From: Andreas Tille Date: Wed, 13 Dec 2017 11:59:12 +0000 (+0100) Subject: New upstream version 0.13.0+ds X-Git-Tag: archive/raspbian/0.22.0+ds-1+rpi1~1^2^2~12^2~13 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=f3bd90ac692a53de9dee562e82a11c5cda33b542;p=python-pysam.git New upstream version 0.13.0+ds --- diff --git a/.travis.yml b/.travis.yml index bfc5d1c..f874a90 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,8 @@ env: - CONDA_PY=3.4 - CONDA_PY=3.5 - CONDA_PY=3.6 + global: + - PYSAM_LINKING_TEST=1 addons: apt: diff --git a/INSTALL b/INSTALL index a1edd45..9636125 100644 --- a/INSTALL +++ b/INSTALL @@ -1,57 +1,102 @@ -System Requirements -=================== +An online version of the installation instructions can be found here: +http://pysam.readthedocs.io/en/latest/installation.html -SAMtools depends on the zlib library . The latest -version 1.2.3 is preferred and with the latest version you can compile -razip and use it to compress a FASTA file. SAMtools' faidx is able to -index a razip-compressed FASTA file to save diskspace. Older zlib also -works with SAMtools, but razip cannot be compiled. +================ +Installing pysam +================ -The text-based viewer (tview) requires the GNU ncurses library -, which comes with Mac OS X and -most of the modern Linux/Unix distributions. If you do not have this -library installed, you can still compile the rest of SAMtools by -manually modifying one line in Makefile. +Pysam can be installed through conda_, pypi_ and from the repository. +The recommended way to install pysam is through conda/bioconda. -curl +Conda installation +================== -Pysam requires Python (2.7 or greater) and Cython (0.22 or greater). -It has not been tested on many other platforms. +To install pysam in your current conda_ environment, type:: -Windows support does not work yet. + conda config --add channels r + conda config --add channels bioconda + conda install pysam -Compilation -=========== +This will install pysam from the bioconda_ channel and automatically +makes sure that dependencies are installed. Also, compilation flags +will be set automatically, which will potentially save a lot of +trouble on OS X. -Unpack the distribution and enter the pysam directory. Type +Pypi installation +================= -python setup.py build +Pysam provides a python interface to the functionality contained +within the htslib_ C library. There are two ways that these two +can be combined, ``builtin`` and ``external``. -to compile. +Builtin +------- -Installation -============ +The typical installation will be through pypi_:: + + pip install pysam + +This will compile the ``builtin`` htslib source code within pysam. + +htslib_ can be configured at compilation to turn on additional +features such support using encrypted configurations, enable plugins, +and more. See the htslib_ project for more information on these. + +Pysam will attempt to configure htslib_ to turn on some advanced +features. If these fail, for example due to missing library +dependencies (`libcurl`, `libcrypto`), it will fall back to +conservative defaults. + +Options can be passed to the configure script explicitely by +setting the environment variable `HTSLIB_CONFIGURE_OPTIONS`. +For example:: -Type + export HTSLIB_CONFIGURE_OPTIONS=--enable-plugins + pip install pysam - python setup.py install +External +-------- -to install it within the site-packages directory of your python -distribution. Type +pysam can be combined with an externally installed htslib_ +library. This is a good way to avoid duplication of libraries. To link +against an externally installed library, set the environment variables +`HTSLIB_LIBRARY_DIR` and `HTSLIB_INCLUDE_DIR` before installing:: - python setup.py install --help + export HTSLIB_LIBRARY_DIR=/usr/local/lib + export HTSLIB_INCLUDE_DIR=/usr/local/include + pip install pysam -for more options. +Note that the location of the file :file:`libhts.so` needs to be known +to the linker once you run pysam, for example by setting the +environment-varirable `LD_LIBRARY_PATH`. -Build the documentation -======================= +Note that generally the pysam and htslib version need to be compatible. See +the release notes for more information. -Install a version of Sphinx that matches your Python version (2 or 3) and run +Installation from repository +============================ - python setup.py build_sphinx +pysam depends on cython_ to provide the connectivity to the htslib_ C +library. The installation of the source tarball (:file:`.tar.gz`) +contains pre-built C-files and cython needs not be present +during installation. However, when installing from the repository, +cython needs to be installed beforehand. + +To install from repository, type:: + + python setup.py install + +For compilation options, see the section on Pypi installation above. + +Requirements +============ -or +Depending on the installation method, requirements for building pysam differ. - python3 setup.py build_sphinx +When installing through conda_, dependencies will be resolved by the +package manager. The pip_ installation and installation from source +require a C compiler and its standard libraries as well as all +requirements for building htslib. Htslib requirements are listed in +the htslib/INSTALL file. -The documentation will be put into build/sphinx. +Installing from the repository will require cython_ to be installed. diff --git a/MANIFEST.in b/MANIFEST.in index 20b7777..4c431ec 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,6 +5,7 @@ # include MANIFEST.in include COPYING +include NEWS include INSTALL include KNOWN_BUGS include THANKS @@ -31,6 +32,8 @@ exclude bcftools/config.h # htslib include htslib/*.c include htslib/*.h +include htslib/INSTALL +include htslib/NEWS exclude htslib/config.h include htslib/Makefile include htslib/htslib_vars.mk @@ -41,6 +44,8 @@ include htslib/htslib.pc.in include htslib/htslib/*.h include htslib/cram/*.c include htslib/cram/*.h +include htslib/win/*.c +include htslib/win/*.h include cy_build.py include pysam.py include requirements.txt diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..528d750 --- /dev/null +++ b/NEWS @@ -0,0 +1,604 @@ +An online version of the installation instructions can be found here: +http://pysam.readthedocs.io/en/latest/release.html + +============= +Release notes +============= + +Release 0.13.0 +=============== + +This release wraps htslib/samtools/bcftools versions 1.6.0 and +contains a series of bugfixes. + +* [#544] reading header from remote TabixFiles now works. +* [#531] add missing tag types H and A. A python float will now be + added as 'f' type instead of 'd' type. +* [#543] use FastaFile instead of Fastafile in pileup. +* [#546] set is_modified flag in setAttribute so updated attributes + are output. +* [#537] allow tabix index files to be created in a custom location. +* [#530] add get_index_statistics() method + + +Release 0.12.0.1 +================ + +Bugfix release to solve compilation issue due to missinge +bcftools/config.h file. + +Release 0.12.0 +============== + +This release wraps htslib/samtools/bcftools versions 1.5.0 and +contains a series of bugfixes. + +* [#473] A new FastxRecord class that can be instantiated from class and + modified in-place. Replaces PersistentFastqProxy. +* [#521] In AligmentFile, Simplify file detection logic and allow remote index files + * Removed attempts to guess data and index file names; this is magic left + to htslib. + * Removed file existence check prior to opening files with htslib + * Better error checking after opening files that raise the appropriate + error (IOError for when errno is set, ValueError otherwise for backward + compatibility). + * Report IO errors when loading an index by name. + * Allow remote indices (tested using S3 signed URLs). + * Document filepath_index and make it an alias for index_filename. + * Added a require_index parameter to AlignmentFile +* [#526] handle unset ref when creating new records +* [#513] fix bcf_translate to skip deleted FORMAT fields to avoid + segfaults +* [#516] expose IO errors via IOError exceptions +* [#487] add tabix line_skip, remove 'pileup' preset +* add FastxRecord, replaces PersistentFastqProxy (still present for + backwards compatibility) +* [#496] upgrade to htslib/samtools/bcftools versions 1.5 +* add start/stop to AlignmentFile.fetch() to be consistent with + VariantFile.fetch(). "end" is kept for backwards compatibility. +* [#512] add get_index_statistics() method to AlignmentFile. + +Upcoming changes: + +In the next release we are plannig to separate the header information +from AlignmentFile into a separate class AlignmentHeader. This layout +is similar to VariantFile/VariantHeader. With this change we will +ensure that an AlignedSegment record will be linked to a header so +that chromosome names can be automatically translated from the numeric +representation. As a consequence, the way new AlignedSegment records +are created will need to change as the constructor requires a header:: + + header = pysam.AlignmentHeader( + reference_names=["chr1", "chr2"], + reference_lengths=[1000, 1000]) + + read = pysam.AlignedSegment(header) + +This will affect all code that instantiates AlignedSegment objects +directly. We have not yet merged to allow users to provide feed-back. +The pull-request is here: https://github.com/pysam-developers/pysam/pull/518 +Please comment on github. + +Release 0.11.2.2 +================ + +Bugfix release to address two issues: + +* Changes in 0.11.2.1 broke the GTF/GFF3 parser. Corrected and + more tests have been added. +* [#479] Correct VariantRecord edge cases described in issue + +Release 0.11.2.1 +================ + +Release to fix release tar-ball containing 0.11.1 pre-compiled +C-files. + +Release 0.11.2 +============== + +This release wraps htslib/samtools/bcfools versions 1.4.1 in response +to a security fix in these libraries. Additionaly the following +issues have been fixed: + +* [#452] add GFF3 support for tabix parsers +* [#461] Multiple fixes related to VariantRecordInfo and handling of INFO/END +* [#447] limit query name to 251 characters (only partially addresses issue) + +VariantFile and related object fixes + +* Restore VariantFile.\_\_dealloc\_\_ +* Correct handling of bcf_str_missing in bcf_array_to_object and + bcf_object_to_array +* Added update() and pop() methods to some dict-like proxy objects +* scalar INFO entries could not be set again after being deleted +* VariantRecordInfo.__delitem__ now allows unset flags to be deleted without + raising a KeyError +* Multiple other fixes for VariantRecordInfo methods +* INFO/END is now accessible only via VariantRecord.stop and + VariantRecord.rlen. Even if present behind the scenes, it is no longer + accessible via VariantRecordInfo. +* Add argument to issue a warning instead of an exception if input appears + to be truncated + +Other features and fixes: + +* Make AlignmentFile \_\_dealloc\_\_ and close more + stringent +* Add argument AlignmentFile to issue a warning instead of an + exception if input appears to be truncated + +Release 0.11.1 +============== + +Bugfix release + +* [#440] add deprecated 'always' option to infer_query_length for backwards compatibility. + +Release 0.11.0 +============== + +This release wraps the latest versions of htslib/samtools/bcftools and +implements a few bugfixes. + +* [#413] Wrap HTSlib/Samtools/BCFtools 1.4 +* [#422] Fix missing pysam.sort.usage() message +* [#411] Fix BGZfile initialization bug +* [#412] Add seek support for BGZFile +* [#395] Make BGZfile iterable +* [#433] Correct getQueryEnd +* [#419] Export SAM enums such as pysam.CMATCH +* [#415] Fix access by tid in AlignmentFile.fetch() +* [#405] Writing SAM now outputs a header by default. +* [#332] split infer_query_length(always) into infer_query_length and infer_read_length + +Release 0.10.0 +============== + +This release implements further functionality in the VariantFile API +and includes several bugfixes: + +* treat special case -c option in samtools view outputs to stdout even + if -o given, fixes #315 +* permit reading BAM files with CSI index, closes #370 +* raise Error if query name exceeds maximum length, fixes #373 +* new method to compute hash value for AlignedSegment +* AlignmentFile, VariantFile and TabixFile all inherit from HTSFile +* Avoid segfault by detecting out of range reference_id and + next_reference in AlignedSegment.tostring +* Issue #355: Implement streams using file descriptors for VariantFile +* upgrade to htslib 1.3.2 +* fix compilation with musl libc +* Issue #316, #360: Rename all Cython modules to have lib as a prefix +* Issue #332, hardclipped bases in cigar included by + pysam.AlignedSegment.infer_query_length() +* Added support for Python 3.6 filename encoding protocol +* Issue #371, fix incorrect parsing of scalar INFO and FORMAT fields in VariantRecord +* Issue #331, fix failure in VariantFile.reset() method +* Issue #314, add VariantHeader.new_record(), VariantFile.new_record() and + VariantRecord.copy() methods to create new VariantRecord objects +* Added VariantRecordFilter.add() method to allow setting new VariantRecord filters +* Preliminary (potentially unsafe) support for removing and altering header metadata +* Many minor fixes and improvements to VariantFile and related objects + +Please note that all internal cython extensions now have a lib prefix +to facilitate linking against pysam extension modules. Any user cython +extensions using cimport to import pysam definitions will need +changes, for example:: + + cimport pysam.csamtools + +will become:: + + cimport pysam.libcamtools + +Release 0.9.1 +============= + +This is a bugfix release addressing some installation problems +in pysam 0.9.0, in particular: + +* patch included htslib to work with older libcurl versions, fixes #262. +* do not require cython for python 3 install, fixes #260 +* FastaFile does not accept filepath_index any more, see #270 +* add AlignedSegment.get_cigar_stats method. +* py3 bugfix in VariantFile.subset_samples, fixes #272 +* add missing sysconfig import, fixes #278 +* do not redirect stdout, but instead write to a separately + created file. This should resolve issues when pysam is used + in notebooks or other environments that redirect stdout. +* wrap htslib-1.3.1, samtools-1.3.1 and bcftools-1.3.1 +* use bgzf throughout instead of gzip +* allow specifying a fasta reference for CRAM file when opening + for both read and write, fixes #280 + +Release 0.9.0 +============= + +Overview +-------- + +The 0.9.0 release upgrades htslib to htslib 1.3 and numerous other +enchancements and bugfixes. See below for a detailed list. + +`Htslib 1.3 `_ +comes with additional capabilities for remote file access which depend +on the presence of optional system libraries. As a consequence, the +installation script :file:`setup.py` has become more complex. For an +overview, see :ref:`installation`. We have tested installation on +linux and OS X, but could not capture all variations. It is possible +that a 0.9.1 release might follow soon addressing installation issues. + +The :py:class:`~.pysam.VariantFile` class provides access to +:term:`vcf` and :term:`bcf` formatted files. The class is certainly +usable and interface is reaching completion, but the API and the +functionality is subject to change. + +Detailed release notes +---------------------- + +* upgrade to htslib 1.3 +* python 3 compatibility tested throughout. +* added a first set of bcftools commands in the pysam.bcftools + submodule. +* samtools commands are now in the pysam.samtools module. For + backwards compatibility they are still imported into the pysam + namespace. +* samtools/bcftools return stdout as a single (byte) string. As output + can be binary (VCF.gz, BAM) this is necessary to ensure py2/py3 + compatibility. To replicate the previous behaviour in py2.7, use:: + + pysam.samtools.view(self.filename).splitlines(True) + +* get_tags() returns the tag type as a character, not an integer (#214) +* TabixFile now raises ValueError on indices created by tabix <1.0 (#206) +* improve OSX installation and develop mode +* FastxIterator now handles empty sequences (#204) +* TabixFile.isremote is not TabixFile.is_remote in line with AlignmentFile +* AlignmentFile.count() has extra optional argument read_callback +* setup.py has been changed to: + * install a single builtin htslib library. Previously, each pysam + module contained its own version. This reduces compilation time + and code bloat. + * run configure for the builtin htslib library in order to detect + optional libraries such as libcurl. Configure behaviour can be + controlled by setting the environmet variable + HTSLIB_CONFIGURE_OPTIONS. +* get_reference_sequence() now returns the reference sequence and not + something looking like it. This bug had effects on + get_aligned_pairs(with_seq=True), see #225. If you have relied on on + get_aligned_pairs(with_seq=True) in pysam-0.8.4, please check your + results. +* improved autodetection of file formats in AlignmentFile and VariantFile. + +Release 0.8.4 +============= + +This release contains numerous bugfixes and a first implementation of +a pythonic interface to VCF/BCF files. Note that this code is still +incomplete and preliminary, but does offer a nearly complete immutable +Pythonic interface to VCF/BCF metadata and data with reading and +writing capability. + +Potential isses when upgrading from v0.8.3: + +* binary tags are now returned as python arrays + +* renamed several methods for pep8 compatibility, old names still retained for + backwards compatibility, but should be considered deprecated. + * gettid() is now get_tid() + * getrname() is now get_reference_name() + * parseRegion() is now parse_region() + +* some methods have changed for pep8 compatibility without the old + names being present: + * fromQualityString() is now qualitystring_to_array() + * toQualityString() is now qualities_to_qualitystring() + +* faidx now returns strings and not binary strings in py3. + +* The cython components have been broken up into smaller files with + more specific content. This will affect users using the cython + interfaces. + +Edited list of commit log changes: + +* fixes AlignmentFile.check_index to return True +* add RG/PM header tag - closes #179 +* add with_seq option to get_aligned_pairs +* use char * inside reconsituteReferenceSequence +* add soft clipping for get_reference_sequence +* add get_reference_sequence +* queryEnd now computes length from cigar string if no sequence present, closes #176 +* tolerate missing space at end of gtf files, closes #162 +* do not raise Error when receiving output on stderr +* add docu about fetching without index, closes #170 +* FastaFile and FastxFile now return strings in python3, closes #173 +* py3 compat: relative -> absolute imports. +* add reference_name and next_reference_name attributes to AlignedSegment +* add function signatures to cvcf cython. Added note about other VCF code. +* add context manager functions to FastaFile +* add reference_name and next_reference_name attributes to AlignedSegment +* PileupColumn also gets a reference_name attribute. +* add context manager functions to FastaFile +* TabixFile.header for remote files raises AttributeError, fixes #157 +* add context manager interface to TabixFile, closes #165 +* change ctypedef enum to typedef enum for cython 0.23 +* add function signatures to cvcf cython, also added note about other VCF code +* remove exception for custom upper-case header record tags. +* rename VALID_HEADER_FIELDS to KNOWN_HEADER_FIELDS +* fix header record tag parsing for custom tags. +* use cython.str in count_coverage, fixes #141 +* avoid maketrans (issues with python3) +* refactoring: AlignedSegment now in separate module +* do not execute remote tests if URL not available +* fix the unmapped count, incl reads with no SQ group +* add raw output to tags +* added write access for binary tags +* bugfix in call to resize +* implemented writing of binary tags from arrays +* implemented convert_binary_tag to use arrays +* add special cases for reads that are unmapped or whose mates are unmapped. +* rename TabProxies to ctabixproxies +* remove underscores from utility functions +* move utility methods into cutils +* remove callback argument to fetch - closes #128 +* avoid calling close in dealloc +* add unit tests for File object opening +* change AlignmentFile.open to filepath_or_object +* implement copy.copy, close #65 +* add chaching of array attributes in AlignedSegment, closes #121 +* add export of Fastafile +* remove superfluous pysam_dispatch +* use persist option in FastqFile +* get_tag: expose tag type if requested with `with_value_type` +* fix to allow reading vcf record info via tabix-based vcf reader +* add pFastqProxy and pFastqFile objects to make it possible to work with multiple fastq records per file handle, unlike FastqProxy/FastqFile. +* release GIL around htslib IO operations +* More work on read/write support, API improvements +* add `phased` property on `VariantRecordSample` +* add mutable properties to VariantRecord +* BCF fixes and start of read/write support +* VariantHeaderRecord objects now act like mappings for attributes. +* add VariantHeader.alts dict from alt ID->Record. +* Bug fix to strong representation of structured header records. +* VariantHeader is now mutable + + +Release 0.8.3 +============= + +* samtools command now accept the "catch_stdout" option. + +* get_aligned_pairs now works for soft-clipped reads. + +* query_position is now None when a PileupRead is not aligned + to a particular position. + +* AlignedSegments are now comparable and hashable. + +Release 0.8.2.1 +=============== + +* Installation bugfix release. + +Release 0.8.2 +============= + +* Pysam now wraps htslib 1.2.1 and samtools version 1.2. + +* Added CRAM file support to pysam. + +* New alignment info interface. + * opt() and setTag are deprecated, use get_tag() and set_tag() + instead. + * added has_tag() + * tags is deprecated, use get_tags() and set_tags() instead. + +* FastqFile is now FastxFile to reflect that the latter permits + iteration over both fastq- and fasta-formatted files. + +* A Cython wrapper for htslib VCF/BCF reader/writer. The wrapper + provides a nearly complete Pythonic interface to VCF/BCF metadata + with reading and writing capability. However, the interface is still + incomplete and preliminary and lacks capability to mutate the + resulting data. + +Release 0.8.1 +============= + +* Pysam now wraps htslib and samtools versions 1.1. + +* Bugfixes, most notable: + * issue #43: uncompressed BAM output + * issue #42: skip tests requiring network if none available + * issue #19: multiple iterators can now be made to work on the same tabix file + * issue #24: All strings returned from/passed to the pysam API are now unicode in python 3 + * issue #5: type guessing for lists of integers fixed + +* API changes for consistency. The old API is still present, + but deprecated. + In particular: + + * Tabixfile -> TabixFile + * Fastafile -> FastaFile + * Fastqfile -> FastqFile + * Samfile -> AlignmentFile + * AlignedRead -> AlignedSegment + * qname -> query_name + * tid -> reference_id + * pos -> reference_start + * mapq -> mapping_quality + * rnext -> next_reference_id + * pnext -> next_reference_start + * cigar -> cigartuples + * cigarstring -> cigarstring + * tlen -> template_length + * seq -> query_sequence + * qual -> query_qualities, now returns array + * qqual -> query_alignment_qualities, now returns array + * tags -> tags + * alen -> reference_length, reference is always "alignment", so removed + * aend -> reference_end + * rlen -> query_length + * query -> query_alignment_sequence + * qstart -> query_alignment_start + * qend -> query_alignment_end + * qlen -> query_alignment_length + * mrnm -> next_reference_id + * mpos -> next_reference_start + * rname -> reference_id + * isize -> template_length + * blocks -> get_blocks() + * aligned_pairs -> get_aligned_pairs() + * inferred_length -> infer_query_length() + * positions -> get_reference_positions() + * overlap() -> get_overlap() + + * All strings are now passed to or received from the pysam API + as strings, no more bytes. + +Other changes: + * AlignmentFile.fetch(reopen) option is now multiple_iterators. The + default changed to not reopen a file unless requested by the user. + * FastaFile.getReferenceLength is now FastaFile.get_reference_length + +Backwards incompatible changes + +* Empty cigarstring now returns None (intstead of '') +* Empty cigar now returns None (instead of []) +* When using the extension classes in cython modules, AlignedRead + needs to be substituted with AlignedSegment. +* fancy_str() has been removed +* qual, qqual now return arrays + +Release 0.8.0 +============= + +* Disabled features + * IteratorColumn.setMask() disabled as htslib does not implement + this functionality? + +* Not implemented yet: + * reading SAM files without header + +Tabix files between version 0.7.8 and 0.8.0 are +not compatible and need to be re-indexed. + +While version 0.7.8 and 0.8.0 should be mostly +compatible, there are some notable exceptions: + +* tabix iterators will fail if there are comments + in the middle or the end of a file. + +* tabix raises always ValueError for invalid intervals. + Previously, different types of errors were raised + (KeyError, IndexError, ValueError) depending on + the type of invalid intervals (missing chromosome, + out-of-range, malformatted interval). + + +Release 0.7.8 +============= + +* added AlignedRead.setTag method +* added AlignedRead.blocks +* unsetting CIGAR strings is now possible +* empty CIGAR string returns empty list +* added reopen flag to Samfile.fetch() +* various bugfixes + +Release 0.7.7 +============= + +* added Fastafile.references, .nreferences and .lengths +* tabix_iterator now uses kseq.h for python 2.7 + +Release 0.7.6 +============= + +* added inferred_length property +* issue 122: MACOSX getline missing, now it works? +* seq and qual can be set None +* added Fastqfile + +Release 0.7.5 +============= + +* switch to samtools 0.1.19 +* issue 122: MACOSX getline missing +* issue 130: clean up tempfiles +* various other bugfixes + +Release 0.7.4 +============= + +* further bugfixes to setup.py and package layout + +Release 0.7.3 +============= + +* further bugfixes to setup.py +* upgraded distribute_setup.py to 0.6.34 + +Release 0.7.2 +============= + +* bugfix in installer - failed when cython not present +* changed installation locations of shared libraries + +Release 0.7.1 +============= + +* bugfix: missing PP tag PG records in header +* added pre-built .c files to distribution + +Release 0.7 +=========== + +* switch to tabix 0.2.6 +* added cigarstring field +* python3 compatibility +* added B tag handling +* added check_sq and check_header options to Samfile.__init__ +* added lazy GTF parsing to tabix +* reworked support for VCF format parsing +* bugfixes + +Release 0.6 +=========== + +* switch to samtools 0.1.18 +* various bugfixes +* removed references to deprecated 'samtools pileup' functionality +* AlignedRead.tags now returns an empty list if there are no tags. +* added pnext, rnext and tlen + +Release 0.5 +=========== + +* switch to samtools 0.1.16 and tabix 0.2.5 +* improved tabix parsing, added vcf support +* re-organized code to permit linking against pysam +* various bugfixes +* added Samfile.positions and Samfile.overlap + +Release 0.4 +=========== + +* switch to samtools 0.1.12a and tabix 0.2.3 +* added snp and indel calling. +* switch from pyrex to cython +* changed handling of samtools stderr +* various bugfixes +* added Samfile.count and Samfile.mate +* deprecated AlignedRead.rname, added AlignedRead.tid + +Release 0.3 +=========== + +* switch to samtools 0.1.8 +* added support for tabix files +* numerous bugfixes including +* permit simultaneous iterators on the same file +* working access to remote files diff --git a/bcftools/consensus.c b/bcftools/consensus.c index 258ef14..544eca6 100644 --- a/bcftools/consensus.c +++ b/bcftools/consensus.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2014 Genome Research Ltd. + Copyright (c) 2014-2017 Genome Research Ltd. Author: Petr Danecek @@ -39,6 +39,16 @@ #include "regidx.h" #include "bcftools.h" #include "rbuf.h" +#include "filter.h" + +// Logic of the filters: include or exclude sites which match the filters? +#define FLT_INCLUDE 1 +#define FLT_EXCLUDE 2 + +#define PICK_REF 1 +#define PICK_ALT 2 +#define PICK_LONG 4 +#define PICK_SHORT 8 typedef struct { @@ -75,12 +85,16 @@ typedef struct chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences // Note that the chain is re-initialised for each chromosome/seq_region + filter_t *filter; + char *filter_str; + int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE + bcf_srs_t *files; bcf_hdr_t *hdr; FILE *fp_out; FILE *fp_chain; char **argv; - int argc, output_iupac, haplotype, isample; + int argc, output_iupac, haplotype, allele, isample; char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname; } args_t; @@ -195,7 +209,7 @@ static void init_data(args_t *args) args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample); if ( args->isample<0 ) error("No such sample: %s\n", args->sample); } - if ( args->haplotype && args->isample<0 ) + if ( (args->haplotype || args->allele) && args->isample<0 ) { if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n"); args->isample = 0; @@ -220,10 +234,14 @@ static void init_data(args_t *args) if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno)); } else args->fp_out = stdout; + if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records\n"); + if ( args->filter_str ) + args->filter = filter_init(args->hdr, args->filter_str); } static void destroy_data(args_t *args) { + if (args->filter) filter_destroy(args->filter); bcf_sr_destroy(args->files); int i; for (i=0; ivcf_rbuf.m; i++) @@ -287,9 +305,16 @@ static bcf1_t **next_vcf_line(args_t *args) int i = rbuf_shift(&args->vcf_rbuf); return &args->vcf_buf[i]; } - else if ( bcf_sr_next_line(args->files) ) + while ( bcf_sr_next_line(args->files) ) + { + if ( args->filter ) + { + int is_ok = filter_test(args->filter, bcf_sr_get_line(args->files,0), NULL); + if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1; + if ( !is_ok ) continue; + } return &args->files->readers[0].buffer[0]; - + } return NULL; } static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr) @@ -358,33 +383,36 @@ static void apply_variant(args_t *args, bcf1_t *rec) int i, ialt = 1; if ( args->isample >= 0 ) { + bcf_unpack(rec, BCF_UN_FMT); bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); if ( !fmt ) return; + + if ( fmt->type!=BCF_BT_INT8 ) + error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1); + uint8_t *ptr = fmt->p + fmt->size*args->isample; + if ( args->haplotype ) { if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1); - uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1; - ialt = bcf_dec_int1(ptr, fmt->type, &ignore); + ialt = ptr[args->haplotype-1]; if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); } else if ( args->output_iupac ) { - uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample; - ialt = bcf_dec_int1(ptr, fmt->type, &ignore); + ialt = ptr[0]; if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); int jalt; if ( fmt->n>1 ) { - ptr = fmt->p + fmt->size*args->isample + 1; - jalt = bcf_dec_int1(ptr, fmt->type, &ignore); + jalt = ptr[1]; if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt; else jalt = bcf_gt_allele(jalt); } else jalt = ialt; - if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); + if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? { char ial = rec->d.allele[ialt][0]; @@ -394,13 +422,40 @@ static void apply_variant(args_t *args, bcf1_t *rec) } else { + int is_hom = 1; for (i=0; in; i++) { - uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i; - ialt = bcf_dec_int1(ptr, fmt->type, &ignore); - if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; - ialt = bcf_gt_allele(ialt); - if ( ialt ) break; + if ( bcf_gt_is_missing(ptr[i]) ) return; // ignore missing or half-missing genotypes + if ( ptr[i]==bcf_int32_vector_end ) break; + ialt = bcf_gt_allele(ptr[i]); + if ( i>0 && ialt!=bcf_gt_allele(ptr[i-1]) ) { is_hom = 0; break; } + } + if ( !is_hom ) + { + int prev_len = 0, jalt; + for (i=0; in; i++) + { + if ( ptr[i]==bcf_int32_vector_end ) break; + jalt = bcf_gt_allele(ptr[i]); + if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); + if ( args->allele & (PICK_LONG|PICK_SHORT) ) + { + int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]); + if ( i==0 ) ialt = jalt, prev_len = len; + else if ( len == prev_len ) + { + if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt, prev_len = len; + else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt, prev_len = len; + } + else if ( args->allele & PICK_LONG && len > prev_len ) ialt = jalt, prev_len = len; + else if ( args->allele & PICK_SHORT && len < prev_len ) ialt = jalt, prev_len = len; + } + else + { + if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt; + else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt; + } + } } } if ( !ialt ) return; // ref allele @@ -623,12 +678,21 @@ static void usage(args_t *args) fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n"); fprintf(stderr, "Usage: bcftools consensus [OPTIONS] \n"); fprintf(stderr, "Options:\n"); + fprintf(stderr, " -c, --chain write a chain file for liftover\n"); + fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(stderr, " -H, --haplotype <1|2> apply variants for the given haplotype\n"); - fprintf(stderr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); + fprintf(stderr, " -H, --haplotype choose which allele to use from the FORMAT/GT field, note\n"); + fprintf(stderr, " the codes are case-insensitive:\n"); + fprintf(stderr, " 1: first allele from GT\n"); + fprintf(stderr, " 2: second allele\n"); + fprintf(stderr, " R: REF allele in het genotypes\n"); + fprintf(stderr, " A: ALT allele\n"); + fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); + fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); + fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); fprintf(stderr, " -m, --mask replace regions with N\n"); fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -c, --chain write a chain file for liftover\n"); fprintf(stderr, " -s, --sample apply variants of the given sample\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); @@ -645,8 +709,10 @@ int main_consensus(int argc, char *argv[]) static struct option loptions[] = { + {"exclude",required_argument,NULL,'e'}, + {"include",required_argument,NULL,'i'}, {"sample",1,0,'s'}, - {"iupac-codes",0,0,'i'}, + {"iupac-codes",0,0,'I'}, {"haplotype",1,0,'H'}, {"output",1,0,'o'}, {"fasta-ref",1,0,'f'}, @@ -655,19 +721,32 @@ int main_consensus(int argc, char *argv[]) {0,0,0,0} }; int c; - while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:",loptions,NULL)) >= 0) { switch (c) { case 's': args->sample = optarg; break; case 'o': args->output_fname = optarg; break; - case 'i': args->output_iupac = 1; break; + case 'I': args->output_iupac = 1; break; + case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'f': args->ref_fname = optarg; break; case 'm': args->mask_fname = optarg; break; case 'c': args->chain_fname = optarg; break; case 'H': - args->haplotype = optarg[0] - '0'; - if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n"); + if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF; + else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT; + else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF; + else if ( !strcasecmp(optarg,"S") ) args->allele |= PICK_SHORT|PICK_REF; + else if ( !strcasecmp(optarg,"LR") ) args->allele |= PICK_LONG|PICK_REF; + else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT; + else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF; + else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT; + else + { + args->haplotype = optarg[0] - '0'; + if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n"); + } break; default: usage(args); break; } diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c index 86e855e..5250b4f 100644 --- a/bcftools/consensus.c.pysam.c +++ b/bcftools/consensus.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2014 Genome Research Ltd. + Copyright (c) 2014-2017 Genome Research Ltd. Author: Petr Danecek @@ -41,6 +41,16 @@ #include "regidx.h" #include "bcftools.h" #include "rbuf.h" +#include "filter.h" + +// Logic of the filters: include or exclude sites which match the filters? +#define FLT_INCLUDE 1 +#define FLT_EXCLUDE 2 + +#define PICK_REF 1 +#define PICK_ALT 2 +#define PICK_LONG 4 +#define PICK_SHORT 8 typedef struct { @@ -77,12 +87,16 @@ typedef struct chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences // Note that the chain is re-initialised for each chromosome/seq_region + filter_t *filter; + char *filter_str; + int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE + bcf_srs_t *files; bcf_hdr_t *hdr; FILE *fp_out; FILE *fp_chain; char **argv; - int argc, output_iupac, haplotype, isample; + int argc, output_iupac, haplotype, allele, isample; char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname; } args_t; @@ -197,7 +211,7 @@ static void init_data(args_t *args) args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample); if ( args->isample<0 ) error("No such sample: %s\n", args->sample); } - if ( args->haplotype && args->isample<0 ) + if ( (args->haplotype || args->allele) && args->isample<0 ) { if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n"); args->isample = 0; @@ -222,10 +236,14 @@ static void init_data(args_t *args) if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno)); } else args->fp_out = pysam_stdout; + if ( args->isample<0 ) fprintf(pysam_stderr,"Note: the --sample option not given, applying all records\n"); + if ( args->filter_str ) + args->filter = filter_init(args->hdr, args->filter_str); } static void destroy_data(args_t *args) { + if (args->filter) filter_destroy(args->filter); bcf_sr_destroy(args->files); int i; for (i=0; ivcf_rbuf.m; i++) @@ -289,9 +307,16 @@ static bcf1_t **next_vcf_line(args_t *args) int i = rbuf_shift(&args->vcf_rbuf); return &args->vcf_buf[i]; } - else if ( bcf_sr_next_line(args->files) ) + while ( bcf_sr_next_line(args->files) ) + { + if ( args->filter ) + { + int is_ok = filter_test(args->filter, bcf_sr_get_line(args->files,0), NULL); + if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1; + if ( !is_ok ) continue; + } return &args->files->readers[0].buffer[0]; - + } return NULL; } static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr) @@ -360,33 +385,36 @@ static void apply_variant(args_t *args, bcf1_t *rec) int i, ialt = 1; if ( args->isample >= 0 ) { + bcf_unpack(rec, BCF_UN_FMT); bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); if ( !fmt ) return; + + if ( fmt->type!=BCF_BT_INT8 ) + error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1); + uint8_t *ptr = fmt->p + fmt->size*args->isample; + if ( args->haplotype ) { if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1); - uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1; - ialt = bcf_dec_int1(ptr, fmt->type, &ignore); + ialt = ptr[args->haplotype-1]; if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); } else if ( args->output_iupac ) { - uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample; - ialt = bcf_dec_int1(ptr, fmt->type, &ignore); + ialt = ptr[0]; if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; ialt = bcf_gt_allele(ialt); int jalt; if ( fmt->n>1 ) { - ptr = fmt->p + fmt->size*args->isample + 1; - jalt = bcf_dec_int1(ptr, fmt->type, &ignore); + jalt = ptr[1]; if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt; else jalt = bcf_gt_allele(jalt); } else jalt = ialt; - if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); + if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? { char ial = rec->d.allele[ialt][0]; @@ -396,13 +424,40 @@ static void apply_variant(args_t *args, bcf1_t *rec) } else { + int is_hom = 1; for (i=0; in; i++) { - uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i; - ialt = bcf_dec_int1(ptr, fmt->type, &ignore); - if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return; - ialt = bcf_gt_allele(ialt); - if ( ialt ) break; + if ( bcf_gt_is_missing(ptr[i]) ) return; // ignore missing or half-missing genotypes + if ( ptr[i]==bcf_int32_vector_end ) break; + ialt = bcf_gt_allele(ptr[i]); + if ( i>0 && ialt!=bcf_gt_allele(ptr[i-1]) ) { is_hom = 0; break; } + } + if ( !is_hom ) + { + int prev_len = 0, jalt; + for (i=0; in; i++) + { + if ( ptr[i]==bcf_int32_vector_end ) break; + jalt = bcf_gt_allele(ptr[i]); + if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); + if ( args->allele & (PICK_LONG|PICK_SHORT) ) + { + int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]); + if ( i==0 ) ialt = jalt, prev_len = len; + else if ( len == prev_len ) + { + if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt, prev_len = len; + else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt, prev_len = len; + } + else if ( args->allele & PICK_LONG && len > prev_len ) ialt = jalt, prev_len = len; + else if ( args->allele & PICK_SHORT && len < prev_len ) ialt = jalt, prev_len = len; + } + else + { + if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt; + else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt; + } + } } } if ( !ialt ) return; // ref allele @@ -625,12 +680,21 @@ static void usage(args_t *args) fprintf(pysam_stderr, " information, such as INFO/AD or FORMAT/AD.\n"); fprintf(pysam_stderr, "Usage: bcftools consensus [OPTIONS] \n"); fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -c, --chain write a chain file for liftover\n"); + fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); fprintf(pysam_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(pysam_stderr, " -H, --haplotype <1|2> apply variants for the given haplotype\n"); - fprintf(pysam_stderr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); + fprintf(pysam_stderr, " -H, --haplotype choose which allele to use from the FORMAT/GT field, note\n"); + fprintf(pysam_stderr, " the codes are case-insensitive:\n"); + fprintf(pysam_stderr, " 1: first allele from GT\n"); + fprintf(pysam_stderr, " 2: second allele\n"); + fprintf(pysam_stderr, " R: REF allele in het genotypes\n"); + fprintf(pysam_stderr, " A: ALT allele\n"); + fprintf(pysam_stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); + fprintf(pysam_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); + fprintf(pysam_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); + fprintf(pysam_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); fprintf(pysam_stderr, " -m, --mask replace regions with N\n"); fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysam_stderr, " -c, --chain write a chain file for liftover\n"); fprintf(pysam_stderr, " -s, --sample apply variants of the given sample\n"); fprintf(pysam_stderr, "Examples:\n"); fprintf(pysam_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); @@ -647,8 +711,10 @@ int main_consensus(int argc, char *argv[]) static struct option loptions[] = { + {"exclude",required_argument,NULL,'e'}, + {"include",required_argument,NULL,'i'}, {"sample",1,0,'s'}, - {"iupac-codes",0,0,'i'}, + {"iupac-codes",0,0,'I'}, {"haplotype",1,0,'H'}, {"output",1,0,'o'}, {"fasta-ref",1,0,'f'}, @@ -657,19 +723,32 @@ int main_consensus(int argc, char *argv[]) {0,0,0,0} }; int c; - while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:",loptions,NULL)) >= 0) { switch (c) { case 's': args->sample = optarg; break; case 'o': args->output_fname = optarg; break; - case 'i': args->output_iupac = 1; break; + case 'I': args->output_iupac = 1; break; + case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'f': args->ref_fname = optarg; break; case 'm': args->mask_fname = optarg; break; case 'c': args->chain_fname = optarg; break; case 'H': - args->haplotype = optarg[0] - '0'; - if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n"); + if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF; + else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT; + else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF; + else if ( !strcasecmp(optarg,"S") ) args->allele |= PICK_SHORT|PICK_REF; + else if ( !strcasecmp(optarg,"LR") ) args->allele |= PICK_LONG|PICK_REF; + else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT; + else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF; + else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT; + else + { + args->haplotype = optarg[0] - '0'; + if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n"); + } break; default: usage(args); break; } diff --git a/bcftools/csq.c b/bcftools/csq.c index b1db103..94ac442 100644 --- a/bcftools/csq.c +++ b/bcftools/csq.c @@ -164,17 +164,6 @@ #define N_SPLICE_REGION_EXON 3 #define N_SPLICE_REGION_INTRON 8 -// Ensembl ID format, e.g. -// ENST00000423372 for human .. ENST%011d -// ENSMUST00000120394 for mouse .. ENSMUST%011d -char ENSID_BUF[32], *ENSID_FMT = NULL; -static inline char *ENSID(uint32_t id) -{ - sprintf(ENSID_BUF,ENSID_FMT,id); - return ENSID_BUF; -} - - #define N_REF_PAD 10 // number of bases to avoid boundary effects #define STRAND_REV 0 @@ -509,7 +498,6 @@ hap_t; temporary list of all exons, CDS, UTRs */ KHASH_MAP_INIT_INT(int2tscript, tscript_t*) -KHASH_MAP_INIT_INT(int2int, int) KHASH_MAP_INIT_INT(int2gene, gf_gene_t*) typedef struct { @@ -522,25 +510,41 @@ typedef struct uint32_t iseq:29; } ftr_t; +/* + Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001) + to integer id. To keep the memory requirements low, the original version + relied on IDs in the form of a string prefix and a numerical id. However, + it turns out that this assumption is not valid for some ensembl GFFs, see + for example Zea_mays.AGPv4.36.gff3.gz + */ +typedef struct +{ + void *str2id; // khash_str2int + int nstr, mstr; + char **str; // numeric id to string +} +id_tbl_t; typedef struct { // all exons, CDS, UTRs ftr_t *ftr; int nftr, mftr; - // mapping from transcript ensembl id to gene id + // mapping from gene id to gf_gene_t kh_int2gene_t *gid2gene; // mapping from transcript id to tscript, for quick CDS anchoring kh_int2tscript_t *id2tr; // sequences - void *seq2int; + void *seq2int; // str2int hash char **seq; int nseq, mseq; // ignored biotypes void *ignored_biotypes; + + id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx } aux_t; @@ -590,6 +594,7 @@ typedef struct _args_t int nrm_tr, mrm_tr; csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs int ncsq_buf, mcsq_buf; + id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx faidx_t *fai; kstring_t str, str2; @@ -694,33 +699,38 @@ static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); return se+1; } -static inline uint32_t gff_parse_id(const char *line, const char *needle, char *ss) +static void gff_id_init(id_tbl_t *tbl) { - ss = strstr(ss,needle); - if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line); - ss += strlen(needle); - while ( *ss && !isdigit(*ss) ) ss++; - if ( !ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line); - char *se; - uint32_t id = strtol(ss, &se, 10); - if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line); - if ( *se && *se!=';' && *se!='\t' ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - assert( id <= 0xffffff ); // see gf_gene_t.id. Ensembl IDs are never that big in practice - return id; + memset(tbl, 0, sizeof(*tbl)); + tbl->str2id = khash_str2int_init(); +} +static void gff_id_destroy(id_tbl_t *tbl) +{ + khash_str2int_destroy_free(tbl->str2id); + free(tbl->str); } -static void gff_parse_ensid_fmt(const char *line, const char *needle, char *ss) +static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char *needle, char *ss) { - ss = strstr(ss,needle); + ss = strstr(ss,needle); // e.g. "ID=transcript:" if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line); ss += strlen(needle); + char *se = ss; - while ( *se && !isdigit(*se) ) se++; - kstring_t str = {0,0,0}; - kputsn(ss,se-ss,&str); - ss = se; - while ( *se && isdigit(*se) ) se++; - ksprintf(&str,"%%0%dd",(int)(se-ss)); - ENSID_FMT = str.s; + while ( *se && *se!=';' && !isspace(*se) ) se++; + char tmp = *se; + *se = 0; + + int id; + if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 ) + { + id = tbl->nstr++; + hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str); + tbl->str[id] = strdup(ss); + int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id); + } + *se = tmp; + + return id; } static inline int gff_parse_type(char *line) { @@ -880,10 +890,8 @@ void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr) } // create a mapping from transcript_id to gene_id - uint32_t trid = gff_parse_id(line, "ID=transcript:", ss); - uint32_t gene_id = gff_parse_id(line, "Parent=gene:", ss); - - if ( !ENSID_FMT ) gff_parse_ensid_fmt(line, "ID=transcript:", ss); // id prefix different across species + uint32_t trid = gff_id_parse(&args->tscript_ids, line, "ID=transcript:", ss); + uint32_t gene_id = gff_id_parse(&args->init.gene_ids, line, "Parent=gene:", ss); tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t)); tr->id = trid; @@ -910,7 +918,7 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha aux_t *aux = &args->init; // substring search for "ID=gene:ENSG00000437963" - uint32_t gene_id = gff_parse_id(line, "ID=gene:", ss); + uint32_t gene_id = gff_id_parse(&aux->gene_ids, line, "ID=gene:", ss); gf_gene_t *gene = gene_init(aux, gene_id); assert( !gene->name ); // the gene_id should be unique @@ -918,13 +926,17 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha // substring search for "Name=OR4F5" ss = strstr(chr_end+2,"Name="); - if ( !ss ) error("Could not parse the line, \"Name=\" not present: %s\n", line); - ss += 5; - char *se = ss; - while ( *se && *se!=';' && !isspace(*se) ) se++; - gene->name = (char*) malloc(se-ss+1); - memcpy(gene->name,ss,se-ss); - gene->name[se-ss] = 0; + if ( ss ) + { + ss += 5; + char *se = ss; + while ( *se && *se!=';' && !isspace(*se) ) se++; + gene->name = (char*) malloc(se-ss+1); + memcpy(gene->name,ss,se-ss); + gene->name[se-ss] = 0; + } + else + gene->name = strdup(aux->gene_ids.str[gene_id]); // Name= field is not present, use the gene ID instead } int gff_parse(args_t *args, char *line, ftr_t *ftr) { @@ -999,7 +1011,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr) ss += 2; // substring search for "Parent=transcript:ENST00000437963" - ftr->trid = gff_parse_id(line, "Parent=transcript:", ss); + ftr->trid = gff_id_parse(&args->tscript_ids, line, "Parent=transcript:", ss); ftr->iseq = feature_set_seq(args, chr_beg,chr_end); return 0; } @@ -1104,7 +1116,7 @@ void tscript_init_cds(args_t *args) { int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; if ( phase!=len%3) - error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len); + error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); assert( phase == len%3 ); len += tr->cds[i]->len; } @@ -1132,7 +1144,7 @@ void tscript_init_cds(args_t *args) { int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; if ( phase!=len%3) - error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len); + error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); len += tr->cds[i]->len; } } @@ -1205,6 +1217,8 @@ void init_gff(args_t *args) aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL); aux->ignored_biotypes = khash_str2int_init(); + gff_id_init(&aux->gene_ids); + gff_id_init(&args->tscript_ids); // parse gff kstring_t str = {0,0,0}; @@ -1252,7 +1266,7 @@ void init_gff(args_t *args) else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr); else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr); else - error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,ENSID(ftr->trid),gf_type2gff_string(ftr->type)); + error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type)); } tscript_init_cds(args); @@ -1270,6 +1284,7 @@ void init_gff(args_t *args) // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene); kh_destroy(int2tscript,aux->id2tr); free(aux->seq); + gff_id_destroy(&aux->gene_ids); if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) ) { @@ -1409,7 +1424,7 @@ void destroy_data(args_t *args) free(args->gt_arr); free(args->str.s); free(args->str2.s); - free(ENSID_FMT); + gff_id_destroy(&args->tscript_ids); } /* @@ -2491,7 +2506,7 @@ exit_duplicate: #define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen) #define node2rpos(i) (hap->stack[i].node->rec->pos) -void kput_vcsq(vcsq_t *csq, kstring_t *str) +void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str) { // Remove start/stop from incomplete CDS, but only if there is another // consequence as something must be reported @@ -2520,7 +2535,7 @@ void kput_vcsq(vcsq_t *csq, kstring_t *str) if ( csq->gene ) kputs(csq->gene , str); kputc_('|', str); - if ( csq->type & CSQ_PRN_TSCRIPT ) ksprintf(str, "%s",ENSID(csq->trid)); + if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str); kputc_('|', str); kputs(gf_type2gff_string(csq->biotype), str); @@ -2889,7 +2904,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap) fprintf(args->out,"-"); args->str.l = 0; - kput_vcsq(&csq->type, &args->str); + kput_vcsq(args, &csq->type, &args->str); fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); } static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node) @@ -2913,7 +2928,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih fprintf(args->out,"-"); args->str.l = 0; - kput_vcsq(&csq->type, &args->str); + kput_vcsq(args, &csq->type, &args->str); fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); } } @@ -3057,11 +3072,11 @@ void vbuf_flush(args_t *args) } args->str.l = 0; - kput_vcsq(&vrec->vcsq[0], &args->str); + kput_vcsq(args, &vrec->vcsq[0], &args->str); for (j=1; jnvcsq; j++) { kputc_(',', &args->str); - kput_vcsq(&vrec->vcsq[j], &args->str); + kput_vcsq(args, &vrec->vcsq[j], &args->str); } bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s); if ( args->hdr_nsmpl ) @@ -3665,7 +3680,7 @@ void process(args_t *args, bcf1_t **rec_ptr) return; } -const char *usage(void) +static const char *usage(void) { return "\n" diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c index b79a030..4a7810c 100644 --- a/bcftools/csq.c.pysam.c +++ b/bcftools/csq.c.pysam.c @@ -166,17 +166,6 @@ #define N_SPLICE_REGION_EXON 3 #define N_SPLICE_REGION_INTRON 8 -// Ensembl ID format, e.g. -// ENST00000423372 for human .. ENST%011d -// ENSMUST00000120394 for mouse .. ENSMUST%011d -char ENSID_BUF[32], *ENSID_FMT = NULL; -static inline char *ENSID(uint32_t id) -{ - sprintf(ENSID_BUF,ENSID_FMT,id); - return ENSID_BUF; -} - - #define N_REF_PAD 10 // number of bases to avoid boundary effects #define STRAND_REV 0 @@ -511,7 +500,6 @@ hap_t; temporary list of all exons, CDS, UTRs */ KHASH_MAP_INIT_INT(int2tscript, tscript_t*) -KHASH_MAP_INIT_INT(int2int, int) KHASH_MAP_INIT_INT(int2gene, gf_gene_t*) typedef struct { @@ -524,25 +512,41 @@ typedef struct uint32_t iseq:29; } ftr_t; +/* + Mapping from GFF ID string (such as ENST00000450305 or Zm00001d027230_P001) + to integer id. To keep the memory requirements low, the original version + relied on IDs in the form of a string prefix and a numerical id. However, + it turns out that this assumption is not valid for some ensembl GFFs, see + for example Zea_mays.AGPv4.36.gff3.gz + */ +typedef struct +{ + void *str2id; // khash_str2int + int nstr, mstr; + char **str; // numeric id to string +} +id_tbl_t; typedef struct { // all exons, CDS, UTRs ftr_t *ftr; int nftr, mftr; - // mapping from transcript ensembl id to gene id + // mapping from gene id to gf_gene_t kh_int2gene_t *gid2gene; // mapping from transcript id to tscript, for quick CDS anchoring kh_int2tscript_t *id2tr; // sequences - void *seq2int; + void *seq2int; // str2int hash char **seq; int nseq, mseq; // ignored biotypes void *ignored_biotypes; + + id_tbl_t gene_ids; // temporary table for mapping between gene id (eg. Zm00001d027245) and a numeric idx } aux_t; @@ -592,6 +596,7 @@ typedef struct _args_t int nrm_tr, mrm_tr; csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs int ncsq_buf, mcsq_buf; + id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx faidx_t *fai; kstring_t str, str2; @@ -696,33 +701,38 @@ static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); return se+1; } -static inline uint32_t gff_parse_id(const char *line, const char *needle, char *ss) +static void gff_id_init(id_tbl_t *tbl) { - ss = strstr(ss,needle); - if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line); - ss += strlen(needle); - while ( *ss && !isdigit(*ss) ) ss++; - if ( !ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line); - char *se; - uint32_t id = strtol(ss, &se, 10); - if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line); - if ( *se && *se!=';' && *se!='\t' ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line); - assert( id <= 0xffffff ); // see gf_gene_t.id. Ensembl IDs are never that big in practice - return id; + memset(tbl, 0, sizeof(*tbl)); + tbl->str2id = khash_str2int_init(); +} +static void gff_id_destroy(id_tbl_t *tbl) +{ + khash_str2int_destroy_free(tbl->str2id); + free(tbl->str); } -static void gff_parse_ensid_fmt(const char *line, const char *needle, char *ss) +static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char *needle, char *ss) { - ss = strstr(ss,needle); + ss = strstr(ss,needle); // e.g. "ID=transcript:" if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line); ss += strlen(needle); + char *se = ss; - while ( *se && !isdigit(*se) ) se++; - kstring_t str = {0,0,0}; - kputsn(ss,se-ss,&str); - ss = se; - while ( *se && isdigit(*se) ) se++; - ksprintf(&str,"%%0%dd",(int)(se-ss)); - ENSID_FMT = str.s; + while ( *se && *se!=';' && !isspace(*se) ) se++; + char tmp = *se; + *se = 0; + + int id; + if ( khash_str2int_get(tbl->str2id, ss, &id) < 0 ) + { + id = tbl->nstr++; + hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str); + tbl->str[id] = strdup(ss); + int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id); + } + *se = tmp; + + return id; } static inline int gff_parse_type(char *line) { @@ -882,10 +892,8 @@ void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr) } // create a mapping from transcript_id to gene_id - uint32_t trid = gff_parse_id(line, "ID=transcript:", ss); - uint32_t gene_id = gff_parse_id(line, "Parent=gene:", ss); - - if ( !ENSID_FMT ) gff_parse_ensid_fmt(line, "ID=transcript:", ss); // id prefix different across species + uint32_t trid = gff_id_parse(&args->tscript_ids, line, "ID=transcript:", ss); + uint32_t gene_id = gff_id_parse(&args->init.gene_ids, line, "Parent=gene:", ss); tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t)); tr->id = trid; @@ -912,7 +920,7 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha aux_t *aux = &args->init; // substring search for "ID=gene:ENSG00000437963" - uint32_t gene_id = gff_parse_id(line, "ID=gene:", ss); + uint32_t gene_id = gff_id_parse(&aux->gene_ids, line, "ID=gene:", ss); gf_gene_t *gene = gene_init(aux, gene_id); assert( !gene->name ); // the gene_id should be unique @@ -920,13 +928,17 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha // substring search for "Name=OR4F5" ss = strstr(chr_end+2,"Name="); - if ( !ss ) error("Could not parse the line, \"Name=\" not present: %s\n", line); - ss += 5; - char *se = ss; - while ( *se && *se!=';' && !isspace(*se) ) se++; - gene->name = (char*) malloc(se-ss+1); - memcpy(gene->name,ss,se-ss); - gene->name[se-ss] = 0; + if ( ss ) + { + ss += 5; + char *se = ss; + while ( *se && *se!=';' && !isspace(*se) ) se++; + gene->name = (char*) malloc(se-ss+1); + memcpy(gene->name,ss,se-ss); + gene->name[se-ss] = 0; + } + else + gene->name = strdup(aux->gene_ids.str[gene_id]); // Name= field is not present, use the gene ID instead } int gff_parse(args_t *args, char *line, ftr_t *ftr) { @@ -1001,7 +1013,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr) ss += 2; // substring search for "Parent=transcript:ENST00000437963" - ftr->trid = gff_parse_id(line, "Parent=transcript:", ss); + ftr->trid = gff_id_parse(&args->tscript_ids, line, "Parent=transcript:", ss); ftr->iseq = feature_set_seq(args, chr_beg,chr_end); return 0; } @@ -1106,7 +1118,7 @@ void tscript_init_cds(args_t *args) { int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; if ( phase!=len%3) - error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len); + error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); assert( phase == len%3 ); len += tr->cds[i]->len; } @@ -1134,7 +1146,7 @@ void tscript_init_cds(args_t *args) { int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; if ( phase!=len%3) - error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len); + error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); len += tr->cds[i]->len; } } @@ -1207,6 +1219,8 @@ void init_gff(args_t *args) aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL); aux->ignored_biotypes = khash_str2int_init(); + gff_id_init(&aux->gene_ids); + gff_id_init(&args->tscript_ids); // parse gff kstring_t str = {0,0,0}; @@ -1254,7 +1268,7 @@ void init_gff(args_t *args) else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr); else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr); else - error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,ENSID(ftr->trid),gf_type2gff_string(ftr->type)); + error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,args->tscript_ids.str[ftr->trid],gf_type2gff_string(ftr->type)); } tscript_init_cds(args); @@ -1272,6 +1286,7 @@ void init_gff(args_t *args) // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene); kh_destroy(int2tscript,aux->id2tr); free(aux->seq); + gff_id_destroy(&aux->gene_ids); if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) ) { @@ -1411,7 +1426,7 @@ void destroy_data(args_t *args) free(args->gt_arr); free(args->str.s); free(args->str2.s); - free(ENSID_FMT); + gff_id_destroy(&args->tscript_ids); } /* @@ -2493,7 +2508,7 @@ exit_duplicate: #define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen) #define node2rpos(i) (hap->stack[i].node->rec->pos) -void kput_vcsq(vcsq_t *csq, kstring_t *str) +void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str) { // Remove start/stop from incomplete CDS, but only if there is another // consequence as something must be reported @@ -2522,7 +2537,7 @@ void kput_vcsq(vcsq_t *csq, kstring_t *str) if ( csq->gene ) kputs(csq->gene , str); kputc_('|', str); - if ( csq->type & CSQ_PRN_TSCRIPT ) ksprintf(str, "%s",ENSID(csq->trid)); + if ( csq->type & CSQ_PRN_TSCRIPT ) kputs(args->tscript_ids.str[csq->trid], str); kputc_('|', str); kputs(gf_type2gff_string(csq->biotype), str); @@ -2891,7 +2906,7 @@ static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap) fprintf(args->out,"-"); args->str.l = 0; - kput_vcsq(&csq->type, &args->str); + kput_vcsq(args, &csq->type, &args->str); fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); } static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node) @@ -2915,7 +2930,7 @@ static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ih fprintf(args->out,"-"); args->str.l = 0; - kput_vcsq(&csq->type, &args->str); + kput_vcsq(args, &csq->type, &args->str); fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s); } } @@ -3059,11 +3074,11 @@ void vbuf_flush(args_t *args) } args->str.l = 0; - kput_vcsq(&vrec->vcsq[0], &args->str); + kput_vcsq(args, &vrec->vcsq[0], &args->str); for (j=1; jnvcsq; j++) { kputc_(',', &args->str); - kput_vcsq(&vrec->vcsq[j], &args->str); + kput_vcsq(args, &vrec->vcsq[j], &args->str); } bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s); if ( args->hdr_nsmpl ) @@ -3667,7 +3682,7 @@ void process(args_t *args, bcf1_t **rec_ptr) return; } -const char *usage(void) +static const char *usage(void) { return "\n" diff --git a/bcftools/filter.c b/bcftools/filter.c index 78ff1f1..3dc91a7 100644 --- a/bcftools/filter.c +++ b/bcftools/filter.c @@ -67,21 +67,23 @@ typedef struct _token_t char *tag; // for debugging and printout only, VCF tag name double threshold; // filtering threshold int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types - int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*]) + int idx; // 0-based index to VCF vectors, -1: not a vector, + // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) + int *idxs, nidxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited void (*setter)(filter_t *, bcf1_t *, struct _token_t *); int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *); void *hash; // test presence of str value in the hash via comparator regex_t *regex; // precompiled regex for string comparison // modified on filter evaluation at each VCF line - double *values; // In case str_value is set, values[0] is one sample's string length - char *str_value; // and values[0]*nsamples gives the total length; + double *values; + kstring_t str_value; int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues int pass_site; // -1 not applicable, 0 fails, >0 pass uint8_t *pass_samples; // status of individual samples int nsamples; // number of samples - int nvalues, mvalues; // number of used values, n=0 for missing values, n=1 for scalars - // for strings, total length of str_value + int nvalues, mvalues; // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l + int nstr1; // per-sample string length, set only with str_value.l>0 && nsamples>1 } token_t; @@ -93,6 +95,7 @@ struct _filter_t token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack int32_t *tmpi; float *tmpf; + kstring_t tmps; int max_unpack, mtmpi, mtmpf, nsamples; }; @@ -169,6 +172,7 @@ static int filters_next_token(char **str, int *len) return TOK_VAL; } + int square_brackets = 0; while ( tmp[0] ) { if ( tmp[0]=='"' ) break; @@ -183,11 +187,12 @@ static int filters_next_token(char **str, int *len) if ( tmp[0]=='(' ) break; if ( tmp[0]==')' ) break; if ( tmp[0]=='+' ) break; - // hacky: so that [*] is not split, the tokenizer does not recognise square brackets [] - if ( tmp[0]=='*' && (tmp==*str || tmp[-1]!='[') ) break; - if ( tmp[0]=='-' ) break; + if ( tmp[0]=='*' && !square_brackets ) break; + if ( tmp[0]=='-' && !square_brackets ) break; if ( tmp[0]=='/' ) break; if ( tmp[0]=='~' ) break; + if ( tmp[0]==']' ) { if (square_brackets) tmp++; break; } + if ( tmp[0]=='[' ) square_brackets++; tmp++; } if ( tmp > *str ) @@ -270,12 +275,15 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) else if ( line->d.info[i].type==BCF_BT_CHAR ) { int n = line->d.info[i].len; - int m = (int)tok->values[0]; - hts_expand(char,n+1,m,tok->str_value); - memcpy(tok->str_value,line->d.info[i].vptr,n); - tok->str_value[n] = 0; - tok->values[0] = m; - tok->nvalues = n; + if ( n >= tok->str_value.m ) + { + tok->str_value.m = n + 1; + tok->str_value.s = (char*) realloc(tok->str_value.s, tok->str_value.m); + if ( !tok->str_value.s ) error("Failed to alloc %d bytes\n", (int)tok->str_value.m); + } + memcpy(tok->str_value.s, line->d.info[i].vptr, n); + tok->str_value.s[n] = 0; + tok->nvalues = tok->str_value.l = n; } else if ( line->d.info[i].type==BCF_BT_FLOAT ) { @@ -285,10 +293,11 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) tok->values[0] = line->d.info[i].v1.f; tok->nvalues = 1; } - tok->str_value = NULL; + tok->str_value.l = 0; } else { + tok->str_value.l = 0; if ( line->d.info[i].type==BCF_BT_INT8 && line->d.info[i].v1.i==bcf_int8_missing ) tok->nvalues = 0; else if ( line->d.info[i].type==BCF_BT_INT16 && line->d.info[i].v1.i==bcf_int16_missing ) tok->nvalues = 0; else if ( line->d.info[i].type==BCF_BT_INT32 && line->d.info[i].v1.i==bcf_int32_missing ) tok->nvalues = 0; @@ -297,7 +306,6 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) tok->values[0] = line->d.info[i].v1.i; tok->nvalues = 1; } - tok->str_value = NULL; } } static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line) @@ -346,8 +354,8 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin return ret ? 0 : 1; } - if ( op_type==TOK_EQ ) return strcmp(btok->str_value,line->d.id) ? 0 : 1; - return strcmp(btok->str_value,line->d.id) ? 1 : 0; + if ( op_type==TOK_EQ ) return strcmp(btok->str_value.s,line->d.id) ? 0 : 1; + return strcmp(btok->str_value.s,line->d.id) ? 1 : 0; } /** @@ -409,13 +417,16 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok) { if ( tok->idx==-2 ) { - int i; tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi); if ( tok->nvalues<=0 ) tok->nvalues = 0; else { hts_expand(double,tok->nvalues,tok->mvalues,tok->values); - for (i=0; invalues; i++) tok->values[i] = flt->tmpi[i]; + int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1; + if ( end >= tok->nvalues ) end = tok->nvalues - 1; + for (i=0; i<=end; i++) + if ( i>=tok->nidxs || tok->idxs[i] ) tok->values[j++] = flt->tmpi[i]; + tok->nvalues = j; } } else @@ -435,15 +446,21 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok) { if ( tok->idx==-2 ) { - int i; tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf); if ( tok->nvalues<=0 ) tok->nvalues = 0; else { hts_expand(double,tok->nvalues,tok->mvalues,tok->values); - for (i=0; invalues; i++) - if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[i]); - else tok->values[i] = flt->tmpf[i]; + int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1; + if ( end >= tok->nvalues ) end = tok->nvalues - 1; + for (i=0; i<=end; i++) + if ( i>=tok->nidxs || tok->idxs[i] ) + { + if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[j]); + else tok->values[j] = flt->tmpf[i]; + j++; + } + tok->nvalues = j; } } else @@ -461,33 +478,64 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok) static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok) { - int m = (int)tok->values[0]; - int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value,&m); - if ( n<0 ) { tok->nvalues = 0; return; } - tok->values[0] = m; // allocated length + int32_t m = tok->str_value.m; + int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value.s,&m); + tok->str_value.m = m; + if ( n<0 ) { tok->nvalues = tok->str_value.l = 0; return; } if ( tok->idx>=0 ) { // get ith field (i=tok->idx) int i = 0; - char *ss = tok->str_value, *se = tok->str_value + n; + char *ss = tok->str_value.s, *se = tok->str_value.s + n; while ( ssidx ) { if ( *ss==',' ) i++; ss++; } - if ( ss==se || i!=tok->idx ) { tok->nvalues = 0; return; } + if ( ss==se || i!=tok->idx ) { tok->nvalues = tok->str_value.l = 0; return; } se = ss; - while ( se-tok->str_valuestr_value ) *se = 0; + while ( se - tok->str_value.s < n && *se!=',' ) se++; + if ( ss==tok->str_value.s ) *se = 0; else { - memmove(tok->str_value,ss,se-ss); - tok->str_value[se-ss] = 0; + memmove(tok->str_value.s, ss, se-ss); + tok->str_value.s[se-ss] = 0; } - tok->nvalues = se-ss; + tok->str_value.l = se - ss; } - else if ( tok->idx==-2 ) tok->nvalues = n; + else if ( tok->idx==-2 && tok->idxs[0]==-1 ) // keep all values, TAG[*] + tok->str_value.l = n; + else if ( tok->idx==-2 ) + { + flt->tmps.l = 0; + ks_resize(&flt->tmps, n); + int i, end = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1; + if ( end >= n ) end = n - 1; + char *beg = tok->str_value.s, *dst = flt->tmps.s; + for (i=0; i<=end; i++) + { + char *end = beg; + while ( *end && *end!=',' ) end++; + + if ( i>=tok->nidxs || tok->idxs[i] ) + { + memcpy(dst, beg, end - beg); + dst += end - beg; + dst[0] = ','; + dst++; + } + + beg = end+1; + } + dst[0] = 0; + tok->str_value.l = dst - flt->tmps.s; + + #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; } + SWAP(char *, flt->tmps.s, tok->str_value.s); + SWAP(size_t, flt->tmps.m, tok->str_value.m); + } + tok->nvalues = tok->str_value.l; } static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok) @@ -503,127 +551,266 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) { int i; if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 ) - tok->nvalues = 0; - else { + tok->nvalues = tok->nsamples = 0; + return; + } + if ( tok->idx >= -1 ) // scalar or vector index + { + hts_expand(double,flt->nsamples,tok->mvalues,tok->values); + int nvals = tok->nvalues / line->n_sample; + int idx = tok->idx >= 0 ? tok->idx : 0; int is_missing = 1; - hts_expand(double,tok->nvalues,tok->mvalues,tok->values); - for (i=0; invalues; i++) + int32_t *ptr = flt->tmpi; + for (i=0; in_sample; i++) { - if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end ) + if ( ptr[idx]==bcf_int32_missing || ptr[idx]==bcf_int32_vector_end ) bcf_double_set_missing(tok->values[i]); else { - tok->values[i] = flt->tmpi[i]; + tok->values[i] = ptr[idx]; is_missing = 0; } + ptr += nvals; } if ( is_missing ) tok->nvalues = 0; - else if ( tok->idx >= 0 ) + else tok->nvalues = line->n_sample; + tok->nsamples = tok->nvalues; + return; + } + if ( tok->idx == -2 ) + { + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); + int nvals = tok->nvalues / line->n_sample; + int idx = tok->idx >= 0 ? tok->idx : 0; + int is_missing = 1; + int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1; + if ( end >= nvals ) end = nvals - 1; + int32_t *ptr = flt->tmpi; + for (i=0; in_sample; i++) { - int nsmpl = bcf_hdr_nsamples(flt->hdr); - int nvals = tok->nvalues / nsmpl; - if ( tok->idx >= nvals ) - tok->nvalues = 0; // the index is too big - else - { - for (i=0; ivalues[i] = tok->values[i*nvals+tok->idx]; - tok->nvalues = nsmpl; - } + for (k=0; k<=end; k++) + if ( k>=tok->nidxs || tok->idxs[k] ) + { + if ( ptr[k]==bcf_int32_missing || ptr[k]==bcf_int32_vector_end ) + bcf_double_set_missing(tok->values[j]); + else + { + tok->values[j] = ptr[k]; + is_missing = 0; + } + j++; + } + ptr += nvals; + } + if ( is_missing ) tok->nvalues = tok->nsamples = 0; + else + { + tok->nsamples = line->n_sample; + tok->nvalues = j; } + return; } - tok->nsamples = tok->nvalues; } static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) { int i; - if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<=0 ) + if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 ) { - tok->nvalues = tok->nsamples = 0; // missing values + tok->nvalues = tok->nsamples = 0; + return; } - else + if ( tok->idx >= -1 ) // scalar or vector index { + hts_expand(double,flt->nsamples,tok->mvalues,tok->values); + int nvals = tok->nvalues / line->n_sample; + int idx = tok->idx >= 0 ? tok->idx : 0; int is_missing = 1; - hts_expand(double,tok->nvalues,tok->mvalues,tok->values); - for (i=0; invalues; i++) + float *ptr = flt->tmpf; + for (i=0; in_sample; i++) { - if ( bcf_float_is_missing(flt->tmpf[i]) || bcf_float_is_vector_end(flt->tmpf[i]) ) + if ( bcf_float_is_missing(ptr[idx]) || bcf_float_is_vector_end(ptr[idx]) ) bcf_double_set_missing(tok->values[i]); else { - tok->values[i] = flt->tmpf[i]; + tok->values[i] = ptr[idx]; is_missing = 0; } + ptr += nvals; } if ( is_missing ) tok->nvalues = 0; - else if ( tok->idx >= 0 ) + else tok->nvalues = line->n_sample; + tok->nsamples = tok->nvalues; + return; + } + if ( tok->idx == -2 ) + { + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); + int nvals = tok->nvalues / line->n_sample; + int idx = tok->idx >= 0 ? tok->idx : 0; + int is_missing = 1; + int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1; + if ( end >= nvals ) end = nvals - 1; + float *ptr = flt->tmpf; + for (i=0; in_sample; i++) { - int nsmpl = bcf_hdr_nsamples(flt->hdr); - int nvals = tok->nvalues / nsmpl; - if ( tok->idx >= nvals ) - tok->nvalues = 0; // the index is too big - else - { - for (i=0; ivalues[i] = tok->values[i*nvals+tok->idx]; - tok->nvalues = nsmpl; - } + for (k=0; k<=end; k++) + if ( k>=tok->nidxs || tok->idxs[k] ) + { + if ( bcf_float_is_missing(ptr[k]) || bcf_float_is_vector_end(ptr[k]) ) + bcf_double_set_missing(tok->values[j]); + else + { + tok->values[j] = ptr[k]; + is_missing = 0; + } + j++; + } + ptr += nvals; + } + if ( is_missing ) tok->nvalues = tok->nsamples = 0; + else + { + tok->nsamples = line->n_sample; + tok->nvalues = j; } + return; } - tok->nsamples = tok->nvalues; } static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok) { - int ndim = tok->nsamples * (int)tok->values[0]; - int ret = bcf_get_format_char(flt->hdr,line,tok->tag,&tok->str_value,&ndim); + tok->str_value.l = tok->nvalues = 0; + if ( !line->n_sample ) return; - int nsmpl = bcf_hdr_nsamples(flt->hdr); - ndim /= nsmpl; - tok->values[0] = ndim; + int ndim = tok->str_value.m; + int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim); + tok->str_value.m = ndim; - if ( ret<=0 ) - { - tok->nvalues = 0; - return; - } + if ( nstr<=0 ) return; - if ( tok->idx < 0 ) // scalar + if ( tok->idx == -1 || (tok->idx==-2 && tok->idxs[0]==-1) ) // scalar or keep all values of a vector: TAG[*] { - tok->nvalues = tok->nsamples = nsmpl; + tok->nsamples = line->n_sample; + tok->nstr1 = ndim / line->n_sample; + tok->nvalues = tok->str_value.l = nstr; return; } - // vector + int nstr1 = nstr / line->n_sample; + + // vector, one or multiple indices int i; - for (i=0; in_sample; i++) { - char *ss = tok->str_value + i*ndim; - int is = 0, ivec = 0; - while ( ivecidx && isidx || is==ndim || !ss[is] ) + char *dst = tok->str_value.s + i*nstr1, *str = dst; + int nval = 0, ibeg = 0; + while ( ibeg < nstr1 ) { - ss[0] = '.'; - ss[1] = 0; - continue; + int iend = ibeg + 1; + while ( iend < nstr1 && str[iend] && str[iend]!=',' ) iend++; + + int keep = 0; + if ( tok->idx >=0 ) + keep = tok->idx==nval ? 1 : 0; + else if ( nval < tok->nidxs ) + keep = tok->idxs[nval] ? 1 : 0; + else if ( tok->idxs[tok->nidxs-1] < 0 ) + keep = 1; + + if ( keep ) + { + if ( ibeg>0 ) memmove(dst, str+ibeg, iend-ibeg+1); + dst += iend - ibeg + 1; + if ( tok->idx>=0 ) break; + } + if ( !str[iend] ) break; + ibeg = iend + 1; + nval++; } - int ie = is; - while ( ienvalues = tok->str_value.l = nstr; + tok->nstr1 = nstr1; + tok->nsamples = line->n_sample; +} +static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type) +{ + bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT"); + if ( !fmt ) { - tok->nvalues = 0; + tok->nvalues = tok->str_value.l = 0; return; } - tok->nvalues = ret; + + int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals = type==2 ? 3 : 4; + if ( tok->str_value.m <= nvals*nsmpl ) + { + tok->str_value.m = nvals*nsmpl + 1; + tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m); + } + +#define BRANCH_INT(type_t,vector_end) \ + { \ + for (i=0; in_sample; i++) \ + { \ + type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \ + int is_het = 0, has_ref = 0, missing = 0; \ + for (j=0; jn; j++) \ + { \ + if ( ptr[j]==vector_end ) break; /* smaller ploidy */ \ + if ( bcf_gt_is_missing(ptr[j]) ) { missing=1; break; } /* missing allele */ \ + int ial = ptr[j]; \ + if ( bcf_gt_allele(ial)==0 ) has_ref = 1; \ + if ( j>0 ) \ + { \ + int jal = ptr[j-1]; \ + if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \ + } \ + } \ + char *dst = &tok->str_value.s[nvals*i]; \ + if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \ + else if ( type==3 ) \ + { \ + if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \ + else if ( !is_het ) dst[0]='h', dst[1]='o', dst[2]='m', dst[3] = 0; /* hom */ \ + else dst[0]='h', dst[1]='e', dst[2]='t', dst[3] = 0; /* het */ \ + } \ + else \ + { \ + if ( j==1 ) \ + { \ + if ( has_ref ) dst[0]='r', dst[1]=0; /* r, haploid */ \ + else dst[0]='a', dst[1]=0; /* a, haploid */ \ + } \ + else if ( !is_het ) \ + { \ + if ( has_ref ) dst[0]='r', dst[1]='r', dst[2] = 0; /* rr */ \ + else dst[0]='a', dst[1]='a', dst[2] = 0; /* aa */ \ + } \ + else \ + { \ + if ( has_ref ) dst[0]='r', dst[1]='a', dst[2] = 0; /* ra */ \ + else dst[0]='a', dst[1]='A', dst[2] = 0; /* aA */ \ + } \ + } \ + } \ + } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; + default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break; + } +#undef BRANCH_INT tok->nsamples = nsmpl; + tok->nvalues = tok->str_value.l = nvals*nsmpl; + tok->str_value.s[tok->str_value.l] = 0; + tok->nstr1 = nvals; } +static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); } +static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); } + static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok) { bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT"); @@ -636,67 +823,73 @@ static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *to kstring_t str; gt_length_too_big: - str.s = tok->str_value; str.m = tok->values[0] * nsmpl; str.l = 0; + tok->str_value.l = 0; for (i=0; istr_value.l; - bcf_format_gt(fmt, i, &str); - kputc_(0,&str); - if ( str.l - plen > blen ) + bcf_format_gt(fmt, i, &tok->str_value); + kputc_(0, &tok->str_value); + if ( tok->str_value.l - plen > blen ) { // too many alternate alleles or ploidy is too large, the genotype does not fit // three characters ("0/0" vs "10/10"). - tok->str_value = str.s; blen *= 2; goto gt_length_too_big; } - plen = str.l - plen; - while ( plenstr_value.l - plen; + while ( plen < blen ) { - kputc_(0, &str); + kputc_(0, &tok->str_value); plen++; } } - tok->nvalues = str.l; tok->nsamples = nsmpl; - tok->values[0] = blen; - tok->str_value = str.s; + tok->nvalues = tok->str_value.l; + tok->nstr1 = blen; } static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok) { - kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0; - kputs(line->d.allele[0], &str); - tok->nvalues = str.l; - tok->values[0] = str.m; - tok->str_value = str.s; + tok->str_value.l = 0; + kputs(line->d.allele[0], &tok->str_value); + tok->nvalues = tok->str_value.l; } static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok) { - kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0; + tok->str_value.l = 0; if ( tok->idx>=0 ) { - if ( line->n_allele >= tok->idx ) - kputs(line->d.allele[tok->idx], &str); + if ( line->n_allele > tok->idx + 1 ) + kputs(line->d.allele[tok->idx + 1], &tok->str_value); else - kputc('.', &str); + kputc('.', &tok->str_value); + tok->idx = 0; + } + else if ( tok->idx==-2 ) + { + int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? line->n_allele - 1 : tok->nidxs - 1; + if ( end >= line->n_allele - 1 ) end = line->n_allele - 2; + for (i=0; i<=end; i++) + if ( i>=tok->nidxs || tok->idxs[i] ) + { + if ( tok->str_value.l ) kputc(',', &tok->str_value); + kputs(line->d.allele[i+1], &tok->str_value); + } } else if ( line->n_allele>1 ) { - kputs(line->d.allele[1], &str); + kputs(line->d.allele[1], &tok->str_value); int i; for (i=2; in_allele; i++) { - kputc(',', &str); - kputs(line->d.allele[i], &str); + kputc(',', &tok->str_value); + kputs(line->d.allele[i], &tok->str_value); } } else if ( line->n_allele==1 ) - kputc('.', &str); - tok->nvalues = str.l; - tok->values[0] = str.m; - tok->str_value = str.s; + kputc('.', &tok->str_value); + tok->nvalues = tok->str_value.l; } static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok) { @@ -857,11 +1050,11 @@ static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok) static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) { tok->is_str = 0; - if ( !tok->nvalues ) return; + if ( !tok->str_value.l ) return; if ( tok->idx==-2 ) { int i = 0; - char *ss = tok->str_value; + char *ss = tok->str_value.s; while ( *ss ) { char *se = ss; @@ -881,9 +1074,10 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) } else { - tok->values[0] = strlen(tok->str_value); + tok->values[0] = strlen(tok->str_value.s); tok->nvalues = 1; } + tok->str_value.l = 0; } #define VECTOR_ARITHMETICS(atok,btok,AOP) \ { \ @@ -1077,7 +1271,16 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \ else \ { \ - if ( (atok)->nsamples && (btok)->nsamples ) \ + if ( (atok)->idx<=-2 || (btok)->idx<=-2 ) \ + { \ + /* any field can match: [*] */ \ + for (i=0; i<(atok)->nvalues; i++) \ + { \ + for (j=0; j<(btok)->nvalues; j++) \ + if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \ + } \ + } \ + else if ( (atok)->nsamples && (btok)->nsamples ) \ { \ for (i=0; i<(atok)->nsamples; i++) \ { \ @@ -1111,15 +1314,6 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) (atok)->nsamples = (btok)->nsamples; \ if ( !has_values ) (atok)->nvalues = 0; \ } \ - else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \ - { \ - /* any field can match: [*] */ \ - for (i=0; i<(atok)->nvalues; i++) \ - { \ - for (j=0; j<(btok)->nvalues; j++) \ - if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \ - } \ - } \ else \ { \ if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \ @@ -1130,18 +1324,18 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) } static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // logic: TOK_EQ or TOK_NE { - if ( !atok->nvalues ) { return 0; } - if ( !btok->nvalues ) { atok->nvalues = 0; return 0; } + if ( !atok->str_value.l ) { return 0; } + if ( !btok->str_value.l ) { atok->str_value.l = 0; return 0; } int i, pass_site = 0; if ( atok->nsamples && atok->nsamples==btok->nsamples ) { for (i=0; insamples; i++) { - char *astr = atok->str_value + i*(int)atok->values[0]; - char *bstr = btok->str_value + i*(int)btok->values[0]; - char *aend = astr + (int)atok->values[0], *a = astr; + char *astr = atok->str_value.s + i*atok->nstr1; + char *bstr = btok->str_value.s + i*btok->nstr1; + char *aend = astr + atok->str_value.l, *a = astr; while ( avalues[0], *b = bstr; + char *bend = bstr + btok->str_value.l, *b = bstr; while ( bpass_samples[i] = 0; else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0; @@ -1161,8 +1355,8 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log token_t *xtok, *ytok; // xtok is scalar, ytok array if ( btok->idx==-2 ) { xtok = atok; ytok = btok; } else { xtok = btok; ytok = atok; } - char *xstr = xtok->str_value, *xend = xstr + xtok->nvalues; - char *ystr = ytok->str_value, *yend = ystr + ytok->nvalues, *y = ystr; + char *xstr = xtok->str_value.s, *xend = xstr + xtok->str_value.l; + char *ystr = ytok->str_value.s, *yend = ystr + ytok->str_value.l, *y = ystr; while ( y<=yend ) { if ( y==yend || *y==',' ) @@ -1178,7 +1372,7 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log } } else - pass_site = strcmp(atok->str_value,btok->str_value) ? 0 : 1; + pass_site = strcmp(atok->str_value.s,btok->str_value.s) ? 0 : 1; if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1; } else @@ -1186,19 +1380,26 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log token_t *xtok, *ytok; if ( !atok->nsamples ) { xtok = atok; ytok = btok; } else { xtok = btok; ytok = atok; } - char *xstr = xtok->str_value; - char *xend = xstr + (int)xtok->values[0], *x = xstr; + char *xstr = xtok->str_value.s; + char *xend = xstr + xtok->str_value.l, *x = xstr; while ( xnsamples; i++) { - char *ystr = ytok->str_value + i*(int)ytok->values[0]; - char *yend = ystr + (int)ytok->values[0], *y = ystr; - while ( ypass_samples[i] = 0; - else atok->pass_samples[i] = strncmp(xstr,ystr,x-xstr)==0 ? 1 : 0; - if ( logic!=TOK_EQ ) - atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1; - pass_site |= atok->pass_samples[i]; + char *ystr = ytok->str_value.s + i*ytok->nstr1; + char *ybeg = ystr, *yend = ystr + ytok->nstr1; + int pass = 0; + while ( ybeg < yend ) + { + char *y = ybeg; + while ( ypass_samples[i] = pass; + pass_site |= pass; } if ( !atok->nsamples ) atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set @@ -1212,18 +1413,70 @@ static int regex_vector_strings(token_t *atok, token_t *btok, int negate) { for (i=0; insamples; i++) { - char *ptr = atok->str_value + i*(int)atok->values[0]; + char *ptr = atok->str_value.s + i*atok->nstr1; atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1; if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1; pass_site |= atok->pass_samples[i]; } return pass_site; } - pass_site = regexec(btok->regex, atok->str_value, 0,NULL,0) ? 0 : 1; + pass_site = regexec(btok->regex, atok->str_value.s, 0,NULL,0) ? 0 : 1; if ( negate ) pass_site = pass_site ? 0 : 1; return pass_site; } +static void parse_tag_idx(char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG[" +{ + // TAG[*] .. any field + if ( !strncmp("*]", tag_idx, 3) ) + { + tok->idxs = (int*) malloc(sizeof(int)); + tok->idxs[0] = -1; + tok->nidxs = 1; + tok->idx = -2; + return; + } + + // TAG[integer] .. one field + char *end, *beg = tag_idx; + tok->idx = strtol(tag_idx, &end, 10); + if ( tok->idx >= 0 && *end==']' ) return; + + + // TAG[0,1] or TAG[0-2] or [1-] etc + int i, ibeg = -1; + while ( *beg && *beg!=']' ) + { + int idx = strtol(beg, &end, 10); + if ( end[0]==',' ) beg = end + 1; + else if ( end[0]==']' ) beg = end; + else if ( end[0]=='-' ) { beg = end + 1; ibeg = idx; continue; } + else error("Could not parse the index: %s[%s\n", tag, tag_idx+1); + if ( idx >= tok->nidxs ) + { + tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(idx+1)); + memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(idx - tok->nidxs + 1)); + tok->nidxs = idx + 1; + } + if ( ibeg>=0 ) + { + for (i=ibeg; i<=idx; i++) tok->idxs[i] = 1; + ibeg = -1; + } + tok->idxs[idx] = 1; + } + if ( ibeg >=0 ) + { + if ( ibeg >= tok->nidxs ) + { + tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(ibeg+1)); + memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(ibeg - tok->nidxs + 1)); + tok->nidxs = ibeg + 1; + } + tok->idxs[ibeg] = -1; + } + tok->idx = -2; +} static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->tok_type = TOK_VAL; @@ -1361,17 +1614,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) int i; for (i=0; iidx = -2; // tag[*] .. any field - else - { - char *end; - tok->idx = strtol(tmp.s+is_array, &end, 10); - if ( *end!=']' ) error("Could not parse the index: %s[%s\n", tmp.s,tmp.s+is_array); - } - } + if ( is_array ) + parse_tag_idx(tmp.s, tmp.s+is_array, tok); } tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s); if ( is_fmt==-1 ) @@ -1425,7 +1669,13 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break; default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__); } - if(!is_array) tok->idx = -2; + if (!is_array) + { + tok->idx = -2; + tok->idxs = (int*) malloc(sizeof(int)); + tok->idxs[0] = -1; + tok->nidxs = 1; + } } } filter->max_unpack |= BCF_UN_INFO; @@ -1518,6 +1768,11 @@ static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks) } } +static void str_to_lower(char *str) +{ + while ( *str ) { *str = tolower(*str); str++; } +} + // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm filter_t *filter_init(bcf_hdr_t *hdr, const char *str) @@ -1538,8 +1793,8 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) ret = filters_next_token(&tmp, &len); if ( ret==-1 ) error("Missing quotes in: %s\n", str); - //fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len); - //int i; for (i=0; istr); + int ival; + if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1; + else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1; + else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = i + 1; + else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1; + else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); + + // assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc + if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; } // ra + else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; } // aA + else if ( !strcasecmp(out[ival].key,"aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='a'; } // aa + else if ( !strcasecmp(out[ival].key,"a") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]=0; } // a + else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; } // r + continue; + } if ( !strcmp(out[i].tag,"FILTER") ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); @@ -1728,9 +2005,10 @@ void filter_destroy(filter_t *filter) int i; for (i=0; infilters; i++) { - //if ( filter->filters[i].key ) free(filter->filters[i].key); - free(filter->filters[i].str_value); + if ( filter->filters[i].key ) free(filter->filters[i].key); + free(filter->filters[i].str_value.s); free(filter->filters[i].tag); + free(filter->filters[i].idxs); free(filter->filters[i].values); free(filter->filters[i].pass_samples); if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash); @@ -1745,6 +2023,7 @@ void filter_destroy(filter_t *filter) free(filter->str); free(filter->tmpi); free(filter->tmpf); + free(filter->tmps.s); free(filter); } @@ -1765,16 +2044,15 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) filter->filters[i].setter(filter, line, &filter->filters[i]); else if ( filter->filters[i].key ) // string constant { - filter->filters[i].str_value = filter->filters[i].key; - filter->filters[i].values[0] = filter->filters[i].values[0]; - filter->filters[i].nvalues = strlen(filter->filters[i].key); + filter->filters[i].str_value.l = 0; + kputs(filter->filters[i].key, &filter->filters[i].str_value); + filter->filters[i].nvalues = filter->filters[i].str_value.l; } else // numeric constant { filter->filters[i].values[0] = filter->filters[i].threshold; filter->filters[i].nvalues = 1; } - filter->flt_stack[nstack++] = &filter->filters[i]; continue; } diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c index 25200c4..81f8734 100644 --- a/bcftools/filter.c.pysam.c +++ b/bcftools/filter.c.pysam.c @@ -69,21 +69,23 @@ typedef struct _token_t char *tag; // for debugging and printout only, VCF tag name double threshold; // filtering threshold int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types - int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*]) + int idx; // 0-based index to VCF vectors, -1: not a vector, + // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) + int *idxs, nidxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited void (*setter)(filter_t *, bcf1_t *, struct _token_t *); int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *); void *hash; // test presence of str value in the hash via comparator regex_t *regex; // precompiled regex for string comparison // modified on filter evaluation at each VCF line - double *values; // In case str_value is set, values[0] is one sample's string length - char *str_value; // and values[0]*nsamples gives the total length; + double *values; + kstring_t str_value; int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues int pass_site; // -1 not applicable, 0 fails, >0 pass uint8_t *pass_samples; // status of individual samples int nsamples; // number of samples - int nvalues, mvalues; // number of used values, n=0 for missing values, n=1 for scalars - // for strings, total length of str_value + int nvalues, mvalues; // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l + int nstr1; // per-sample string length, set only with str_value.l>0 && nsamples>1 } token_t; @@ -95,6 +97,7 @@ struct _filter_t token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack int32_t *tmpi; float *tmpf; + kstring_t tmps; int max_unpack, mtmpi, mtmpf, nsamples; }; @@ -171,6 +174,7 @@ static int filters_next_token(char **str, int *len) return TOK_VAL; } + int square_brackets = 0; while ( tmp[0] ) { if ( tmp[0]=='"' ) break; @@ -185,11 +189,12 @@ static int filters_next_token(char **str, int *len) if ( tmp[0]=='(' ) break; if ( tmp[0]==')' ) break; if ( tmp[0]=='+' ) break; - // hacky: so that [*] is not split, the tokenizer does not recognise square brackets [] - if ( tmp[0]=='*' && (tmp==*str || tmp[-1]!='[') ) break; - if ( tmp[0]=='-' ) break; + if ( tmp[0]=='*' && !square_brackets ) break; + if ( tmp[0]=='-' && !square_brackets ) break; if ( tmp[0]=='/' ) break; if ( tmp[0]=='~' ) break; + if ( tmp[0]==']' ) { if (square_brackets) tmp++; break; } + if ( tmp[0]=='[' ) square_brackets++; tmp++; } if ( tmp > *str ) @@ -272,12 +277,15 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) else if ( line->d.info[i].type==BCF_BT_CHAR ) { int n = line->d.info[i].len; - int m = (int)tok->values[0]; - hts_expand(char,n+1,m,tok->str_value); - memcpy(tok->str_value,line->d.info[i].vptr,n); - tok->str_value[n] = 0; - tok->values[0] = m; - tok->nvalues = n; + if ( n >= tok->str_value.m ) + { + tok->str_value.m = n + 1; + tok->str_value.s = (char*) realloc(tok->str_value.s, tok->str_value.m); + if ( !tok->str_value.s ) error("Failed to alloc %d bytes\n", (int)tok->str_value.m); + } + memcpy(tok->str_value.s, line->d.info[i].vptr, n); + tok->str_value.s[n] = 0; + tok->nvalues = tok->str_value.l = n; } else if ( line->d.info[i].type==BCF_BT_FLOAT ) { @@ -287,10 +295,11 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) tok->values[0] = line->d.info[i].v1.f; tok->nvalues = 1; } - tok->str_value = NULL; + tok->str_value.l = 0; } else { + tok->str_value.l = 0; if ( line->d.info[i].type==BCF_BT_INT8 && line->d.info[i].v1.i==bcf_int8_missing ) tok->nvalues = 0; else if ( line->d.info[i].type==BCF_BT_INT16 && line->d.info[i].v1.i==bcf_int16_missing ) tok->nvalues = 0; else if ( line->d.info[i].type==BCF_BT_INT32 && line->d.info[i].v1.i==bcf_int32_missing ) tok->nvalues = 0; @@ -299,7 +308,6 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) tok->values[0] = line->d.info[i].v1.i; tok->nvalues = 1; } - tok->str_value = NULL; } } static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line) @@ -348,8 +356,8 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin return ret ? 0 : 1; } - if ( op_type==TOK_EQ ) return strcmp(btok->str_value,line->d.id) ? 0 : 1; - return strcmp(btok->str_value,line->d.id) ? 1 : 0; + if ( op_type==TOK_EQ ) return strcmp(btok->str_value.s,line->d.id) ? 0 : 1; + return strcmp(btok->str_value.s,line->d.id) ? 1 : 0; } /** @@ -411,13 +419,16 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok) { if ( tok->idx==-2 ) { - int i; tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi); if ( tok->nvalues<=0 ) tok->nvalues = 0; else { hts_expand(double,tok->nvalues,tok->mvalues,tok->values); - for (i=0; invalues; i++) tok->values[i] = flt->tmpi[i]; + int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1; + if ( end >= tok->nvalues ) end = tok->nvalues - 1; + for (i=0; i<=end; i++) + if ( i>=tok->nidxs || tok->idxs[i] ) tok->values[j++] = flt->tmpi[i]; + tok->nvalues = j; } } else @@ -437,15 +448,21 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok) { if ( tok->idx==-2 ) { - int i; tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf); if ( tok->nvalues<=0 ) tok->nvalues = 0; else { hts_expand(double,tok->nvalues,tok->mvalues,tok->values); - for (i=0; invalues; i++) - if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[i]); - else tok->values[i] = flt->tmpf[i]; + int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? tok->nvalues - 1 : tok->nidxs - 1; + if ( end >= tok->nvalues ) end = tok->nvalues - 1; + for (i=0; i<=end; i++) + if ( i>=tok->nidxs || tok->idxs[i] ) + { + if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[j]); + else tok->values[j] = flt->tmpf[i]; + j++; + } + tok->nvalues = j; } } else @@ -463,33 +480,64 @@ static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok) static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok) { - int m = (int)tok->values[0]; - int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value,&m); - if ( n<0 ) { tok->nvalues = 0; return; } - tok->values[0] = m; // allocated length + int32_t m = tok->str_value.m; + int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value.s,&m); + tok->str_value.m = m; + if ( n<0 ) { tok->nvalues = tok->str_value.l = 0; return; } if ( tok->idx>=0 ) { // get ith field (i=tok->idx) int i = 0; - char *ss = tok->str_value, *se = tok->str_value + n; + char *ss = tok->str_value.s, *se = tok->str_value.s + n; while ( ssidx ) { if ( *ss==',' ) i++; ss++; } - if ( ss==se || i!=tok->idx ) { tok->nvalues = 0; return; } + if ( ss==se || i!=tok->idx ) { tok->nvalues = tok->str_value.l = 0; return; } se = ss; - while ( se-tok->str_valuestr_value ) *se = 0; + while ( se - tok->str_value.s < n && *se!=',' ) se++; + if ( ss==tok->str_value.s ) *se = 0; else { - memmove(tok->str_value,ss,se-ss); - tok->str_value[se-ss] = 0; + memmove(tok->str_value.s, ss, se-ss); + tok->str_value.s[se-ss] = 0; } - tok->nvalues = se-ss; + tok->str_value.l = se - ss; } - else if ( tok->idx==-2 ) tok->nvalues = n; + else if ( tok->idx==-2 && tok->idxs[0]==-1 ) // keep all values, TAG[*] + tok->str_value.l = n; + else if ( tok->idx==-2 ) + { + flt->tmps.l = 0; + ks_resize(&flt->tmps, n); + int i, end = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1; + if ( end >= n ) end = n - 1; + char *beg = tok->str_value.s, *dst = flt->tmps.s; + for (i=0; i<=end; i++) + { + char *end = beg; + while ( *end && *end!=',' ) end++; + + if ( i>=tok->nidxs || tok->idxs[i] ) + { + memcpy(dst, beg, end - beg); + dst += end - beg; + dst[0] = ','; + dst++; + } + + beg = end+1; + } + dst[0] = 0; + tok->str_value.l = dst - flt->tmps.s; + + #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; } + SWAP(char *, flt->tmps.s, tok->str_value.s); + SWAP(size_t, flt->tmps.m, tok->str_value.m); + } + tok->nvalues = tok->str_value.l; } static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok) @@ -505,127 +553,266 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) { int i; if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 ) - tok->nvalues = 0; - else { + tok->nvalues = tok->nsamples = 0; + return; + } + if ( tok->idx >= -1 ) // scalar or vector index + { + hts_expand(double,flt->nsamples,tok->mvalues,tok->values); + int nvals = tok->nvalues / line->n_sample; + int idx = tok->idx >= 0 ? tok->idx : 0; int is_missing = 1; - hts_expand(double,tok->nvalues,tok->mvalues,tok->values); - for (i=0; invalues; i++) + int32_t *ptr = flt->tmpi; + for (i=0; in_sample; i++) { - if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end ) + if ( ptr[idx]==bcf_int32_missing || ptr[idx]==bcf_int32_vector_end ) bcf_double_set_missing(tok->values[i]); else { - tok->values[i] = flt->tmpi[i]; + tok->values[i] = ptr[idx]; is_missing = 0; } + ptr += nvals; } if ( is_missing ) tok->nvalues = 0; - else if ( tok->idx >= 0 ) + else tok->nvalues = line->n_sample; + tok->nsamples = tok->nvalues; + return; + } + if ( tok->idx == -2 ) + { + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); + int nvals = tok->nvalues / line->n_sample; + int idx = tok->idx >= 0 ? tok->idx : 0; + int is_missing = 1; + int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1; + if ( end >= nvals ) end = nvals - 1; + int32_t *ptr = flt->tmpi; + for (i=0; in_sample; i++) { - int nsmpl = bcf_hdr_nsamples(flt->hdr); - int nvals = tok->nvalues / nsmpl; - if ( tok->idx >= nvals ) - tok->nvalues = 0; // the index is too big - else - { - for (i=0; ivalues[i] = tok->values[i*nvals+tok->idx]; - tok->nvalues = nsmpl; - } + for (k=0; k<=end; k++) + if ( k>=tok->nidxs || tok->idxs[k] ) + { + if ( ptr[k]==bcf_int32_missing || ptr[k]==bcf_int32_vector_end ) + bcf_double_set_missing(tok->values[j]); + else + { + tok->values[j] = ptr[k]; + is_missing = 0; + } + j++; + } + ptr += nvals; + } + if ( is_missing ) tok->nvalues = tok->nsamples = 0; + else + { + tok->nsamples = line->n_sample; + tok->nvalues = j; } + return; } - tok->nsamples = tok->nvalues; } static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) { int i; - if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<=0 ) + if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 ) { - tok->nvalues = tok->nsamples = 0; // missing values + tok->nvalues = tok->nsamples = 0; + return; } - else + if ( tok->idx >= -1 ) // scalar or vector index { + hts_expand(double,flt->nsamples,tok->mvalues,tok->values); + int nvals = tok->nvalues / line->n_sample; + int idx = tok->idx >= 0 ? tok->idx : 0; int is_missing = 1; - hts_expand(double,tok->nvalues,tok->mvalues,tok->values); - for (i=0; invalues; i++) + float *ptr = flt->tmpf; + for (i=0; in_sample; i++) { - if ( bcf_float_is_missing(flt->tmpf[i]) || bcf_float_is_vector_end(flt->tmpf[i]) ) + if ( bcf_float_is_missing(ptr[idx]) || bcf_float_is_vector_end(ptr[idx]) ) bcf_double_set_missing(tok->values[i]); else { - tok->values[i] = flt->tmpf[i]; + tok->values[i] = ptr[idx]; is_missing = 0; } + ptr += nvals; } if ( is_missing ) tok->nvalues = 0; - else if ( tok->idx >= 0 ) + else tok->nvalues = line->n_sample; + tok->nsamples = tok->nvalues; + return; + } + if ( tok->idx == -2 ) + { + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); + int nvals = tok->nvalues / line->n_sample; + int idx = tok->idx >= 0 ? tok->idx : 0; + int is_missing = 1; + int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1; + if ( end >= nvals ) end = nvals - 1; + float *ptr = flt->tmpf; + for (i=0; in_sample; i++) { - int nsmpl = bcf_hdr_nsamples(flt->hdr); - int nvals = tok->nvalues / nsmpl; - if ( tok->idx >= nvals ) - tok->nvalues = 0; // the index is too big - else - { - for (i=0; ivalues[i] = tok->values[i*nvals+tok->idx]; - tok->nvalues = nsmpl; - } + for (k=0; k<=end; k++) + if ( k>=tok->nidxs || tok->idxs[k] ) + { + if ( bcf_float_is_missing(ptr[k]) || bcf_float_is_vector_end(ptr[k]) ) + bcf_double_set_missing(tok->values[j]); + else + { + tok->values[j] = ptr[k]; + is_missing = 0; + } + j++; + } + ptr += nvals; + } + if ( is_missing ) tok->nvalues = tok->nsamples = 0; + else + { + tok->nsamples = line->n_sample; + tok->nvalues = j; } + return; } - tok->nsamples = tok->nvalues; } static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok) { - int ndim = tok->nsamples * (int)tok->values[0]; - int ret = bcf_get_format_char(flt->hdr,line,tok->tag,&tok->str_value,&ndim); + tok->str_value.l = tok->nvalues = 0; + if ( !line->n_sample ) return; - int nsmpl = bcf_hdr_nsamples(flt->hdr); - ndim /= nsmpl; - tok->values[0] = ndim; + int ndim = tok->str_value.m; + int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim); + tok->str_value.m = ndim; - if ( ret<=0 ) - { - tok->nvalues = 0; - return; - } + if ( nstr<=0 ) return; - if ( tok->idx < 0 ) // scalar + if ( tok->idx == -1 || (tok->idx==-2 && tok->idxs[0]==-1) ) // scalar or keep all values of a vector: TAG[*] { - tok->nvalues = tok->nsamples = nsmpl; + tok->nsamples = line->n_sample; + tok->nstr1 = ndim / line->n_sample; + tok->nvalues = tok->str_value.l = nstr; return; } - // vector + int nstr1 = nstr / line->n_sample; + + // vector, one or multiple indices int i; - for (i=0; in_sample; i++) { - char *ss = tok->str_value + i*ndim; - int is = 0, ivec = 0; - while ( ivecidx && isidx || is==ndim || !ss[is] ) + char *dst = tok->str_value.s + i*nstr1, *str = dst; + int nval = 0, ibeg = 0; + while ( ibeg < nstr1 ) { - ss[0] = '.'; - ss[1] = 0; - continue; + int iend = ibeg + 1; + while ( iend < nstr1 && str[iend] && str[iend]!=',' ) iend++; + + int keep = 0; + if ( tok->idx >=0 ) + keep = tok->idx==nval ? 1 : 0; + else if ( nval < tok->nidxs ) + keep = tok->idxs[nval] ? 1 : 0; + else if ( tok->idxs[tok->nidxs-1] < 0 ) + keep = 1; + + if ( keep ) + { + if ( ibeg>0 ) memmove(dst, str+ibeg, iend-ibeg+1); + dst += iend - ibeg + 1; + if ( tok->idx>=0 ) break; + } + if ( !str[iend] ) break; + ibeg = iend + 1; + nval++; } - int ie = is; - while ( ienvalues = tok->str_value.l = nstr; + tok->nstr1 = nstr1; + tok->nsamples = line->n_sample; +} +static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type) +{ + bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT"); + if ( !fmt ) { - tok->nvalues = 0; + tok->nvalues = tok->str_value.l = 0; return; } - tok->nvalues = ret; + + int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals = type==2 ? 3 : 4; + if ( tok->str_value.m <= nvals*nsmpl ) + { + tok->str_value.m = nvals*nsmpl + 1; + tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m); + } + +#define BRANCH_INT(type_t,vector_end) \ + { \ + for (i=0; in_sample; i++) \ + { \ + type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \ + int is_het = 0, has_ref = 0, missing = 0; \ + for (j=0; jn; j++) \ + { \ + if ( ptr[j]==vector_end ) break; /* smaller ploidy */ \ + if ( bcf_gt_is_missing(ptr[j]) ) { missing=1; break; } /* missing allele */ \ + int ial = ptr[j]; \ + if ( bcf_gt_allele(ial)==0 ) has_ref = 1; \ + if ( j>0 ) \ + { \ + int jal = ptr[j-1]; \ + if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \ + } \ + } \ + char *dst = &tok->str_value.s[nvals*i]; \ + if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \ + else if ( type==3 ) \ + { \ + if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \ + else if ( !is_het ) dst[0]='h', dst[1]='o', dst[2]='m', dst[3] = 0; /* hom */ \ + else dst[0]='h', dst[1]='e', dst[2]='t', dst[3] = 0; /* het */ \ + } \ + else \ + { \ + if ( j==1 ) \ + { \ + if ( has_ref ) dst[0]='r', dst[1]=0; /* r, haploid */ \ + else dst[0]='a', dst[1]=0; /* a, haploid */ \ + } \ + else if ( !is_het ) \ + { \ + if ( has_ref ) dst[0]='r', dst[1]='r', dst[2] = 0; /* rr */ \ + else dst[0]='a', dst[1]='a', dst[2] = 0; /* aa */ \ + } \ + else \ + { \ + if ( has_ref ) dst[0]='r', dst[1]='a', dst[2] = 0; /* ra */ \ + else dst[0]='a', dst[1]='A', dst[2] = 0; /* aA */ \ + } \ + } \ + } \ + } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; + default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break; + } +#undef BRANCH_INT tok->nsamples = nsmpl; + tok->nvalues = tok->str_value.l = nvals*nsmpl; + tok->str_value.s[tok->str_value.l] = 0; + tok->nstr1 = nvals; } +static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); } +static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); } + static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok) { bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT"); @@ -638,67 +825,73 @@ static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *to kstring_t str; gt_length_too_big: - str.s = tok->str_value; str.m = tok->values[0] * nsmpl; str.l = 0; + tok->str_value.l = 0; for (i=0; istr_value.l; - bcf_format_gt(fmt, i, &str); - kputc_(0,&str); - if ( str.l - plen > blen ) + bcf_format_gt(fmt, i, &tok->str_value); + kputc_(0, &tok->str_value); + if ( tok->str_value.l - plen > blen ) { // too many alternate alleles or ploidy is too large, the genotype does not fit // three characters ("0/0" vs "10/10"). - tok->str_value = str.s; blen *= 2; goto gt_length_too_big; } - plen = str.l - plen; - while ( plenstr_value.l - plen; + while ( plen < blen ) { - kputc_(0, &str); + kputc_(0, &tok->str_value); plen++; } } - tok->nvalues = str.l; tok->nsamples = nsmpl; - tok->values[0] = blen; - tok->str_value = str.s; + tok->nvalues = tok->str_value.l; + tok->nstr1 = blen; } static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok) { - kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0; - kputs(line->d.allele[0], &str); - tok->nvalues = str.l; - tok->values[0] = str.m; - tok->str_value = str.s; + tok->str_value.l = 0; + kputs(line->d.allele[0], &tok->str_value); + tok->nvalues = tok->str_value.l; } static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok) { - kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0; + tok->str_value.l = 0; if ( tok->idx>=0 ) { - if ( line->n_allele >= tok->idx ) - kputs(line->d.allele[tok->idx], &str); + if ( line->n_allele > tok->idx + 1 ) + kputs(line->d.allele[tok->idx + 1], &tok->str_value); else - kputc('.', &str); + kputc('.', &tok->str_value); + tok->idx = 0; + } + else if ( tok->idx==-2 ) + { + int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? line->n_allele - 1 : tok->nidxs - 1; + if ( end >= line->n_allele - 1 ) end = line->n_allele - 2; + for (i=0; i<=end; i++) + if ( i>=tok->nidxs || tok->idxs[i] ) + { + if ( tok->str_value.l ) kputc(',', &tok->str_value); + kputs(line->d.allele[i+1], &tok->str_value); + } } else if ( line->n_allele>1 ) { - kputs(line->d.allele[1], &str); + kputs(line->d.allele[1], &tok->str_value); int i; for (i=2; in_allele; i++) { - kputc(',', &str); - kputs(line->d.allele[i], &str); + kputc(',', &tok->str_value); + kputs(line->d.allele[i], &tok->str_value); } } else if ( line->n_allele==1 ) - kputc('.', &str); - tok->nvalues = str.l; - tok->values[0] = str.m; - tok->str_value = str.s; + kputc('.', &tok->str_value); + tok->nvalues = tok->str_value.l; } static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok) { @@ -859,11 +1052,11 @@ static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok) static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) { tok->is_str = 0; - if ( !tok->nvalues ) return; + if ( !tok->str_value.l ) return; if ( tok->idx==-2 ) { int i = 0; - char *ss = tok->str_value; + char *ss = tok->str_value.s; while ( *ss ) { char *se = ss; @@ -883,9 +1076,10 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) } else { - tok->values[0] = strlen(tok->str_value); + tok->values[0] = strlen(tok->str_value.s); tok->nvalues = 1; } + tok->str_value.l = 0; } #define VECTOR_ARITHMETICS(atok,btok,AOP) \ { \ @@ -1079,7 +1273,16 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \ else \ { \ - if ( (atok)->nsamples && (btok)->nsamples ) \ + if ( (atok)->idx<=-2 || (btok)->idx<=-2 ) \ + { \ + /* any field can match: [*] */ \ + for (i=0; i<(atok)->nvalues; i++) \ + { \ + for (j=0; j<(btok)->nvalues; j++) \ + if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \ + } \ + } \ + else if ( (atok)->nsamples && (btok)->nsamples ) \ { \ for (i=0; i<(atok)->nsamples; i++) \ { \ @@ -1113,15 +1316,6 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) (atok)->nsamples = (btok)->nsamples; \ if ( !has_values ) (atok)->nvalues = 0; \ } \ - else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \ - { \ - /* any field can match: [*] */ \ - for (i=0; i<(atok)->nvalues; i++) \ - { \ - for (j=0; j<(btok)->nvalues; j++) \ - if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \ - } \ - } \ else \ { \ if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \ @@ -1132,18 +1326,18 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) } static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // logic: TOK_EQ or TOK_NE { - if ( !atok->nvalues ) { return 0; } - if ( !btok->nvalues ) { atok->nvalues = 0; return 0; } + if ( !atok->str_value.l ) { return 0; } + if ( !btok->str_value.l ) { atok->str_value.l = 0; return 0; } int i, pass_site = 0; if ( atok->nsamples && atok->nsamples==btok->nsamples ) { for (i=0; insamples; i++) { - char *astr = atok->str_value + i*(int)atok->values[0]; - char *bstr = btok->str_value + i*(int)btok->values[0]; - char *aend = astr + (int)atok->values[0], *a = astr; + char *astr = atok->str_value.s + i*atok->nstr1; + char *bstr = btok->str_value.s + i*btok->nstr1; + char *aend = astr + atok->str_value.l, *a = astr; while ( avalues[0], *b = bstr; + char *bend = bstr + btok->str_value.l, *b = bstr; while ( bpass_samples[i] = 0; else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0; @@ -1163,8 +1357,8 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log token_t *xtok, *ytok; // xtok is scalar, ytok array if ( btok->idx==-2 ) { xtok = atok; ytok = btok; } else { xtok = btok; ytok = atok; } - char *xstr = xtok->str_value, *xend = xstr + xtok->nvalues; - char *ystr = ytok->str_value, *yend = ystr + ytok->nvalues, *y = ystr; + char *xstr = xtok->str_value.s, *xend = xstr + xtok->str_value.l; + char *ystr = ytok->str_value.s, *yend = ystr + ytok->str_value.l, *y = ystr; while ( y<=yend ) { if ( y==yend || *y==',' ) @@ -1180,7 +1374,7 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log } } else - pass_site = strcmp(atok->str_value,btok->str_value) ? 0 : 1; + pass_site = strcmp(atok->str_value.s,btok->str_value.s) ? 0 : 1; if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1; } else @@ -1188,19 +1382,26 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log token_t *xtok, *ytok; if ( !atok->nsamples ) { xtok = atok; ytok = btok; } else { xtok = btok; ytok = atok; } - char *xstr = xtok->str_value; - char *xend = xstr + (int)xtok->values[0], *x = xstr; + char *xstr = xtok->str_value.s; + char *xend = xstr + xtok->str_value.l, *x = xstr; while ( xnsamples; i++) { - char *ystr = ytok->str_value + i*(int)ytok->values[0]; - char *yend = ystr + (int)ytok->values[0], *y = ystr; - while ( ypass_samples[i] = 0; - else atok->pass_samples[i] = strncmp(xstr,ystr,x-xstr)==0 ? 1 : 0; - if ( logic!=TOK_EQ ) - atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1; - pass_site |= atok->pass_samples[i]; + char *ystr = ytok->str_value.s + i*ytok->nstr1; + char *ybeg = ystr, *yend = ystr + ytok->nstr1; + int pass = 0; + while ( ybeg < yend ) + { + char *y = ybeg; + while ( ypass_samples[i] = pass; + pass_site |= pass; } if ( !atok->nsamples ) atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set @@ -1214,18 +1415,70 @@ static int regex_vector_strings(token_t *atok, token_t *btok, int negate) { for (i=0; insamples; i++) { - char *ptr = atok->str_value + i*(int)atok->values[0]; + char *ptr = atok->str_value.s + i*atok->nstr1; atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1; if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1; pass_site |= atok->pass_samples[i]; } return pass_site; } - pass_site = regexec(btok->regex, atok->str_value, 0,NULL,0) ? 0 : 1; + pass_site = regexec(btok->regex, atok->str_value.s, 0,NULL,0) ? 0 : 1; if ( negate ) pass_site = pass_site ? 0 : 1; return pass_site; } +static void parse_tag_idx(char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG[" +{ + // TAG[*] .. any field + if ( !strncmp("*]", tag_idx, 3) ) + { + tok->idxs = (int*) malloc(sizeof(int)); + tok->idxs[0] = -1; + tok->nidxs = 1; + tok->idx = -2; + return; + } + + // TAG[integer] .. one field + char *end, *beg = tag_idx; + tok->idx = strtol(tag_idx, &end, 10); + if ( tok->idx >= 0 && *end==']' ) return; + + + // TAG[0,1] or TAG[0-2] or [1-] etc + int i, ibeg = -1; + while ( *beg && *beg!=']' ) + { + int idx = strtol(beg, &end, 10); + if ( end[0]==',' ) beg = end + 1; + else if ( end[0]==']' ) beg = end; + else if ( end[0]=='-' ) { beg = end + 1; ibeg = idx; continue; } + else error("Could not parse the index: %s[%s\n", tag, tag_idx+1); + if ( idx >= tok->nidxs ) + { + tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(idx+1)); + memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(idx - tok->nidxs + 1)); + tok->nidxs = idx + 1; + } + if ( ibeg>=0 ) + { + for (i=ibeg; i<=idx; i++) tok->idxs[i] = 1; + ibeg = -1; + } + tok->idxs[idx] = 1; + } + if ( ibeg >=0 ) + { + if ( ibeg >= tok->nidxs ) + { + tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(ibeg+1)); + memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(ibeg - tok->nidxs + 1)); + tok->nidxs = ibeg + 1; + } + tok->idxs[ibeg] = -1; + } + tok->idx = -2; +} static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->tok_type = TOK_VAL; @@ -1363,17 +1616,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) int i; for (i=0; iidx = -2; // tag[*] .. any field - else - { - char *end; - tok->idx = strtol(tmp.s+is_array, &end, 10); - if ( *end!=']' ) error("Could not parse the index: %s[%s\n", tmp.s,tmp.s+is_array); - } - } + if ( is_array ) + parse_tag_idx(tmp.s, tmp.s+is_array, tok); } tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s); if ( is_fmt==-1 ) @@ -1427,7 +1671,13 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break; default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__); } - if(!is_array) tok->idx = -2; + if (!is_array) + { + tok->idx = -2; + tok->idxs = (int*) malloc(sizeof(int)); + tok->idxs[0] = -1; + tok->nidxs = 1; + } } } filter->max_unpack |= BCF_UN_INFO; @@ -1520,6 +1770,11 @@ static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks) } } +static void str_to_lower(char *str) +{ + while ( *str ) { *str = tolower(*str); str++; } +} + // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm filter_t *filter_init(bcf_hdr_t *hdr, const char *str) @@ -1540,8 +1795,8 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) ret = filters_next_token(&tmp, &len); if ( ret==-1 ) error("Missing quotes in: %s\n", str); - //fprintf(pysam_stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len); - //int i; for (i=0; istr); + int ival; + if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1; + else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1; + else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = i + 1; + else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1; + else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); + + // assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc + if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; } // ra + else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; } // aA + else if ( !strcasecmp(out[ival].key,"aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='a'; } // aa + else if ( !strcasecmp(out[ival].key,"a") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]=0; } // a + else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; } // r + continue; + } if ( !strcmp(out[i].tag,"FILTER") ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); @@ -1730,9 +2007,10 @@ void filter_destroy(filter_t *filter) int i; for (i=0; infilters; i++) { - //if ( filter->filters[i].key ) free(filter->filters[i].key); - free(filter->filters[i].str_value); + if ( filter->filters[i].key ) free(filter->filters[i].key); + free(filter->filters[i].str_value.s); free(filter->filters[i].tag); + free(filter->filters[i].idxs); free(filter->filters[i].values); free(filter->filters[i].pass_samples); if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash); @@ -1747,6 +2025,7 @@ void filter_destroy(filter_t *filter) free(filter->str); free(filter->tmpi); free(filter->tmpf); + free(filter->tmps.s); free(filter); } @@ -1767,16 +2046,15 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) filter->filters[i].setter(filter, line, &filter->filters[i]); else if ( filter->filters[i].key ) // string constant { - filter->filters[i].str_value = filter->filters[i].key; - filter->filters[i].values[0] = filter->filters[i].values[0]; - filter->filters[i].nvalues = strlen(filter->filters[i].key); + filter->filters[i].str_value.l = 0; + kputs(filter->filters[i].key, &filter->filters[i].str_value); + filter->filters[i].nvalues = filter->filters[i].str_value.l; } else // numeric constant { filter->filters[i].values[0] = filter->filters[i].threshold; filter->filters[i].nvalues = 1; } - filter->flt_stack[nstack++] = &filter->filters[i]; continue; } diff --git a/bcftools/kheap.h b/bcftools/kheap.h index ac2f9f9..cb5dda4 100644 --- a/bcftools/kheap.h +++ b/bcftools/kheap.h @@ -57,6 +57,8 @@ // "data_t". heap_t *heap = khp_init(mh); + // When inserting a new element, the heap stores a copy of the memory + // area pointed to by the third argument. for (int i=0; i<3; i++) khp_insert(mh, heap, &data[i]); @@ -130,7 +132,8 @@ { \ heap->mdat = heap->ndat; \ kroundup32(heap->mdat); \ - heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t)); \ + heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t)); \ + memset(heap->dat + heap->ndat, 0, (heap->mdat - heap->ndat)*sizeof(kheap_t)); \ } \ int i = heap->ndat - 1; \ while ( i && __cmp(dat,&heap->dat[khp_parent(i)]) ) \ diff --git a/bcftools/main.c b/bcftools/main.c index 4e3e0e5..03fa6a7 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -57,6 +57,7 @@ int main_plugin(int argc, char *argv[]); int main_consensus(int argc, char *argv[]); int main_csq(int argc, char *argv[]); int bam_mpileup(int argc, char *argv[]); +int main_sort(int argc, char *argv[]); typedef struct { @@ -126,6 +127,10 @@ static cmd_t cmds[] = .alias = "reheader", .help = "modify VCF/BCF header, change sample names" }, + { .func = main_sort, + .alias = "sort", + .help = "sort VCF/BCF file" + }, { .func = main_vcfview, .alias = "view", .help = "VCF/BCF conversion, view, subset and filter VCF/BCF files" diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c index f148252..9d81ba1 100644 --- a/bcftools/main.c.pysam.c +++ b/bcftools/main.c.pysam.c @@ -59,6 +59,7 @@ int main_plugin(int argc, char *argv[]); int main_consensus(int argc, char *argv[]); int main_csq(int argc, char *argv[]); int bam_mpileup(int argc, char *argv[]); +int main_sort(int argc, char *argv[]); typedef struct { @@ -128,6 +129,10 @@ static cmd_t cmds[] = .alias = "reheader", .help = "modify VCF/BCF header, change sample names" }, + { .func = main_sort, + .alias = "sort", + .help = "sort VCF/BCF file" + }, { .func = main_vcfview, .alias = "view", .help = "VCF/BCF conversion, view, subset and filter VCF/BCF files" diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c index ac37dd4..9b6c6eb 100644 --- a/bcftools/mpileup.c +++ b/bcftools/mpileup.c @@ -909,7 +909,7 @@ int bam_mpileup(int argc, char *argv[]) {"ignore-RG", no_argument, NULL, 5}, {"ignore-rg", no_argument, NULL, 5}, {"gvcf", required_argument, NULL, 'g'}, - {"non-reference", no_argument, NULL, 7}, + {"no-reference", no_argument, NULL, 7}, {"no-version", no_argument, NULL, 8}, {"threads",required_argument,NULL,9}, {"illumina1.3+", no_argument, NULL, '6'}, @@ -1099,11 +1099,8 @@ int bam_mpileup(int argc, char *argv[]) free(mplp.files); free(mplp.reg_fname); free(mplp.pl_list); if (mplp.fai) fai_destroy(mplp.fai); - if (mplp.bed) - { - regidx_destroy(mplp.bed); - regitr_destroy(mplp.bed_itr); - } + if (mplp.bed) regidx_destroy(mplp.bed); + if (mplp.bed_itr) regitr_destroy(mplp.bed_itr); if (mplp.reg) regidx_destroy(mplp.reg); bam_smpl_destroy(mplp.bsmpl); return ret; diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c index 6ef6838..94286e9 100644 --- a/bcftools/mpileup.c.pysam.c +++ b/bcftools/mpileup.c.pysam.c @@ -911,7 +911,7 @@ int bam_mpileup(int argc, char *argv[]) {"ignore-RG", no_argument, NULL, 5}, {"ignore-rg", no_argument, NULL, 5}, {"gvcf", required_argument, NULL, 'g'}, - {"non-reference", no_argument, NULL, 7}, + {"no-reference", no_argument, NULL, 7}, {"no-version", no_argument, NULL, 8}, {"threads",required_argument,NULL,9}, {"illumina1.3+", no_argument, NULL, '6'}, @@ -1101,11 +1101,8 @@ int bam_mpileup(int argc, char *argv[]) free(mplp.files); free(mplp.reg_fname); free(mplp.pl_list); if (mplp.fai) fai_destroy(mplp.fai); - if (mplp.bed) - { - regidx_destroy(mplp.bed); - regitr_destroy(mplp.bed_itr); - } + if (mplp.bed) regidx_destroy(mplp.bed); + if (mplp.bed_itr) regitr_destroy(mplp.bed_itr); if (mplp.reg) regidx_destroy(mplp.reg); bam_smpl_destroy(mplp.bsmpl); return ret; diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c index ffe71c4..11c55bd 100644 --- a/bcftools/vcfcnv.c +++ b/bcftools/vcfcnv.c @@ -212,8 +212,14 @@ static double *init_iprobs(int ndim, double same_prob) static void init_sample_files(sample_t *smpl, char *dir) { smpl->dat_fh = open_file(&smpl->dat_fname,"w","%s/dat.%s.tab",dir,smpl->name); + if ( !smpl->dat_fh ) error("Error opening file: %s/dat.%s.tab\n",dir,smpl->name); + smpl->cn_fh = open_file(&smpl->cn_fname,"w","%s/cn.%s.tab",dir,smpl->name); + if ( !smpl->cn_fh ) error("Error opening file: %s/cn.%s.tab\n",dir,smpl->name); + smpl->summary_fh = open_file(&smpl->summary_fname,"w","%s/summary.%s.tab",dir,smpl->name); + if ( !smpl->summary_fh ) error("Error opening file: %s/summary.%s.tab\n",dir,smpl->name); + fprintf(smpl->dat_fh,"# [1]Chromosome\t[2]Position\t[3]BAF\t[4]LRR\n"); fprintf(smpl->cn_fh,"# [1]Chromosome\t[2]Position\t[3]CN\t[4]P(CN0)\t[5]P(CN1)\t[6]P(CN2)\t[7]P(CN3)\n"); fprintf(smpl->summary_fh,"# RG, Regions [2]Chromosome\t[3]Start\t[4]End\t[5]Copy Number state\t[6]Quality\t[7]nSites\t[8]nHETs\n"); diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c index 1075ef1..86ba48f 100644 --- a/bcftools/vcfcnv.c.pysam.c +++ b/bcftools/vcfcnv.c.pysam.c @@ -214,8 +214,14 @@ static double *init_iprobs(int ndim, double same_prob) static void init_sample_files(sample_t *smpl, char *dir) { smpl->dat_fh = open_file(&smpl->dat_fname,"w","%s/dat.%s.tab",dir,smpl->name); + if ( !smpl->dat_fh ) error("Error opening file: %s/dat.%s.tab\n",dir,smpl->name); + smpl->cn_fh = open_file(&smpl->cn_fname,"w","%s/cn.%s.tab",dir,smpl->name); + if ( !smpl->cn_fh ) error("Error opening file: %s/cn.%s.tab\n",dir,smpl->name); + smpl->summary_fh = open_file(&smpl->summary_fname,"w","%s/summary.%s.tab",dir,smpl->name); + if ( !smpl->summary_fh ) error("Error opening file: %s/summary.%s.tab\n",dir,smpl->name); + fprintf(smpl->dat_fh,"# [1]Chromosome\t[2]Position\t[3]BAF\t[4]LRR\n"); fprintf(smpl->cn_fh,"# [1]Chromosome\t[2]Position\t[3]CN\t[4]P(CN0)\t[5]P(CN1)\t[6]P(CN2)\t[7]P(CN3)\n"); fprintf(smpl->summary_fh,"# RG, Regions [2]Chromosome\t[3]Start\t[4]End\t[5]Copy Number state\t[6]Quality\t[7]nSites\t[8]nHETs\n"); diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c index 8f596d4..1e28ad8 100644 --- a/bcftools/vcfconvert.c +++ b/bcftools/vcfconvert.c @@ -1316,12 +1316,13 @@ static void gvcf_to_vcf(args_t *args) } // check if alleles compatible with being a gVCF record + // ALT must be one of ., <*>, , + // check for INFO/END is below int i, gallele = -1; if (line->n_allele==1) gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present) - else + else if ( line->d.allele[1][0]=='<' ) { - if ( line->d.allele[1][0]!='<' ) continue; for (i=1; in_allele; i++) { if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c index 53df3d9..d1b15ba 100644 --- a/bcftools/vcfconvert.c.pysam.c +++ b/bcftools/vcfconvert.c.pysam.c @@ -1318,12 +1318,13 @@ static void gvcf_to_vcf(args_t *args) } // check if alleles compatible with being a gVCF record + // ALT must be one of ., <*>, , + // check for INFO/END is below int i, gallele = -1; if (line->n_allele==1) gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present) - else + else if ( line->d.allele[1][0]=='<' ) { - if ( line->d.allele[1][0]!='<' ) continue; for (i=1; in_allele; i++) { if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c index aa60fb2..807fedd 100644 --- a/bcftools/vcfindex.c +++ b/bcftools/vcfindex.c @@ -32,6 +32,7 @@ DEALINGS IN THE SOFTWARE. */ #define __STDC_FORMAT_MACROS #include #include +#include #include "bcftools.h" #define BCF_LIDX_SHIFT 14 @@ -208,6 +209,12 @@ int main_vcfindex(int argc, char *argv[]) return 1; } } + + // check for truncated files, allow only with -f + BGZF *fp = bgzf_open(fname, "r"); + if ( !fp ) error("index: failed to open %s\n", fname); + if ( bgzf_check_EOF(fp)!=1 ) error("index: the input is probably truncated, use -f to index anyway: %s\n", fname); + if ( bgzf_close(fp)!=0 ) error("index: close failed: %s\n", fname); } int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads); diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c index ff960b9..157fc8e 100644 --- a/bcftools/vcfindex.c.pysam.c +++ b/bcftools/vcfindex.c.pysam.c @@ -34,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */ #define __STDC_FORMAT_MACROS #include #include +#include #include "bcftools.h" #define BCF_LIDX_SHIFT 14 @@ -210,6 +211,12 @@ int main_vcfindex(int argc, char *argv[]) return 1; } } + + // check for truncated files, allow only with -f + BGZF *fp = bgzf_open(fname, "r"); + if ( !fp ) error("index: failed to open %s\n", fname); + if ( bgzf_check_EOF(fp)!=1 ) error("index: the input is probably truncated, use -f to index anyway: %s\n", fname); + if ( bgzf_close(fp)!=0 ) error("index: close failed: %s\n", fname); } int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads); diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c index 9eb3a7c..3e0e1e5 100644 --- a/bcftools/vcfisec.c +++ b/bcftools/vcfisec.c @@ -82,13 +82,13 @@ void mkdir_p(const char *fmt, ...) while (*p) { while (*p && *p!='/') p++; - if ( *p ) - { - *p = 0; - mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); - *p = '/'; - p++; - } + if ( !*p ) break; + char ctmp = *p; + *p = 0; + int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); + if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", path,strerror(errno)); + *p = ctmp; + while ( *p && *p=='/' ) p++; } free(tmp); free(path); diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c index e3890d5..15ef22d 100644 --- a/bcftools/vcfisec.c.pysam.c +++ b/bcftools/vcfisec.c.pysam.c @@ -84,13 +84,13 @@ void mkdir_p(const char *fmt, ...) while (*p) { while (*p && *p!='/') p++; - if ( *p ) - { - *p = 0; - mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); - *p = '/'; - p++; - } + if ( !*p ) break; + char ctmp = *p; + *p = 0; + int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); + if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", path,strerror(errno)); + *p = ctmp; + while ( *p && *p=='/' ) p++; } free(tmp); free(path); diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c index e9ed5ad..31f5dad 100644 --- a/bcftools/vcfmerge.c +++ b/bcftools/vcfmerge.c @@ -662,7 +662,6 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb) } // new allele map[i] = *nb; - if ( b[*nb] ) free(b[*nb]); b[*nb] = const_ai ? strdup(ai) : ai; (*nb)++; } @@ -1668,6 +1667,11 @@ void gvcf_set_alleles(args_t *args) bcf_srs_t *files = args->files; maux_t *maux = args->maux; gvcf_aux_t *gaux = maux->gvcf; + for (i=0; inals; i++) + { + free(maux->als[i]); + maux->als[i] = NULL; + } maux->nals = 0; for (i=0; inreaders; i++) @@ -2025,9 +2029,15 @@ int can_merge(args_t *args) maux_t *maux = args->maux; gvcf_aux_t *gaux = maux->gvcf; char *id = NULL, ref = 'N'; + int i,j,k, ntodo = 0; + + for (i=0; inals; i++) + { + free(maux->als[i]); + maux->als[i] = NULL; + } maux->var_types = maux->nals = 0; - int i,j,k, ntodo = 0; for (i=0; inreaders; i++) { buffer_t *buf = &maux->buf[i]; diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c index a162905..f12e0a6 100644 --- a/bcftools/vcfmerge.c.pysam.c +++ b/bcftools/vcfmerge.c.pysam.c @@ -664,7 +664,6 @@ char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb) } // new allele map[i] = *nb; - if ( b[*nb] ) free(b[*nb]); b[*nb] = const_ai ? strdup(ai) : ai; (*nb)++; } @@ -1670,6 +1669,11 @@ void gvcf_set_alleles(args_t *args) bcf_srs_t *files = args->files; maux_t *maux = args->maux; gvcf_aux_t *gaux = maux->gvcf; + for (i=0; inals; i++) + { + free(maux->als[i]); + maux->als[i] = NULL; + } maux->nals = 0; for (i=0; inreaders; i++) @@ -2027,9 +2031,15 @@ int can_merge(args_t *args) maux_t *maux = args->maux; gvcf_aux_t *gaux = maux->gvcf; char *id = NULL, ref = 'N'; + int i,j,k, ntodo = 0; + + for (i=0; inals; i++) + { + free(maux->als[i]); + maux->als[i] = NULL; + } maux->var_types = maux->nals = 0; - int i,j,k, ntodo = 0; for (i=0; inreaders; i++) { buffer_t *buf = &maux->buf[i]; diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c index 86c20ab..bc51018 100644 --- a/bcftools/vcfnorm.c +++ b/bcftools/vcfnorm.c @@ -1514,6 +1514,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n) { bcf1_t *line; int i, k; + int prev_rid = -1, prev_pos = -1, prev_type = 0; for (i=0; irbuf); @@ -1534,6 +1535,23 @@ static void flush_buffer(args_t *args, htsFile *file, int n) continue; } } + else if ( args->rmdup ) + { + int line_type = bcf_get_variant_types(args->lines[k]); + if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos ) + { + if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue; + if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; + if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; + } + else + { + prev_rid = args->lines[k]->rid; + prev_pos = args->lines[k]->pos; + prev_type = 0; + } + prev_type |= line_type; + } bcf_write1(file, args->hdr, args->lines[k]); } if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n ) diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c index a54180d..9308e6b 100644 --- a/bcftools/vcfnorm.c.pysam.c +++ b/bcftools/vcfnorm.c.pysam.c @@ -1516,6 +1516,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n) { bcf1_t *line; int i, k; + int prev_rid = -1, prev_pos = -1, prev_type = 0; for (i=0; irbuf); @@ -1536,6 +1537,23 @@ static void flush_buffer(args_t *args, htsFile *file, int n) continue; } } + else if ( args->rmdup ) + { + int line_type = bcf_get_variant_types(args->lines[k]); + if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos ) + { + if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue; + if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; + if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; + } + else + { + prev_rid = args->lines[k]->rid; + prev_pos = args->lines[k]->pos; + prev_type = 0; + } + prev_type |= line_type; + } bcf_write1(file, args->hdr, args->lines[k]); } if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n ) diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c index ab4c100..04554f8 100644 --- a/bcftools/vcfquery.c +++ b/bcftools/vcfquery.c @@ -32,6 +32,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include "bcftools.h" #include "filter.h" @@ -151,10 +152,26 @@ static void query_vcf(args_t *args) static void list_columns(args_t *args) { + void *has_sample = NULL; + if ( args->sample_list ) + { + has_sample = khash_str2int_init(); + int i, nsmpl; + char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl); + for (i=0; ifiles->readers[0]; for (i=0; iheader); i++) + { + if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue; printf("%s\n", reader->header->samples[i]); + } + + if ( has_sample ) + khash_str2int_destroy_free(has_sample); } static char **copy_header(bcf_hdr_t *hdr, char **src, int nsrc) diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c index 10f56f1..8fd7cf0 100644 --- a/bcftools/vcfquery.c.pysam.c +++ b/bcftools/vcfquery.c.pysam.c @@ -34,6 +34,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include "bcftools.h" #include "filter.h" @@ -153,10 +154,26 @@ static void query_vcf(args_t *args) static void list_columns(args_t *args) { + void *has_sample = NULL; + if ( args->sample_list ) + { + has_sample = khash_str2int_init(); + int i, nsmpl; + char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl); + for (i=0; ifiles->readers[0]; for (i=0; iheader); i++) + { + if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue; fprintf(pysam_stdout, "%s\n", reader->header->samples[i]); + } + + if ( has_sample ) + khash_str2int_destroy_free(has_sample); } static char **copy_header(bcf_hdr_t *hdr, char **src, int nsrc) diff --git a/bcftools/vcfsort.c b/bcftools/vcfsort.c new file mode 100644 index 0000000..e41b628 --- /dev/null +++ b/bcftools/vcfsort.c @@ -0,0 +1,306 @@ +/* vcfsort.c -- sort subcommand + + Copyright (C) 2017 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kheap.h" +#include "bcftools.h" + +typedef struct +{ + char *fname; + htsFile *fh; + bcf1_t *rec; +} +blk_t; + +typedef struct _args_t +{ + bcf_hdr_t *hdr; + char **argv, *fname, *output_fname, *tmp_dir; + int argc, output_type; + size_t max_mem, mem; + bcf1_t **buf; + size_t nbuf, mbuf, nblk; + blk_t *blk; +} +args_t; + +int cmp_bcf_pos(const void *aptr, const void *bptr) +{ + bcf1_t *a = *((bcf1_t**)aptr); + bcf1_t *b = *((bcf1_t**)bptr); + if ( a->rid < b->rid ) return -1; + if ( a->rid > b->rid ) return 1; + if ( a->pos < b->pos ) return -1; + if ( a->pos > b->pos ) return 1; + return 0; +} + +void buf_flush(args_t *args) +{ + if ( !args->nbuf ) return; + + qsort(args->buf, args->nbuf, sizeof(*args->buf), cmp_bcf_pos); + + args->nblk++; + args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk); + blk_t *blk = args->blk + args->nblk - 1; + + kstring_t str = {0,0,0}; + ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk); + blk->fname = str.s; + + htsFile *fh = hts_open(blk->fname, "wbu"); + if ( fh == NULL ) error("Cannot write %s: %s\n", blk->fname, strerror(errno)); + bcf_hdr_write(fh, args->hdr); + + int i; + for (i=0; inbuf; i++) + { + bcf_write(fh, args->hdr, args->buf[i]); + bcf_destroy(args->buf[i]); + } + hts_close(fh); + + args->nbuf = 0; + args->mem = 0; +} + +void buf_push(args_t *args, bcf1_t *rec) +{ + int delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + sizeof(bcf1_t*); + if ( args->mem + delta > args->max_mem ) buf_flush(args); + args->nbuf++; + args->mem += delta; + hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf); + args->buf[args->nbuf-1] = rec; +} + +void sort_blocks(args_t *args) +{ + htsFile *in = hts_open(args->fname, "r"); + if ( !in ) error("Could not read %s\n", args->fname); + args->hdr = bcf_hdr_read(in); + + while ( 1 ) + { + bcf1_t *rec = bcf_init(); + int ret = bcf_read1(in, args->hdr, rec); + if ( ret < -1 ) error("Error encountered while parsing the input\n"); + if ( ret == -1 ) + { + bcf_destroy(rec); + break; + } + buf_push(args, rec); + } + buf_flush(args); + free(args->buf); + + if ( hts_close(in)!=0 ) error("Close failed: %s\n", args->fname); +} + +static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr) +{ + blk_t *a = *aptr; + blk_t *b = *bptr; + if ( a->rec->rid < b->rec->rid ) return 1; + if ( a->rec->rid > b->rec->rid ) return 0; + if ( a->rec->pos < b->rec->pos ) return 1; + return 0; +} +KHEAP_INIT(blk, blk_t*, blk_is_smaller) + +void blk_read(khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) +{ + if ( !blk->fh ) return; + int ret = bcf_read(blk->fh, hdr, blk->rec); + if ( ret < -1 ) error("Error reading %s\n", blk->fname); + if ( ret == -1 ) + { + if ( hts_close(blk->fh)!=0 ) error("Close failed: %s\n", blk->fname); + blk->fh = 0; + return; + } + khp_insert(blk, bhp, &blk); +} + +void merge_blocks(args_t *args) +{ + fprintf(stderr,"Merging %d temporary files\n", (int)args->nblk); + + khp_blk_t *bhp = khp_init(blk); + + int i; + for (i=0; inblk; i++) + { + blk_t *blk = args->blk + i; + blk->fh = hts_open(blk->fname, "r"); + if ( !blk->fh ) error("Could not read %s: %s\n", blk->fname, strerror(errno)); + bcf_hdr_t *hdr = bcf_hdr_read(blk->fh); + bcf_hdr_destroy(hdr); + blk->rec = bcf_init(); + blk_read(bhp, args->hdr, blk); + } + + htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + bcf_hdr_write(out, args->hdr); + while ( bhp->ndat ) + { + blk_t *blk = bhp->dat[0]; + bcf_write(out, args->hdr, blk->rec); + khp_delete(blk, bhp); + blk_read(bhp, args->hdr, blk); + } + if ( hts_close(out)!=0 ) error("Close failed: %s\n", args->output_fname); + + fprintf(stderr,"Cleaning\n"); + for (i=0; inblk; i++) + { + blk_t *blk = args->blk + i; + unlink(blk->fname); + free(blk->fname); + bcf_destroy(blk->rec); + } + rmdir(args->tmp_dir); + free(args->blk); + khp_destroy(blk, bhp); + fprintf(stderr,"Done\n"); +} + +static void usage(args_t *args) +{ + fprintf(stderr, "\n"); + fprintf(stderr, "About: Sort VCF/BCF file.\n"); + fprintf(stderr, "Usage: bcftools sort [OPTIONS] \n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -m, --max-mem [kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 + fprintf(stderr, " -o, --output-file output file name [stdout]\n"); + fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX/]\n"); + fprintf(stderr, "\n"); + exit(1); +} + +size_t parse_mem_string(char *str) +{ + char *tmp; + double mem = strtod(str, &tmp); + if ( tmp==str ) error("Could not parse: --max-mem %s\n", str); + if ( !strcasecmp("k",tmp) ) mem *= 1000; + else if ( !strcasecmp("m",tmp) ) mem *= 1000*1000; + else if ( !strcasecmp("g",tmp) ) mem *= 1000*1000*1000; + return mem; +} + +void mkdir_p(const char *fmt, ...); +void init(args_t *args) +{ + if ( !args->tmp_dir ) + { + args->tmp_dir = strdup("/tmp/bcftools-sort.XXXXXX"); + char *tmp_dir = mkdtemp(args->tmp_dir); + if ( !tmp_dir ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno)); + } + else + { + args->tmp_dir = strdup(args->tmp_dir); + mkdir_p(args->tmp_dir); + } + fprintf(stderr,"Writing to %s\n", args->tmp_dir); +} +void destroy(args_t *args) +{ + bcf_hdr_destroy(args->hdr); + free(args->tmp_dir); + free(args); +} + +int main_sort(int argc, char *argv[]) +{ + int c; + args_t *args = (args_t*) calloc(1,sizeof(args_t)); + args->argc = argc; args->argv = argv; + args->max_mem = 768*1000*1000; + args->output_fname = "-"; + + static struct option loptions[] = + { + {"max-mem",required_argument,NULL,'m'}, + {"temp-dir",required_argument,NULL,'T'}, + {"output-type",required_argument,NULL,'O'}, + {"output-file",required_argument,NULL,'o'}, + {"help",no_argument,NULL,'h'}, + {0,0,0,0} + }; + while ((c = getopt_long(argc, argv, "m:T:O:o:h?",loptions,NULL)) >= 0) + { + switch (c) + { + case 'm': args->max_mem = parse_mem_string(optarg); break; + case 'T': args->tmp_dir = optarg; break; + case 'o': args->output_fname = optarg; break; + case 'O': + switch (optarg[0]) { + case 'b': args->output_type = FT_BCF_GZ; break; + case 'u': args->output_type = FT_BCF; break; + case 'z': args->output_type = FT_VCF_GZ; break; + case 'v': args->output_type = FT_VCF; break; + default: error("The output type \"%s\" not recognised\n", optarg); + }; + break; + case 'h': usage(args); + case '?': usage(args); + default: error("Unknown argument: %s\n", optarg); + } + } + + if ( optind>=argc ) + { + if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin + else usage(args); + } + else args->fname = argv[optind]; + + init(args); + sort_blocks(args); + merge_blocks(args); + destroy(args); + + return 0; +} diff --git a/bcftools/vcfsort.c.pysam.c b/bcftools/vcfsort.c.pysam.c new file mode 100644 index 0000000..a07cd92 --- /dev/null +++ b/bcftools/vcfsort.c.pysam.c @@ -0,0 +1,308 @@ +#include "pysam.h" + +/* vcfsort.c -- sort subcommand + + Copyright (C) 2017 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kheap.h" +#include "bcftools.h" + +typedef struct +{ + char *fname; + htsFile *fh; + bcf1_t *rec; +} +blk_t; + +typedef struct _args_t +{ + bcf_hdr_t *hdr; + char **argv, *fname, *output_fname, *tmp_dir; + int argc, output_type; + size_t max_mem, mem; + bcf1_t **buf; + size_t nbuf, mbuf, nblk; + blk_t *blk; +} +args_t; + +int cmp_bcf_pos(const void *aptr, const void *bptr) +{ + bcf1_t *a = *((bcf1_t**)aptr); + bcf1_t *b = *((bcf1_t**)bptr); + if ( a->rid < b->rid ) return -1; + if ( a->rid > b->rid ) return 1; + if ( a->pos < b->pos ) return -1; + if ( a->pos > b->pos ) return 1; + return 0; +} + +void buf_flush(args_t *args) +{ + if ( !args->nbuf ) return; + + qsort(args->buf, args->nbuf, sizeof(*args->buf), cmp_bcf_pos); + + args->nblk++; + args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk); + blk_t *blk = args->blk + args->nblk - 1; + + kstring_t str = {0,0,0}; + ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk); + blk->fname = str.s; + + htsFile *fh = hts_open(blk->fname, "wbu"); + if ( fh == NULL ) error("Cannot write %s: %s\n", blk->fname, strerror(errno)); + bcf_hdr_write(fh, args->hdr); + + int i; + for (i=0; inbuf; i++) + { + bcf_write(fh, args->hdr, args->buf[i]); + bcf_destroy(args->buf[i]); + } + hts_close(fh); + + args->nbuf = 0; + args->mem = 0; +} + +void buf_push(args_t *args, bcf1_t *rec) +{ + int delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + sizeof(bcf1_t*); + if ( args->mem + delta > args->max_mem ) buf_flush(args); + args->nbuf++; + args->mem += delta; + hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf); + args->buf[args->nbuf-1] = rec; +} + +void sort_blocks(args_t *args) +{ + htsFile *in = hts_open(args->fname, "r"); + if ( !in ) error("Could not read %s\n", args->fname); + args->hdr = bcf_hdr_read(in); + + while ( 1 ) + { + bcf1_t *rec = bcf_init(); + int ret = bcf_read1(in, args->hdr, rec); + if ( ret < -1 ) error("Error encountered while parsing the input\n"); + if ( ret == -1 ) + { + bcf_destroy(rec); + break; + } + buf_push(args, rec); + } + buf_flush(args); + free(args->buf); + + if ( hts_close(in)!=0 ) error("Close failed: %s\n", args->fname); +} + +static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr) +{ + blk_t *a = *aptr; + blk_t *b = *bptr; + if ( a->rec->rid < b->rec->rid ) return 1; + if ( a->rec->rid > b->rec->rid ) return 0; + if ( a->rec->pos < b->rec->pos ) return 1; + return 0; +} +KHEAP_INIT(blk, blk_t*, blk_is_smaller) + +void blk_read(khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) +{ + if ( !blk->fh ) return; + int ret = bcf_read(blk->fh, hdr, blk->rec); + if ( ret < -1 ) error("Error reading %s\n", blk->fname); + if ( ret == -1 ) + { + if ( hts_close(blk->fh)!=0 ) error("Close failed: %s\n", blk->fname); + blk->fh = 0; + return; + } + khp_insert(blk, bhp, &blk); +} + +void merge_blocks(args_t *args) +{ + fprintf(pysam_stderr,"Merging %d temporary files\n", (int)args->nblk); + + khp_blk_t *bhp = khp_init(blk); + + int i; + for (i=0; inblk; i++) + { + blk_t *blk = args->blk + i; + blk->fh = hts_open(blk->fname, "r"); + if ( !blk->fh ) error("Could not read %s: %s\n", blk->fname, strerror(errno)); + bcf_hdr_t *hdr = bcf_hdr_read(blk->fh); + bcf_hdr_destroy(hdr); + blk->rec = bcf_init(); + blk_read(bhp, args->hdr, blk); + } + + htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + bcf_hdr_write(out, args->hdr); + while ( bhp->ndat ) + { + blk_t *blk = bhp->dat[0]; + bcf_write(out, args->hdr, blk->rec); + khp_delete(blk, bhp); + blk_read(bhp, args->hdr, blk); + } + if ( hts_close(out)!=0 ) error("Close failed: %s\n", args->output_fname); + + fprintf(pysam_stderr,"Cleaning\n"); + for (i=0; inblk; i++) + { + blk_t *blk = args->blk + i; + unlink(blk->fname); + free(blk->fname); + bcf_destroy(blk->rec); + } + rmdir(args->tmp_dir); + free(args->blk); + khp_destroy(blk, bhp); + fprintf(pysam_stderr,"Done\n"); +} + +static void usage(args_t *args) +{ + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "About: Sort VCF/BCF file.\n"); + fprintf(pysam_stderr, "Usage: bcftools sort [OPTIONS] \n"); + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Options:\n"); + fprintf(pysam_stderr, " -m, --max-mem [kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 + fprintf(pysam_stderr, " -o, --output-file output file name [pysam_stdout]\n"); + fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(pysam_stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX/]\n"); + fprintf(pysam_stderr, "\n"); + exit(1); +} + +size_t parse_mem_string(char *str) +{ + char *tmp; + double mem = strtod(str, &tmp); + if ( tmp==str ) error("Could not parse: --max-mem %s\n", str); + if ( !strcasecmp("k",tmp) ) mem *= 1000; + else if ( !strcasecmp("m",tmp) ) mem *= 1000*1000; + else if ( !strcasecmp("g",tmp) ) mem *= 1000*1000*1000; + return mem; +} + +void mkdir_p(const char *fmt, ...); +void init(args_t *args) +{ + if ( !args->tmp_dir ) + { + args->tmp_dir = strdup("/tmp/bcftools-sort.XXXXXX"); + char *tmp_dir = mkdtemp(args->tmp_dir); + if ( !tmp_dir ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno)); + } + else + { + args->tmp_dir = strdup(args->tmp_dir); + mkdir_p(args->tmp_dir); + } + fprintf(pysam_stderr,"Writing to %s\n", args->tmp_dir); +} +void destroy(args_t *args) +{ + bcf_hdr_destroy(args->hdr); + free(args->tmp_dir); + free(args); +} + +int main_sort(int argc, char *argv[]) +{ + int c; + args_t *args = (args_t*) calloc(1,sizeof(args_t)); + args->argc = argc; args->argv = argv; + args->max_mem = 768*1000*1000; + args->output_fname = "-"; + + static struct option loptions[] = + { + {"max-mem",required_argument,NULL,'m'}, + {"temp-dir",required_argument,NULL,'T'}, + {"output-type",required_argument,NULL,'O'}, + {"output-file",required_argument,NULL,'o'}, + {"help",no_argument,NULL,'h'}, + {0,0,0,0} + }; + while ((c = getopt_long(argc, argv, "m:T:O:o:h?",loptions,NULL)) >= 0) + { + switch (c) + { + case 'm': args->max_mem = parse_mem_string(optarg); break; + case 'T': args->tmp_dir = optarg; break; + case 'o': args->output_fname = optarg; break; + case 'O': + switch (optarg[0]) { + case 'b': args->output_type = FT_BCF_GZ; break; + case 'u': args->output_type = FT_BCF; break; + case 'z': args->output_type = FT_VCF_GZ; break; + case 'v': args->output_type = FT_VCF; break; + default: error("The output type \"%s\" not recognised\n", optarg); + }; + break; + case 'h': usage(args); + case '?': usage(args); + default: error("Unknown argument: %s\n", optarg); + } + } + + if ( optind>=argc ) + { + if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin + else usage(args); + } + else args->fname = argv[optind]; + + init(args); + sort_blocks(args); + merge_blocks(args); + destroy(args); + + return 0; +} diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c index 4041a5a..3b73173 100644 --- a/bcftools/vcfstats.c +++ b/bcftools/vcfstats.c @@ -87,6 +87,7 @@ typedef struct int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1; int subst[15]; int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; + int *smpl_hapRef, *smpl_hapAlt; int *smpl_indel_hets, *smpl_indel_homs; int *smpl_frm_shifts; // not-applicable, in-frame, out-frame unsigned long int *smpl_dp; @@ -472,6 +473,8 @@ static void init_stats(args_t *args) stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int)); + stats->smpl_hapRef = (int *) calloc(args->files->n_smpl,sizeof(int)); + stats->smpl_hapAlt = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int)); @@ -548,17 +551,19 @@ static void destroy_stats(args_t *args) #endif free(stats->insertions); free(stats->deletions); - if (stats->smpl_hets) free(stats->smpl_hets); - if (stats->smpl_homAA) free(stats->smpl_homAA); - if (stats->smpl_homRR) free(stats->smpl_homRR); - if (stats->smpl_indel_homs) free(stats->smpl_indel_homs); - if (stats->smpl_indel_hets) free(stats->smpl_indel_hets); - if (stats->smpl_ts) free(stats->smpl_ts); - if (stats->smpl_tv) free(stats->smpl_tv); - if (stats->smpl_indels) free(stats->smpl_indels); - if (stats->smpl_dp) free(stats->smpl_dp); - if (stats->smpl_ndp) free(stats->smpl_ndp); - if (stats->smpl_sngl) free(stats->smpl_sngl); + free(stats->smpl_hets); + free(stats->smpl_homAA); + free(stats->smpl_homRR); + free(stats->smpl_hapRef); + free(stats->smpl_hapAlt); + free(stats->smpl_indel_homs); + free(stats->smpl_indel_hets); + free(stats->smpl_ts); + free(stats->smpl_tv); + free(stats->smpl_indels); + free(stats->smpl_dp); + free(stats->smpl_ndp); + free(stats->smpl_sngl); idist_destroy(&stats->dp); idist_destroy(&stats->dp_sites); for (j=0; jnusr; j++) @@ -861,6 +866,8 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int assert( ialn_allele ); stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++; } + if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++; + if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++; continue; } if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; } @@ -873,7 +880,10 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int case GT_HOM_AA: nalt_tot++; break; } #endif - if ( line_type&VCF_SNP || line_type==VCF_REF ) // count ALT=. as SNP + int var_type = 0; + if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial); + if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal); + if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP { if ( gt == GT_HET_RA ) stats->smpl_hets[is]++; else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++; @@ -889,7 +899,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int stats->smpl_tv[is]++; } } - if ( line_type&VCF_INDEL ) + if ( var_type&VCF_INDEL ) { if ( gt != GT_HOM_RR ) { @@ -1068,7 +1078,7 @@ static void do_vcf_stats(args_t *args) if ( line->n_allele>2 ) { stats->n_mals++; - if ( line_type == VCF_SNP ) stats->n_snp_mals++; + if ( line_type == VCF_SNP ) stats->n_snp_mals++; // note: this will be fooled by C>C,T } if ( files->n_smpl ) @@ -1125,7 +1135,22 @@ static void print_header(args_t *args) static void print_stats(args_t *args) { int i, j,k, id; - printf("# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n"); + printf("# SN, Summary numbers:\n"); + printf("# number of records .. number of data rows in the VCF\n"); + printf("# number of no-ALTs .. reference-only sites, ALT is either \".\" or identical to REF\n"); + printf("# number of SNPs .. number of rows with a SNP\n"); + printf("# number of MNPs .. number of rows with a MNP, such as CC>TT\n"); + printf("# number of indels .. number of rows with an indel\n"); + printf("# number of others .. number of rows with other type, for example a symbolic allele or\n"); + printf("# a complex substitution, such as ACT>TCGA\n"); + printf("# number of multiallelic sites .. number of rows with multiple alternate alleles\n"); + printf("# number of multiallelic SNP sites .. number of rows with multiple alternate alleles, all SNPs\n"); + printf("# \n"); + printf("# Note that rows containing multiple types will be counted multiple times, in each\n"); + printf("# counter. For example, a row with a SNP and an indel increments both the SNP and\n"); + printf("# the indel counter.\n"); + printf("# \n"); + printf("# SN\t[2]id\t[3]key\t[4]value\n"); for (id=0; idfiles->nreaders; id++) printf("SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header)); for (id=0; idnstats; id++) @@ -1470,16 +1495,18 @@ static void print_stats(args_t *args) if ( args->files->n_smpl ) { - printf("# PSC, Per-sample counts\n# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\n"); + printf("# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n"); + printf("# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons" + "\t[12]nHapRef\t[13]nHapAlt\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=0; ifiles->n_smpl; i++) { float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0; - printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\n", id,args->files->samples[i], + printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i], stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i], - stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i]); + stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]); } } diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c index a5e5a9f..57adbc0 100644 --- a/bcftools/vcfstats.c.pysam.c +++ b/bcftools/vcfstats.c.pysam.c @@ -89,6 +89,7 @@ typedef struct int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1; int subst[15]; int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; + int *smpl_hapRef, *smpl_hapAlt; int *smpl_indel_hets, *smpl_indel_homs; int *smpl_frm_shifts; // not-applicable, in-frame, out-frame unsigned long int *smpl_dp; @@ -474,6 +475,8 @@ static void init_stats(args_t *args) stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int)); + stats->smpl_hapRef = (int *) calloc(args->files->n_smpl,sizeof(int)); + stats->smpl_hapAlt = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int)); @@ -550,17 +553,19 @@ static void destroy_stats(args_t *args) #endif free(stats->insertions); free(stats->deletions); - if (stats->smpl_hets) free(stats->smpl_hets); - if (stats->smpl_homAA) free(stats->smpl_homAA); - if (stats->smpl_homRR) free(stats->smpl_homRR); - if (stats->smpl_indel_homs) free(stats->smpl_indel_homs); - if (stats->smpl_indel_hets) free(stats->smpl_indel_hets); - if (stats->smpl_ts) free(stats->smpl_ts); - if (stats->smpl_tv) free(stats->smpl_tv); - if (stats->smpl_indels) free(stats->smpl_indels); - if (stats->smpl_dp) free(stats->smpl_dp); - if (stats->smpl_ndp) free(stats->smpl_ndp); - if (stats->smpl_sngl) free(stats->smpl_sngl); + free(stats->smpl_hets); + free(stats->smpl_homAA); + free(stats->smpl_homRR); + free(stats->smpl_hapRef); + free(stats->smpl_hapAlt); + free(stats->smpl_indel_homs); + free(stats->smpl_indel_hets); + free(stats->smpl_ts); + free(stats->smpl_tv); + free(stats->smpl_indels); + free(stats->smpl_dp); + free(stats->smpl_ndp); + free(stats->smpl_sngl); idist_destroy(&stats->dp); idist_destroy(&stats->dp_sites); for (j=0; jnusr; j++) @@ -863,6 +868,8 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int assert( ialn_allele ); stats->smpl_frm_shifts[is*3 + args->tmp_frm[ial]]++; } + if ( gt == GT_HAPL_R ) stats->smpl_hapRef[is]++; + if ( gt == GT_HAPL_A ) stats->smpl_hapAlt[is]++; continue; } if ( gt != GT_HOM_RR ) { n_nref++; i_nref = is; } @@ -875,7 +882,10 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int case GT_HOM_AA: nalt_tot++; break; } #endif - if ( line_type&VCF_SNP || line_type==VCF_REF ) // count ALT=. as SNP + int var_type = 0; + if ( ial>0 ) var_type |= bcf_get_variant_type(line,ial); + if ( jal>0 ) var_type |= bcf_get_variant_type(line,jal); + if ( var_type&VCF_SNP || var_type==VCF_REF ) // count ALT=. as SNP { if ( gt == GT_HET_RA ) stats->smpl_hets[is]++; else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++; @@ -891,7 +901,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int stats->smpl_tv[is]++; } } - if ( line_type&VCF_INDEL ) + if ( var_type&VCF_INDEL ) { if ( gt != GT_HOM_RR ) { @@ -1070,7 +1080,7 @@ static void do_vcf_stats(args_t *args) if ( line->n_allele>2 ) { stats->n_mals++; - if ( line_type == VCF_SNP ) stats->n_snp_mals++; + if ( line_type == VCF_SNP ) stats->n_snp_mals++; // note: this will be fooled by C>C,T } if ( files->n_smpl ) @@ -1127,7 +1137,22 @@ static void print_header(args_t *args) static void print_stats(args_t *args) { int i, j,k, id; - fprintf(pysam_stdout, "# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n"); + fprintf(pysam_stdout, "# SN, Summary numbers:\n"); + fprintf(pysam_stdout, "# number of records .. number of data rows in the VCF\n"); + fprintf(pysam_stdout, "# number of no-ALTs .. reference-only sites, ALT is either \".\" or identical to REF\n"); + fprintf(pysam_stdout, "# number of SNPs .. number of rows with a SNP\n"); + fprintf(pysam_stdout, "# number of MNPs .. number of rows with a MNP, such as CC>TT\n"); + fprintf(pysam_stdout, "# number of indels .. number of rows with an indel\n"); + fprintf(pysam_stdout, "# number of others .. number of rows with other type, for example a symbolic allele or\n"); + fprintf(pysam_stdout, "# a complex substitution, such as ACT>TCGA\n"); + fprintf(pysam_stdout, "# number of multiallelic sites .. number of rows with multiple alternate alleles\n"); + fprintf(pysam_stdout, "# number of multiallelic SNP sites .. number of rows with multiple alternate alleles, all SNPs\n"); + fprintf(pysam_stdout, "# \n"); + fprintf(pysam_stdout, "# Note that rows containing multiple types will be counted multiple times, in each\n"); + fprintf(pysam_stdout, "# counter. For example, a row with a SNP and an indel increments both the SNP and\n"); + fprintf(pysam_stdout, "# the indel counter.\n"); + fprintf(pysam_stdout, "# \n"); + fprintf(pysam_stdout, "# SN\t[2]id\t[3]key\t[4]value\n"); for (id=0; idfiles->nreaders; id++) fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header)); for (id=0; idnstats; id++) @@ -1472,16 +1497,18 @@ static void print_stats(args_t *args) if ( args->files->n_smpl ) { - fprintf(pysam_stdout, "# PSC, Per-sample counts\n# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons\n"); + fprintf(pysam_stdout, "# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n"); + fprintf(pysam_stdout, "# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons" + "\t[12]nHapRef\t[13]nHapAlt\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=0; ifiles->n_smpl; i++) { float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0; - fprintf(pysam_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\n", id,args->files->samples[i], + fprintf(pysam_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i], stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i], - stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i]); + stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]); } } diff --git a/bcftools/version.h b/bcftools/version.h index 11ee02d..eb2074c 100644 --- a/bcftools/version.h +++ b/bcftools/version.h @@ -1 +1 @@ -#define BCFTOOLS_VERSION "1.5" +#define BCFTOOLS_VERSION "1.6" diff --git a/doc/installation.rst b/doc/installation.rst index e404701..535f4bc 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -69,6 +69,9 @@ Note that the location of the file :file:`libhts.so` needs to be known to the linker once you run pysam, for example by setting the environment-varirable `LD_LIBRARY_PATH`. +Note that generally the pysam and htslib version need to be +compatible. See the release notes for more information. + Installation from repository ============================ @@ -83,3 +86,23 @@ To install from repository, type:: python setup.py install For compilation options, see the section on Pypi installation above. + +Requirements +============ + +Depending on the installation method, requirements for building pysam differ. + +When installing through conda_, dependencies will be resolved by the +package manager. The pip_ installation and installation from source +require a C compiler and its standard libraries as well as all +requirements for building htslib. Htslib requirements are listed in +the htslib/INSTALL file. + +Installing from the repository will require cython_ to be installed. + + + + + + + diff --git a/doc/release.rst b/doc/release.rst index 18af4ad..81cd274 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,6 +2,17 @@ Release notes ============= +Release 0.13.0 +=============== + +This release wraps htslib/samtools/bcftools versions 1.6.0 and +contains a series of bugfixes. + +* [#544] reading header from remote TabixFiles now works. +* [#531] add missing tag types H and A. A python float will now be + added as 'f' type instead of 'd' type. + + Release 0.12.0.1 ================ diff --git a/import.py b/import.py index c50f623..80e6d4b 100644 --- a/import.py +++ b/import.py @@ -12,6 +12,7 @@ # For samtools, type: # rm -rf samtools # python import.py samtools download/samtools +# git checkout -- samtools/version.h # # Manually, then: # modify config.h to set compatibility flags @@ -19,6 +20,7 @@ # For bcftools, type: # rm -rf bedtools # python import.py bedtools download/bedtools +# git checkout -- bcftools/version.h # rm -rf bedtools/test bedtools/plugins import fnmatch diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx index 67967c4..4b3b4dd 100644 --- a/pysam/libcalignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -64,6 +64,9 @@ from cpython.version cimport PY_MAJOR_VERSION from cpython cimport PyBytes_FromStringAndSize from libc.string cimport strchr from cpython cimport array as c_array +from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \ + INT8_MAX, INT16_MAX, INT32_MAX, \ + UINT8_MAX, UINT16_MAX, UINT32_MAX from pysam.libcutils cimport force_bytes, force_str, \ charptr_to_str, charptr_to_bytes @@ -74,13 +77,15 @@ from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, cdef char * htslib_types = 'cCsSiIf' cdef char * parray_types = 'bBhHiIf' +cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3 + # translation tables # cigar code to character and vice versa cdef char* CODE2CIGAR= "MIDNSHP=XB" cdef int NCIGAR_CODES = 10 -if PY_MAJOR_VERSION >= 3: +if IS_PYTHON3: CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR)) else: CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR)) @@ -142,18 +147,33 @@ cdef convert_binary_tag(uint8_t * tag): return byte_size, nvalues, c_values -cdef inline uint8_t get_value_code(value, value_type=None): - '''guess type code for a *value*. If *value_type* is None, - the type code will be inferred based on the Python type of - *value*''' - cdef uint8_t typecode - cdef char * _char_type +cdef inline uint8_t get_tag_typecode(value, value_type=None): + """guess type code for a *value*. If *value_type* is None, the type + code will be inferred based on the Python type of *value* + """ + # 0 is unknown typecode + cdef char typecode = 0 + if value_type is None: if isinstance(value, int): - typecode = 'i' + if value < 0: + if value >= INT8_MIN: + typecode = 'c' + elif value >= INT16_MIN: + typecode = 's' + elif value >= INT32_MIN: + typecode = 'i' + # unsigned ints + else: + if value <= UINT8_MAX: + typecode = 'C' + elif value <= UINT16_MAX: + typecode = 'S' + elif value <= UINT32_MAX: + typecode = 'I' elif isinstance(value, float): - typecode = 'd' + typecode = 'f' elif isinstance(value, str): typecode = 'Z' elif isinstance(value, bytes): @@ -162,93 +182,98 @@ cdef inline uint8_t get_value_code(value, value_type=None): isinstance(value, list) or \ isinstance(value, tuple): typecode = 'B' - else: - return 0 else: - if value_type not in 'Zidf': - return 0 - value_type = force_bytes(value_type) - _char_type = value_type - typecode = (_char_type)[0] + if value_type in 'aAsSIcCZidfH': + typecode = force_bytes(value_type)[0] return typecode -cdef inline bytes getTypecode(value, maximum_value=None): +cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None): '''returns the value typecode of a value. - If max is specified, the approprite type is - returned for a range where value is the minimum. + If max is specified, the appropriate type is returned for a range + where value is the minimum. + + Note that this method returns types from the extended BAM alphabet + of types that includes tags that are not part of the SAM + specification. ''' - if maximum_value is None: - maximum_value = value - cdef bytes valuetype + cdef uint8_t typecode t = type(value) if t is float: - valuetype = b'f' + typecode = 'f' elif t is int: + if max_value is None: + max_value = value + if min_value is None: + min_value = value # signed ints - if value < 0: - if value >= -128 and maximum_value < 128: - valuetype = b'c' - elif value >= -32768 and maximum_value < 32768: - valuetype = b's' - elif value < -2147483648 or maximum_value >= 2147483648: + if min_value < 0: + if min_value >= INT8_MIN and max_value <= INT8_MAX: + typecode = 'c' + elif min_value >= INT16_MIN and max_value <= INT16_MAX: + typecode = 's' + elif min_value >= INT32_MIN or max_value <= INT32_MAX: + typecode = 'i' + else: raise ValueError( "at least one signed integer out of range of " "BAM/SAM specification") - else: - valuetype = b'i' # unsigned ints else: - if maximum_value < 256: - valuetype = b'C' - elif maximum_value < 65536: - valuetype = b'S' - elif maximum_value >= 4294967296: + if max_value <= UINT8_MAX: + typecode = 'C' + elif max_value <= UINT16_MAX: + typecode = 'S' + elif max_value <= UINT32_MAX: + typecode = 'I' + else: raise ValueError( "at least one integer out of range of BAM/SAM specification") - else: - valuetype = b'I' else: # Note: hex strings (H) are not supported yet if t is not bytes: value = value.encode('ascii') if len(value) == 1: - valuetype = b'A' + typecode = 'A' else: - valuetype = b'Z' + typecode = 'Z' + + return typecode + - return valuetype +# mapping python array.array and htslib typecodes to struct typecodes +DATATYPE2FORMAT = { + ord('c'): ('b', 1), + ord('C'): ('B', 1), + ord('s'): ('h', 2), + ord('S'): ('H', 2), + ord('i'): ('i', 4), + ord('I'): ('I', 4), + ord('f'): ('f', 4), + ord('d'): ('d', 8), + ord('A'): ('c', 1), + ord('a'): ('c', 1)} -cdef inline packTags(tags): +cdef inline pack_tags(tags): """pack a list of tags. Each tag is a tuple of (tag, tuple). Values are packed into the most space efficient data structure possible unless the tag contains a third field with the typecode. - Returns a format string and the associated list of arguments - to be used in a call to struct.pack_into. + Returns a format string and the associated list of arguments to be + used in a call to struct.pack_into. """ fmts, args = ["<"], [] - cdef char array_typecode - - datatype2format = { - b'c': ('b', 1), - b'C': ('B', 1), - b's': ('h', 2), - b'S': ('H', 2), - b'i': ('i', 4), - b'I': ('I', 4), - b'f': ('f', 4), - b'A': ('c', 1)} - + # htslib typecode + cdef uint8_t typecode for tag in tags: if len(tag) == 2: @@ -259,68 +284,76 @@ cdef inline packTags(tags): else: raise ValueError("malformatted tag: %s" % str(tag)) + if valuetype is None: + typecode = 0 + else: + # only first character in valuecode matters + if IS_PYTHON3: + typecode = force_bytes(valuetype)[0] + else: + typecode = ord(valuetype[0]) + pytag = force_bytes(pytag) - valuetype = force_bytes(valuetype) - t = type(value) + pytype = type(value) - if t is tuple or t is list: + if pytype is tuple or pytype is list: # binary tags from tuples or lists - if valuetype is None: + if not typecode: # automatically determine value type - first value # determines type. If there is a mix of types, the # result is undefined. - valuetype = getTypecode(min(value), max(value)) + typecode = get_btag_typecode(min(value), + min_value=min(value), + max_value=max(value)) - if valuetype not in datatype2format: - raise ValueError("invalid value type '%s'" % valuetype) + if typecode not in DATATYPE2FORMAT: + raise ValueError("invalid value type '{}'".format(chr(typecode))) - datafmt = "2sccI%i%s" % (len(value), datatype2format[valuetype][0]) + datafmt = "2sBBI%i%s" % (len(value), DATATYPE2FORMAT[typecode][0]) args.extend([pytag[:2], - b"B", - valuetype, + ord("B"), + typecode, len(value)] + list(value)) elif isinstance(value, array.array): - valuetype = value.typecode - if valuetype not in datatype2format: - valuetype = None # binary tags from arrays - if valuetype is None: - array_typecode = map_typecode_python_to_htslib(ord(value.typecode)) - - if array_typecode == 0: - raise ValueError("unsupported type code '{}'" - .format(value.typecode)) + if typecode == 0: + typecode = map_typecode_python_to_htslib(ord(value.typecode)) - valuetype = force_bytes(chr(array_typecode)) - - if valuetype not in datatype2format: - raise ValueError("invalid value type '%s' (%s)" % - (valuetype, type(valuetype))) + if typecode == 0: + raise ValueError("unsupported type code '{}'".format(value.typecode)) + if typecode not in DATATYPE2FORMAT: + raise ValueError("invalid value type '{}' ({})".format(chr(typecode), array.typecode)) + # use array.tostring() to retrieve byte representation and # save as bytes - datafmt = "2sccI%is" % (len(value) * datatype2format[valuetype][1]) + datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1]) args.extend([pytag[:2], - b"B", - valuetype, + ord("B"), + typecode, len(value), force_bytes(value.tostring())]) else: - if valuetype is None: - valuetype = getTypecode(value) - - if valuetype in b"AZ": + if typecode == 0: + typecode = get_tag_typecode(value) + if typecode == 0: + raise ValueError("could not deduce typecode for value {}".format(value)) + + if typecode == 'a' or typecode == 'A' or typecode == 'Z' or typecode == 'H': value = force_bytes(value) - if valuetype == b"Z": - datafmt = "2sc%is" % (len(value)+1) - else: - datafmt = "2sc%s" % datatype2format[valuetype][0] + if typecode == "a": + typecode = 'A' + if typecode == 'Z' or typecode == 'H': + datafmt = "2sB%is" % (len(value)+1) + else: + datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0] + args.extend([pytag[:2], - valuetype, + typecode, value]) fmts.append(datafmt) @@ -545,6 +578,31 @@ cdef inline uint32_t get_alignment_length(bam1_t * src): l += cigar_p[k] >> BAM_CIGAR_SHIFT return l +cdef inline uint32_t get_md_reference_length(char * md_tag): + cdef int l = 0 + cdef int md_idx = 0 + cdef int nmatches = 0 + + while md_tag[md_idx] != 0: + if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57: + nmatches *= 10 + nmatches += md_tag[md_idx] - 48 + md_idx += 1 + continue + else: + l += nmatches + nmatches = 0 + if md_tag[md_idx] == '^': + md_idx += 1 + while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90: + md_idx += 1 + l += 1 + else: + md_idx += 1 + l += 1 + + l += nmatches + return l # TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ... cdef inline bytes build_alignment_sequence(bam1_t * src): @@ -634,6 +692,21 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): cdef int md_idx = 0 s_idx = 0 + # Check if MD tag is valid by matching CIGAR length to MD tag defined length + # Insertions would be in addition to what is described by MD, so we calculate + # the number of insertions seperately. + insertions = 0 + + while s[s_idx] != 0: + if s[s_idx] >= 'a': + insertions += 1 + s_idx += 1 + s_idx = 0 + + cdef uint32_t md_len = get_md_reference_length(md_tag) + if md_len + insertions > max_len: + raise AssertionError("Invalid MD tag: MD length {} mismatch with CIGAR length {}".format(md_len, max_len)) + while md_tag[md_idx] != 0: # c is numerical if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57: @@ -1918,24 +1991,56 @@ cdef class AlignedSegment: section. *value_type* describes the type of *value* that is to entered - into the alignment record.. It can be set explicitly to one - of the valid one-letter type codes. If unset, an appropriate - type will be chosen automatically. + into the alignment record. It can be set explicitly to one of + the valid one-letter type codes. If unset, an appropriate type + will be chosen automatically based on the python type of + *value*. An existing value of the same *tag* will be overwritten unless - replace is set to False. This is usually not recommened as a + *replace* is set to False. This is usually not recommened as a tag may only appear once in the optional alignment section. If *value* is None, the tag will be deleted. + + This method accepts valid SAM specification value types, which + are:: + + A: printable char + i: signed int + f: float + Z: printable string + H: Byte array in hex format + B: Integer or numeric array + + Additionally, it will accept the integer BAM types ('cCsSI') + + For htslib compatibility, 'a' is synonymous with 'A' and the + method accepts a 'd' type code for a double precision float. + + When deducing the type code by the python type of *value*, the + following mapping is applied:: + + i: python int + f: python float + Z: python str or bytes + B: python array.array, list or tuple + + Note that a single character string will be output as 'Z' and + not 'A' as the former is the more general type. """ cdef int value_size + cdef uint8_t tc cdef uint8_t * value_ptr cdef uint8_t *existing_ptr - cdef uint8_t typecode cdef float float_value cdef double double_value - cdef int32_t int_value + cdef int32_t int32_t_value + cdef uint32_t uint32_t_value + cdef int16_t int16_t_value + cdef uint16_t uint16_t_value + cdef int8_t int8_t_value + cdef uint8_t uint8_t_value cdef bam1_t * src = self._delegate cdef char * _value_type cdef c_array.array array_value @@ -1954,19 +2059,51 @@ cdef class AlignedSegment: if value is None: return - typecode = get_value_code(value, value_type) + cdef uint8_t typecode = get_tag_typecode(value, value_type) if typecode == 0: - raise ValueError("can't guess type or invalid type code specified") + raise ValueError("can't guess type or invalid type code specified: {} {}".format( + value, value_type)) - # Not Endian-safe, but then again neither is samtools! + # sam_format1 for typecasting if typecode == 'Z': value = force_bytes(value) value_ptr = value value_size = len(value)+1 + elif typecode == 'H': + # Note that hex tags are stored the very same + # way as Z string.s + value = force_bytes(value) + value_ptr = value + value_size = len(value)+1 + elif typecode == 'A' or typecode == 'a': + value = force_bytes(value) + value_ptr = value + value_size = sizeof(char) + typecode = 'A' elif typecode == 'i': - int_value = value - value_ptr = &int_value + int32_t_value = value + value_ptr = &int32_t_value value_size = sizeof(int32_t) + elif typecode == 'I': + uint32_t_value = value + value_ptr = &uint32_t_value + value_size = sizeof(uint32_t) + elif typecode == 's': + int16_t_value = value + value_ptr = &int16_t_value + value_size = sizeof(int16_t) + elif typecode == 'S': + uint16_t_value = value + value_ptr = &uint16_t_value + value_size = sizeof(uint16_t) + elif typecode == 'c': + int8_t_value = value + value_ptr = &int8_t_value + value_size = sizeof(int8_t) + elif typecode == 'C': + uint8_t_value = value + value_ptr = &uint8_t_value + value_size = sizeof(uint8_t) elif typecode == 'd': double_value = value value_ptr = &double_value @@ -1978,13 +2115,10 @@ cdef class AlignedSegment: elif typecode == 'B': # the following goes through python, needs to be cleaned up # pack array using struct - if value_type is None: - fmt, args = packTags([(tag, value)]) - else: - fmt, args = packTags([(tag, value, value_type)]) + fmt, args = pack_tags([(tag, value, value_type)]) # remove tag and type code as set by bam_aux_append - # first four chars of format (<2sc) + # first four chars of format (<2sB) fmt = '<' + fmt[4:] # first two values to pack args = args[2:] @@ -2000,7 +2134,7 @@ cdef class AlignedSegment: buffer.raw) return else: - raise ValueError('unsupported value_type in set_option') + raise ValueError('unsupported value_type {} in set_option'.format(typecode)) bam_aux_append(src, tag, @@ -2027,6 +2161,10 @@ cdef class AlignedSegment: This method is the fastest way to access the optional alignment section if only few tags need to be retrieved. + Possible value types are "AcCsSiIfZHB" (see BAM format + specification) as well as additional value type 'd' as + implemented in htslib. + Parameters ---------- @@ -2061,19 +2199,20 @@ cdef class AlignedSegment: else: auxtype = chr(v[0]) - if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S': - value = bam_aux2i(v) - elif auxtype == 'i' or auxtype == 'I': - value = bam_aux2i(v) + if auxtype in "iIcCsS": + value = bam_aux2i(v) elif auxtype == 'f' or auxtype == 'F': - value = bam_aux2f(v) + value = bam_aux2f(v) elif auxtype == 'd' or auxtype == 'D': - value = bam_aux2f(v) - elif auxtype == 'A': + value = bam_aux2f(v) + elif auxtype == 'A' or auxtype == 'a': + # force A to a + v[0] = 'A' # there might a more efficient way # to convert a char into a string value = '%c' % bam_aux2A(v) - elif auxtype == 'Z': + elif auxtype == 'Z' or auxtype == 'H': + # Z and H are treated equally as strings in htslib value = charptr_to_str(bam_aux2Z(v)) elif auxtype[0] == 'B': bytesize, nvalues, values = convert_binary_tag(v + 1) @@ -2141,7 +2280,7 @@ cdef class AlignedSegment: elif auxtype == 'd': value = bam_aux2f(s) s += 8 - elif auxtype == 'A': + elif auxtype in ('A', 'a'): value = "%c" % bam_aux2A(s) s += 1 elif auxtype in ('Z', 'H'): @@ -2166,7 +2305,7 @@ cdef class AlignedSegment: return result def set_tags(self, tags): - """sets the fields in the optional alignmest section with + """sets the fields in the optional alignment section with a list of (tag, value) tuples. The :term:`value type` of the values is determined from the @@ -2188,7 +2327,7 @@ cdef class AlignedSegment: # convert and pack the data if tags is not None and len(tags) > 0: - fmt, args = packTags(tags) + fmt, args = pack_tags(tags) new_size = struct.calcsize(fmt) buffer = ctypes.create_string_buffer(new_size) struct.pack_into(fmt, @@ -2196,6 +2335,7 @@ cdef class AlignedSegment: 0, *args) + # delete the old data and allocate new space. # If total_size == 0, the aux field will be # empty diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd index d59e704..fb2bd0c 100644 --- a/pysam/libcalignmentfile.pxd +++ b/pysam/libcalignmentfile.pxd @@ -4,7 +4,7 @@ from libc.stdlib cimport malloc, calloc, realloc, free from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup from libc.stdio cimport FILE, printf -from pysam.libcfaidx cimport faidx_t, Fastafile +from pysam.libcfaidx cimport faidx_t, FastaFile from pysam.libcalignedsegment cimport AlignedSegment from pysam.libchtslib cimport * @@ -121,7 +121,7 @@ cdef class IteratorColumn: cdef bam_plp_t pileup_iter cdef __iterdata iterdata cdef AlignmentFile samfile - cdef Fastafile fastafile + cdef FastaFile fastafile cdef stepper cdef int max_depth diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index cea312c..1599dfa 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -2149,7 +2149,7 @@ cdef class IteratorColumn: def __get__(self): return self.iterdata.seq_len - def addReference(self, Fastafile fastafile): + def addReference(self, FastaFile fastafile): ''' add reference sequences in `fastafile` to iterator.''' self.fastafile = fastafile diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx index f8b0e38..23e5832 100644 --- a/pysam/libctabix.pyx +++ b/pysam/libctabix.pyx @@ -53,6 +53,7 @@ # DEALINGS IN THE SOFTWARE. # ############################################################################### +import binascii import os import sys @@ -71,9 +72,9 @@ cimport pysam.libctabixproxies as ctabixproxies from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \ - tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \ + tbx_index_build2, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \ tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ - tbx_destroy, hisremote, region_list, \ + tbx_destroy, hisremote, region_list, hts_getline, \ TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC from pysam.libcutils cimport force_bytes, force_str, charptr_to_str @@ -472,9 +473,9 @@ cdef class TabixFile: # without region or reference - iterate from start with nogil: itr = tbx_itr_queryi(fileobj.index, - HTS_IDX_START, - 0, - 0) + HTS_IDX_START, + 0, + 0) else: s = force_bytes(region, encoding=fileobj.encoding) cstr = s @@ -528,18 +529,42 @@ cdef class TabixFile: .. note:: The header is returned as an iterator presenting lines without the newline character. - - .. note:: - The header is only available for local files. For remote - files an Attribute Error is raised. - ''' def __get__(self): - if self.is_remote: - raise AttributeError( - "the header is not available for remote files") - return GZIteratorHead(self.filename) + + cdef char *cfilename = self.filename + + cdef kstring_t buffer + buffer.l = buffer.m = 0 + buffer.s = NULL + + cdef htsFile * fp = NULL + cdef int KS_SEP_LINE = 2 + cdef tbx_t * tbx = NULL + lines = [] + with nogil: + fp = hts_open(cfilename, 'r') + + if fp == NULL: + raise OSError("could not open {} for reading header".format(self.filename)) + + with nogil: + tbx = tbx_index_load(cfilename) + + if tbx == NULL: + raise OSError("could not load .tbi/.csi index of {}".format(self.filename)) + + while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0: + if not buffer.l or buffer.s[0] != tbx.conf.meta_char: + break + lines.append(force_str(buffer.s, self.encoding)) + + with nogil: + hts_close(fp) + free(buffer.s) + + return lines property contigs: '''list of chromosome names''' @@ -843,16 +868,25 @@ def tabix_compress(filename_in, raise IOError("error %i when closing file %s" % (r, filename_in)) -def tabix_index(filename, +def is_gzip_file(filename): + gzip_magic_hex = b'1f8b' + fd = os.open(filename, os.O_RDONLY) + header = os.read(fd, 2) + return header == binascii.a2b_hex(gzip_magic_hex) + + +def tabix_index(filename, force=False, - seq_col=None, - start_col=None, + seq_col=None, + start_col=None, end_col=None, preset=None, meta_char="#", int line_skip=0, zerobased=False, int min_shift=-1, + index=None, + keep_original=False, ): '''index tab-separated *filename* using tabix. @@ -876,20 +910,22 @@ def tabix_index(filename, Lines beginning with *meta_char* and the first *line_skip* lines will be skipped. - - If *filename* does not end in ".gz", it will be automatically - compressed. The original file will be removed and only the - compressed file will be retained. - If *filename* ends in *gz*, the file is assumed to be already - compressed with bgzf. + If *filename* is not detected as a gzip file it will be automatically + compressed. The original file will be removed and only the compressed + file will be retained. *min-shift* sets the minimal interval size to 1< #include diff --git a/samtools/bam_lpileup.c b/samtools/bam_lpileup.c index e20cc92..cc7a75b 100644 --- a/samtools/bam_lpileup.c +++ b/samtools/bam_lpileup.c @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "bam_plbuf.h" #include "bam_lpileup.h" +#include "samtools.h" #include #define TV_GAP 2 diff --git a/samtools/bam_lpileup.c.pysam.c b/samtools/bam_lpileup.c.pysam.c index 9f7f063..93fde4f 100644 --- a/samtools/bam_lpileup.c.pysam.c +++ b/samtools/bam_lpileup.c.pysam.c @@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "bam_plbuf.h" #include "bam_lpileup.h" +#include "samtools.h" #include #define TV_GAP 2 diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c new file mode 100644 index 0000000..cf6a82a --- /dev/null +++ b/samtools/bam_markdup.c @@ -0,0 +1,844 @@ +/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone + through fixmates with the mate scoring option on. + + Copyright (C) 2017 Genome Research Ltd. + + Author: Andrew Whitwham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include "htslib/thread_pool.h" +#include "htslib/sam.h" +#include "sam_opts.h" +#include "samtools.h" +#include "htslib/khash.h" +#include "htslib/klist.h" + +typedef struct { + int32_t single; + int32_t this_ref; + int32_t this_coord; + int32_t other_ref; + int32_t other_coord; + int32_t leftmost; + int32_t orientation; +} key_data_t; + +typedef struct { + bam1_t *p; +} in_hash_t; + +typedef struct { + bam1_t *b; + int32_t pos; + key_data_t pair_key; + key_data_t single_key; +} read_queue_t; + + + +static khint32_t do_hash(unsigned char *key, khint32_t len); + +static khint_t hash_key(key_data_t key) { + int i = 0; + khint_t hash; + + if (key.single) { + unsigned char sig[12]; + + memcpy(sig + i, &key.this_ref, 4); i += 4; + memcpy(sig + i, &key.this_coord, 4); i += 4; + memcpy(sig + i, &key.orientation, 4); i += 4; + + hash = do_hash(sig, i); + } else { + unsigned char sig[24]; + + memcpy(sig + i, &key.this_ref, 4); i += 4; + memcpy(sig + i, &key.this_coord, 4); i += 4; + memcpy(sig + i, &key.other_ref, 4); i += 4; + memcpy(sig + i, &key.other_coord, 4); i += 4; + memcpy(sig + i, &key.leftmost, 4); i += 4; + memcpy(sig + i, &key.orientation, 4); i += 4; + + hash = do_hash(sig, i); + } + + return hash; +} + + +static int key_equal(key_data_t a, key_data_t b) { + int match = 1; + + if (a.this_coord != b.this_coord) + match = 0; + else if (a.orientation != b.orientation) + match = 0; + else if (a.this_ref != b.this_ref) + match = 0; + else if (a.single != b.single) + match = 0; + + if (!a.single) { + if (a.other_coord != b.other_coord) + match = 0; + else if (a.leftmost != b.leftmost) + match = 0; + else if (a.other_ref != b.other_ref) + match = 0; + } + + return match; +} + + +#define __free_queue_element(p) +#define O_FF 2 +#define O_RR 3 +#define O_FR 5 +#define O_RF 7 + +KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash +KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer + + +/* Calculate the mate's unclipped start based on position and cigar string from MC tag. */ + +static int32_t unclipped_other_start(int32_t op, char *cigar) { + char *c = cigar; + int32_t clipped = 0; + + while (*c && *c != '*') { + long num = 0; + + if (isdigit((int)*c)) { + num = strtol(c, &c, 10); + } else { + num = 1; + } + + if (*c == 'S' || *c == 'H') { // clips + clipped += num; + } else { + break; + } + + c++; + } + + return op - clipped + 1; +} + + +/* Calculate the current read's start based on the stored cigar string. */ + +static int32_t unclipped_start(bam1_t *b) { + uint32_t *cigar = bam_get_cigar(b); + int32_t clipped = 0; + uint32_t i; + + for (i = 0; i < b->core.n_cigar; i++) { + char c = bam_cigar_opchr(cigar[i]); + + if (c == 'S' || c == 'H') { // clips + clipped += bam_cigar_oplen(cigar[i]); + } else { + break; + } + } + + return b->core.pos - clipped + 1; +} + + +/* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/ + +static int32_t unclipped_other_end(int32_t op, char *cigar) { + char *c = cigar; + int32_t refpos = 0; + int skip = 1; + + while (*c && *c != '*') { + long num = 0; + + if (isdigit((int)*c)) { + num = strtol(c, &c, 10); + } else { + num = 1; + } + + switch (*c) { + case 'M': + case 'D': + case 'N': + case '=': + case 'X': + refpos += num; + skip = 0; // ignore initial clips + break; + + case 'S': + case 'H': + if (!skip) { + refpos += num; + } + break; + } + + c++; + } + + return op + refpos; +} + + +/* Calculate the current read's end based on the stored cigar string. */ + +static int32_t unclipped_end(bam1_t *b) { + uint32_t *cigar = bam_get_cigar(b); + int32_t end_pos, clipped = 0; + int32_t i; + + end_pos = bam_endpos(b); + + // now get the clipped end bases (if any) + // if we get to the beginning of the cigar string + // without hitting a non-clip then the results are meaningless + for (i = b->core.n_cigar - 1; i >= 0; i--) { + char c = bam_cigar_opchr(cigar[i]); + + if (c == 'S' || c == 'H') { // clips + clipped += bam_cigar_oplen(cigar[i]); + } else { + break; + } + } + + return end_pos + clipped; +} + + +/* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */ + +static khint32_t do_hash(unsigned char *key, khint32_t len) { + khint32_t hash, i; + + for (hash = 0, i = 0; i < len; ++i) { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } + + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + + return hash; +} + + +/* Get mate score from tag. */ + +static int64_t get_mate_score(bam1_t *b) { + uint8_t *data; + int64_t score; + + if ((data = bam_aux_get(b, "ms"))) { + score = bam_aux2i(data); + } else { + fprintf(stderr, "[markdup] error: no ms score tag.\n"); + return -1; + } + + return score; +} + + +/* Calc current score from quality. */ + +static int64_t calc_score(bam1_t *b) +{ + int64_t score = 0; + uint8_t *qual = bam_get_qual(b); + int i; + + for (i = 0; i < b->core.l_qseq; i++) { + if (qual[i] >= 15) score += qual[i]; + } + + return score; +} + + +/* Create a signature hash of the current read and its pair. + Uses the unclipped start (or end depending on orientation), + the reference id, orientation and whether the current + read is leftmost of the pair. */ + +static int make_pair_key(key_data_t *key, bam1_t *bam) { + int32_t this_ref, this_coord, this_end; + int32_t other_ref, other_coord, other_end; + int32_t orientation, leftmost; + uint8_t *data; + char *cig; + + this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash + other_ref = bam->core.mtid + 1; + + this_coord = unclipped_start(bam); + this_end = unclipped_end(bam); + + if ((data = bam_aux_get(bam, "MC"))) { + cig = bam_aux2Z(data); + other_end = unclipped_other_end(bam->core.mpos, cig); + other_coord = unclipped_other_start(bam->core.mpos, cig); + } else { + fprintf(stderr, "[markdup] error: no MC tag.\n"); + return 1; + } + + // work out orientations + if (this_ref != other_ref) { + leftmost = this_ref < other_ref; + } else { + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + if (!bam_is_rev(bam)) { + leftmost = this_coord <= other_coord; + } else { + leftmost = this_end <= other_end; + } + } else { + if (bam_is_rev(bam)) { + leftmost = this_end <= other_coord; + } else { + leftmost = this_coord <= other_end; + } + } + } + + // pair orientation + if (leftmost) { + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + other_coord = other_end; + + if (!bam_is_rev(bam)) { + if (bam->core.flag & BAM_FREAD1) { + orientation = O_FF; + } else { + orientation = O_RR; + } + } else { + if (bam->core.flag & BAM_FREAD1) { + orientation = O_RR; + } else { + orientation = O_FF; + } + } + } else { + if (!bam_is_rev(bam)) { + orientation = O_FR; + other_coord = other_end; + } else { + orientation = O_RF; + this_coord = this_end; + } + } + } else { + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + this_coord = this_end; + + if (!bam_is_rev(bam)) { + if (bam->core.flag & BAM_FREAD1) { + orientation = O_RR; + } else { + orientation = O_FF; + } + } else { + if (bam->core.flag & BAM_FREAD1) { + orientation = O_FF; + } else { + orientation = O_RR; + } + } + } else { + if (!bam_is_rev(bam)) { + orientation = O_RF; + other_coord = other_end; + } else { + orientation = O_FR; + this_coord = this_end; + } + } + } + + if (!leftmost) + leftmost = 13; + else + leftmost = 11; + + key->single = 0; + key->this_ref = this_ref; + key->this_coord = this_coord; + key->other_ref = other_ref; + key->other_coord = other_coord; + key->leftmost = leftmost; + key->orientation = orientation; + + return 0; +} + + +/* Create a signature hash of single read (or read with an unmatched pair). + Uses unclipped start (or end depending on orientation), reference id, + and orientation. */ + +static void make_single_key(key_data_t *key, bam1_t *bam) { + int32_t this_ref, this_coord; + int32_t orientation; + + this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash + + if (bam_is_rev(bam)) { + this_coord = unclipped_end(bam); + orientation = O_RR; + } else { + this_coord = unclipped_start(bam); + orientation = O_FF; + } + + key->single = 1; + key->this_ref = this_ref; + key->this_coord = this_coord; + key->orientation = orientation; +} + + +/* Compare the reads near each other (coordinate sorted) and try to spot the duplicates. + Generally the highest quality scoring is chosen as the original and all others the duplicates. + The score is based on the sum of the quality values (<= 15) of the read and its mate (if any). + While single reads are compared to only one read of a pair, the pair will chosen as the original. + The comparison is done on position and orientation, see above for details. */ + +static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32_t max_length, int do_stats) { + bam_hdr_t *header; + khiter_t k; + khash_t(reads) *pair_hash = kh_init(reads); + khash_t(reads) *single_hash = kh_init(reads); + klist_t(read_queue) *read_buffer = kl_init(read_queue); + kliter_t(read_queue) *rq; + int32_t prev_tid, prev_coord; + read_queue_t *in_read; + int ret; + int reading, writing, excluded, duplicate, single, pair, single_dup, examined; + + if ((header = sam_hdr_read(in)) == NULL) { + fprintf(stderr, "[markdup] error reading header\n"); + return 1; + } + + // accept unknown, unsorted or coordinate sort order, but error on queryname sorted. + // only really works on coordinate sorted files. + if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { + char *p, *q; + + p = strstr(header->text, "\tSO:queryname"); + q = strchr(header->text, '\n'); + + // looking for SO:queryname within @HD only + // (e.g. must ignore in a @CO comment line later in header) + if ((p != 0) && (p < q)) { + fprintf(stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); + return 1; + } + } + + if (sam_hdr_write(out, header) < 0) { + fprintf(stderr, "[markdup] error writing header.\n"); + return 1; + } + + // used for coordinate order checks + prev_tid = prev_coord = 0; + + // get the buffer going + in_read = kl_pushp(read_queue, read_buffer); + + + if ((in_read->b = bam_init1()) == NULL) { + fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n"); + return 1; + } + + reading = writing = excluded = single_dup = duplicate = examined = pair = single = 0; + + while ((ret = sam_read1(in, header, in_read->b)) >= 0) { + + // do some basic coordinate order checks + if (in_read->b->core.tid >= 0) { // -1 for unmapped reads + if (in_read->b->core.tid < prev_tid || + ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) { + fprintf(stderr, "[markdup] error: bad coordinate order.\n"); + return 1; + } + } + + prev_coord = in_read->pos = in_read->b->core.pos; + prev_tid = in_read->b->core.tid; + in_read->pair_key.single = 1; + in_read->single_key.single = 0; + + reading++; + + // read must not be secondary, supplementary, unmapped or failed QC + if (!(in_read->b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL))) { + examined++; + + // look at the pairs first + if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { + int ret, mate_tmp; + key_data_t pair_key; + key_data_t single_key; + in_hash_t *bp; + + if (make_pair_key(&pair_key, in_read->b)) { + fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n"); + return 1; + } + + make_single_key(&single_key, in_read->b); + + pair++; + in_read->pos = single_key.this_coord; // cigar/orientation modified pos + + // put in singles hash for checking against non paired reads + k = kh_put(reads, single_hash, single_key, &ret); + + if (ret > 0) { // new + // add to single duplicate hash + bp = &kh_val(single_hash, k); + bp->p = in_read->b; + in_read->single_key = single_key; + } else if (ret == 0) { // exists + // look at singles only for duplication marking + bp = &kh_val(single_hash, k); + + if (!(bp->p->core.flag & BAM_FPAIRED) || (bp->p->core.flag & BAM_FMUNMAP)) { + bam1_t *dup = bp->p; + + // singleton will always be marked duplicate even if + // scores more than one read of the pair + + bp->p = in_read->b; + dup->core.flag |= BAM_FDUP; + single_dup++; + } + } else { + fprintf(stderr, "[markdup] error: single hashing failure.\n"); + return 1; + } + + // now do the pair + k = kh_put(reads, pair_hash, pair_key, &ret); + + if (ret > 0) { // new + // add to the pair hash + bp = &kh_val(pair_hash, k); + bp->p = in_read->b; + in_read->pair_key = pair_key; + } else if (ret == 0) { + int64_t old_score, new_score, tie_add = 0; + bam1_t *dup; + + bp = &kh_val(pair_hash, k); + + if ((mate_tmp = get_mate_score(bp->p)) == -1) { + fprintf(stderr, "[markdup] error: no ms score tag.\n"); + return 1; + } else { + old_score = calc_score(bp->p) + mate_tmp; + } + + if ((mate_tmp = get_mate_score(in_read->b)) == -1) { + fprintf(stderr, "[markdup] error: no ms score tag.\n"); + return 1; + } else { + new_score = calc_score(in_read->b) + mate_tmp; + } + + // choose the highest score as the original + // and add it to the pair hash, mark the other as duplicate + + if (new_score == old_score) { + if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p)) < 0) { + tie_add = 1; + } else { + tie_add = -1; + } + } + + if (new_score + tie_add > old_score) { // swap reads + dup = bp->p; + bp->p = in_read->b; + } else { + dup = in_read->b; + } + + dup->core.flag |= BAM_FDUP; + + duplicate++; + } else { + fprintf(stderr, "[markdup] error: pair hashing failure.\n"); + return 1; + } + } else { // do the single (or effectively single) reads + int ret; + key_data_t single_key; + in_hash_t *bp; + + make_single_key(&single_key, in_read->b); + + single++; + in_read->pos = single_key.this_coord; // cigar/orientation modified pos + + k = kh_put(reads, single_hash, single_key, &ret); + + if (ret > 0) { // new + bp = &kh_val(single_hash, k); + bp->p = in_read->b; + in_read->single_key = single_key; + } else if (ret == 0) { // exists + bp = &kh_val(single_hash, k); + + if ((bp->p->core.flag & BAM_FPAIRED) && !(bp->p->core.flag & BAM_FMUNMAP)) { + // if matched against one of a pair just mark as duplicate + in_read->b->core.flag |= BAM_FDUP; + } else { + int64_t old_score, new_score; + bam1_t *dup; + + old_score = calc_score(bp->p); + new_score = calc_score(in_read->b); + + // choose the highest score as the original, add it + // to the single hash and mark the other as duplicate + if (new_score > old_score) { // swap reads + dup = bp->p; + bp->p = in_read->b; + } else { + dup = in_read->b; + } + + dup->core.flag |= BAM_FDUP; + } + + single_dup++; + } else { + fprintf(stderr, "[markdup] error: single hashing failure.\n"); + return 1; + } + } + } else { + excluded++; + } + + // loop through the stored reads and write out those we + // no longer need + rq = kl_begin(read_buffer); + while (rq != kl_end(read_buffer)) { + in_read = &kl_val(rq); + + /* keep a moving window of reads based on coordinates and max read length. Any unaligned reads + should just be written as they cannot be matched as duplicates. */ + if (in_read->pos + max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { + break; + } + + if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { + if (sam_write1(out, header, in_read->b) < 0) { + fprintf(stderr, "[markdup] error: writing output failed.\n"); + return 1; + } + + writing++; + } + + // remove from hash + if (in_read->pair_key.single == 0) { + k = kh_get(reads, pair_hash, in_read->pair_key); + kh_del(reads, pair_hash, k); + } + + if (in_read->single_key.single == 1) { + k = kh_get(reads, single_hash, in_read->single_key); + kh_del(reads, single_hash, k); + } + + kl_shift(read_queue, read_buffer, NULL); + bam_destroy1(in_read->b); + rq = kl_begin(read_buffer); + } + + // set the next one up for reading + in_read = kl_pushp(read_queue, read_buffer); + + if ((in_read->b = bam_init1()) == NULL) { + fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n"); + return 1; + } + } + + if (ret < -1) { + fprintf(stderr, "[markdup] error: truncated input file.\n"); + return 1; + } + + // write out the end of the list + rq = kl_begin(read_buffer); + while (rq != kl_end(read_buffer)) { + in_read = &kl_val(rq); + + if (bam_get_qname(in_read->b)) { // last entry will be blank + if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { + if (sam_write1(out, header, in_read->b) < 0) { + fprintf(stderr, "[markdup] error: writing final output failed.\n"); + return 1; + } + + writing++; + } + } + + kl_shift(read_queue, read_buffer, NULL); + bam_destroy1(in_read->b); + rq = kl_begin(read_buffer); + } + + if (do_stats) { + fprintf(stderr, "READ %d WRITTEN %d \n" + "EXCLUDED %d EXAMINED %d\n" + "PAIRED %d SINGLE %d\n" + "DULPICATE PAIR %d DUPLICATE SINGLE %d\n" + "DUPLICATE TOTAL %d\n", reading, writing, excluded, examined, pair, single, + duplicate, single_dup, single_dup + duplicate); + } + + kh_destroy(reads, pair_hash); + kh_destroy(reads, single_hash); + kl_destroy(read_queue, read_buffer); + bam_hdr_destroy(header); + + return 0; +} + + +static int markdup_usage(void) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools markdup \n\n"); + fprintf(stderr, "Option: \n"); + fprintf(stderr, " -r Remove duplicate reads\n"); + fprintf(stderr, " -l Max read length (default 300 bases)\n"); + fprintf(stderr, " -s Report stats.\n"); + + sam_global_opt_help(stderr, "-.O..@"); + + fprintf(stderr, "\nThe input file must be coordinate sorted and must have gone" + " through fixmates with the mate scoring option on.\n"); + + return 1; +} + + +int bam_markdup(int argc, char **argv) { + int c, ret, remove_dups = 0, report_stats = 0; + int32_t max_length = 300; + samFile *in = NULL, *out = NULL; + char wmode[3] = {'w', 'b', 0}; + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + htsThreadPool p = {NULL, 0}; + + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), + {NULL, 0, NULL, 0} + }; + + while ((c = getopt_long(argc, argv, "rsl:O:@:", lopts, NULL)) >= 0) { + switch (c) { + case 'r': remove_dups = 1; break; + case 'l': max_length = atoi(optarg); break; + case 's': report_stats = 1; break; + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': return markdup_usage(); + } + } + + if (optind + 2 > argc) + return markdup_usage(); + + in = sam_open_format(argv[optind], "r", &ga.in); + + if (!in) { + print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]); + return 1; + } + + sam_open_mode(wmode + 1, argv[optind + 1], NULL); + out = sam_open_format(argv[optind + 1], wmode, &ga.out); + + if (!out) { + print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]); + return 1; + } + + if (ga.nthreads > 0) { + if (!(p.pool = hts_tpool_init(ga.nthreads))) { + fprintf(stderr, "[markdup] error creating thread pool\n"); + return 1; + } + + hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); + hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); + } + + // actual stuff happens here + ret = bam_mark_duplicates(in, out, remove_dups, max_length, report_stats); + + sam_close(in); + + if (sam_close(out) < 0) { + fprintf(stderr, "[markdup] error closing output file\n"); + ret = 1; + } + + if (p.pool) hts_tpool_destroy(p.pool); + + sam_global_args_free(&ga); + + return ret; +} diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c new file mode 100644 index 0000000..11b298c --- /dev/null +++ b/samtools/bam_markdup.c.pysam.c @@ -0,0 +1,846 @@ +#include "pysam.h" + +/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone + through fixmates with the mate scoring option on. + + Copyright (C) 2017 Genome Research Ltd. + + Author: Andrew Whitwham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include "htslib/thread_pool.h" +#include "htslib/sam.h" +#include "sam_opts.h" +#include "samtools.h" +#include "htslib/khash.h" +#include "htslib/klist.h" + +typedef struct { + int32_t single; + int32_t this_ref; + int32_t this_coord; + int32_t other_ref; + int32_t other_coord; + int32_t leftmost; + int32_t orientation; +} key_data_t; + +typedef struct { + bam1_t *p; +} in_hash_t; + +typedef struct { + bam1_t *b; + int32_t pos; + key_data_t pair_key; + key_data_t single_key; +} read_queue_t; + + + +static khint32_t do_hash(unsigned char *key, khint32_t len); + +static khint_t hash_key(key_data_t key) { + int i = 0; + khint_t hash; + + if (key.single) { + unsigned char sig[12]; + + memcpy(sig + i, &key.this_ref, 4); i += 4; + memcpy(sig + i, &key.this_coord, 4); i += 4; + memcpy(sig + i, &key.orientation, 4); i += 4; + + hash = do_hash(sig, i); + } else { + unsigned char sig[24]; + + memcpy(sig + i, &key.this_ref, 4); i += 4; + memcpy(sig + i, &key.this_coord, 4); i += 4; + memcpy(sig + i, &key.other_ref, 4); i += 4; + memcpy(sig + i, &key.other_coord, 4); i += 4; + memcpy(sig + i, &key.leftmost, 4); i += 4; + memcpy(sig + i, &key.orientation, 4); i += 4; + + hash = do_hash(sig, i); + } + + return hash; +} + + +static int key_equal(key_data_t a, key_data_t b) { + int match = 1; + + if (a.this_coord != b.this_coord) + match = 0; + else if (a.orientation != b.orientation) + match = 0; + else if (a.this_ref != b.this_ref) + match = 0; + else if (a.single != b.single) + match = 0; + + if (!a.single) { + if (a.other_coord != b.other_coord) + match = 0; + else if (a.leftmost != b.leftmost) + match = 0; + else if (a.other_ref != b.other_ref) + match = 0; + } + + return match; +} + + +#define __free_queue_element(p) +#define O_FF 2 +#define O_RR 3 +#define O_FR 5 +#define O_RF 7 + +KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash +KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer + + +/* Calculate the mate's unclipped start based on position and cigar string from MC tag. */ + +static int32_t unclipped_other_start(int32_t op, char *cigar) { + char *c = cigar; + int32_t clipped = 0; + + while (*c && *c != '*') { + long num = 0; + + if (isdigit((int)*c)) { + num = strtol(c, &c, 10); + } else { + num = 1; + } + + if (*c == 'S' || *c == 'H') { // clips + clipped += num; + } else { + break; + } + + c++; + } + + return op - clipped + 1; +} + + +/* Calculate the current read's start based on the stored cigar string. */ + +static int32_t unclipped_start(bam1_t *b) { + uint32_t *cigar = bam_get_cigar(b); + int32_t clipped = 0; + uint32_t i; + + for (i = 0; i < b->core.n_cigar; i++) { + char c = bam_cigar_opchr(cigar[i]); + + if (c == 'S' || c == 'H') { // clips + clipped += bam_cigar_oplen(cigar[i]); + } else { + break; + } + } + + return b->core.pos - clipped + 1; +} + + +/* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/ + +static int32_t unclipped_other_end(int32_t op, char *cigar) { + char *c = cigar; + int32_t refpos = 0; + int skip = 1; + + while (*c && *c != '*') { + long num = 0; + + if (isdigit((int)*c)) { + num = strtol(c, &c, 10); + } else { + num = 1; + } + + switch (*c) { + case 'M': + case 'D': + case 'N': + case '=': + case 'X': + refpos += num; + skip = 0; // ignore initial clips + break; + + case 'S': + case 'H': + if (!skip) { + refpos += num; + } + break; + } + + c++; + } + + return op + refpos; +} + + +/* Calculate the current read's end based on the stored cigar string. */ + +static int32_t unclipped_end(bam1_t *b) { + uint32_t *cigar = bam_get_cigar(b); + int32_t end_pos, clipped = 0; + int32_t i; + + end_pos = bam_endpos(b); + + // now get the clipped end bases (if any) + // if we get to the beginning of the cigar string + // without hitting a non-clip then the results are meaningless + for (i = b->core.n_cigar - 1; i >= 0; i--) { + char c = bam_cigar_opchr(cigar[i]); + + if (c == 'S' || c == 'H') { // clips + clipped += bam_cigar_oplen(cigar[i]); + } else { + break; + } + } + + return end_pos + clipped; +} + + +/* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */ + +static khint32_t do_hash(unsigned char *key, khint32_t len) { + khint32_t hash, i; + + for (hash = 0, i = 0; i < len; ++i) { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } + + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + + return hash; +} + + +/* Get mate score from tag. */ + +static int64_t get_mate_score(bam1_t *b) { + uint8_t *data; + int64_t score; + + if ((data = bam_aux_get(b, "ms"))) { + score = bam_aux2i(data); + } else { + fprintf(pysam_stderr, "[markdup] error: no ms score tag.\n"); + return -1; + } + + return score; +} + + +/* Calc current score from quality. */ + +static int64_t calc_score(bam1_t *b) +{ + int64_t score = 0; + uint8_t *qual = bam_get_qual(b); + int i; + + for (i = 0; i < b->core.l_qseq; i++) { + if (qual[i] >= 15) score += qual[i]; + } + + return score; +} + + +/* Create a signature hash of the current read and its pair. + Uses the unclipped start (or end depending on orientation), + the reference id, orientation and whether the current + read is leftmost of the pair. */ + +static int make_pair_key(key_data_t *key, bam1_t *bam) { + int32_t this_ref, this_coord, this_end; + int32_t other_ref, other_coord, other_end; + int32_t orientation, leftmost; + uint8_t *data; + char *cig; + + this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash + other_ref = bam->core.mtid + 1; + + this_coord = unclipped_start(bam); + this_end = unclipped_end(bam); + + if ((data = bam_aux_get(bam, "MC"))) { + cig = bam_aux2Z(data); + other_end = unclipped_other_end(bam->core.mpos, cig); + other_coord = unclipped_other_start(bam->core.mpos, cig); + } else { + fprintf(pysam_stderr, "[markdup] error: no MC tag.\n"); + return 1; + } + + // work out orientations + if (this_ref != other_ref) { + leftmost = this_ref < other_ref; + } else { + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + if (!bam_is_rev(bam)) { + leftmost = this_coord <= other_coord; + } else { + leftmost = this_end <= other_end; + } + } else { + if (bam_is_rev(bam)) { + leftmost = this_end <= other_coord; + } else { + leftmost = this_coord <= other_end; + } + } + } + + // pair orientation + if (leftmost) { + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + other_coord = other_end; + + if (!bam_is_rev(bam)) { + if (bam->core.flag & BAM_FREAD1) { + orientation = O_FF; + } else { + orientation = O_RR; + } + } else { + if (bam->core.flag & BAM_FREAD1) { + orientation = O_RR; + } else { + orientation = O_FF; + } + } + } else { + if (!bam_is_rev(bam)) { + orientation = O_FR; + other_coord = other_end; + } else { + orientation = O_RF; + this_coord = this_end; + } + } + } else { + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + this_coord = this_end; + + if (!bam_is_rev(bam)) { + if (bam->core.flag & BAM_FREAD1) { + orientation = O_RR; + } else { + orientation = O_FF; + } + } else { + if (bam->core.flag & BAM_FREAD1) { + orientation = O_FF; + } else { + orientation = O_RR; + } + } + } else { + if (!bam_is_rev(bam)) { + orientation = O_RF; + other_coord = other_end; + } else { + orientation = O_FR; + this_coord = this_end; + } + } + } + + if (!leftmost) + leftmost = 13; + else + leftmost = 11; + + key->single = 0; + key->this_ref = this_ref; + key->this_coord = this_coord; + key->other_ref = other_ref; + key->other_coord = other_coord; + key->leftmost = leftmost; + key->orientation = orientation; + + return 0; +} + + +/* Create a signature hash of single read (or read with an unmatched pair). + Uses unclipped start (or end depending on orientation), reference id, + and orientation. */ + +static void make_single_key(key_data_t *key, bam1_t *bam) { + int32_t this_ref, this_coord; + int32_t orientation; + + this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash + + if (bam_is_rev(bam)) { + this_coord = unclipped_end(bam); + orientation = O_RR; + } else { + this_coord = unclipped_start(bam); + orientation = O_FF; + } + + key->single = 1; + key->this_ref = this_ref; + key->this_coord = this_coord; + key->orientation = orientation; +} + + +/* Compare the reads near each other (coordinate sorted) and try to spot the duplicates. + Generally the highest quality scoring is chosen as the original and all others the duplicates. + The score is based on the sum of the quality values (<= 15) of the read and its mate (if any). + While single reads are compared to only one read of a pair, the pair will chosen as the original. + The comparison is done on position and orientation, see above for details. */ + +static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32_t max_length, int do_stats) { + bam_hdr_t *header; + khiter_t k; + khash_t(reads) *pair_hash = kh_init(reads); + khash_t(reads) *single_hash = kh_init(reads); + klist_t(read_queue) *read_buffer = kl_init(read_queue); + kliter_t(read_queue) *rq; + int32_t prev_tid, prev_coord; + read_queue_t *in_read; + int ret; + int reading, writing, excluded, duplicate, single, pair, single_dup, examined; + + if ((header = sam_hdr_read(in)) == NULL) { + fprintf(pysam_stderr, "[markdup] error reading header\n"); + return 1; + } + + // accept unknown, unsorted or coordinate sort order, but error on queryname sorted. + // only really works on coordinate sorted files. + if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { + char *p, *q; + + p = strstr(header->text, "\tSO:queryname"); + q = strchr(header->text, '\n'); + + // looking for SO:queryname within @HD only + // (e.g. must ignore in a @CO comment line later in header) + if ((p != 0) && (p < q)) { + fprintf(pysam_stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); + return 1; + } + } + + if (sam_hdr_write(out, header) < 0) { + fprintf(pysam_stderr, "[markdup] error writing header.\n"); + return 1; + } + + // used for coordinate order checks + prev_tid = prev_coord = 0; + + // get the buffer going + in_read = kl_pushp(read_queue, read_buffer); + + + if ((in_read->b = bam_init1()) == NULL) { + fprintf(pysam_stderr, "[markdup] error: unable to allocate memory for alignment.\n"); + return 1; + } + + reading = writing = excluded = single_dup = duplicate = examined = pair = single = 0; + + while ((ret = sam_read1(in, header, in_read->b)) >= 0) { + + // do some basic coordinate order checks + if (in_read->b->core.tid >= 0) { // -1 for unmapped reads + if (in_read->b->core.tid < prev_tid || + ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) { + fprintf(pysam_stderr, "[markdup] error: bad coordinate order.\n"); + return 1; + } + } + + prev_coord = in_read->pos = in_read->b->core.pos; + prev_tid = in_read->b->core.tid; + in_read->pair_key.single = 1; + in_read->single_key.single = 0; + + reading++; + + // read must not be secondary, supplementary, unmapped or failed QC + if (!(in_read->b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL))) { + examined++; + + // look at the pairs first + if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { + int ret, mate_tmp; + key_data_t pair_key; + key_data_t single_key; + in_hash_t *bp; + + if (make_pair_key(&pair_key, in_read->b)) { + fprintf(pysam_stderr, "[markdup] error: unable to assign pair hash key.\n"); + return 1; + } + + make_single_key(&single_key, in_read->b); + + pair++; + in_read->pos = single_key.this_coord; // cigar/orientation modified pos + + // put in singles hash for checking against non paired reads + k = kh_put(reads, single_hash, single_key, &ret); + + if (ret > 0) { // new + // add to single duplicate hash + bp = &kh_val(single_hash, k); + bp->p = in_read->b; + in_read->single_key = single_key; + } else if (ret == 0) { // exists + // look at singles only for duplication marking + bp = &kh_val(single_hash, k); + + if (!(bp->p->core.flag & BAM_FPAIRED) || (bp->p->core.flag & BAM_FMUNMAP)) { + bam1_t *dup = bp->p; + + // singleton will always be marked duplicate even if + // scores more than one read of the pair + + bp->p = in_read->b; + dup->core.flag |= BAM_FDUP; + single_dup++; + } + } else { + fprintf(pysam_stderr, "[markdup] error: single hashing failure.\n"); + return 1; + } + + // now do the pair + k = kh_put(reads, pair_hash, pair_key, &ret); + + if (ret > 0) { // new + // add to the pair hash + bp = &kh_val(pair_hash, k); + bp->p = in_read->b; + in_read->pair_key = pair_key; + } else if (ret == 0) { + int64_t old_score, new_score, tie_add = 0; + bam1_t *dup; + + bp = &kh_val(pair_hash, k); + + if ((mate_tmp = get_mate_score(bp->p)) == -1) { + fprintf(pysam_stderr, "[markdup] error: no ms score tag.\n"); + return 1; + } else { + old_score = calc_score(bp->p) + mate_tmp; + } + + if ((mate_tmp = get_mate_score(in_read->b)) == -1) { + fprintf(pysam_stderr, "[markdup] error: no ms score tag.\n"); + return 1; + } else { + new_score = calc_score(in_read->b) + mate_tmp; + } + + // choose the highest score as the original + // and add it to the pair hash, mark the other as duplicate + + if (new_score == old_score) { + if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p)) < 0) { + tie_add = 1; + } else { + tie_add = -1; + } + } + + if (new_score + tie_add > old_score) { // swap reads + dup = bp->p; + bp->p = in_read->b; + } else { + dup = in_read->b; + } + + dup->core.flag |= BAM_FDUP; + + duplicate++; + } else { + fprintf(pysam_stderr, "[markdup] error: pair hashing failure.\n"); + return 1; + } + } else { // do the single (or effectively single) reads + int ret; + key_data_t single_key; + in_hash_t *bp; + + make_single_key(&single_key, in_read->b); + + single++; + in_read->pos = single_key.this_coord; // cigar/orientation modified pos + + k = kh_put(reads, single_hash, single_key, &ret); + + if (ret > 0) { // new + bp = &kh_val(single_hash, k); + bp->p = in_read->b; + in_read->single_key = single_key; + } else if (ret == 0) { // exists + bp = &kh_val(single_hash, k); + + if ((bp->p->core.flag & BAM_FPAIRED) && !(bp->p->core.flag & BAM_FMUNMAP)) { + // if matched against one of a pair just mark as duplicate + in_read->b->core.flag |= BAM_FDUP; + } else { + int64_t old_score, new_score; + bam1_t *dup; + + old_score = calc_score(bp->p); + new_score = calc_score(in_read->b); + + // choose the highest score as the original, add it + // to the single hash and mark the other as duplicate + if (new_score > old_score) { // swap reads + dup = bp->p; + bp->p = in_read->b; + } else { + dup = in_read->b; + } + + dup->core.flag |= BAM_FDUP; + } + + single_dup++; + } else { + fprintf(pysam_stderr, "[markdup] error: single hashing failure.\n"); + return 1; + } + } + } else { + excluded++; + } + + // loop through the stored reads and write out those we + // no longer need + rq = kl_begin(read_buffer); + while (rq != kl_end(read_buffer)) { + in_read = &kl_val(rq); + + /* keep a moving window of reads based on coordinates and max read length. Any unaligned reads + should just be written as they cannot be matched as duplicates. */ + if (in_read->pos + max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { + break; + } + + if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { + if (sam_write1(out, header, in_read->b) < 0) { + fprintf(pysam_stderr, "[markdup] error: writing output failed.\n"); + return 1; + } + + writing++; + } + + // remove from hash + if (in_read->pair_key.single == 0) { + k = kh_get(reads, pair_hash, in_read->pair_key); + kh_del(reads, pair_hash, k); + } + + if (in_read->single_key.single == 1) { + k = kh_get(reads, single_hash, in_read->single_key); + kh_del(reads, single_hash, k); + } + + kl_shift(read_queue, read_buffer, NULL); + bam_destroy1(in_read->b); + rq = kl_begin(read_buffer); + } + + // set the next one up for reading + in_read = kl_pushp(read_queue, read_buffer); + + if ((in_read->b = bam_init1()) == NULL) { + fprintf(pysam_stderr, "[markdup] error: unable to allocate memory for alignment.\n"); + return 1; + } + } + + if (ret < -1) { + fprintf(pysam_stderr, "[markdup] error: truncated input file.\n"); + return 1; + } + + // write out the end of the list + rq = kl_begin(read_buffer); + while (rq != kl_end(read_buffer)) { + in_read = &kl_val(rq); + + if (bam_get_qname(in_read->b)) { // last entry will be blank + if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { + if (sam_write1(out, header, in_read->b) < 0) { + fprintf(pysam_stderr, "[markdup] error: writing final output failed.\n"); + return 1; + } + + writing++; + } + } + + kl_shift(read_queue, read_buffer, NULL); + bam_destroy1(in_read->b); + rq = kl_begin(read_buffer); + } + + if (do_stats) { + fprintf(pysam_stderr, "READ %d WRITTEN %d \n" + "EXCLUDED %d EXAMINED %d\n" + "PAIRED %d SINGLE %d\n" + "DULPICATE PAIR %d DUPLICATE SINGLE %d\n" + "DUPLICATE TOTAL %d\n", reading, writing, excluded, examined, pair, single, + duplicate, single_dup, single_dup + duplicate); + } + + kh_destroy(reads, pair_hash); + kh_destroy(reads, single_hash); + kl_destroy(read_queue, read_buffer); + bam_hdr_destroy(header); + + return 0; +} + + +static int markdup_usage(void) { + fprintf(pysam_stderr, "\n"); + fprintf(pysam_stderr, "Usage: samtools markdup \n\n"); + fprintf(pysam_stderr, "Option: \n"); + fprintf(pysam_stderr, " -r Remove duplicate reads\n"); + fprintf(pysam_stderr, " -l Max read length (default 300 bases)\n"); + fprintf(pysam_stderr, " -s Report stats.\n"); + + sam_global_opt_help(pysam_stderr, "-.O..@"); + + fprintf(pysam_stderr, "\nThe input file must be coordinate sorted and must have gone" + " through fixmates with the mate scoring option on.\n"); + + return 1; +} + + +int bam_markdup(int argc, char **argv) { + int c, ret, remove_dups = 0, report_stats = 0; + int32_t max_length = 300; + samFile *in = NULL, *out = NULL; + char wmode[3] = {'w', 'b', 0}; + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + htsThreadPool p = {NULL, 0}; + + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), + {NULL, 0, NULL, 0} + }; + + while ((c = getopt_long(argc, argv, "rsl:O:@:", lopts, NULL)) >= 0) { + switch (c) { + case 'r': remove_dups = 1; break; + case 'l': max_length = atoi(optarg); break; + case 's': report_stats = 1; break; + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': return markdup_usage(); + } + } + + if (optind + 2 > argc) + return markdup_usage(); + + in = sam_open_format(argv[optind], "r", &ga.in); + + if (!in) { + print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]); + return 1; + } + + sam_open_mode(wmode + 1, argv[optind + 1], NULL); + out = sam_open_format(argv[optind + 1], wmode, &ga.out); + + if (!out) { + print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]); + return 1; + } + + if (ga.nthreads > 0) { + if (!(p.pool = hts_tpool_init(ga.nthreads))) { + fprintf(pysam_stderr, "[markdup] error creating thread pool\n"); + return 1; + } + + hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); + hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); + } + + // actual stuff happens here + ret = bam_mark_duplicates(in, out, remove_dups, max_length, report_stats); + + sam_close(in); + + if (sam_close(out) < 0) { + fprintf(pysam_stderr, "[markdup] error closing output file\n"); + ret = 1; + } + + if (p.pool) hts_tpool_destroy(p.pool); + + sam_global_args_free(&ga); + + return ret; +} diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c index 75c2f51..1d6c55f 100644 --- a/samtools/bam_mate.c +++ b/samtools/bam_mate.c @@ -218,8 +218,39 @@ static int sync_mate(bam1_t* a, bam1_t* b) return 0; } + +static uint32_t calc_mate_score(bam1_t *b) +{ + uint32_t score = 0; + uint8_t *qual = bam_get_qual(b); + int i; + + for (i = 0; i < b->core.l_qseq; i++) { + if (qual[i] >= 15) score += qual[i]; + } + + return score; +} + + +static int add_mate_score(bam1_t *src, bam1_t *dest) +{ + uint8_t *data_ms; + uint32_t mate_score = calc_mate_score(src); + + if ((data_ms = bam_aux_get(dest, "ms")) != NULL) { + bam_aux_del(dest, data_ms); + } + + if (bam_aux_append(dest, "ms", 'i', sizeof(uint32_t), (uint8_t*)&mate_score) == -1) { + return -1; + } + + return 0; +} + // currently, this function ONLY works if each read has one hit -static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct) +static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring) { bam_hdr_t *header; bam1_t *b[2] = { NULL, NULL }; @@ -295,6 +326,13 @@ static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int prop cur->core.flag &= ~BAM_FPROPER_PAIR; } + if (do_mate_scoring) { + if ((add_mate_score(pre, cur) == -1) || (add_mate_score(cur, pre) == -1)) { + fprintf(stderr, "[bam_mating_core] ERROR: unable to add mate score.\n"); + goto fail; + } + } + // Write out result if ( !remove_reads ) { if (sam_write1(out, header, pre) < 0) goto write_fail; @@ -361,7 +399,8 @@ void usage(FILE* where) "Options:\n" " -r Remove unmapped reads and secondary alignments\n" " -p Disable FR proper pair check\n" -" -c Add template cigar ct tag\n"); +" -c Add template cigar ct tag\n" +" -m Add mate score tag\n"); sam_global_opt_help(where, "-.O..@"); @@ -376,7 +415,7 @@ int bam_mating(int argc, char *argv[]) { htsThreadPool p = {NULL, 0}; samFile *in = NULL, *out = NULL; - int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1; + int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; char wmode[3] = {'w', 'b', 0}; static const struct option lopts[] = { @@ -386,11 +425,12 @@ int bam_mating(int argc, char *argv[]) // parse args if (argc == 1) { usage(stdout); return 0; } - while ((c = getopt_long(argc, argv, "rpcO:@:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rpcmO:@:", lopts, NULL)) >= 0) { switch (c) { case 'r': remove_reads = 1; break; case 'p': proper_pair_check = 0; break; case 'c': add_ct = 1; break; + case 'm': mate_score = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage(stderr); goto fail; @@ -419,7 +459,7 @@ int bam_mating(int argc, char *argv[]) } // run - res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct); + res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score); // cleanup sam_close(in); diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c index a03de96..8857aeb 100644 --- a/samtools/bam_mate.c.pysam.c +++ b/samtools/bam_mate.c.pysam.c @@ -220,8 +220,39 @@ static int sync_mate(bam1_t* a, bam1_t* b) return 0; } + +static uint32_t calc_mate_score(bam1_t *b) +{ + uint32_t score = 0; + uint8_t *qual = bam_get_qual(b); + int i; + + for (i = 0; i < b->core.l_qseq; i++) { + if (qual[i] >= 15) score += qual[i]; + } + + return score; +} + + +static int add_mate_score(bam1_t *src, bam1_t *dest) +{ + uint8_t *data_ms; + uint32_t mate_score = calc_mate_score(src); + + if ((data_ms = bam_aux_get(dest, "ms")) != NULL) { + bam_aux_del(dest, data_ms); + } + + if (bam_aux_append(dest, "ms", 'i', sizeof(uint32_t), (uint8_t*)&mate_score) == -1) { + return -1; + } + + return 0; +} + // currently, this function ONLY works if each read has one hit -static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct) +static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring) { bam_hdr_t *header; bam1_t *b[2] = { NULL, NULL }; @@ -297,6 +328,13 @@ static int bam_mating_core(samFile* in, samFile* out, int remove_reads, int prop cur->core.flag &= ~BAM_FPROPER_PAIR; } + if (do_mate_scoring) { + if ((add_mate_score(pre, cur) == -1) || (add_mate_score(cur, pre) == -1)) { + fprintf(pysam_stderr, "[bam_mating_core] ERROR: unable to add mate score.\n"); + goto fail; + } + } + // Write out result if ( !remove_reads ) { if (sam_write1(out, header, pre) < 0) goto write_fail; @@ -363,7 +401,8 @@ void usage(FILE* where) "Options:\n" " -r Remove unmapped reads and secondary alignments\n" " -p Disable FR proper pair check\n" -" -c Add template cigar ct tag\n"); +" -c Add template cigar ct tag\n" +" -m Add mate score tag\n"); sam_global_opt_help(where, "-.O..@"); @@ -378,7 +417,7 @@ int bam_mating(int argc, char *argv[]) { htsThreadPool p = {NULL, 0}; samFile *in = NULL, *out = NULL; - int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1; + int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; char wmode[3] = {'w', 'b', 0}; static const struct option lopts[] = { @@ -388,11 +427,12 @@ int bam_mating(int argc, char *argv[]) // parse args if (argc == 1) { usage(pysam_stdout); return 0; } - while ((c = getopt_long(argc, argv, "rpcO:@:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rpcmO:@:", lopts, NULL)) >= 0) { switch (c) { case 'r': remove_reads = 1; break; case 'p': proper_pair_check = 0; break; case 'c': add_ct = 1; break; + case 'm': mate_score = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage(pysam_stderr); goto fail; @@ -421,7 +461,7 @@ int bam_mating(int argc, char *argv[]) } // run - res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct); + res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score); // cleanup sam_close(in); diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c index d17e9d6..d451ffd 100644 --- a/samtools/bam_plcmd.c +++ b/samtools/bam_plcmd.c @@ -113,6 +113,7 @@ static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref #define MPLP_PRINT_MAPQ (1<<10) #define MPLP_PER_SAMPLE (1<<11) #define MPLP_SMART_OVERLAPS (1<<12) +#define MPLP_PRINT_QNAME (1<<13) void *bed_read(const char *fn); void bed_destroy(void *_h); @@ -220,6 +221,7 @@ print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, fputs("\t0\t*\t*", fp); if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp); if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp); + if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", fp); } putc('\n', fp); } @@ -642,6 +644,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fputs("*\t*", pileup_fp); if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); + if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", pileup_fp); } else { int n = 0; for (j = 0; j < n_plp[i]; ++j) { @@ -698,6 +701,21 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } if (!n) putc('*', pileup_fp); } + + if (conf->flag & MPLP_PRINT_QNAME) { + n = 0; + putc('\t', pileup_fp); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = &plp[i][j]; + int c = bam_get_qual(p->b)[p->qpos]; + if ( c < conf->min_baseQ ) continue; + + if (n > 0) putc(',', pileup_fp); + fputs(bam_get_qname(p->b), pileup_fp); + n++; + } + if (!n) putc('*', pileup_fp); + } } } putc('\n', pileup_fp); @@ -898,6 +916,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) "Output options for mpileup format (without -g/-v):\n" " -O, --output-BP output base positions on reads\n" " -s, --output-MQ output mapping quality\n" +" --output-QNAME output read names\n" " -a output all positions (including zero depth)\n" " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n" "\n" @@ -960,6 +979,8 @@ int bam_mpileup(int argc, char *argv[]) {"excl-flags", required_argument, NULL, 2}, {"output", required_argument, NULL, 3}, {"open-prob", required_argument, NULL, 4}, + {"output-QNAME", no_argument, NULL, 5}, + {"output-qname", no_argument, NULL, 5}, {"illumina1.3+", no_argument, NULL, '6'}, {"count-orphans", no_argument, NULL, 'A'}, {"bam-list", required_argument, NULL, 'b'}, @@ -1016,6 +1037,7 @@ int bam_mpileup(int argc, char *argv[]) break; case 3 : mplp.output_fname = optarg; break; case 4 : mplp.openQ = atoi(optarg); break; + case 5 : mplp.flag |= MPLP_PRINT_QNAME; break; case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == NULL) return 1; diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c index 03e5f8a..7fd5bea 100644 --- a/samtools/bam_plcmd.c.pysam.c +++ b/samtools/bam_plcmd.c.pysam.c @@ -115,6 +115,7 @@ static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref #define MPLP_PRINT_MAPQ (1<<10) #define MPLP_PER_SAMPLE (1<<11) #define MPLP_SMART_OVERLAPS (1<<12) +#define MPLP_PRINT_QNAME (1<<13) void *bed_read(const char *fn); void bed_destroy(void *_h); @@ -222,6 +223,7 @@ print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, fputs("\t0\t*\t*", fp); if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp); if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp); + if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", fp); } putc('\n', fp); } @@ -644,6 +646,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fputs("*\t*", pileup_fp); if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); + if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", pileup_fp); } else { int n = 0; for (j = 0; j < n_plp[i]; ++j) { @@ -700,6 +703,21 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } if (!n) putc('*', pileup_fp); } + + if (conf->flag & MPLP_PRINT_QNAME) { + n = 0; + putc('\t', pileup_fp); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = &plp[i][j]; + int c = bam_get_qual(p->b)[p->qpos]; + if ( c < conf->min_baseQ ) continue; + + if (n > 0) putc(',', pileup_fp); + fputs(bam_get_qname(p->b), pileup_fp); + n++; + } + if (!n) putc('*', pileup_fp); + } } } putc('\n', pileup_fp); @@ -900,6 +918,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) "Output options for mpileup format (without -g/-v):\n" " -O, --output-BP output base positions on reads\n" " -s, --output-MQ output mapping quality\n" +" --output-QNAME output read names\n" " -a output all positions (including zero depth)\n" " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n" "\n" @@ -962,6 +981,8 @@ int bam_mpileup(int argc, char *argv[]) {"excl-flags", required_argument, NULL, 2}, {"output", required_argument, NULL, 3}, {"open-prob", required_argument, NULL, 4}, + {"output-QNAME", no_argument, NULL, 5}, + {"output-qname", no_argument, NULL, 5}, {"illumina1.3+", no_argument, NULL, '6'}, {"count-orphans", no_argument, NULL, 'A'}, {"bam-list", required_argument, NULL, 'b'}, @@ -1018,6 +1039,7 @@ int bam_mpileup(int argc, char *argv[]) break; case 3 : mplp.output_fname = optarg; break; case 4 : mplp.openQ = atoi(optarg); break; + case 5 : mplp.flag |= MPLP_PRINT_QNAME; break; case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == NULL) return 1; diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c index f82686d..562c8e4 100644 --- a/samtools/bam_reheader.c.pysam.c +++ b/samtools/bam_reheader.c.pysam.c @@ -475,7 +475,7 @@ int main_reheader(int argc, char *argv[]) if (argc - optind != 2) usage(pysam_stderr, 1); - + { // read the header samFile *fph = sam_open(argv[optind], "r"); if (fph == 0) { diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c index d32a241..b1d5898 100644 --- a/samtools/bam_sort.c +++ b/samtools/bam_sort.c @@ -38,7 +38,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "htslib/bgzf.h" #include "htslib/ksort.h" +#include "htslib/hts_os.h" #include "htslib/khash.h" #include "htslib/klist.h" #include "htslib/kstring.h" @@ -49,10 +51,10 @@ DEALINGS IN THE SOFTWARE. */ // Struct which contains the a record, and the pointer to the sort tag (if any) // Used to speed up sort-by-tag. -typedef struct bam1_p { - bam1_t *b; +typedef struct bam1_tag { + bam1_t *bam_record; const uint8_t *tag; -} bam1_p; +} bam1_tag; /* Minimum memory required in megabytes before sort will attempt to run. This is to prevent accidents where failing to use the -m option correctly results @@ -122,29 +124,36 @@ static int strnum_cmp(const char *_a, const char *_b) typedef struct { int i; uint64_t pos, idx; - bam1_p b; + bam1_tag entry; } heap1_t; -#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx)))) - -static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b); +static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b); // Function to compare reads in the heap and determine which one is < the other static inline int heap_lt(const heap1_t a, const heap1_t b) { + if (!a.entry.bam_record) + return 1; + if (!b.entry.bam_record) + return 0; + if (g_is_by_tag) { int t; - if (a.b.b == NULL || b.b.b == NULL) return a.b.b == NULL? 1 : 0; - t = bam1_lt_by_tag(b.b,a.b); - return t; + t = bam1_cmp_by_tag(a.entry, b.entry); + if (t != 0) return t > 0; } else if (g_is_by_qname) { - int t; - if (a.b.b == NULL || b.b.b == NULL) return a.b.b == NULL? 1 : 0; - t = strnum_cmp(bam_get_qname(a.b.b), bam_get_qname(b.b.b)); - return (t > 0 || (t == 0 && (a.b.b->core.flag&0xc0) > (b.b.b->core.flag&0xc0))); + int t, fa, fb; + t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record)); + if (t != 0) return t > 0; + fa = a.entry.bam_record->core.flag & 0xc0; + fb = b.entry.bam_record->core.flag & 0xc0; + if (fa != fb) return fa > fb; } else { - return __pos_cmp(a, b); + if (a.pos != b.pos) return a.pos > b.pos; } + // This compares by position in the input file(s) + if (a.i != b.i) return a.i > b.i; + return a.idx > b.idx; } KSORT_INIT(heap, heap1_t, heap_lt) @@ -1351,25 +1360,25 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m heap1_t *h = heap + i; int res; h->i = i; - h->b.b = bam_init1(); - h->b.tag = NULL; - if (!h->b.b) goto mem_fail; - res = iter[i] ? sam_itr_next(fp[i], iter[i], h->b.b) : sam_read1(fp[i], hdr[i], h->b.b); + h->entry.bam_record = bam_init1(); + h->entry.tag = NULL; + if (!h->entry.bam_record) goto mem_fail; + res = iter[i] ? sam_itr_next(fp[i], iter[i], h->entry.bam_record) : sam_read1(fp[i], hdr[i], h->entry.bam_record); if (res >= 0) { - bam_translate(h->b.b, translation_tbl + i); - h->pos = ((uint64_t)h->b.b->core.tid<<32) | (uint32_t)((int32_t)h->b.b->core.pos+1)<<1 | bam_is_rev(h->b.b); + bam_translate(h->entry.bam_record, translation_tbl + i); + h->pos = ((uint64_t)h->entry.bam_record->core.tid<<32) | (uint32_t)((int32_t)h->entry.bam_record->core.pos+1)<<1 | bam_is_rev(h->entry.bam_record); h->idx = idx++; if (g_is_by_tag) { - h->b.tag = bam_aux_get(h->b.b, g_sort_tag); + h->entry.tag = bam_aux_get(h->entry.bam_record, g_sort_tag); } else { - h->b.tag = NULL; + h->entry.tag = NULL; } } else if (res == -1 && (!iter[i] || iter[i]->finished)) { h->pos = HEAP_EMPTY; - bam_destroy1(h->b.b); - h->b.b = NULL; - h->b.tag = NULL; + bam_destroy1(h->entry.bam_record); + h->entry.bam_record = NULL; + h->entry.tag = NULL; } else { print_error(cmd, "failed to read first record from \"%s\"", fn[i]); goto fail; @@ -1391,7 +1400,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m // Begin the actual merge ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { - bam1_t *b = heap->b.b; + bam1_t *b = heap->entry.bam_record; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); @@ -1407,15 +1416,15 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b); heap->idx = idx++; if (g_is_by_tag) { - heap->b.tag = bam_aux_get(heap->b.b, g_sort_tag); + heap->entry.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag); } else { - heap->b.tag = NULL; + heap->entry.tag = NULL; } } else if (j == -1 && (!iter[heap->i] || iter[heap->i]->finished)) { heap->pos = HEAP_EMPTY; - bam_destroy1(heap->b.b); - heap->b.b = NULL; - heap->b.tag = NULL; + bam_destroy1(heap->entry.bam_record); + heap->entry.bam_record = NULL; + heap->entry.tag = NULL; } else { print_error(cmd, "\"%s\" is truncated", fn[heap->i]); goto fail; @@ -1459,7 +1468,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m if (iter && iter[i]) hts_itr_destroy(iter[i]); if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]); if (fp && fp[i]) sam_close(fp[i]); - if (heap && heap[i].b.b) bam_destroy1(heap[i].b.b); + if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record); } if (hout) bam_hdr_destroy(hout); free(RG); @@ -1615,6 +1624,169 @@ end: * BAM sorting * ***************/ +typedef struct { + size_t from; + size_t to; +} buf_region; + +/* Simplified version of bam_merge_core2() for merging part-sorted + temporary files. No need for header merging or translation, + it just needs to read data into the heap and push it out again. */ + +static inline int heap_add_read(heap1_t *heap, int nfiles, samFile **fp, + int num_in_mem, buf_region *in_mem, + bam1_tag *buf, uint64_t *idx, bam_hdr_t *hout) { + int i = heap->i, res; + if (i < nfiles) { // read from file + res = sam_read1(fp[i], hout, heap->entry.bam_record); + } else { // read from memory + if (in_mem[i - nfiles].from < in_mem[i - nfiles].to) { + heap->entry.bam_record = buf[in_mem[i - nfiles].from++].bam_record; + res = 0; + } else { + res = -1; + } + } + if (res >= 0) { + heap->pos = (((uint64_t)heap->entry.bam_record->core.tid<<32) + | (uint32_t)((int32_t)heap->entry.bam_record->core.pos+1)<<1 + | bam_is_rev(heap->entry.bam_record)); + heap->idx = (*idx)++; + if (g_is_by_tag) { + heap->entry.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag); + } else { + heap->entry.tag = NULL; + } + } else if (res == -1) { + heap->pos = HEAP_EMPTY; + if (i < nfiles) bam_destroy1(heap->entry.bam_record); + heap->entry.bam_record = NULL; + heap->entry.tag = NULL; + } else { + return -1; + } + return 0; +} + +static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, + const char *mode, bam_hdr_t *hout, + int n, char * const *fn, int num_in_mem, + buf_region *in_mem, bam1_tag *buf, int n_threads, + const char *cmd, const htsFormat *in_fmt, + const htsFormat *out_fmt) { + samFile *fpout = NULL, **fp = NULL; + heap1_t *heap = NULL; + uint64_t idx = 0; + int i, heap_size = n + num_in_mem; + + g_is_by_qname = by_qname; + if (sort_tag) { + g_is_by_tag = 1; + g_sort_tag[0] = sort_tag[0]; + g_sort_tag[1] = sort_tag[1]; + } + if (n > 0) { + fp = (samFile**)calloc(n, sizeof(samFile*)); + if (!fp) goto mem_fail; + } + heap = (heap1_t*)calloc(heap_size, sizeof(heap1_t)); + if (!heap) goto mem_fail; + + // Open each file, read the header and put the first read into the heap + for (i = 0; i < heap_size; i++) { + bam_hdr_t *hin; + heap1_t *h = &heap[i]; + + if (i < n) { + fp[i] = sam_open_format(fn[i], "r", in_fmt); + if (fp[i] == NULL) { + print_error_errno(cmd, "fail to open \"%s\"", fn[i]); + goto fail; + } + + // Read header ... + hin = sam_hdr_read(fp[i]); + if (hin == NULL) { + print_error(cmd, "failed to read header from \"%s\"", fn[i]); + goto fail; + } + // ... and throw it away as we don't really need it + bam_hdr_destroy(hin); + } + + // Get a read into the heap + h->i = i; + h->entry.tag = NULL; + if (i < n) { + h->entry.bam_record = bam_init1(); + if (!h->entry.bam_record) goto mem_fail; + } + if (heap_add_read(h, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) { + assert(i < n); + print_error(cmd, "failed to read first record from \"%s\"", fn[i]); + goto fail; + } + } + + // Open output file and write header + if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) { + print_error_errno(cmd, "failed to create \"%s\"", out); + return -1; + } + + hts_set_threads(fpout, n_threads); + + if (sam_hdr_write(fpout, hout) != 0) { + print_error_errno(cmd, "failed to write header to \"%s\"", out); + sam_close(fpout); + return -1; + } + + // Now do the merge + ks_heapmake(heap, heap_size, heap); + while (heap->pos != HEAP_EMPTY) { + bam1_t *b = heap->entry.bam_record; + if (sam_write1(fpout, hout, b) < 0) { + print_error_errno(cmd, "failed writing to \"%s\"", out); + sam_close(fpout); + return -1; + } + if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) { + assert(heap->i < n); + print_error(cmd, "Error reading \"%s\" : %s", + fn[heap->i], strerror(errno)); + goto fail; + } + ks_heapadjust(heap, 0, heap_size, heap); + } + // Clean up and close + for (i = 0; i < n; i++) { + if (sam_close(fp[i]) != 0) { + print_error(cmd, "Error on closing \"%s\" : %s", + fn[i], strerror(errno)); + } + } + free(fp); + free(heap); + if (sam_close(fpout) < 0) { + print_error(cmd, "error closing output file"); + return -1; + } + return 0; + mem_fail: + print_error(cmd, "Out of memory"); + + fail: + for (i = 0; i < n; i++) { + if (fp && fp[i]) sam_close(fp[i]); + if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record); + } + free(fp); + free(heap); + if (fpout) sam_close(fpout); + return -1; +} + static int change_SO(bam_hdr_t *h, const char *so) { char *p, *q, *beg = NULL, *end = NULL, *newtext; @@ -1635,29 +1807,41 @@ static int change_SO(bam_hdr_t *h, const char *so) if (beg == NULL) { // no @HD h->l_text += strlen(so) + 15; newtext = (char*)malloc(h->l_text + 1); - sprintf(newtext, "@HD\tVN:1.3\tSO:%s\n", so); - strcat(newtext, h->text); + if (!newtext) return -1; + snprintf(newtext, h->l_text + 1, + "@HD\tVN:1.3\tSO:%s\n%s", so, h->text); } else { // has @HD but different or no SO h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end); newtext = (char*)malloc(h->l_text + 1); - strncpy(newtext, h->text, beg - h->text); - sprintf(newtext + (beg - h->text), "\tSO:%s", so); - strcat(newtext, end); + if (!newtext) return -1; + snprintf(newtext, h->l_text + 1, "%.*s\tSO:%s%s", + (int) (beg - h->text), h->text, so, end); } free(h->text); h->text = newtext; return 0; } -// Function to compare reads and determine which one is < the other +// Function to compare reads and determine which one is < or > the other // Handle sort-by-pos and sort-by-name. Used as the secondary sort in bam1_lt_by_tag, if reads are equivalent by tag. -static inline int bam1_lt_core(const bam1_p a, const bam1_p b) +// Returns a value less than, equal to or greater than zero if a is less than, +// equal to or greater than b, respectively. +static inline int bam1_cmp_core(const bam1_tag a, const bam1_tag b) { + uint64_t pa, pb; + if (!a.bam_record) + return 1; + if (!b.bam_record) + return 0; + if (g_is_by_qname) { - int t = strnum_cmp(bam_get_qname(a.b), bam_get_qname(b.b)); - return (t < 0 || (t == 0 && (a.b->core.flag&0xc0) < (b.b->core.flag&0xc0))); + int t = strnum_cmp(bam_get_qname(a.bam_record), bam_get_qname(b.bam_record)); + if (t != 0) return t; + return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0); } else { - return (((uint64_t)a.b->core.tid<<32|(a.b->core.pos+1)<<1|bam_is_rev(a.b)) < ((uint64_t)b.b->core.tid<<32|(b.b->core.pos+1)<<1|bam_is_rev(b.b))); + pa = (uint64_t)a.bam_record->core.tid<<32|(a.bam_record->core.pos+1)<<1|bam_is_rev(a.bam_record); + pb = (uint64_t)b.bam_record->core.tid<<32|(b.bam_record->core.pos+1)<<1|bam_is_rev(b.bam_record); + return pa < pb ? -1 : (pa > pb ? 1 : 0); } } @@ -1675,17 +1859,19 @@ uint8_t normalize_type(const uint8_t* aux) { // Sort record by tag, using pos or read name as a secondary key if tags are identical. Reads not carrying the tag sort first. // Tags are first sorted by the type character (in case the types differ), or by the appropriate comparator for that type if they agree. -static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b) +// Returns a value less than, equal to or greater than zero if a is less than, +// equal to or greater than b, respectively. +static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b) { const uint8_t* aux_a = a.tag; const uint8_t* aux_b = b.tag; if (aux_a == NULL && aux_b != NULL) { - return 1; + return -1; } else if (aux_a != NULL && aux_b == NULL) { - return 0; + return 1; } else if (aux_a == NULL && aux_b == NULL) { - return bam1_lt_core(a,b); + return bam1_cmp_core(a,b); } // 'Normalize' the letters of the datatypes to a canonical letter, @@ -1702,57 +1888,62 @@ static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b) b_type = 'f'; } else { // Unfixable mismatched types - return a_type < b_type ? 1 : 0; + return a_type < b_type ? -1 : 1; } } if (a_type == 'c') { int64_t va = bam_aux2i(aux_a); int64_t vb = bam_aux2i(aux_b); - return (va < vb || (va == vb && bam1_lt_core(a, b))); + if (va != vb) return va < vb ? -1 : 1; + return bam1_cmp_core(a, b); } else if (a_type == 'f') { double va = bam_aux2f(aux_a); double vb = bam_aux2f(aux_b); - return (va < vb || (va == vb && bam1_lt_core(a,b))); + if (va != vb) return va < vb ? -1 : 1; + return bam1_cmp_core(a, b); } else if (a_type == 'A') { - char va = bam_aux2A(aux_a); - char vb = bam_aux2A(aux_b); - return (va < vb || (va == vb && bam1_lt_core(a,b))); + unsigned char va = bam_aux2A(aux_a); + unsigned char vb = bam_aux2A(aux_b); + if (va != vb) return va < vb ? -1 : 1; + return bam1_cmp_core(a, b); } else if (a_type == 'H') { int t = strcmp(bam_aux2Z(aux_a), bam_aux2Z(aux_b)); - return (t < 0 || (t == 0 && bam1_lt_core(a,b))); + if (t) return t; + return bam1_cmp_core(a, b); } else { - return bam1_lt_core(a,b); + return bam1_cmp_core(a,b); } } // Function to compare reads and determine which one is < the other // Handle sort-by-pos, sort-by-name, or sort-by-tag -static inline int bam1_lt(const bam1_p a, const bam1_p b) +static inline int bam1_lt(const bam1_tag a, const bam1_tag b) { if (g_is_by_tag) { - return bam1_lt_by_tag(a, b); + return bam1_cmp_by_tag(a, b) < 0; } else { - return bam1_lt_core(a,b); + return bam1_cmp_core(a,b) < 0; } } -KSORT_INIT(sort, bam1_p, bam1_lt) +KSORT_INIT(sort, bam1_tag, bam1_lt) typedef struct { size_t buf_len; const char *prefix; - bam1_p *buf; + bam1_tag *buf; const bam_hdr_t *h; int index; int error; + int no_save; } worker_t; // Returns 0 for success // -1 for failure -static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt) +static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt) { size_t i; samFile* fp; @@ -1761,7 +1952,7 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, if (sam_hdr_write(fp, h) != 0) goto fail; if (n_threads > 1) hts_set_threads(fp, n_threads); for (i = 0; i < l; ++i) { - if (sam_write1(fp, h, buf[i].b) < 0) goto fail; + if (sam_write1(fp, h, buf[i].bam_record) < 0) goto fail; } if (sam_close(fp) < 0) return -1; return 0; @@ -1776,6 +1967,10 @@ static void *worker(void *data) char *name; w->error = 0; ks_mergesort(sort, w->buf_len, w->buf, 0); + + if (w->no_save) + return 0; + name = (char*)calloc(strlen(w->prefix) + 20, 1); if (!name) { w->error = errno; return 0; } sprintf(name, "%s.%.4d.bam", w->prefix, w->index); @@ -1783,7 +1978,7 @@ static void *worker(void *data) uint32_t max_ncigar = 0; int i; for (i = 0; i < w->buf_len; i++) { - uint32_t nc = w->buf[i].b->core.n_cigar; + uint32_t nc = w->buf[i].bam_record->core.n_cigar; if (max_ncigar < nc) max_ncigar = nc; } @@ -1808,11 +2003,11 @@ static void *worker(void *data) return 0; } -static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, const bam_hdr_t *h, int n_threads) +static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, + const bam_hdr_t *h, int n_threads, buf_region *in_mem) { int i; - size_t rest; - bam1_p *b; + size_t pos, rest; pthread_t *tid; pthread_attr_t attr; worker_t *w; @@ -1823,15 +2018,24 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); w = (worker_t*)calloc(n_threads, sizeof(worker_t)); + if (!w) return -1; tid = (pthread_t*)calloc(n_threads, sizeof(pthread_t)); - b = buf; rest = k; + if (!tid) { free(w); return -1; } + pos = 0; rest = k; for (i = 0; i < n_threads; ++i) { w[i].buf_len = rest / (n_threads - i); - w[i].buf = b; + w[i].buf = &buf[pos]; w[i].prefix = prefix; w[i].h = h; w[i].index = n_files + i; - b += w[i].buf_len; rest -= w[i].buf_len; + if (in_mem) { + w[i].no_save = 1; + in_mem[i].from = pos; + in_mem[i].to = pos + w[i].buf_len; + } else { + w[i].no_save = 0; + } + pos += w[i].buf_len; rest -= w[i].buf_len; pthread_create(&tid[i], &attr, worker, &w[i]); } for (i = 0; i < n_threads; ++i) { @@ -1843,7 +2047,9 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c } } free(tid); free(w); - return (n_failed == 0)? n_files + n_threads : -1; + if (n_failed) return -1; + if (in_mem) return n_threads; + return n_files + n_threads; } /*! @@ -1862,7 +2068,7 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c @return 0 for successful sorting, negative on errors @discussion It may create multiple temporary subalignment files - and then merge them by calling bam_merge_core2(). This function is + and then merge them by calling bam_merge_simple(). This function is NOT thread safe. */ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix, @@ -1870,12 +2076,22 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const size_t _max_mem, int n_threads, const htsFormat *in_fmt, const htsFormat *out_fmt) { - int ret = -1, i, n_files = 0; - size_t mem, max_k, k, max_mem; + int ret = -1, res, i, n_files = 0; + size_t max_k, k, max_mem, bam_mem_offset; bam_hdr_t *header = NULL; samFile *fp; - bam1_p *buf; - bam1_t *b; + bam1_tag *buf = NULL; + bam1_t *b = bam_init1(); + uint8_t *bam_mem = NULL; + char **fns = NULL; + const char *new_so; + buf_region *in_mem = NULL; + int num_in_mem = 0; + + if (!b) { + print_error("sort", "couldn't allocate memory for bam record"); + return -1; + } if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; @@ -1884,13 +2100,12 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const strncpy(g_sort_tag, sort_by_tag, 2); } - max_k = k = 0; mem = 0; max_mem = _max_mem * n_threads; buf = NULL; fp = sam_open_format(fn, "r", in_fmt); if (fp == NULL) { print_error_errno("sort", "can't open \"%s\"", fn); - return -2; + goto err; } header = sam_hdr_read(fp); if (header == NULL) { @@ -1899,11 +2114,17 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const } if (sort_by_tag != NULL) - change_SO(header, "unknown"); + new_so = "unknown"; else if (is_by_qname) - change_SO(header, "queryname"); + new_so = "queryname"; else - change_SO(header, "coordinate"); + new_so = "coordinate"; + + if (change_SO(header, new_so) != 0) { + print_error("sort", + "failed to change sort order header to '%s'\n", new_so); + goto err; + } // No gain to using the thread pool here as the flow of this code // is such that we are *either* reading *or* sorting. Hence a shared @@ -1911,93 +2132,121 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const if (n_threads > 1) hts_set_threads(fp, n_threads); + if ((bam_mem = malloc(max_mem)) == NULL) { + print_error("sort", "couldn't allocate memory for bam_mem"); + goto err; + } + // write sub files - for (;;) { + k = max_k = bam_mem_offset = 0; + while ((res = sam_read1(fp, header, b)) >= 0) { + int mem_full = 0; + if (k == max_k) { - size_t kk, old_max = max_k; + bam1_tag *new_buf; max_k = max_k? max_k<<1 : 0x10000; - buf = (bam1_p*)realloc(buf, max_k * sizeof(bam1_p)); - for (kk = old_max; kk < max_k; ++kk) { - buf[kk].b = NULL; - buf[kk].tag = NULL; + if ((new_buf = realloc(buf, max_k * sizeof(bam1_tag))) == NULL) { + print_error("sort", "couldn't allocate memory for buf"); + goto err; } + buf = new_buf; } - if (buf[k].b == NULL) buf[k].b = bam_init1(); - b = buf[k].b; - if ((ret = sam_read1(fp, header, b)) < 0) break; - if (b->l_data < b->m_data>>2) { // shrink - b->m_data = b->l_data; - kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); + + // Check if the BAM record will fit in the memory limit + if (bam_mem_offset + sizeof(*b) + b->l_data < max_mem) { + // Copy record into the memory block + buf[k].bam_record = (bam1_t *)(bam_mem + bam_mem_offset); + *buf[k].bam_record = *b; + buf[k].bam_record->data = (uint8_t *)((char *)buf[k].bam_record + sizeof(bam1_t)); + memcpy(buf[k].bam_record->data, b->data, b->l_data); + // store next BAM record in next 8-byte-aligned address after + // current one + bam_mem_offset = (bam_mem_offset + sizeof(*b) + b->l_data + 8 - 1) & ~((size_t)(8 - 1)); + } else { + // Add a pointer to the remaining record + buf[k].bam_record = b; + mem_full = 1; } // Pull out the pointer to the sort tag if applicable if (g_is_by_tag) { - buf[k].tag = bam_aux_get(b, g_sort_tag); + buf[k].tag = bam_aux_get(buf[k].bam_record, g_sort_tag); } else { buf[k].tag = NULL; } - - mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays ++k; - if (mem >= max_mem) { - n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); + + if (mem_full) { + n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, + NULL); if (n_files < 0) { - ret = -1; goto err; } - mem = k = 0; + k = 0; + bam_mem_offset = 0; } } - if (ret != -1) { + if (res != -1) { print_error("sort", "truncated file. Aborting"); - ret = -1; goto err; } + // Sort last records + if (k > 0) { + in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0])); + if (!in_mem) goto err; + num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads, + in_mem); + if (num_in_mem < 0) goto err; + } else { + num_in_mem = 0; + } + // write the final output - if (n_files == 0) { // a single block + if (n_files == 0 && num_in_mem < 2) { // a single block ks_mergesort(sort, k, buf, 0); if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) { print_error_errno("sort", "failed to create \"%s\"", fnout); - ret = -1; goto err; } } else { // then merge - char **fns; - n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); - if (n_files == -1) { - ret = -1; - goto err; - } - fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files); + fprintf(stderr, + "[bam_sort_core] merging from %d files and %d in-memory blocks...\n", + n_files, num_in_mem); fns = (char**)calloc(n_files, sizeof(char*)); + if (!fns) goto err; for (i = 0; i < n_files; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); + if (!fns[i]) goto err; sprintf(fns[i], "%s.%.4d.bam", prefix, i); } - if (bam_merge_core2(is_by_qname, sort_by_tag, fnout, modeout, NULL, n_files, fns, - MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO, - NULL, n_threads, "sort", in_fmt, out_fmt) < 0) { - // Propagate bam_merge_core2() failure; it has already emitted a + if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header, + n_files, fns, num_in_mem, in_mem, buf, + n_threads, "sort", in_fmt, out_fmt) < 0) { + // Propagate bam_merge_simple() failure; it has already emitted a // message explaining the failure, so no further message is needed. goto err; } - for (i = 0; i < n_files; ++i) { - unlink(fns[i]); - free(fns[i]); - } - free(fns); } ret = 0; err: // free - for (k = 0; k < max_k; ++k) bam_destroy1(buf[k].b); + if (fns) { + for (i = 0; i < n_files; ++i) { + if (fns[i]) { + unlink(fns[i]); + free(fns[i]); + } + } + free(fns); + } + bam_destroy1(b); free(buf); + free(bam_mem); bam_hdr_destroy(header); - sam_close(fp); + if (fp) sam_close(fp); return ret; } @@ -2006,6 +2255,7 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma { int ret; char *fnout = calloc(strlen(prefix) + 4 + 1, 1); + if (!fnout) return -1; sprintf(fnout, "%s.bam", prefix); ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL); free(fnout); diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c index 524f724..8989fc5 100644 --- a/samtools/bam_sort.c.pysam.c +++ b/samtools/bam_sort.c.pysam.c @@ -40,7 +40,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "htslib/bgzf.h" #include "htslib/ksort.h" +#include "htslib/hts_os.h" #include "htslib/khash.h" #include "htslib/klist.h" #include "htslib/kstring.h" @@ -51,10 +53,10 @@ DEALINGS IN THE SOFTWARE. */ // Struct which contains the a record, and the pointer to the sort tag (if any) // Used to speed up sort-by-tag. -typedef struct bam1_p { - bam1_t *b; +typedef struct bam1_tag { + bam1_t *bam_record; const uint8_t *tag; -} bam1_p; +} bam1_tag; /* Minimum memory required in megabytes before sort will attempt to run. This is to prevent accidents where failing to use the -m option correctly results @@ -124,29 +126,36 @@ static int strnum_cmp(const char *_a, const char *_b) typedef struct { int i; uint64_t pos, idx; - bam1_p b; + bam1_tag entry; } heap1_t; -#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx)))) - -static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b); +static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b); // Function to compare reads in the heap and determine which one is < the other static inline int heap_lt(const heap1_t a, const heap1_t b) { + if (!a.entry.bam_record) + return 1; + if (!b.entry.bam_record) + return 0; + if (g_is_by_tag) { int t; - if (a.b.b == NULL || b.b.b == NULL) return a.b.b == NULL? 1 : 0; - t = bam1_lt_by_tag(b.b,a.b); - return t; + t = bam1_cmp_by_tag(a.entry, b.entry); + if (t != 0) return t > 0; } else if (g_is_by_qname) { - int t; - if (a.b.b == NULL || b.b.b == NULL) return a.b.b == NULL? 1 : 0; - t = strnum_cmp(bam_get_qname(a.b.b), bam_get_qname(b.b.b)); - return (t > 0 || (t == 0 && (a.b.b->core.flag&0xc0) > (b.b.b->core.flag&0xc0))); + int t, fa, fb; + t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record)); + if (t != 0) return t > 0; + fa = a.entry.bam_record->core.flag & 0xc0; + fb = b.entry.bam_record->core.flag & 0xc0; + if (fa != fb) return fa > fb; } else { - return __pos_cmp(a, b); + if (a.pos != b.pos) return a.pos > b.pos; } + // This compares by position in the input file(s) + if (a.i != b.i) return a.i > b.i; + return a.idx > b.idx; } KSORT_INIT(heap, heap1_t, heap_lt) @@ -1353,25 +1362,25 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m heap1_t *h = heap + i; int res; h->i = i; - h->b.b = bam_init1(); - h->b.tag = NULL; - if (!h->b.b) goto mem_fail; - res = iter[i] ? sam_itr_next(fp[i], iter[i], h->b.b) : sam_read1(fp[i], hdr[i], h->b.b); + h->entry.bam_record = bam_init1(); + h->entry.tag = NULL; + if (!h->entry.bam_record) goto mem_fail; + res = iter[i] ? sam_itr_next(fp[i], iter[i], h->entry.bam_record) : sam_read1(fp[i], hdr[i], h->entry.bam_record); if (res >= 0) { - bam_translate(h->b.b, translation_tbl + i); - h->pos = ((uint64_t)h->b.b->core.tid<<32) | (uint32_t)((int32_t)h->b.b->core.pos+1)<<1 | bam_is_rev(h->b.b); + bam_translate(h->entry.bam_record, translation_tbl + i); + h->pos = ((uint64_t)h->entry.bam_record->core.tid<<32) | (uint32_t)((int32_t)h->entry.bam_record->core.pos+1)<<1 | bam_is_rev(h->entry.bam_record); h->idx = idx++; if (g_is_by_tag) { - h->b.tag = bam_aux_get(h->b.b, g_sort_tag); + h->entry.tag = bam_aux_get(h->entry.bam_record, g_sort_tag); } else { - h->b.tag = NULL; + h->entry.tag = NULL; } } else if (res == -1 && (!iter[i] || iter[i]->finished)) { h->pos = HEAP_EMPTY; - bam_destroy1(h->b.b); - h->b.b = NULL; - h->b.tag = NULL; + bam_destroy1(h->entry.bam_record); + h->entry.bam_record = NULL; + h->entry.tag = NULL; } else { print_error(cmd, "failed to read first record from \"%s\"", fn[i]); goto fail; @@ -1393,7 +1402,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m // Begin the actual merge ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { - bam1_t *b = heap->b.b; + bam1_t *b = heap->entry.bam_record; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); @@ -1409,15 +1418,15 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b); heap->idx = idx++; if (g_is_by_tag) { - heap->b.tag = bam_aux_get(heap->b.b, g_sort_tag); + heap->entry.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag); } else { - heap->b.tag = NULL; + heap->entry.tag = NULL; } } else if (j == -1 && (!iter[heap->i] || iter[heap->i]->finished)) { heap->pos = HEAP_EMPTY; - bam_destroy1(heap->b.b); - heap->b.b = NULL; - heap->b.tag = NULL; + bam_destroy1(heap->entry.bam_record); + heap->entry.bam_record = NULL; + heap->entry.tag = NULL; } else { print_error(cmd, "\"%s\" is truncated", fn[heap->i]); goto fail; @@ -1461,7 +1470,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m if (iter && iter[i]) hts_itr_destroy(iter[i]); if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]); if (fp && fp[i]) sam_close(fp[i]); - if (heap && heap[i].b.b) bam_destroy1(heap[i].b.b); + if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record); } if (hout) bam_hdr_destroy(hout); free(RG); @@ -1617,6 +1626,169 @@ end: * BAM sorting * ***************/ +typedef struct { + size_t from; + size_t to; +} buf_region; + +/* Simplified version of bam_merge_core2() for merging part-sorted + temporary files. No need for header merging or translation, + it just needs to read data into the heap and push it out again. */ + +static inline int heap_add_read(heap1_t *heap, int nfiles, samFile **fp, + int num_in_mem, buf_region *in_mem, + bam1_tag *buf, uint64_t *idx, bam_hdr_t *hout) { + int i = heap->i, res; + if (i < nfiles) { // read from file + res = sam_read1(fp[i], hout, heap->entry.bam_record); + } else { // read from memory + if (in_mem[i - nfiles].from < in_mem[i - nfiles].to) { + heap->entry.bam_record = buf[in_mem[i - nfiles].from++].bam_record; + res = 0; + } else { + res = -1; + } + } + if (res >= 0) { + heap->pos = (((uint64_t)heap->entry.bam_record->core.tid<<32) + | (uint32_t)((int32_t)heap->entry.bam_record->core.pos+1)<<1 + | bam_is_rev(heap->entry.bam_record)); + heap->idx = (*idx)++; + if (g_is_by_tag) { + heap->entry.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag); + } else { + heap->entry.tag = NULL; + } + } else if (res == -1) { + heap->pos = HEAP_EMPTY; + if (i < nfiles) bam_destroy1(heap->entry.bam_record); + heap->entry.bam_record = NULL; + heap->entry.tag = NULL; + } else { + return -1; + } + return 0; +} + +static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, + const char *mode, bam_hdr_t *hout, + int n, char * const *fn, int num_in_mem, + buf_region *in_mem, bam1_tag *buf, int n_threads, + const char *cmd, const htsFormat *in_fmt, + const htsFormat *out_fmt) { + samFile *fpout = NULL, **fp = NULL; + heap1_t *heap = NULL; + uint64_t idx = 0; + int i, heap_size = n + num_in_mem; + + g_is_by_qname = by_qname; + if (sort_tag) { + g_is_by_tag = 1; + g_sort_tag[0] = sort_tag[0]; + g_sort_tag[1] = sort_tag[1]; + } + if (n > 0) { + fp = (samFile**)calloc(n, sizeof(samFile*)); + if (!fp) goto mem_fail; + } + heap = (heap1_t*)calloc(heap_size, sizeof(heap1_t)); + if (!heap) goto mem_fail; + + // Open each file, read the header and put the first read into the heap + for (i = 0; i < heap_size; i++) { + bam_hdr_t *hin; + heap1_t *h = &heap[i]; + + if (i < n) { + fp[i] = sam_open_format(fn[i], "r", in_fmt); + if (fp[i] == NULL) { + print_error_errno(cmd, "fail to open \"%s\"", fn[i]); + goto fail; + } + + // Read header ... + hin = sam_hdr_read(fp[i]); + if (hin == NULL) { + print_error(cmd, "failed to read header from \"%s\"", fn[i]); + goto fail; + } + // ... and throw it away as we don't really need it + bam_hdr_destroy(hin); + } + + // Get a read into the heap + h->i = i; + h->entry.tag = NULL; + if (i < n) { + h->entry.bam_record = bam_init1(); + if (!h->entry.bam_record) goto mem_fail; + } + if (heap_add_read(h, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) { + assert(i < n); + print_error(cmd, "failed to read first record from \"%s\"", fn[i]); + goto fail; + } + } + + // Open output file and write header + if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) { + print_error_errno(cmd, "failed to create \"%s\"", out); + return -1; + } + + hts_set_threads(fpout, n_threads); + + if (sam_hdr_write(fpout, hout) != 0) { + print_error_errno(cmd, "failed to write header to \"%s\"", out); + sam_close(fpout); + return -1; + } + + // Now do the merge + ks_heapmake(heap, heap_size, heap); + while (heap->pos != HEAP_EMPTY) { + bam1_t *b = heap->entry.bam_record; + if (sam_write1(fpout, hout, b) < 0) { + print_error_errno(cmd, "failed writing to \"%s\"", out); + sam_close(fpout); + return -1; + } + if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) { + assert(heap->i < n); + print_error(cmd, "Error reading \"%s\" : %s", + fn[heap->i], strerror(errno)); + goto fail; + } + ks_heapadjust(heap, 0, heap_size, heap); + } + // Clean up and close + for (i = 0; i < n; i++) { + if (sam_close(fp[i]) != 0) { + print_error(cmd, "Error on closing \"%s\" : %s", + fn[i], strerror(errno)); + } + } + free(fp); + free(heap); + if (sam_close(fpout) < 0) { + print_error(cmd, "error closing output file"); + return -1; + } + return 0; + mem_fail: + print_error(cmd, "Out of memory"); + + fail: + for (i = 0; i < n; i++) { + if (fp && fp[i]) sam_close(fp[i]); + if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record); + } + free(fp); + free(heap); + if (fpout) sam_close(fpout); + return -1; +} + static int change_SO(bam_hdr_t *h, const char *so) { char *p, *q, *beg = NULL, *end = NULL, *newtext; @@ -1637,29 +1809,41 @@ static int change_SO(bam_hdr_t *h, const char *so) if (beg == NULL) { // no @HD h->l_text += strlen(so) + 15; newtext = (char*)malloc(h->l_text + 1); - sprintf(newtext, "@HD\tVN:1.3\tSO:%s\n", so); - strcat(newtext, h->text); + if (!newtext) return -1; + snprintf(newtext, h->l_text + 1, + "@HD\tVN:1.3\tSO:%s\n%s", so, h->text); } else { // has @HD but different or no SO h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end); newtext = (char*)malloc(h->l_text + 1); - strncpy(newtext, h->text, beg - h->text); - sprintf(newtext + (beg - h->text), "\tSO:%s", so); - strcat(newtext, end); + if (!newtext) return -1; + snprintf(newtext, h->l_text + 1, "%.*s\tSO:%s%s", + (int) (beg - h->text), h->text, so, end); } free(h->text); h->text = newtext; return 0; } -// Function to compare reads and determine which one is < the other +// Function to compare reads and determine which one is < or > the other // Handle sort-by-pos and sort-by-name. Used as the secondary sort in bam1_lt_by_tag, if reads are equivalent by tag. -static inline int bam1_lt_core(const bam1_p a, const bam1_p b) +// Returns a value less than, equal to or greater than zero if a is less than, +// equal to or greater than b, respectively. +static inline int bam1_cmp_core(const bam1_tag a, const bam1_tag b) { + uint64_t pa, pb; + if (!a.bam_record) + return 1; + if (!b.bam_record) + return 0; + if (g_is_by_qname) { - int t = strnum_cmp(bam_get_qname(a.b), bam_get_qname(b.b)); - return (t < 0 || (t == 0 && (a.b->core.flag&0xc0) < (b.b->core.flag&0xc0))); + int t = strnum_cmp(bam_get_qname(a.bam_record), bam_get_qname(b.bam_record)); + if (t != 0) return t; + return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0); } else { - return (((uint64_t)a.b->core.tid<<32|(a.b->core.pos+1)<<1|bam_is_rev(a.b)) < ((uint64_t)b.b->core.tid<<32|(b.b->core.pos+1)<<1|bam_is_rev(b.b))); + pa = (uint64_t)a.bam_record->core.tid<<32|(a.bam_record->core.pos+1)<<1|bam_is_rev(a.bam_record); + pb = (uint64_t)b.bam_record->core.tid<<32|(b.bam_record->core.pos+1)<<1|bam_is_rev(b.bam_record); + return pa < pb ? -1 : (pa > pb ? 1 : 0); } } @@ -1677,17 +1861,19 @@ uint8_t normalize_type(const uint8_t* aux) { // Sort record by tag, using pos or read name as a secondary key if tags are identical. Reads not carrying the tag sort first. // Tags are first sorted by the type character (in case the types differ), or by the appropriate comparator for that type if they agree. -static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b) +// Returns a value less than, equal to or greater than zero if a is less than, +// equal to or greater than b, respectively. +static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b) { const uint8_t* aux_a = a.tag; const uint8_t* aux_b = b.tag; if (aux_a == NULL && aux_b != NULL) { - return 1; + return -1; } else if (aux_a != NULL && aux_b == NULL) { - return 0; + return 1; } else if (aux_a == NULL && aux_b == NULL) { - return bam1_lt_core(a,b); + return bam1_cmp_core(a,b); } // 'Normalize' the letters of the datatypes to a canonical letter, @@ -1704,57 +1890,62 @@ static inline int bam1_lt_by_tag(const bam1_p a, const bam1_p b) b_type = 'f'; } else { // Unfixable mismatched types - return a_type < b_type ? 1 : 0; + return a_type < b_type ? -1 : 1; } } if (a_type == 'c') { int64_t va = bam_aux2i(aux_a); int64_t vb = bam_aux2i(aux_b); - return (va < vb || (va == vb && bam1_lt_core(a, b))); + if (va != vb) return va < vb ? -1 : 1; + return bam1_cmp_core(a, b); } else if (a_type == 'f') { double va = bam_aux2f(aux_a); double vb = bam_aux2f(aux_b); - return (va < vb || (va == vb && bam1_lt_core(a,b))); + if (va != vb) return va < vb ? -1 : 1; + return bam1_cmp_core(a, b); } else if (a_type == 'A') { - char va = bam_aux2A(aux_a); - char vb = bam_aux2A(aux_b); - return (va < vb || (va == vb && bam1_lt_core(a,b))); + unsigned char va = bam_aux2A(aux_a); + unsigned char vb = bam_aux2A(aux_b); + if (va != vb) return va < vb ? -1 : 1; + return bam1_cmp_core(a, b); } else if (a_type == 'H') { int t = strcmp(bam_aux2Z(aux_a), bam_aux2Z(aux_b)); - return (t < 0 || (t == 0 && bam1_lt_core(a,b))); + if (t) return t; + return bam1_cmp_core(a, b); } else { - return bam1_lt_core(a,b); + return bam1_cmp_core(a,b); } } // Function to compare reads and determine which one is < the other // Handle sort-by-pos, sort-by-name, or sort-by-tag -static inline int bam1_lt(const bam1_p a, const bam1_p b) +static inline int bam1_lt(const bam1_tag a, const bam1_tag b) { if (g_is_by_tag) { - return bam1_lt_by_tag(a, b); + return bam1_cmp_by_tag(a, b) < 0; } else { - return bam1_lt_core(a,b); + return bam1_cmp_core(a,b) < 0; } } -KSORT_INIT(sort, bam1_p, bam1_lt) +KSORT_INIT(sort, bam1_tag, bam1_lt) typedef struct { size_t buf_len; const char *prefix; - bam1_p *buf; + bam1_tag *buf; const bam_hdr_t *h; int index; int error; + int no_save; } worker_t; // Returns 0 for success // -1 for failure -static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt) +static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt) { size_t i; samFile* fp; @@ -1763,7 +1954,7 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, if (sam_hdr_write(fp, h) != 0) goto fail; if (n_threads > 1) hts_set_threads(fp, n_threads); for (i = 0; i < l; ++i) { - if (sam_write1(fp, h, buf[i].b) < 0) goto fail; + if (sam_write1(fp, h, buf[i].bam_record) < 0) goto fail; } if (sam_close(fp) < 0) return -1; return 0; @@ -1778,6 +1969,10 @@ static void *worker(void *data) char *name; w->error = 0; ks_mergesort(sort, w->buf_len, w->buf, 0); + + if (w->no_save) + return 0; + name = (char*)calloc(strlen(w->prefix) + 20, 1); if (!name) { w->error = errno; return 0; } sprintf(name, "%s.%.4d.bam", w->prefix, w->index); @@ -1785,7 +1980,7 @@ static void *worker(void *data) uint32_t max_ncigar = 0; int i; for (i = 0; i < w->buf_len; i++) { - uint32_t nc = w->buf[i].b->core.n_cigar; + uint32_t nc = w->buf[i].bam_record->core.n_cigar; if (max_ncigar < nc) max_ncigar = nc; } @@ -1810,11 +2005,11 @@ static void *worker(void *data) return 0; } -static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, const bam_hdr_t *h, int n_threads) +static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, + const bam_hdr_t *h, int n_threads, buf_region *in_mem) { int i; - size_t rest; - bam1_p *b; + size_t pos, rest; pthread_t *tid; pthread_attr_t attr; worker_t *w; @@ -1825,15 +2020,24 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); w = (worker_t*)calloc(n_threads, sizeof(worker_t)); + if (!w) return -1; tid = (pthread_t*)calloc(n_threads, sizeof(pthread_t)); - b = buf; rest = k; + if (!tid) { free(w); return -1; } + pos = 0; rest = k; for (i = 0; i < n_threads; ++i) { w[i].buf_len = rest / (n_threads - i); - w[i].buf = b; + w[i].buf = &buf[pos]; w[i].prefix = prefix; w[i].h = h; w[i].index = n_files + i; - b += w[i].buf_len; rest -= w[i].buf_len; + if (in_mem) { + w[i].no_save = 1; + in_mem[i].from = pos; + in_mem[i].to = pos + w[i].buf_len; + } else { + w[i].no_save = 0; + } + pos += w[i].buf_len; rest -= w[i].buf_len; pthread_create(&tid[i], &attr, worker, &w[i]); } for (i = 0; i < n_threads; ++i) { @@ -1845,7 +2049,9 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c } } free(tid); free(w); - return (n_failed == 0)? n_files + n_threads : -1; + if (n_failed) return -1; + if (in_mem) return n_threads; + return n_files + n_threads; } /*! @@ -1864,7 +2070,7 @@ static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, c @return 0 for successful sorting, negative on errors @discussion It may create multiple temporary subalignment files - and then merge them by calling bam_merge_core2(). This function is + and then merge them by calling bam_merge_simple(). This function is NOT thread safe. */ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix, @@ -1872,12 +2078,22 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const size_t _max_mem, int n_threads, const htsFormat *in_fmt, const htsFormat *out_fmt) { - int ret = -1, i, n_files = 0; - size_t mem, max_k, k, max_mem; + int ret = -1, res, i, n_files = 0; + size_t max_k, k, max_mem, bam_mem_offset; bam_hdr_t *header = NULL; samFile *fp; - bam1_p *buf; - bam1_t *b; + bam1_tag *buf = NULL; + bam1_t *b = bam_init1(); + uint8_t *bam_mem = NULL; + char **fns = NULL; + const char *new_so; + buf_region *in_mem = NULL; + int num_in_mem = 0; + + if (!b) { + print_error("sort", "couldn't allocate memory for bam record"); + return -1; + } if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; @@ -1886,13 +2102,12 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const strncpy(g_sort_tag, sort_by_tag, 2); } - max_k = k = 0; mem = 0; max_mem = _max_mem * n_threads; buf = NULL; fp = sam_open_format(fn, "r", in_fmt); if (fp == NULL) { print_error_errno("sort", "can't open \"%s\"", fn); - return -2; + goto err; } header = sam_hdr_read(fp); if (header == NULL) { @@ -1901,11 +2116,17 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const } if (sort_by_tag != NULL) - change_SO(header, "unknown"); + new_so = "unknown"; else if (is_by_qname) - change_SO(header, "queryname"); + new_so = "queryname"; else - change_SO(header, "coordinate"); + new_so = "coordinate"; + + if (change_SO(header, new_so) != 0) { + print_error("sort", + "failed to change sort order header to '%s'\n", new_so); + goto err; + } // No gain to using the thread pool here as the flow of this code // is such that we are *either* reading *or* sorting. Hence a shared @@ -1913,93 +2134,121 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const if (n_threads > 1) hts_set_threads(fp, n_threads); + if ((bam_mem = malloc(max_mem)) == NULL) { + print_error("sort", "couldn't allocate memory for bam_mem"); + goto err; + } + // write sub files - for (;;) { + k = max_k = bam_mem_offset = 0; + while ((res = sam_read1(fp, header, b)) >= 0) { + int mem_full = 0; + if (k == max_k) { - size_t kk, old_max = max_k; + bam1_tag *new_buf; max_k = max_k? max_k<<1 : 0x10000; - buf = (bam1_p*)realloc(buf, max_k * sizeof(bam1_p)); - for (kk = old_max; kk < max_k; ++kk) { - buf[kk].b = NULL; - buf[kk].tag = NULL; + if ((new_buf = realloc(buf, max_k * sizeof(bam1_tag))) == NULL) { + print_error("sort", "couldn't allocate memory for buf"); + goto err; } + buf = new_buf; } - if (buf[k].b == NULL) buf[k].b = bam_init1(); - b = buf[k].b; - if ((ret = sam_read1(fp, header, b)) < 0) break; - if (b->l_data < b->m_data>>2) { // shrink - b->m_data = b->l_data; - kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); + + // Check if the BAM record will fit in the memory limit + if (bam_mem_offset + sizeof(*b) + b->l_data < max_mem) { + // Copy record into the memory block + buf[k].bam_record = (bam1_t *)(bam_mem + bam_mem_offset); + *buf[k].bam_record = *b; + buf[k].bam_record->data = (uint8_t *)((char *)buf[k].bam_record + sizeof(bam1_t)); + memcpy(buf[k].bam_record->data, b->data, b->l_data); + // store next BAM record in next 8-byte-aligned address after + // current one + bam_mem_offset = (bam_mem_offset + sizeof(*b) + b->l_data + 8 - 1) & ~((size_t)(8 - 1)); + } else { + // Add a pointer to the remaining record + buf[k].bam_record = b; + mem_full = 1; } // Pull out the pointer to the sort tag if applicable if (g_is_by_tag) { - buf[k].tag = bam_aux_get(b, g_sort_tag); + buf[k].tag = bam_aux_get(buf[k].bam_record, g_sort_tag); } else { buf[k].tag = NULL; } - - mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays ++k; - if (mem >= max_mem) { - n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); + + if (mem_full) { + n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, + NULL); if (n_files < 0) { - ret = -1; goto err; } - mem = k = 0; + k = 0; + bam_mem_offset = 0; } } - if (ret != -1) { + if (res != -1) { print_error("sort", "truncated file. Aborting"); - ret = -1; goto err; } + // Sort last records + if (k > 0) { + in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0])); + if (!in_mem) goto err; + num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads, + in_mem); + if (num_in_mem < 0) goto err; + } else { + num_in_mem = 0; + } + // write the final output - if (n_files == 0) { // a single block + if (n_files == 0 && num_in_mem < 2) { // a single block ks_mergesort(sort, k, buf, 0); if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) { print_error_errno("sort", "failed to create \"%s\"", fnout); - ret = -1; goto err; } } else { // then merge - char **fns; - n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); - if (n_files == -1) { - ret = -1; - goto err; - } - fprintf(pysam_stderr, "[bam_sort_core] merging from %d files...\n", n_files); + fprintf(pysam_stderr, + "[bam_sort_core] merging from %d files and %d in-memory blocks...\n", + n_files, num_in_mem); fns = (char**)calloc(n_files, sizeof(char*)); + if (!fns) goto err; for (i = 0; i < n_files; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); + if (!fns[i]) goto err; sprintf(fns[i], "%s.%.4d.bam", prefix, i); } - if (bam_merge_core2(is_by_qname, sort_by_tag, fnout, modeout, NULL, n_files, fns, - MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO, - NULL, n_threads, "sort", in_fmt, out_fmt) < 0) { - // Propagate bam_merge_core2() failure; it has already emitted a + if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header, + n_files, fns, num_in_mem, in_mem, buf, + n_threads, "sort", in_fmt, out_fmt) < 0) { + // Propagate bam_merge_simple() failure; it has already emitted a // message explaining the failure, so no further message is needed. goto err; } - for (i = 0; i < n_files; ++i) { - unlink(fns[i]); - free(fns[i]); - } - free(fns); } ret = 0; err: // free - for (k = 0; k < max_k; ++k) bam_destroy1(buf[k].b); + if (fns) { + for (i = 0; i < n_files; ++i) { + if (fns[i]) { + unlink(fns[i]); + free(fns[i]); + } + } + free(fns); + } + bam_destroy1(b); free(buf); + free(bam_mem); bam_hdr_destroy(header); - sam_close(fp); + if (fp) sam_close(fp); return ret; } @@ -2008,6 +2257,7 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma { int ret; char *fnout = calloc(strlen(prefix) + 4 + 1, 1); + if (!fnout) return -1; sprintf(fnout, "%s.bam", prefix); ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL); free(fnout); diff --git a/samtools/bamtk.c b/samtools/bamtk.c index bd520b6..9316386 100644 --- a/samtools/bamtk.c +++ b/samtools/bamtk.c @@ -44,6 +44,7 @@ int bam_rmdup(int argc, char *argv[]); int bam_flagstat(int argc, char *argv[]); int bam_fillmd(int argc, char *argv[]); int bam_idxstats(int argc, char *argv[]); +int bam_markdup(int argc, char *argv[]); int main_samview(int argc, char *argv[]); int main_import(int argc, char *argv[]); int main_reheader(int argc, char *argv[]); @@ -92,6 +93,7 @@ static void usage(FILE *fp) " rmdup remove PCR duplicates\n" " targetcut cut fosmid regions (for fosmid pool only)\n" " addreplacerg adds or replaces RG tags\n" +" markdup mark duplicates\n" "\n" " -- File operations\n" " collate shuffle and group alignments by name\n" @@ -126,6 +128,18 @@ static void usage(FILE *fp) #endif } +// This is a tricky one, but on Windows the filename wildcard expansion is done by +// the application and not by the shell, as traditionally it never had a "shell". +// Even now, DOS and Powershell do not do this expansion (but bash does). +// +// This means that Mingw/Msys implements code before main() that takes e.g. "*" and +// expands it up to a list of matching filenames. This in turn breaks things like +// specifying "*" as a region (all the unmapped reads). We take a hard line here - +// filename expansion is the task of the shell, not our application! +#ifdef _WIN32 +int _CRT_glob = 0; +#endif + int main(int argc, char *argv[]) { #ifdef _WIN32 @@ -156,6 +170,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1); else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1); else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1); + else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0) ret = bam_flagstat(argc-1, argv+1); else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1); diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c index 248bc81..67c09c8 100644 --- a/samtools/bamtk.c.pysam.c +++ b/samtools/bamtk.c.pysam.c @@ -40,12 +40,14 @@ int bam_mpileup(int argc, char *argv[]); int bam_merge(int argc, char *argv[]); int bam_index(int argc, char *argv[]); int bam_sort(int argc, char *argv[]); -/* AH: int bam_tview_main(int argc, char *argv[]); */ +/* AH: removed */ +/* int bam_tview_main(int argc, char *argv[]); */ int bam_mating(int argc, char *argv[]); int bam_rmdup(int argc, char *argv[]); int bam_flagstat(int argc, char *argv[]); int bam_fillmd(int argc, char *argv[]); int bam_idxstats(int argc, char *argv[]); +int bam_markdup(int argc, char *argv[]); int main_samview(int argc, char *argv[]); int main_import(int argc, char *argv[]); int main_reheader(int argc, char *argv[]); @@ -94,6 +96,7 @@ static void usage(FILE *fp) " rmdup remove PCR duplicates\n" " targetcut cut fosmid regions (for fosmid pool only)\n" " addreplacerg adds or replaces RG tags\n" +" markdup mark duplicates\n" "\n" " -- File operations\n" " collate shuffle and group alignments by name\n" @@ -128,6 +131,18 @@ static void usage(FILE *fp) #endif } +// This is a tricky one, but on Windows the filename wildcard expansion is done by +// the application and not by the shell, as traditionally it never had a "shell". +// Even now, DOS and Powershell do not do this expansion (but bash does). +// +// This means that Mingw/Msys implements code before main() that takes e.g. "*" and +// expands it up to a list of matching filenames. This in turn breaks things like +// specifying "*" as a region (all the unmapped reads). We take a hard line here - +// filename expansion is the task of the shell, not our application! +#ifdef _WIN32 +int _CRT_glob = 0; +#endif + int samtools_main(int argc, char *argv[]) { #ifdef _WIN32 @@ -158,6 +173,7 @@ int samtools_main(int argc, char *argv[]) else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1); else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1); else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1); + else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0) ret = bam_flagstat(argc-1, argv+1); else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1); @@ -183,7 +199,9 @@ int samtools_main(int argc, char *argv[]) fprintf(pysam_stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); return 1; } - /* else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); */ +/* AH: + else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); +*/ else if (strcmp(argv[1], "--version") == 0) { fprintf(pysam_stdout, "samtools %s\n" diff --git a/samtools/bedidx.c b/samtools/bedidx.c index c1954ad..86d2338 100644 --- a/samtools/bedidx.c +++ b/samtools/bedidx.c @@ -32,10 +32,6 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#ifdef _WIN32 -#define drand48() ((double)rand() / RAND_MAX) -#endif - #include "htslib/ksort.h" KSORT_INIT_GENERIC(uint64_t) diff --git a/samtools/bedidx.c.pysam.c b/samtools/bedidx.c.pysam.c index 5b7df0c..1998435 100644 --- a/samtools/bedidx.c.pysam.c +++ b/samtools/bedidx.c.pysam.c @@ -34,10 +34,6 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#ifdef _WIN32 -#define drand48() ((double)rand() / RAND_MAX) -#endif - #include "htslib/ksort.h" KSORT_INIT_GENERIC(uint64_t) diff --git a/samtools/dict.c b/samtools/dict.c index fa64a16..cb5622e 100644 --- a/samtools/dict.c +++ b/samtools/dict.c @@ -82,7 +82,11 @@ static void write_dict(const char *fn, args_t *args) if (args->uri) fprintf(out, "\tUR:%s", args->uri); else if (strcmp(fn, "-") != 0) { +#ifdef _WIN32 + char *real_path = _fullpath(NULL, fn, PATH_MAX); +#else char *real_path = realpath(fn, NULL); +#endif fprintf(out, "\tUR:file://%s", real_path); free(real_path); } diff --git a/samtools/dict.c.pysam.c b/samtools/dict.c.pysam.c index 5368851..c4e4045 100644 --- a/samtools/dict.c.pysam.c +++ b/samtools/dict.c.pysam.c @@ -84,7 +84,11 @@ static void write_dict(const char *fn, args_t *args) if (args->uri) fprintf(out, "\tUR:%s", args->uri); else if (strcmp(fn, "-") != 0) { +#ifdef _WIN32 + char *real_path = _fullpath(NULL, fn, PATH_MAX); +#else char *real_path = realpath(fn, NULL); +#endif fprintf(out, "\tUR:file://%s", real_path); free(real_path); } diff --git a/samtools/padding.c b/samtools/padding.c index 2f10e86..650aff8 100644 --- a/samtools/padding.c +++ b/samtools/padding.c @@ -382,6 +382,7 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) { int i = 0, unpadded_len = 0; bam_hdr_t *header = 0 ; + unsigned short ln_found; header = bam_hdr_dup(old); for (i = 0; i < old->n_targets; ++i) { @@ -418,27 +419,45 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) name += 4; for (name_end = name; name_end != end && *name_end != '\t'; name_end++); strcat(newtext, "@SQ"); + ln_found = 0; /* Parse the @SQ lines */ while (cp != end) { - if (end-cp >= 2 && strncmp(cp, "LN", 2) == 0) { + if (!ln_found && end-cp >= 2 && strncmp(cp, "LN", 2) == 0) { // Rewrite the length char len_buf[100]; int tid; + unsigned int old_length, new_length; + const char *old_cp = cp; + + ln_found = 1; + + while (cp != end && *cp++ != '\t'); + old_length = (int)(cp - old_cp); + for (tid = 0; tid < header->n_targets; tid++) { // may want to hash this, but new header API incoming. if (strncmp(name, header->target_name[tid], name_end - name) == 0) { - sprintf(len_buf, "LN:%d", header->target_len[tid]); - strcat(newtext, len_buf); + new_length = sprintf(len_buf, "LN:%d", header->target_len[tid]); + if (new_length <= old_length) { + strcat(newtext, len_buf); + } + else { + fprintf(stderr, "LN value of the reference is larger than the original!\n"); + exit(1); + } break; } } - while (cp != end && *cp++ != '\t'); + if (cp != end) strcat(newtext, "\t"); } else if (end-cp >= 2 && - (strncmp(cp, "M5", 2) == 0 || - strncmp(cp, "UR", 2) == 0)) { + ((ln_found && strncmp(cp, "LN", 2) == 0) || + strncmp(cp, "M5", 2) == 0 || + strncmp(cp, "UR", 2) == 0)) + { + // skip secondary LNs // MD5 changed during depadding; ditch it. // URLs are also invalid. while (cp != end && *cp++ != '\t'); diff --git a/samtools/padding.c.pysam.c b/samtools/padding.c.pysam.c index a3461e4..901f027 100644 --- a/samtools/padding.c.pysam.c +++ b/samtools/padding.c.pysam.c @@ -384,6 +384,7 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) { int i = 0, unpadded_len = 0; bam_hdr_t *header = 0 ; + unsigned short ln_found; header = bam_hdr_dup(old); for (i = 0; i < old->n_targets; ++i) { @@ -420,27 +421,45 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) name += 4; for (name_end = name; name_end != end && *name_end != '\t'; name_end++); strcat(newtext, "@SQ"); + ln_found = 0; /* Parse the @SQ lines */ while (cp != end) { - if (end-cp >= 2 && strncmp(cp, "LN", 2) == 0) { + if (!ln_found && end-cp >= 2 && strncmp(cp, "LN", 2) == 0) { // Rewrite the length char len_buf[100]; int tid; + unsigned int old_length, new_length; + const char *old_cp = cp; + + ln_found = 1; + + while (cp != end && *cp++ != '\t'); + old_length = (int)(cp - old_cp); + for (tid = 0; tid < header->n_targets; tid++) { // may want to hash this, but new header API incoming. if (strncmp(name, header->target_name[tid], name_end - name) == 0) { - sprintf(len_buf, "LN:%d", header->target_len[tid]); - strcat(newtext, len_buf); + new_length = sprintf(len_buf, "LN:%d", header->target_len[tid]); + if (new_length <= old_length) { + strcat(newtext, len_buf); + } + else { + fprintf(pysam_stderr, "LN value of the reference is larger than the original!\n"); + exit(1); + } break; } } - while (cp != end && *cp++ != '\t'); + if (cp != end) strcat(newtext, "\t"); } else if (end-cp >= 2 && - (strncmp(cp, "M5", 2) == 0 || - strncmp(cp, "UR", 2) == 0)) { + ((ln_found && strncmp(cp, "LN", 2) == 0) || + strncmp(cp, "M5", 2) == 0 || + strncmp(cp, "UR", 2) == 0)) + { + // skip secondary LNs // MD5 changed during depadding; ditch it. // URLs are also invalid. while (cp != end && *cp++ != '\t'); diff --git a/samtools/phase.c b/samtools/phase.c index 584334d..0e00d9b 100644 --- a/samtools/phase.c +++ b/samtools/phase.c @@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "sam_opts.h" #include "samtools.h" +#include "htslib/hts_os.h" #include "htslib/kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) diff --git a/samtools/phase.c.pysam.c b/samtools/phase.c.pysam.c index 4226c03..2cfb3ae 100644 --- a/samtools/phase.c.pysam.c +++ b/samtools/phase.c.pysam.c @@ -38,6 +38,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "sam_opts.h" #include "samtools.h" +#include "htslib/hts_os.h" #include "htslib/kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) diff --git a/samtools/sam_view.c b/samtools/sam_view.c index ee65fcd..ceb1080 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -969,7 +969,7 @@ static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *li } /* - * Create FASTQ lines from the barcode tag using the index-format + * Create FASTQ lines from the barcode tag using the index-format */ static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) { @@ -1072,7 +1072,7 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t if (state->use_oq) { oq = bam_aux_get(b, "OQ"); if (oq) { - oq++; + oq++; qual = strdup(bam_aux2Z(oq)); if (!qual) goto fail; if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented @@ -1208,6 +1208,13 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) return false; } + if (nIndex==0 && opts->index_file[0]) { + fprintf(stderr, "index_format not specified, but index file given\n"); + bam2fq_usage(stderr, argv[0]); + free_opts(opts); + return false; + } + if (opts->def_qual < 0 || 93 < opts->def_qual) { fprintf(stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); bam2fq_usage(stderr, argv[0]); @@ -1375,7 +1382,7 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* } } for (i = 0; i < 2; i++) { - if (state->fpi[i] && bgzf_close(state->fpi[i])) { + if (state->fpi[i] && bgzf_close(state->fpi[i])) { print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); valid = false; } @@ -1435,14 +1442,22 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } - } else if ((score[1] > 0 || score[2] > 0) && state->fpse) { - // print whichever one exists to fpse - if (score[1] > 0) { - if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } + } else if (score[1] > 0 || score[2] > 0) { + if (state->fpse) { + // print whichever one exists to fpse + if (score[1] > 0) { + if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } + } else { + if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } + } + ++n_singletons; } else { - if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } + if (score[1] > 0) { + if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } + } else { + if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } + } } - ++n_singletons; } if (score[0]) { // TODO: check this // print linebuf[0] to fpr[0] diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index f46cc9f..5113339 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -971,7 +971,7 @@ static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *li } /* - * Create FASTQ lines from the barcode tag using the index-format + * Create FASTQ lines from the barcode tag using the index-format */ static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) { @@ -1074,7 +1074,7 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t if (state->use_oq) { oq = bam_aux_get(b, "OQ"); if (oq) { - oq++; + oq++; qual = strdup(bam_aux2Z(oq)); if (!qual) goto fail; if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented @@ -1210,6 +1210,13 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) return false; } + if (nIndex==0 && opts->index_file[0]) { + fprintf(pysam_stderr, "index_format not specified, but index file given\n"); + bam2fq_usage(pysam_stderr, argv[0]); + free_opts(opts); + return false; + } + if (opts->def_qual < 0 || 93 < opts->def_qual) { fprintf(pysam_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); bam2fq_usage(pysam_stderr, argv[0]); @@ -1377,7 +1384,7 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* } } for (i = 0; i < 2; i++) { - if (state->fpi[i] && bgzf_close(state->fpi[i])) { + if (state->fpi[i] && bgzf_close(state->fpi[i])) { print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); valid = false; } @@ -1437,14 +1444,22 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } - } else if ((score[1] > 0 || score[2] > 0) && state->fpse) { - // print whichever one exists to fpse - if (score[1] > 0) { - if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } + } else if (score[1] > 0 || score[2] > 0) { + if (state->fpse) { + // print whichever one exists to fpse + if (score[1] > 0) { + if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } + } else { + if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } + } + ++n_singletons; } else { - if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } + if (score[1] > 0) { + if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } + } else { + if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } + } } - ++n_singletons; } if (score[0]) { // TODO: check this // print linebuf[0] to fpr[0] diff --git a/samtools/version.h b/samtools/version.h index 1f3fa45..e74ad87 100644 --- a/samtools/version.h +++ b/samtools/version.h @@ -1 +1 @@ -#define SAMTOOLS_VERSION "1.5" +#define SAMTOOLS_VERSION "1.6" diff --git a/setup.py b/setup.py index 66783ae..608badb 100644 --- a/setup.py +++ b/setup.py @@ -364,6 +364,8 @@ else: define_macros = [] +samtools_include_dirs = [os.path.abspath("samtools")] + chtslib = Extension( "pysam.libchtslib", [source_pattern % "htslib", @@ -390,7 +392,7 @@ csamfile = Extension( htslib_sources + os_c_files, library_dirs=htslib_library_dirs, - include_dirs=["pysam", "samtools", "."] + include_os + htslib_include_dirs, + include_dirs=["pysam", "."] + samtools_include_dirs + include_os + htslib_include_dirs, libraries=external_htslib_libraries + internal_htslib_libraries, language="c", extra_compile_args=extra_compile_args, @@ -409,7 +411,7 @@ calignmentfile = Extension( htslib_sources + os_c_files, library_dirs=htslib_library_dirs, - include_dirs=["pysam", "samtools"] + include_os + htslib_include_dirs, + include_dirs=["pysam"] + samtools_include_dirs + include_os + htslib_include_dirs, libraries=external_htslib_libraries + internal_htslib_libraries, language="c", extra_compile_args=extra_compile_args, @@ -428,7 +430,7 @@ calignedsegment = Extension( htslib_sources + os_c_files, library_dirs=htslib_library_dirs, - include_dirs=["pysam", "samtools", "."] + include_os + htslib_include_dirs, + include_dirs=["pysam", "."] + samtools_include_dirs + include_os + htslib_include_dirs, libraries=external_htslib_libraries + internal_htslib_libraries, language="c", extra_compile_args=extra_compile_args, @@ -472,7 +474,7 @@ csamtools = Extension( htslib_sources + os_c_files, library_dirs=["pysam"] + htslib_library_dirs, - include_dirs=["samtools", "pysam", "."] + + include_dirs=["pysam", "."] + samtools_include_dirs + include_os + htslib_include_dirs, libraries=external_htslib_libraries + internal_htslib_libraries, language="c", @@ -487,7 +489,7 @@ cbcftools = Extension( htslib_sources + os_c_files, library_dirs=["pysam"] + htslib_library_dirs, - include_dirs=["bcftools", "pysam", "."] + + include_dirs=["bcftools", "pysam", "."] + samtools_include_dirs + include_os + htslib_include_dirs, libraries=external_htslib_libraries + internal_htslib_libraries, language="c", diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index aafa826..920ddbc 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -11,7 +11,7 @@ from TestUtils import checkFieldEqual, BAM_DATADIR, WORKDIR class ReadTest(unittest.TestCase): - def buildRead(self): + def build_read(self): '''build an example read.''' a = pysam.AlignedSegment() @@ -26,7 +26,6 @@ class ReadTest(unittest.TestCase): a.next_reference_start = 200 a.template_length = 167 a.query_qualities = pysam.qualitystring_to_array("1234") * 10 - # todo: create tags return a @@ -66,8 +65,8 @@ class TestAlignedSegment(ReadTest): def testCompare(self): '''check comparison functions.''' - a = self.buildRead() - b = self.buildRead() + a = self.build_read() + b = self.build_read() self.assertEqual(0, a.compare(b)) self.assertEqual(0, b.compare(a)) @@ -83,8 +82,8 @@ class TestAlignedSegment(ReadTest): self.assertTrue(b != a) def testHashing(self): - a = self.buildRead() - b = self.buildRead() + a = self.build_read() + b = self.build_read() self.assertEqual(hash(a), hash(b)) b.tid = 2 self.assertNotEqual(hash(a), hash(b)) @@ -92,8 +91,8 @@ class TestAlignedSegment(ReadTest): def testUpdate(self): '''check if updating fields affects other variable length data ''' - a = self.buildRead() - b = self.buildRead() + a = self.build_read() + b = self.build_read() # check qname b.query_name = "read_123" @@ -124,7 +123,7 @@ class TestAlignedSegment(ReadTest): checkFieldEqual(self, a, b, ("query_qualities",)) # reset qual - b = self.buildRead() + b = self.build_read() # check flags: for x in ( @@ -147,11 +146,11 @@ class TestAlignedSegment(ReadTest): This does not work as setting the sequence will erase the quality scores. ''' - a = self.buildRead() + a = self.build_read() a.query_sequence = a.query_sequence[5:10] self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None) - a = self.buildRead() + a = self.build_read() s = pysam.qualities_to_qualitystring(a.query_qualities) a.query_sequence = a.query_sequence[5:10] a.query_qualities = pysam.qualitystring_to_array(s[5:10]) @@ -178,14 +177,14 @@ class TestAlignedSegment(ReadTest): def testUpdateTlen(self): '''check if updating tlen works''' - a = self.buildRead() + a = self.build_read() oldlen = a.template_length oldlen *= 2 a.template_length = oldlen self.assertEqual(a.template_length, oldlen) def testPositions(self): - a = self.buildRead() + a = self.build_read() self.assertEqual(a.get_reference_positions(), [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, @@ -216,20 +215,20 @@ class TestAlignedSegment(ReadTest): def testFullReferencePositions(self): '''see issue 26''' - a = self.buildRead() + a = self.build_read() a.cigar = [(4, 30), (0, 20), (1, 3), (0, 47)] self.assertEqual(100, len(a.get_reference_positions(full_length=True))) def testBlocks(self): - a = self.buildRead() + a = self.build_read() self.assertEqual(a.get_blocks(), [(20, 30), (31, 40), (40, 60)]) def test_infer_query_length(self): '''Test infer_query_length on M|=|X|I|D|H|S cigar ops''' - a = self.buildRead() + a = self.build_read() a.cigarstring = '40M' self.assertEqual(a.infer_query_length(), 40) a.cigarstring = '40=' @@ -253,7 +252,7 @@ class TestAlignedSegment(ReadTest): def test_infer_read_length(self): '''Test infer_read_length on M|=|X|I|D|H|S cigar ops''' - a = self.buildRead() + a = self.build_read() a.cigarstring = '40M' self.assertEqual(a.infer_read_length(), 40) a.cigarstring = '40=' @@ -276,7 +275,7 @@ class TestAlignedSegment(ReadTest): self.assertEqual(a.infer_read_length(), None) def test_get_aligned_pairs_soft_clipping(self): - a = self.buildRead() + a = self.build_read() a.cigartuples = ((4, 2), (0, 35), (4, 3)) self.assertEqual(a.get_aligned_pairs(), [(0, None), (1, None)] + @@ -292,7 +291,7 @@ class TestAlignedSegment(ReadTest): ) def test_get_aligned_pairs_hard_clipping(self): - a = self.buildRead() + a = self.build_read() a.cigartuples = ((5, 2), (0, 35), (5, 3)) self.assertEqual(a.get_aligned_pairs(), # No seq, no seq pos @@ -303,7 +302,7 @@ class TestAlignedSegment(ReadTest): range(0, 0 + 35), range(20, 20 + 35))]) def test_get_aligned_pairs_skip(self): - a = self.buildRead() + a = self.build_read() a.cigarstring = "2M100D38M" self.assertEqual(a.get_aligned_pairs(), [(0, 20), (1, 21)] + @@ -319,7 +318,7 @@ class TestAlignedSegment(ReadTest): range(20 + 2 + 100, 20 + 2 + 100 + 38))]) def test_get_aligned_pairs_match_mismatch(self): - a = self.buildRead() + a = self.build_read() a.cigartuples = ((7, 20), (8, 20)) self.assertEqual(a.get_aligned_pairs(), [(qpos, refpos) for (qpos, refpos) in zip( @@ -329,7 +328,7 @@ class TestAlignedSegment(ReadTest): range(0, 0 + 40), range(20, 20 + 40))]) def test_get_aligned_pairs_padding(self): - a = self.buildRead() + a = self.build_read() a.cigartuples = ((7, 20), (6, 1), (8, 19)) def inner(): @@ -338,7 +337,7 @@ class TestAlignedSegment(ReadTest): self.assertRaises(NotImplementedError, inner) def test_get_aligned_pairs(self): - a = self.buildRead() + a = self.build_read() a.query_sequence = "A" * 9 a.cigarstring = "9M" a.set_tag("MD", "9") @@ -377,7 +376,7 @@ class TestAlignedSegment(ReadTest): ) def test_get_aligned_pairs_skip_reference(self): - a = self.buildRead() + a = self.build_read() a.query_sequence = "A" * 10 a.cigarstring = "5M1N5M" a.set_tag("MD", "10") @@ -408,7 +407,7 @@ class TestAlignedSegment(ReadTest): '''issue 176: retrieving length without query sequence with soft-clipping. ''' - a = self.buildRead() + a = self.build_read() a.query_sequence = None a.cigarstring = "20M" self.assertEqual(a.query_alignment_length, 20) @@ -427,7 +426,7 @@ class TestAlignedSegment(ReadTest): def test_query_length_is_limited(self): - a = self.buildRead() + a = self.build_read() a.query_name = "A" * 1 a.query_name = "A" * 251 self.assertRaises( @@ -438,11 +437,30 @@ class TestAlignedSegment(ReadTest): "A" * 252) +class TestCigar(ReadTest): + + def testCigarString(self): + r = self.build_read() + self.assertEqual(r.cigarstring, "10M1D9M1I20M") + r.cigarstring = "20M10D20M" + self.assertEqual(r.cigartuples, [(0, 20), (2, 10), (0, 20)]) + # unsetting cigar string + r.cigarstring = None + self.assertEqual(r.cigarstring, None) + + def testCigar(self): + r = self.build_read() + self.assertEqual(r.cigartuples, [(0, 10), (2, 1), (0, 9), (1, 1), (0, 20)]) + # unsetting cigar string + r.cigartuples = None + self.assertEqual(r.cigartuples, None) + + class TestCigarStats(ReadTest): def testStats(self): - a = self.buildRead() + a = self.build_read() a.cigarstring = None self.assertEqual( @@ -508,15 +526,15 @@ class TestAlignedPairs(unittest.TestCase): class TestTags(ReadTest): def testMissingTag(self): - a = self.buildRead() + a = self.build_read() self.assertRaises(KeyError, a.get_tag, "XP") def testEmptyTag(self): - a = self.buildRead() + a = self.build_read() self.assertRaises(KeyError, a.get_tag, "XT") def testSetTag(self): - a = self.buildRead() + a = self.build_read() self.assertEqual(False, a.has_tag("NM")) a.set_tag("NM", 2) self.assertEqual(True, a.has_tag("NM")) @@ -530,7 +548,7 @@ class TestTags(ReadTest): a.set_tag("NM", None) def testArrayTags(self): - read = self.buildRead() + read = self.build_read() supported_dtypes = "bhBHf" unsupported_dtypes = "lLd" @@ -547,7 +565,7 @@ class TestTags(ReadTest): array.array(dtype, range(10))) def testAddTagsType(self): - a = self.buildRead() + a = self.build_read() a.tags = None self.assertEqual(a.tags, []) @@ -579,10 +597,10 @@ class TestTags(ReadTest): ('X5', 5)])) # test setting invalid type code - self.assertRaises(ValueError, a.setTag, 'X6', 5.2, 'g') + self.assertRaises(ValueError, a.set_tag, 'X6', 5.2, 'g') def testTagsUpdatingFloat(self): - a = self.buildRead() + a = self.build_read() a.tags = [('NM', 1), ('RG', 'L1'), ('PG', 'P1'), ('XT', 'U')] @@ -595,7 +613,7 @@ class TestTags(ReadTest): ('PG', 'P1'), ('XT', 'U'), ('XC', 5.0)]) def testAddTags(self): - a = self.buildRead() + a = self.build_read() a.tags = [('NM', 1), ('RG', 'L1'), ('PG', 'P1'), ('XT', 'U')] @@ -643,7 +661,7 @@ class TestTags(ReadTest): self.assertEqual(after, before) def testMDTagMatchOnly(self): - a = self.buildRead() + a = self.build_read() # Substitutions only a.cigarstring = "21M" @@ -668,7 +686,7 @@ class TestTags(ReadTest): a.get_reference_sequence()) def testMDTagInsertions(self): - a = self.buildRead() + a = self.build_read() # insertions are silent in the reference sequence a.cigarstring = "5M1I5M" @@ -691,7 +709,7 @@ class TestTags(ReadTest): "A" * 10) def testMDTagDeletions(self): - a = self.buildRead() + a = self.build_read() a.cigarstring = "5M1D5M" a.query_sequence = "A" * 10 @@ -708,7 +726,7 @@ class TestTags(ReadTest): a.get_reference_sequence()) def testMDTagRefSkipping(self): - a = self.buildRead() + a = self.build_read() a.cigarstring = "5M1N5M" a.query_sequence = "A" * 10 @@ -725,7 +743,7 @@ class TestTags(ReadTest): a.get_reference_sequence()) def testMDTagSoftClipping(self): - a = self.buildRead() + a = self.build_read() # softclipping a.cigarstring = "5S5M1D5M5S" @@ -744,7 +762,7 @@ class TestTags(ReadTest): a.get_reference_sequence()) def testMDTagComplex(self): - a = self.buildRead() + a = self.build_read() a.cigarstring = "5S5M1I2D5M5S" a.query_sequence = "G" * 5 + "A" * 11 + "G" * 5 @@ -777,11 +795,81 @@ class TestTags(ReadTest): "AAAAcTTAA", a.get_reference_sequence()) + def testArrayTags(self): + + r = self.build_read() + + def c(r, l): + r.tags = [('ZM', l)] + self.assertEqual(list(r.opt("ZM")), list(l)) + + # signed integers + c(r, (-1, 1)) + c(r, (-1, 100)) + c(r, (-1, 200)) + c(r, (-1, 1000)) + c(r, (-1, 30000)) + c(r, (-1, 50000)) + c(r, (1, -1)) + c(r, (1, -100)) + c(r, (1, -200)) + c(r, (1, -1000)) + c(r, (1, -30000)) + c(r, (1, -50000)) + + # unsigned integers + c(r, (1, 100)) + c(r, (1, 1000)) + c(r, (1, 10000)) + c(r, (1, 100000)) + + # floats + c(r, (1.0, 100.0)) + + def testLongTags(self): + '''see issue 115''' + + r = self.build_read() + rg = 'HS2000-899_199.L3' + tags = [('XC', 85), ('XT', 'M'), ('NM', 5), + ('SM', 29), ('AM', 29), ('XM', 1), + ('XO', 1), ('XG', 4), ('MD', '37^ACCC29T18'), + ('XA', '5,+11707,36M1I48M,2;21,-48119779,46M1I38M,2;hs37d5,-10060835,40M1D45M,3;5,+11508,36M1I48M,3;hs37d5,+6743812,36M1I48M,3;19,-59118894,46M1I38M,3;4,-191044002,6M1I78M,3;')] + + r.tags = tags + r.tags += [("RG", rg)] * 100 + tags += [("RG", rg)] * 100 + + self.assertEqual(tags, r.tags) + + def testNegativeIntegers(self): + x = -2 + aligned_read = self.build_read() + aligned_read.tags = [("XD", int(x))] + self.assertEqual(aligned_read.opt('XD'), x) + # print (aligned_read.tags) + + def testNegativeIntegersWrittenToFile(self): + r = self.build_read() + x = -2 + r.tags = [("XD", x)] + with pysam.AlignmentFile( + "tests/test.bam", + "wb", + referencenames=("chr1",), + referencelengths = (1000,)) as outf: + outf.write(r) + with pysam.AlignmentFile("tests/test.bam") as inf: + r = next(inf) + + self.assertEqual(r.tags, [("XD", x)]) + os.unlink("tests/test.bam") + class TestCopy(ReadTest): def testCopy(self): - a = self.buildRead() + a = self.build_read() b = copy.copy(a) # check if a and be are the same self.assertEqual(a, b) @@ -793,7 +881,7 @@ class TestCopy(ReadTest): self.assertEqual(b.query_name, 'ReadB') def testDeepCopy(self): - a = self.buildRead() + a = self.build_read() b = copy.deepcopy(a) # check if a and be are the same self.assertEqual(a, b) @@ -805,6 +893,93 @@ class TestCopy(ReadTest): self.assertEqual(b.query_name, 'ReadB') +class TestSetTagGetTag(ReadTest): + + def check_tag(self, tag, value, value_type, alt_value_type=None): + a = self.build_read() + a.set_tag(tag, value, value_type=value_type) + v, t = a.get_tag(tag, with_value_type=True) + self.assertEqual(v, value) + + if alt_value_type: + self.assertEqual(t, alt_value_type) + else: + self.assertEqual(t, value_type) + + def test_set_tag_with_A(self): + self.check_tag('TT', "x", value_type="A") + + def test_set_tag_with_a(self): + self.check_tag('TT', "x", value_type="a", alt_value_type="A") + + def test_set_tag_with_C(self): + self.check_tag('TT', 12, value_type="C") + + def test_set_tag_with_c(self): + self.check_tag('TT', 12, value_type="c") + + def test_set_tag_with_S(self): + self.check_tag('TT', 12, value_type="S") + + def test_set_tag_with_s(self): + self.check_tag('TT', 12, value_type="s") + + def test_set_tag_with_I(self): + self.check_tag('TT', 12, value_type="I") + + def test_set_tag_with_i(self): + self.check_tag('TT', 12, value_type="i") + + def test_set_tag_with_f(self): + self.check_tag('TT', 2.5, value_type="f") + + def test_set_tag_with_d(self): + self.check_tag('TT', 2.5, value_type="d") + + def test_set_tag_with_H(self): + self.check_tag('TT', "AE12", value_type="H") + + def test_set_tag_with_automated_type_detection(self): + self.check_tag('TT', -(1 << 7), value_type=None, alt_value_type="c") + self.check_tag('TT', -(1 << 7) - 1, value_type=None, alt_value_type="s") + self.check_tag('TT', -(1 << 15), value_type=None, alt_value_type="s") + self.check_tag('TT', -(1 << 15) - 1, value_type=None, alt_value_type="i") + self.check_tag('TT', -(1 << 31), value_type=None, alt_value_type="i") + self.assertRaises( + ValueError, + self.check_tag, + 'TT', + -(1 << 31) - 1, + value_type=None, + alt_value_type="i") + + self.check_tag('TT', (1 << 8) - 1, value_type=None, alt_value_type="C") + self.check_tag('TT', (1 << 8), value_type=None, alt_value_type="S") + self.check_tag('TT', (1 << 16) - 1, value_type=None, alt_value_type="S") + self.check_tag('TT', (1 << 16), value_type=None, alt_value_type="I") + self.check_tag('TT', (1 << 32) - 1, value_type=None, alt_value_type="I") + self.assertRaises( + ValueError, + self.check_tag, + 'TT', + (1 << 32), + value_type=None, + alt_value_type="I") + + +class TestSetTagsGetTag(TestSetTagGetTag): + + def check_tag(self, tag, value, value_type, alt_value_type=None): + a = self.build_read() + a.set_tags([(tag, value, value_type)]) + v, t = a.get_tag(tag, with_value_type=True) + if alt_value_type: + self.assertEqual(t, alt_value_type) + else: + self.assertEqual(t, value_type) + self.assertEqual(v, value) + + class TestAsString(unittest.TestCase): def testAsString(self): diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index f81d752..e6f9bdb 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -1169,116 +1169,7 @@ class TestLargeFieldBug(unittest.TestCase): new_read.tags = read.tags self.assertEqual(new_read.tags, read.tags) - -class TestTagParsing(unittest.TestCase): - - '''tests checking the accuracy of tag setting and retrieval.''' - - def makeRead(self): - a = pysam.AlignedSegment() - a.query_name = "read_12345" - a.reference_id = 0 - a.query_sequence = "ACGT" * 3 - a.flag = 0 - a.reference_id = 0 - a.reference_start = 1 - a.mapping_quality = 20 - a.cigartuples = ((0, 10), (2, 1), (0, 25)) - a.next_reference_id = 0 - a.next_reference_start = 200 - a.template_length = 0 - a.query_qualities = pysam.qualitystring_to_array("1234") * 3 - # todo: create tags - return a - - def testNegativeIntegers(self): - x = -2 - aligned_read = self.makeRead() - aligned_read.tags = [("XD", int(x))] - self.assertEqual(aligned_read.opt('XD'), x) - # print (aligned_read.tags) - - def testNegativeIntegers2(self): - x = -2 - r = self.makeRead() - r.tags = [("XD", x)] - outfile = pysam.AlignmentFile( - "tests/test.bam", - "wb", - referencenames=("chr1",), - referencelengths = (1000,)) - outfile.write(r) - outfile.close() - infile = pysam.AlignmentFile("tests/test.bam") - r = next(infile) - self.assertEqual(r.tags, [("XD", x)]) - infile.close() - os.unlink("tests/test.bam") - - def testCigarString(self): - r = self.makeRead() - self.assertEqual(r.cigarstring, "10M1D25M") - r.cigarstring = "20M10D20M" - self.assertEqual(r.cigartuples, [(0, 20), (2, 10), (0, 20)]) - # unsetting cigar string - r.cigarstring = None - self.assertEqual(r.cigarstring, None) - - def testCigar(self): - r = self.makeRead() - self.assertEqual(r.cigartuples, [(0, 10), (2, 1), (0, 25)]) - # unsetting cigar string - r.cigartuples = None - self.assertEqual(r.cigartuples, None) - - def testLongTags(self): - '''see issue 115''' - - r = self.makeRead() - rg = 'HS2000-899_199.L3' - tags = [('XC', 85), ('XT', 'M'), ('NM', 5), - ('SM', 29), ('AM', 29), ('XM', 1), - ('XO', 1), ('XG', 4), ('MD', '37^ACCC29T18'), - ('XA', '5,+11707,36M1I48M,2;21,-48119779,46M1I38M,2;hs37d5,-10060835,40M1D45M,3;5,+11508,36M1I48M,3;hs37d5,+6743812,36M1I48M,3;19,-59118894,46M1I38M,3;4,-191044002,6M1I78M,3;')] - - r.tags = tags - r.tags += [("RG", rg)] * 100 - tags += [("RG", rg)] * 100 - - self.assertEqual(tags, r.tags) - - def testArrayTags(self): - - r = self.makeRead() - - def c(r, l): - r.tags = [('ZM', l)] - self.assertEqual(list(r.opt("ZM")), list(l)) - - # signed integers - c(r, (-1, 1)) - c(r, (-1, 100)) - c(r, (-1, 200)) - c(r, (-1, 1000)) - c(r, (-1, 30000)) - c(r, (-1, 50000)) - c(r, (1, -1)) - c(r, (1, -100)) - c(r, (1, -200)) - c(r, (1, -1000)) - c(r, (1, -30000)) - c(r, (1, -50000)) - - # unsigned integers - c(r, (1, 100)) - c(r, (1, 1000)) - c(r, (1, 10000)) - c(r, (1, 100000)) - - # floats - c(r, (1.0, 100.0)) - - + class TestClipping(unittest.TestCase): def testClipping(self): @@ -1773,10 +1664,12 @@ class TestDeNovoConstruction(unittest.TestCase): '''check if individual reads are binary equal.''' infile = pysam.AlignmentFile(self.bamfile, "rb") - others = list(infile) - for denovo, other in zip(others, self.reads): - checkFieldEqual(self, other, denovo) - self.assertEqual(other.compare(denovo), 0) + references = list(infile) + for denovo, reference in zip(references, self.reads): + checkFieldEqual(self, reference, denovo) + print("reference", str(reference), reference.get_tags(with_value_type=True)) + print("denovo", str(denovo), denovo.get_tags(with_value_type=True)) + self.assertEqual(reference.compare(denovo), 0) # TODO # def testSAMPerRead(self): @@ -2132,7 +2025,7 @@ class TestPileup(unittest.TestCase): def setUp(self): self.samfile = pysam.AlignmentFile(self.samfilename) - self.fastafile = pysam.Fastafile(self.fastafilename) + self.fastafile = pysam.FastaFile(self.fastafilename) def tearDown(self): self.samfile.close() @@ -2173,7 +2066,8 @@ class TestPileup(unittest.TestCase): self.checkEqual(refs, iterator) -class TestCountCoverage(unittest.TestCase): +class TestPileupFastafile(TestPileup): + '''test pileup functionality - backwards compatibility''' samfilename = os.path.join(BAM_DATADIR, "ex1.bam") fastafilename = os.path.join(BAM_DATADIR, "ex1.fa") @@ -2183,6 +2077,17 @@ class TestCountCoverage(unittest.TestCase): self.samfile = pysam.AlignmentFile(self.samfilename) self.fastafile = pysam.Fastafile(self.fastafilename) + +class TestCountCoverage(unittest.TestCase): + + samfilename = os.path.join(BAM_DATADIR, "ex1.bam") + fastafilename = os.path.join(BAM_DATADIR, "ex1.fa") + + def setUp(self): + + self.samfile = pysam.AlignmentFile(self.samfilename) + self.fastafile = pysam.FastaFile(self.fastafilename) + samfile = pysam.AlignmentFile( "tests/test_count_coverage_read_all.bam", 'wb', template=self.samfile) diff --git a/tests/TestUtils.py b/tests/TestUtils.py index dc95e09..c5572d3 100644 --- a/tests/TestUtils.py +++ b/tests/TestUtils.py @@ -21,8 +21,12 @@ CBCF_DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__), LINKDIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "linker_tests")) +TESTS_TEMPDIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "tmp")) + + IS_PYTHON3 = sys.version_info[0] >= 3 + if IS_PYTHON3: from itertools import zip_longest from urllib.request import urlopen @@ -192,11 +196,15 @@ def check_lines_equal(cls, a, b, sort=False, filter_f=None, msg=None): def get_temp_filename(suffix=""): caller_name = inspect.getouterframes(inspect.currentframe(), 2)[1][3] + try: + os.makedirs(TESTS_TEMPDIR) + except OSError: + pass f = tempfile.NamedTemporaryFile( prefix="tmp_{}_".format(caller_name), suffix=suffix, delete=False, - dir="tests") + dir=TESTS_TEMPDIR) f.close() return f.name diff --git a/tests/linking_test.py b/tests/linking_test.py index 623c3a2..25b9b04 100644 --- a/tests/linking_test.py +++ b/tests/linking_test.py @@ -20,7 +20,7 @@ def check_import(statement): raise -def check_tests_pass(statement): +def check_pass(statement): try: output = subprocess.check_output( statement, stderr=subprocess.STDOUT, shell=True) @@ -31,6 +31,9 @@ def check_tests_pass(statement): return True +@unittest.skipUnless( + os.environ.get("PYSAM_LINKING_TESTS", None), + "enable linking tests by setting PYSAM_LINKING_TESTS environment variable") class TestLinking(unittest.TestCase): package_name = "link_with_rpath" @@ -43,15 +46,22 @@ class TestLinking(unittest.TestCase): "cd {} && rm -rf build && python setup.py install".format(self.workdir), shell=True) + +@unittest.skipUnless( + os.environ.get("PYSAM_LINKING_TESTS", None), + "enable linking tests by setting PYSAM_LINKING_TESTS environment variable") class TestLinkWithRpath(TestLinking): package_name = "link_with_rpath" def test_package_tests_pass(self): - self.assertTrue(check_tests_pass( + self.assertTrue(check_pass( "cd {} && python test_module.py".format(os.path.join(self.workdir, "tests")))) +@unittest.skipUnless( + os.environ.get("PYSAM_LINKING_TESTS", None), + "enable linking tests by setting PYSAM_LINKING_TESTS environment variable") class TestLinkWithoutRpath(TestLinking): package_name = "link_without_rpath" @@ -69,7 +79,7 @@ class TestLinkWithoutRpath(TestLinking): pysam_libdirs, pysam_libs = zip(*[os.path.split(x) for x in pysam_libraries]) pysam_libdir = pysam_libdirs[0] - self.assertTrue(check_tests_pass( + self.assertTrue(check_pass( "export LD_LIBRARY_PATH={}:$PATH && cd {} && python test_module.py".format( pysam_libdir, os.path.join(self.workdir, "tests")))) diff --git a/tests/samtools_test.py b/tests/samtools_test.py index 5494e1b..a926f5c 100644 --- a/tests/samtools_test.py +++ b/tests/samtools_test.py @@ -5,9 +5,7 @@ Execute in the :file:`tests` directory as it requires the Makefile and data files located there. ''' -import pysam -import pysam.samtools -import pysam.bcftools +import warnings import unittest import os import re @@ -15,6 +13,9 @@ import glob import sys import subprocess import shutil +import pysam +import pysam.samtools +import pysam.bcftools from TestUtils import checkBinaryEqual, check_lines_equal, \ check_samtools_view_equal, get_temp_filename, force_bytes, WORKDIR, \ BAM_DATADIR @@ -130,7 +131,7 @@ class SamtoolsTest(unittest.TestCase): return re.sub("[^0-9.]", "", s) if _r(samtools_version) != _r(pysam.__samtools_version__): - raise ValueError( + warnings.warn( "versions of pysam.%s and %s differ: %s != %s" % (self.executable, self.executable, @@ -222,7 +223,7 @@ class SamtoolsTest(unittest.TestCase): error_msg = "%s failed: files %s and %s are not the same" % (command, s, p) if binary_equal: continue - if s.endswith(".bam"): + elif s.endswith(".bam"): self.assertTrue( check_samtools_view_equal( s, p, without_header=True), @@ -236,7 +237,9 @@ class SamtoolsTest(unittest.TestCase): def testStatements(self): for statement in self.statements: command = self.get_command(statement, map_to_internal=False) - if command in ("bedcov", "stats", "dict"): + # bam2fq differs between version 1.5 and 1.6 - reenable if + # bioconda samtools will be available. + if command in ("bedcov", "stats", "dict", "bam2fq"): continue if (command == "calmd" and @@ -268,6 +271,7 @@ class SamtoolsTest(unittest.TestCase): self.assertTrue(re.search(expected, usage_msg) is not None) def tearDown(self): + return if os.path.exists(WORKDIR): shutil.rmtree(WORKDIR) os.chdir(self.savedir) diff --git a/tests/tabix_test.py b/tests/tabix_test.py index 1b6d450..890130d 100644 --- a/tests/tabix_test.py +++ b/tests/tabix_test.py @@ -78,6 +78,15 @@ class TestIndexing(unittest.TestCase): pysam.tabix_index(self.tmpfilename, preset="gff") self.assertTrue(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx)) + def test_indexing_to_custom_location_works(self): + '''test indexing a file with a non-default location.''' + + index_path = get_temp_filename(suffix='custom.tbi') + pysam.tabix_index(self.tmpfilename, preset="gff", index=index_path, force=True) + self.assertTrue(checkBinaryEqual(index_path, self.filename_idx)) + os.unlink(index_path) + + def test_indexing_with_explict_columns_works(self): '''test indexing via preset.''' @@ -101,7 +110,8 @@ class TestIndexing(unittest.TestCase): def tearDown(self): os.unlink(self.tmpfilename) - os.unlink(self.tmpfilename + ".tbi") + if os.path.exists(self.tmpfilename + ".tbi"): + os.unlink(self.tmpfilename + ".tbi") class TestCompression(unittest.TestCase): @@ -362,7 +372,7 @@ class TestIterationWithoutComments(IterationTest): x = x.decode("ascii") if not x.startswith("#"): break - ref.append(x[:-1].encode('ascii')) + ref.append(x[:-1]) header = list(self.tabix.header) self.assertEqual(ref, header) @@ -592,7 +602,9 @@ if IS_PYTHON3: self.vcf = pysam.VCF() self.assertRaises( UnicodeDecodeError, - self.vcf.connect, self.tmpfilename + ".gz", "ascii") + self.vcf.connect, + self.tmpfilename + ".gz", + "ascii") self.vcf.connect(self.tmpfilename + ".gz", encoding="utf-8") v = self.vcf.getsamples()[0] @@ -1023,16 +1035,16 @@ for vcf_file in vcf_files: class TestRemoteFileHTTP(unittest.TestCase): - url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/example_htslib.gtf.gz" + url = "http://www.cgat.org/downloads/public/pysam/test/example.gtf.gz" region = "chr1:1-1000" local = os.path.join(TABIX_DATADIR, "example.gtf.gz") def setUp(self): - if not checkURL(self.url): + if not pysam.config.HAVE_LIBCURL or not checkURL(self.url): self.remote_file = None - return - - self.remote_file = pysam.TabixFile(self.url, "r") + else: + self.remote_file = pysam.TabixFile(self.url, "r") + self.local_file = pysam.TabixFile(self.local, "r") def tearDown(self): @@ -1058,12 +1070,29 @@ class TestRemoteFileHTTP(unittest.TestCase): return self.assertEqual(list(self.local_file.header), []) - self.assertRaises(AttributeError, - getattr, - self.remote_file, - "header") +class TestRemoteFileHTTPWithHeader(TestRemoteFileHTTP): + + url = "http://www.cgat.org/downloads/public/pysam/test/example_comments.gtf.gz" + region = "chr1:1-1000" + local = os.path.join(TABIX_DATADIR, "example_comments.gtf.gz") + + def setUp(self): + if not pysam.config.HAVE_LIBCURL or not checkURL(self.url): + self.remote_file = None + else: + self.remote_file = pysam.TabixFile(self.url, "r") + self.local_file = pysam.TabixFile(self.local, "r") + + def testHeader(self): + if self.remote_file is None: + return + + self.assertEqual(list(self.local_file.header), ["# comment at start"]) + self.assertEqual(list(self.local_file.header), self.remote_file.header) + + class TestIndexArgument(unittest.TestCase): filename_src = os.path.join(TABIX_DATADIR, "example.vcf.gz") diff --git a/tests/tabixproxies_test.py b/tests/tabixproxies_test.py index ff68c81..35ad8fc 100644 --- a/tests/tabixproxies_test.py +++ b/tests/tabixproxies_test.py @@ -145,7 +145,7 @@ class TestGTF(TestParser): self.assertEqual("\t".join(map(str, c)), str(r)) - def testSetting(self): + def test_setting_fields(self): r = self.tabix.fetch(parser=self.parser()).next() @@ -166,6 +166,14 @@ class TestGTF(TestParser): self.assertTrue("gene_id \"0001\"" in sr) self.assertTrue("transcript_id \"0002\"" in sr) + def test_setAttribute_makes_changes(self): + + r = self.tabix.fetch(parser=self.parser()).next() + r.setAttribute("transcript_id", "abcd") + sr = str(r) + self.assertEqual(r.transcript_id, "abcd") + self.assertTrue("transcript_id \"abcd\"" in sr) + def test_added_attribute_is_output(self): r = self.tabix.fetch(parser=self.parser()).next() @@ -311,7 +319,7 @@ class TestGFF3(TestGTF): str(r)) self.assertTrue(r.ID.startswith("MI00")) - def testSetting(self): + def test_setting_fields(self): for r in self.tabix.fetch(parser=self.parser()): r.contig = r.contig + "_test_contig" @@ -328,7 +336,15 @@ class TestGFF3(TestGTF): self.assertTrue("test_source" in sr) self.assertTrue("test_feature" in sr) self.assertTrue("ID=test" in sr) - + + def test_setAttribute_makes_changes(self): + + r = self.tabix.fetch(parser=self.parser()).next() + r.setAttribute("transcript_id", "abcd") + sr = str(r) + self.assertEqual(r.transcript_id, "abcd") + self.assertTrue("transcript_id=abcd" in sr) + def test_added_attribute_is_output(self): r = self.tabix.fetch(parser=self.parser()).next()