From: Nilesh Patra Date: Fri, 15 Oct 2021 07:47:18 +0000 (+0200) Subject: python-pysam (0.17.0+ds-2) unstable; urgency=medium X-Git-Tag: archive/raspbian/0.22.0+ds-1+rpi1~1^2^2^2^2~3 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=dce516869ddefbb578657bce28d610f9ce5641c2;p=python-pysam.git python-pysam (0.17.0+ds-2) unstable; urgency=medium * Team Upload. * Move from experimental to unstable * Skip failing test on 32-bit [dgit import unpatched python-pysam 0.17.0+ds-2] --- dce516869ddefbb578657bce28d610f9ce5641c2 diff --cc debian/changelog index 0000000,0000000..5d16f17 new file mode 100644 --- /dev/null +++ b/debian/changelog @@@ -1,0 -1,0 +1,463 @@@ ++python-pysam (0.17.0+ds-2) unstable; urgency=medium ++ ++ * Team Upload. ++ * Move from experimental to unstable ++ * Skip failing test on 32-bit ++ ++ -- Nilesh Patra Fri, 15 Oct 2021 13:17:18 +0530 ++ ++python-pysam (0.17.0+ds-1) experimental; urgency=medium ++ ++ * Team Upload. ++ [ Andreas Tille ] ++ * debhelper-compat 13 (routine-update) ++ * Update copyright ++ * Do not ignore test results via '|| true' ++ * Exclude another test that is failing ++ ++ [ Nilesh Patra ] ++ * New upstream version 0.17.0+ds ++ * Standards version: 4.6.0 ++ * Bump watch file version to 4 ++ * Re-diff patches ++ ++ -- Nilesh Patra Fri, 15 Oct 2021 01:13:49 +0530 ++ ++python-pysam (0.15.4+ds-3) unstable; urgency=medium ++ ++ * Remove Python2 package ++ Closes: #938092 ++ * Remove unneeded debian/gbp.conf ++ * Standards-Version: 4.5.0 (routine-update) ++ * Add salsa-ci file (routine-update) ++ * Rules-Requires-Root: no (routine-update) ++ * Set upstream metadata fields: Bug-Submit. ++ ++ -- Andreas Tille Fri, 08 May 2020 20:27:35 +0200 ++ ++python-pysam (0.15.4+ds-2) unstable; urgency=medium ++ ++ * Dont run unittests for python2 binary, to reduce pytest rdeps ++ ++ -- Sandro Tosi Mon, 13 Apr 2020 23:49:01 -0400 ++ ++python-pysam (0.15.4+ds-1) unstable; urgency=medium ++ ++ * Team upload. ++ * New upstream version ++ * python-pysam-tests: Multi-Arch: foreign ++ * Try to keep example_no_seq_in_header_null_bytes.bam from being deleted ++ ++ -- Michael R. Crusoe Thu, 23 Jan 2020 15:00:17 +0100 ++ ++python-pysam (0.15.3+ds-5) unstable; urgency=medium ++ ++ [ Chris Lamb ] ++ * make the build reproducible ++ Closes: #948280 ++ ++ [ Andreas Tille ] ++ * Standards-Version: 4.4.1 ++ * debian/copyright: use spaces rather than tabs to start continuation ++ lines. ++ * Set upstream metadata fields: Bug-Database, Repository, Repository- ++ Browse. ++ ++ -- Andreas Tille Mon, 06 Jan 2020 17:44:23 +0100 ++ ++python-pysam (0.15.3+ds-4) unstable; urgency=medium ++ ++ * Team upload. ++ * Replace missing hts_seek and hts_tell equivalents ++ ++ -- Michael R. Crusoe Sun, 05 Jan 2020 21:23:11 +0100 ++ ++python-pysam (0.15.3+ds-3) unstable; urgency=medium ++ ++ * Team upload. ++ * -docs: oops, don't try to ship nonexistent debian/tests/run-nose{3,}-tests ++ ++ -- Michael R. Crusoe Sun, 05 Jan 2020 08:01:33 +0100 ++ ++python-pysam (0.15.3+ds-2) unstable; urgency=medium ++ ++ * Team upload. ++ * new patch: Update deprecated samtools import test commands to samtools view ++ * new patch: Update samtools + bcftools to v1.10 ++ * disable tests for now ++ ++ -- Michael R. Crusoe Sat, 04 Jan 2020 23:19:04 +0100 ++ ++python-pysam (0.15.3+ds-1) unstable; urgency=medium ++ ++ * Team upload. ++ * New upstream release. Not yet dropping py2 due to paleomix and pbsuite ++ * debhelper-compat 12 ++ * Standards-Version: 4.4.0 ++ * Trim trailing whitespace. ++ * Use secure URI in Homepage field. ++ * Set upstream metadata fields: Repository. ++ * New patch to remove symbols that were dropped in htslib 1.10: ++ hts_utell, hts_useek (Closes: #947426) ++ ++ -- Michael R. Crusoe Fri, 03 Jan 2020 21:25:33 +0100 ++ ++python-pysam (0.15.2+ds-2) unstable; urgency=medium ++ ++ * Team upload. ++ * For the Tabix tests: test the index contents, not the compression scheme. ++ Closes: #919928, #920250 ++ ++ -- Michael R. Crusoe Wed, 20 Feb 2019 02:45:47 -0800 ++ ++python-pysam (0.15.2+ds-1) unstable; urgency=medium ++ ++ * Team upload. ++ * New upstream version ++ * Standards-Version: 4.3.0, no changes needed ++ * added Py2 and Py3 versions of ${python:Provides} ++ * Fix lintian found spelling typos. ++ * debian/tests/control.autodep8 → debian/tests/control. ++ * remove errant log.txt from the packages. ++ ++ -- Michael R. Crusoe Thu, 17 Jan 2019 01:25:11 -0800 ++ ++python-pysam (0.15.1+ds-1) unstable; urgency=medium ++ ++ * Team upload. ++ ++ * New upstream version. ++ - Removes autogenerated config.h - [many thanks!] ++ https://github.com/pysam-developers/pysam/issues/714 ++ - Explicit compatibility with Python 3.7 ++ * Removed patch skipping test that complained on missing file ++ ++ -- Steffen Moeller Fri, 14 Sep 2018 10:44:51 +0200 ++ ++python-pysam (0.15.0.1+ds-2) unstable; urgency=medium ++ ++ * Remove ancient fields X-Python*-Version ++ * Adjust patches for build time test suite ++ * Standards-Version: 4.2.1 ++ ++ -- Andreas Tille Wed, 12 Sep 2018 18:37:42 +0200 ++ ++python-pysam (0.15.0.1+ds-1) experimental; urgency=medium ++ ++ * Team upload. ++ ++ [ Steffen Moeller ] ++ * New upstream version. ++ ++ [ Afif Elghraoui ] ++ * New upstream version 0.14.1+ds ++ * Update patches ++ ++ [ Andreas Tille ] ++ * Testsuite: autopkgtest-pkg-python ++ * Rename d/tests/control to d/tests/control.autodep8 ++ * Standards-Version: 4.1.4 ++ ++ -- Steffen Moeller Sun, 29 Jul 2018 00:51:38 +0200 ++ ++python-pysam (0.14+ds-2) unstable; urgency=medium ++ ++ * Team upload ++ * Add Python 2.7 compatibility symlink for libchtslib.so (Closes: #890748) ++ * Update Vcs-* URIs for move to salsa.debian.org ++ * Remove trailing whitespace from debian/changelog ++ ++ -- Graham Inggs Mon, 19 Feb 2018 10:11:17 +0000 ++ ++python-pysam (0.14+ds-1) unstable; urgency=medium ++ ++ * New upstream version ++ * d/rules: Exclude tests using http access ++ * Bump versioned dependencies to samtools and related from 1.6 to 1.7 ++ * cme fix dpkg-control ++ * debhelper 11 ++ * do not remove samtools/tmp_file.h in clean target ++ * do not depend from non-existing data file in make test target ++ * Update d/copyright ++ * Update lintian overrides ++ ++ -- Andreas Tille Sat, 17 Feb 2018 21:45:07 +0100 ++ ++python-pysam (0.13.0+ds-1) unstable; urgency=medium ++ ++ * New upstream version ++ * Bump versioned Build-Depends on libhts-dev, samtools and bcftools to 1.6 ++ * Exclude tests accessing remote http sites ++ * Lintian-override for false positive ++ * Remove unused paragraphs from d/copyright ++ ++ -- Andreas Tille Thu, 14 Dec 2017 16:36:43 +0100 ++ ++python-pysam (0.12.0.1+ds-4) unstable; urgency=medium ++ ++ * Team upload. ++ * Revert "Skip tests on ppc64el to avoid build-dep on bcftools currently ++ uninstallable". It seems to be installable now, and anyway now that the ++ python3 transition is done we should fix it properly anyway. ++ * d/control: ++ + Bump the libhts-dev buil-dep to 1.5-3, to make sure we gain appropriate ++ versioned symbols and therefore an appropriate versioned dependency on ++ libhts2. Closes: #879867 ++ + Bump Standards-Version to 4.1.1. ++ ++ -- Mattia Rizzolo Fri, 10 Nov 2017 12:56:10 +0100 ++ ++python-pysam (0.12.0.1+ds-3) unstable; urgency=medium ++ ++ * Team upload. ++ * Support DEB_BUILD_OPTIONS=nocheck. ++ * Skip tests on ppc64el to avoid build-dep on bcftools which is currently ++ uninstallable. ++ Hopefully this will help unstuck the current python3 transition. ++ ++ -- Mattia Rizzolo Tue, 24 Oct 2017 18:54:29 +0200 ++ ++python-pysam (0.12.0.1+ds-2) unstable; urgency=low ++ ++ * Update autopkgtest for new test suite driver ++ ++ -- Afif Elghraoui Fri, 06 Oct 2017 23:38:45 -0400 ++ ++python-pysam (0.12.0.1+ds-1) unstable; urgency=medium ++ ++ [ Andreas Tille ] ++ * Drop patch applied upstream ++ * Standards-Version: 4.1.0 (no changes needed) ++ * Apply upstream patch to fix test suite ++ * Use pytest instead of nosetest ++ ++ [ Afif Elghraoui ] ++ * New upstream version ++ Closes: #871083, #834856 ++ * Bump htslib suite minimum versions ++ * Use Build Profiles to mark build-dependencies needed only for tests ++ * Temporarily skip a failing test (reported upstream) ++ ++ [ Steffen Moeller ] ++ * created debian/upstream/metadata: references to registries ++ ++ -- Afif Elghraoui Sun, 01 Oct 2017 12:30:30 -0400 ++ ++python-pysam (0.11.2.2+ds-3) unstable; urgency=medium ++ ++ * Team upload ++ * Link with -Wl,--as-needed and avoid another Python 2.7 ++ compatibility symlink for libchtslib.so ++ ++ -- Graham Inggs Tue, 01 Aug 2017 14:15:58 +0200 ++ ++python-pysam (0.11.2.2+ds-2) unstable; urgency=medium ++ ++ * Team upload ++ * Mark debian/python-pysam.links executable for dh-exec ++ * Drop exclude_test_tyring_to_access_remote_ftpserver.patch, ++ fixed upstream ++ * Drop pysam_stdout_linkage.patch, not needed since ++ compatibility symlinks were added ++ ++ -- Graham Inggs Mon, 31 Jul 2017 14:05:22 +0200 ++ ++python-pysam (0.11.2.2+ds-1) unstable; urgency=medium ++ ++ [ Afif Elghraoui ] ++ * Imported Upstream version 0.11.2.2+ds ++ * Update patches ++ ++ [ Andreas Tille ] ++ * Apply patches suggested by Steve Langasek (thanks for this Steve) ++ Closes: #867017, LP: #1701268 ++ * debhelper 10 ++ * Standards-Version: 4.0.0 (no changes needed) ++ * Add some symlinks to run autopkgtests correctly ++ ++ -- Andreas Tille Sat, 29 Jul 2017 09:03:05 +0200 ++ ++python-pysam (0.10.0+ds-2) unstable; urgency=medium ++ ++ * d/rules: ++ - Add some files that need to be removed after running tests ++ - Remove other autogenerated files to build twice in a row ++ * Exclude test tyring to access remote ftpserver ++ Closes: #861496 ++ ++ -- Andreas Tille Thu, 04 May 2017 15:06:21 +0200 ++ ++python-pysam (0.10.0+ds-1) unstable; urgency=medium ++ ++ [ Afif Elghraoui ] ++ * New upstream release ++ * Update patches ++ ++ [ Andreas Tille ] ++ * d/rules: Remove redundant get-orig-source target ++ * hardening=+all ++ * Fix lintian overrides ++ ++ [ Afif Elghraoui ] ++ * Do not use internal htslib ++ ++ -- Afif Elghraoui Thu, 26 Jan 2017 04:36:11 -0800 ++ ++python-pysam (0.9.1.4+ds-1) unstable; urgency=medium ++ ++ * Imported Upstream version 0.9.1.4+ds ++ * Drop patch applied upstream ++ * Drop unused lintian overrides ++ * Fix spelling issues ++ ++ -- Afif Elghraoui Sat, 23 Jul 2016 18:47:31 -0700 ++ ++python-pysam (0.9.1+ds-1) unstable; urgency=medium ++ ++ * Imported Upstream version 0.9.1+ds ++ * Force at least matching versions of the samtools suite ++ * Bump Standards-Version to 3.9.8 ++ * Globally use C.UTF-8 locale ++ * Update patch for external htslib ++ * Refresh patch ++ * Drop obsolete patches ++ * Fix handling of configuration headers generated at build time ++ * Rely more on pybuild for build-time tests and respect exit code ++ * Fix autopkgtests ++ ++ -- Afif Elghraoui Sun, 19 Jun 2016 18:43:53 -0700 ++ ++python-pysam (0.9.0+ds-1) unstable; urgency=medium ++ ++ * Add filenamemangle to d/watch and space out content ++ * Imported Upstream version 0.9.0+ds (Closes: #814765) ++ * Bump htslib and samtools minimum versions ++ * Update policy statndards-version to 3.9.7 ++ * Use encrypted protocols for Vcs URLs in d/control ++ * Use readthedocs page as pysam's homepage ++ * Update packaging for external htslib link ++ * Refresh patches ++ * Delete obsolete patch ++ ++ -- Afif Elghraoui Wed, 09 Mar 2016 23:43:59 -0800 ++ ++python-pysam (0.8.4+ds-1) unstable; urgency=medium ++ ++ * Add version constraints to build-dependencies. ++ * Drop ds suffix versioning. ++ * Imported Upstream version 0.8.4+ds ++ * Remove patches applied upstream ++ * Refresh existing patches ++ * Refine d/rules ++ * Set Vcs-Browser to point to cgit rather than gitweb ++ * Reduce dependencies in autopktest dependencies ++ * Replace patch for network-dependent tests ++ * Add new lintian overrides ++ ++ -- Afif Elghraoui Fri, 13 Nov 2015 22:15:14 -0800 ++ ++python-pysam (0.8.3+ds1-3) unstable; urgency=medium ++ ++ * Backport upstream commit 6efb22b to permit building with Cython 0.23.x ++ (Closes: 800794) ++ * Revise lintian overrides. ++ ++ -- Afif Elghraoui Tue, 06 Oct 2015 00:57:19 -0700 ++ ++python-pysam (0.8.3+ds1-2) unstable; urgency=medium ++ ++ [ Afif Elghraoui ] ++ * Improve python3 compatibility for upstream test sources ++ * Refresh older patches ++ * Fix autopkgtests ++ * Fix error in sam_mpileup.patch ++ * Remove unused lintian override ++ ++ [ Andreas Tille ] ++ * Remove tests relying on online connection from test suite ++ * For the moment do some dirty tricks in test run script ++ * The automatic nosetest trigger does not work and this it is ++ switched back to manual nosetests invocation ++ ++ -- Afif Elghraoui Fri, 24 Jul 2015 10:12:41 +0200 ++ ++python-pysam (0.8.3+ds1-1) experimental; urgency=medium ++ ++ * Team upload. ++ ++ [ Jorge Soares ] ++ * New upstream version ++ * provide python3-pysam ++ ++ [ Charles Plessy ] ++ * Requires Python 2.7 or higher. ++ ++ [ Andreas Tille ] ++ * Link against htslib ++ * d/watch: dversionmangle ++ ++ [ Afif Elghraoui ] ++ * New upstream releases (Closes: #763218) ++ * Remove unnecessary test-suite declaration in d/control ++ * Remove nonexistent files from copyright explanations ++ * Allow building of the package when non-ASCII characters are in the path ++ * Add to package long description ++ * Resolve lintian "duplicated-compressed-file" ++ * Make package descriptions unique ++ * Remove extra license definitions from d/copyright ++ * Fix spelling errors in source distribution (forwarded upstream as well) ++ * Add lintian overrides ++ * Exclude bundled htslib convenience-copy ++ * Provide get-orig-source rule ++ * Support building with missing htslib directory ++ * Add dependency on cython for autopkgtests to provide pyximport ++ * Revamp debian/copyright ++ ++ -- Afif Elghraoui Thu, 25 Jun 2015 10:44:30 +0200 ++ ++python-pysam (0.7.7-1) unstable; urgency=medium ++ ++ * New upstream releases. ++ * Upstream source code moved to GitHub. ++ * Watch the Python Package Index since there are no relevant tags on GitHub. ++ * Added a git-buildpackage configuration file to mark its usage. ++ * Build-depend samtools (>= 0.1.19); this is needed for the regression tests ++ in Wheezy. ++ * debian/patches/offline-tests.patch: correction from a later release. ++ ++ -- Charles Plessy Sat, 19 Apr 2014 14:17:42 +0900 ++ ++python-pysam (0.7.5-5) unstable; urgency=medium ++ ++ * Add make to autopkgtest dependencies ++ Closes: #741274 ++ ++ -- Andreas Tille Wed, 19 Mar 2014 13:30:15 +0100 ++ ++python-pysam (0.7.5-4) unstable; urgency=medium ++ ++ * Fix autotest ++ Closes: #741274 ++ ++ -- Andreas Tille Tue, 11 Mar 2014 20:08:15 +0100 ++ ++python-pysam (0.7.5-3) unstable; urgency=medium ++ ++ * Do not install tests in world writable dir ++ Closes: #739575 ++ ++ -- Andreas Tille Sat, 01 Mar 2014 23:40:21 +0100 ++ ++python-pysam (0.7.5-2) unstable; urgency=medium ++ ++ * debian/rules: Set PYTHONPATH correctly using dh_python ++ (thanks to Piotr Ożarowski for the patch) ++ Closes: #739631 ++ ++ -- Andreas Tille Thu, 20 Feb 2014 19:01:46 +0100 ++ ++python-pysam (0.7.5-1) unstable; urgency=low ++ ++ * Initial release (Closes: #738665) ++ ++ -- Andreas Tille Fri, 07 Feb 2014 18:29:40 +0100 diff --cc debian/clean index 0000000,0000000..6987d15 new file mode 100644 --- /dev/null +++ b/debian/clean @@@ -1,0 -1,0 +1,4 @@@ ++tests/GRCh38_full_analysis_set_plus_decoy_hla.fa.fai ++tests/pysam_data/ex1.fa.gz ++tests/pysam_data/ex1.fa.gz.gzi ++tests/pysam_data/ex1_csi.bam.csi diff --cc debian/control index 0000000,0000000..64efc57 new file mode 100644 --- /dev/null +++ b/debian/control @@@ -1,0 -1,0 +1,49 @@@ ++Source: python-pysam ++Maintainer: Debian Med Packaging Team ++Uploaders: Charles Plessy , ++ Andreas Tille ++Section: python ++Testsuite: autopkgtest-pkg-python ++Priority: optional ++Build-Depends: debhelper-compat (= 13), ++ dh-exec, ++ dh-python, ++ libhts-dev (>= 1.9), ++ zlib1g-dev, ++ python3-all-dev, ++ python3-setuptools, ++ cython3 (>= 0.23), ++ tabix , ++ samtools (>= 1.9) , ++ bcftools (>= 1.9) , ++ python3-pytest ++Standards-Version: 4.6.0 ++Vcs-Browser: https://salsa.debian.org/med-team/python-pysam ++Vcs-Git: https://salsa.debian.org/med-team/python-pysam.git ++Homepage: https://pysam.readthedocs.org/en/latest ++Rules-Requires-Root: no ++ ++Package: python3-pysam ++Architecture: any ++Depends: ${shlibs:Depends}, ++ ${misc:Depends}, ++ ${python3:Depends} ++Description: interface for the SAM/BAM sequence alignment and mapping format (Python 3) ++ Pysam is a Python module for reading and manipulating Samfiles. It's a ++ lightweight wrapper of the samtools C-API. Pysam also includes an interface ++ for tabix. ++ . ++ This package installs the module for Python 3. ++ ++Package: python-pysam-tests ++Multi-Arch: foreign ++Architecture: all ++Depends: ${misc:Depends} ++Enhances: python3-pysam ++Description: interface for the SAM/BAM sequence alignment and mapping format (test data) ++ Pysam is a Python module for reading and manipulating Samfiles. It's a ++ lightweight wrapper of the samtools C-API. Pysam also includes an interface ++ for tabix. ++ . ++ This package contains the data provided by upstream to run the pysam ++ test suite. diff --cc debian/copyright index 0000000,0000000..49b2c86 new file mode 100644 --- /dev/null +++ b/debian/copyright @@@ -1,0 -1,0 +1,123 @@@ ++Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ ++Upstream-Name: pysam ++Upstream-Contact: Andreas Heger ++Source: https://github.com/pysam-developers/pysam ++Files-Excluded: htslib/* ++ ++Files: * ++Copyright: 2009-2018 Andreas Heger, ++ Tildon Grant Belgrad, ++ Martin Goodson, ++ Kevin Jacobs ++ 2008-2010 Genome Research Ltd. ++License: MIT ++ ++Files: bcftools/* ++Copyright: ++ 2013-2018 Genome Research Ltd. ++ 2010-2011 Broad Institute ++License: MIT ++ ++Files: samtools/* ++Copyright: 2009-2012 Broad Institute ++ 2008-2014 Genome Research Ltd. ++License: MIT ++ ++Files: samtools/bam_cat.* ++Copyright: 2008-2009, 2011-2013 Genome Research Ltd. ++ 2010 Illumina, Inc. ++License: MIT ++ ++Files: samtools/bam_color.* ++Copyright: 2009, 2012 University of California - Los Angeles ++License: MIT ++ ++Files: samtools/bam_index.* ++ samtools/bam_mate.* ++Copyright: 2008-2014 Genome Research Ltd. ++ 2010-2011 Broad Institute ++ 2012-2013 Peter Cock, The James Hutton Institute ++License: MIT ++ ++Files: samtools/padding.* ++Copyright: 2011-2012 Broad Institute ++ 2014 Genome Research Ltd. ++ 2012-2013 Peter Cock, The James Hutton Institute ++License: MIT ++ ++Files: win32/stdint.h ++Copyright: 2005-2007 Paul Hsieh ++License: BSD-3-clause ++ ++Files: win32/getopt.* ++Copyright: 1987-2001 Free Software Foundation, Inc. ++License: LGPL-2.1+ ++ ++Files: debian/* ++Copyright: ++ 2015-2016 Afif Elghraoui ++ 2015 Jorge Soares ++ 2014-2015 Charles Plessy ++ 2014-2018 Andreas Tille ++License: MIT ++ ++License: BSD-3-clause ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ 1. Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ 2. Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ 3. Neither the name of the University nor the names of its contributors ++ may be used to endorse or promote products derived from this software ++ without specific prior written permission. ++ . ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE HOLDERS OR ++ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++License: MIT ++ Permission is hereby granted, free of charge, to any person obtaining a copy ++ of this software and associated documentation files (the "Software"), to deal ++ in the Software without restriction, including without limitation the rights ++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++ copies of the Software, and to permit persons to whom the Software is ++ furnished to do so, subject to the following conditions: ++ . ++ The above copyright notice and this permission notice shall be included in ++ all copies or substantial portions of the Software. ++ . ++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++ THE SOFTWARE. ++ ++License: LGPL-2.1+ ++ This package is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ . ++ This package is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ . ++ You should have received a copy of the GNU General Public License ++ along with this program. If not, see . ++ . ++ On Debian systems, the complete text of the GNU Lesser General ++ Public License can be found in "/usr/share/common-licenses/LGPL-2.1". diff --cc debian/patches/bcftools_v1.10_full index 0000000,0000000..fad5c40 new file mode 100644 --- /dev/null +++ b/debian/patches/bcftools_v1.10_full @@@ -1,0 -1,0 +1,34431 @@@ ++Author: Michael R. Crusoe ++Description: sync with bcftools 1.10 ++ ++use devtools/import.py and the contents of the bcftools ++Debian package with its patches fully applied ++ ++--- python-pysam.orig/bcftools/LICENSE +++++ python-pysam/bcftools/LICENSE ++@@ -723,3 +723,26 @@ ++ ++ ----------------------------------------------------------------------------- ++ +++LICENSE FOR VariantKey (https://github.com/Genomicsplc/variantkey) +++ +++The MIT License +++ +++Copyright (c) 2017-2018 GENOMICS plc +++ +++Permission is hereby granted, free of charge, to any person obtaining a copy +++of this software and associated documentation files (the "Software"), to deal +++in the Software without restriction, including without limitation the rights +++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++copies of the Software, and to permit persons to whom the Software is +++furnished to do so, subject to the following conditions: +++ +++The above copyright notice and this permission notice shall be included in +++all copies or substantial portions of the Software. +++ +++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++THE SOFTWARE. ++--- python-pysam.orig/bcftools/bam2bcf.c +++++ python-pysam/bcftools/bam2bcf.c ++@@ -125,6 +125,7 @@ ++ memset(bca->rev_mqs,0,sizeof(int)*bca->nqual); ++ if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); ++ if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); +++ if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1)); ++ } ++ ++ /* ++@@ -152,6 +153,7 @@ ++ memset(r->qsum,0,sizeof(float)*4); ++ memset(r->anno,0,sizeof(double)*16); ++ memset(r->p,0,sizeof(float)*25); +++ r->SCR = 0; ++ ++ if (ref_base >= 0) { ++ ref4 = seq_nt16_int[ref_base]; ++@@ -199,6 +201,7 @@ ++ if (q > 63) q = 63; ++ if (q < 4) q = 4; // MQ=0 reads count as BQ=4 ++ bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; +++ if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++; ++ // collect annotations ++ if (b < 4) ++ { ++@@ -225,8 +228,12 @@ ++ // collect for bias tests ++ if ( baseQ > 59 ) baseQ = 59; ++ if ( mapQ > 59 ) mapQ = 59; ++- int len, pos = get_position(p, &len); ++- int epos = (double)pos/(len+1) * bca->npos; +++ int len, epos = 0; +++ if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB) ) +++ { +++ int pos = get_position(p, &len); +++ epos = (double)pos/(len+1) * bca->npos; +++ } ++ int ibq = baseQ/60. * bca->nqual; ++ int imq = mapQ/60. * bca->nqual; ++ if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; ++@@ -650,6 +657,14 @@ ++ call->DP4[4*i+3] = calls[i].anno[3]; ++ } ++ } +++ if ( call->SCR ) +++ { +++ for (i=0; iSCR[0] += calls[i].SCR; +++ call->SCR[1+i] = calls[i].SCR; +++ } +++ } ++ if ( call->ADF ) ++ { ++ assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well ++@@ -702,19 +717,23 @@ ++ // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); ++ // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); ++ ++- call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); +++ if ( bca->fmt_flag & B2B_INFO_RPB ) +++ call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); ++ call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual); ++ call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual); ++ call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual); ++ ++ #if CDF_MWU_TESTS ++- call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); +++ // CDF version of MWU tests is not calculated by default +++ if ( bca->fmt_flag & B2B_INFO_RPB ) +++ call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); ++ call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual); ++ call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual); ++ call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); ++ #endif ++ ++- call->vdb = calc_vdb(bca->alt_pos, bca->npos); +++ if ( bca->fmt_flag & B2B_INFO_VDB ) +++ call->vdb = calc_vdb(bca->alt_pos, bca->npos); ++ ++ return 0; ++ } ++@@ -790,6 +809,8 @@ ++ if ( fmt_flag&B2B_INFO_DPR ) ++ bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele); ++ } +++ if ( fmt_flag&B2B_INFO_SCR ) +++ bcf_update_info_int32(hdr, rec, "SCR", bc->SCR, 1); ++ ++ float tmpf[16]; ++ for (i=0; i<16; i++) tmpf[i] = bc->anno[i]; ++@@ -861,6 +882,8 @@ ++ if ( fmt_flag&B2B_FMT_DPR ) ++ bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); ++ } +++ if ( fmt_flag&B2B_FMT_SCR ) +++ bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample); ++ ++ return 0; ++ } ++--- python-pysam.orig/bcftools/bam2bcf.c.pysam.c +++++ python-pysam/bcftools/bam2bcf.c.pysam.c ++@@ -127,6 +127,7 @@ ++ memset(bca->rev_mqs,0,sizeof(int)*bca->nqual); ++ if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); ++ if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); +++ if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1)); ++ } ++ ++ /* ++@@ -154,6 +155,7 @@ ++ memset(r->qsum,0,sizeof(float)*4); ++ memset(r->anno,0,sizeof(double)*16); ++ memset(r->p,0,sizeof(float)*25); +++ r->SCR = 0; ++ ++ if (ref_base >= 0) { ++ ref4 = seq_nt16_int[ref_base]; ++@@ -201,6 +203,7 @@ ++ if (q > 63) q = 63; ++ if (q < 4) q = 4; // MQ=0 reads count as BQ=4 ++ bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; +++ if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++; ++ // collect annotations ++ if (b < 4) ++ { ++@@ -227,8 +230,12 @@ ++ // collect for bias tests ++ if ( baseQ > 59 ) baseQ = 59; ++ if ( mapQ > 59 ) mapQ = 59; ++- int len, pos = get_position(p, &len); ++- int epos = (double)pos/(len+1) * bca->npos; +++ int len, epos = 0; +++ if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB) ) +++ { +++ int pos = get_position(p, &len); +++ epos = (double)pos/(len+1) * bca->npos; +++ } ++ int ibq = baseQ/60. * bca->nqual; ++ int imq = mapQ/60. * bca->nqual; ++ if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; ++@@ -652,6 +659,14 @@ ++ call->DP4[4*i+3] = calls[i].anno[3]; ++ } ++ } +++ if ( call->SCR ) +++ { +++ for (i=0; iSCR[0] += calls[i].SCR; +++ call->SCR[1+i] = calls[i].SCR; +++ } +++ } ++ if ( call->ADF ) ++ { ++ assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well ++@@ -704,19 +719,23 @@ ++ // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); ++ // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); ++ ++- call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); +++ if ( bca->fmt_flag & B2B_INFO_RPB ) +++ call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); ++ call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual); ++ call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual); ++ call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual); ++ ++ #if CDF_MWU_TESTS ++- call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); +++ // CDF version of MWU tests is not calculated by default +++ if ( bca->fmt_flag & B2B_INFO_RPB ) +++ call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); ++ call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual); ++ call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual); ++ call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); ++ #endif ++ ++- call->vdb = calc_vdb(bca->alt_pos, bca->npos); +++ if ( bca->fmt_flag & B2B_INFO_VDB ) +++ call->vdb = calc_vdb(bca->alt_pos, bca->npos); ++ ++ return 0; ++ } ++@@ -792,6 +811,8 @@ ++ if ( fmt_flag&B2B_INFO_DPR ) ++ bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele); ++ } +++ if ( fmt_flag&B2B_INFO_SCR ) +++ bcf_update_info_int32(hdr, rec, "SCR", bc->SCR, 1); ++ ++ float tmpf[16]; ++ for (i=0; i<16; i++) tmpf[i] = bc->anno[i]; ++@@ -863,6 +884,8 @@ ++ if ( fmt_flag&B2B_FMT_DPR ) ++ bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); ++ } +++ if ( fmt_flag&B2B_FMT_SCR ) +++ bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample); ++ ++ return 0; ++ } ++--- python-pysam.orig/bcftools/bam2bcf.h +++++ python-pysam/bcftools/bam2bcf.h ++@@ -55,10 +55,18 @@ ++ #define B2B_INFO_AD (1<<9) ++ #define B2B_INFO_ADF (1<<10) ++ #define B2B_INFO_ADR (1<<11) +++#define B2B_INFO_SCR (1<<12) +++#define B2B_FMT_SCR (1<<13) +++#define B2B_INFO_VDB (1<<14) +++#define B2B_INFO_RPB (1<<15) ++ ++ #define B2B_MAX_ALLELES 5 ++ +++#define PLP_HAS_SOFT_CLIP(i) ((i)&1) +++#define PLP_SAMPLE_ID(i) ((i)>>1) +++ ++ typedef struct __bcf_callaux_t { +++ int fmt_flag; ++ int capQ, min_baseQ; ++ int openQ, extQ, tandemQ; // for indels ++ uint32_t min_support, max_support; // for collecting indel candidates ++@@ -77,10 +85,11 @@ ++ void *rghash; ++ } bcf_callaux_t; ++ +++// per-sample values ++ typedef struct { ++ uint32_t ori_depth; ++ unsigned int mq0; ++- int32_t *ADF, *ADR; +++ int32_t *ADF, *ADR, SCR; ++ float qsum[4]; ++ // The fields are: ++ // depth fwd .. ref (0) and non-ref (2) ++@@ -98,6 +107,7 @@ ++ float p[25]; // phred-scaled likelihood of each genotype ++ } bcf_callret1_t; ++ +++// values for all samples ++ typedef struct { ++ int tid, pos; ++ bcf_hdr_t *bcf_hdr; ++@@ -107,7 +117,7 @@ ++ int n_supp; // number of supporting non-reference reads ++ double anno[16]; ++ unsigned int depth, ori_depth, mq0; ++- int32_t *PL, *DP4, *ADR, *ADF; +++ int32_t *PL, *DP4, *ADR, *ADF, *SCR; ++ uint8_t *fmt_arr; ++ float vdb; // variant distance bias ++ float mwu_pos, mwu_mq, mwu_bq, mwu_mqs; ++--- python-pysam.orig/bcftools/bcftools.h +++++ python-pysam/bcftools/bcftools.h ++@@ -39,7 +39,15 @@ ++ #define FT_STDIN (1<<3) ++ ++ char *bcftools_version(void); +++ +++/// Report an error and exit -1 ++ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2); +++ +++/// Report an error and exit -1. If errno != 0, appends strerror(errno). +++// Note: unlike error() above, the message should not end with "\n" as a +++// newline will be added by the function. +++void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2); +++ ++ void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd); ++ const char *hts_bcf_wmode(int file_type); ++ ++--- python-pysam.orig/bcftools/call.h +++++ python-pysam/bcftools/call.h ++@@ -49,12 +49,35 @@ ++ } ++ family_t; ++ +++// For the single-sample and grouped -G calling +++typedef struct +++{ +++ float *qsum; // QS(quality sum) values +++ int nqsum, dp; +++ double fa,fb,fc,fa2,fb2,fc2,fab,fac,fbc; +++} +++grp1_t; +++typedef struct +++{ +++ grp1_t *grp; +++ int ngrp; +++ int *smpl2grp; +++} +++grp_t; +++ +++// For the `-C alleles -i` constrained calling +++typedef struct +++{ +++ uint32_t n:31, used:1; +++ char **allele; +++} +++tgt_als_t; +++ ++ typedef struct _ccall_t ccall_t; ++ typedef struct ++ { ++ // mcall only ++- float *qsum; // QS(sum) values ++- int nqsum, npdg; +++ int npdg; ++ int *als_map, nals_map; // mapping from full set of alleles to trimmed set of alleles (old -> new) ++ int *pl_map, npl_map; // same as above for PLs, but reverse (new -> old) ++ char **als; // array to hold the trimmed set of alleles to appear on output ++@@ -65,14 +88,19 @@ ++ uint16_t *trio[5][5]; // family type, second index: allele count (2-4, first two are unused) ++ double *GLs; ++ float *GPs; // FORMAT/GP: posterior probabilities ++- int32_t *GQs; // FORMAT/GQ: genotype qualities +++ int32_t *GQs, *ADs; // FORMAT/GQ: genotype qualities; AD: allelic depth for -G ++ int32_t *itmp; // temporary int array, used for new PLs with CALL_CONSTR_ALLELES ++- int n_itmp, nGPs; +++ int n_itmp, nGPs, nADs; ++ vcmp_t *vcmp; ++ double trio_Pm_SNPs, trio_Pm_del, trio_Pm_ins; // P(mendelian) for trio calling, see mcall_call_trio_genotypes() ++ int32_t *ugts, *cgts; // unconstraind and constrained GTs ++ uint32_t output_tags; ++ char *prior_AN, *prior_AC; // reference panel AF tags (AF=AC/AN) +++ tgt_als_t *tgt_als; // for CALL_CONSTR_ALLELES +++ char *sample_groups; // for single-sample or grouped calling with -G +++ grp_t smpl_grp; +++ float *qsum; +++ int nqsum; ++ ++ // ccall only ++ double indel_frac, min_perm_p, min_lrt; ++--- /dev/null +++++ python-pysam/bcftools/cols.c ++@@ -0,0 +1,109 @@ +++/* +++ Copyright (C) 2019 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++ DEALINGS IN THE SOFTWARE. +++*/ +++ +++#include +++#include "cols.h" +++ +++cols_t *cols_split(const char *line, cols_t *cols, char delim) +++{ +++ if ( !cols ) cols = (cols_t*) calloc(1,sizeof(cols_t)); +++ if ( cols->rmme ) free(cols->rmme); +++ cols->n = 0; +++ cols->rmme = strdup(line); +++ char *ss = cols->rmme; +++ while (1) +++ { +++ char *se = ss; +++ while ( *se && *se!=delim ) se++; +++ char tmp = *se; +++ *se = 0; +++ cols->n++; +++ if ( cols->n > cols->m ) +++ { +++ cols->m += 10; +++ cols->off = (char**) realloc(cols->off, sizeof(*cols->off)*cols->m); +++ } +++ cols->off[ cols->n - 1 ] = ss; +++ if ( !tmp ) break; +++ ss = se + 1; +++ } +++ return cols; +++} +++ +++void cols_append(cols_t *cols, char *str) +++{ +++ if ( cols->rmme ) +++ { +++ size_t str_len = strlen(str); +++ size_t lst_len = strlen(cols->off[ cols->n - 1 ]); +++ size_t tot_len = 2 + str_len + lst_len + (cols->off[ cols->n - 1 ] - cols->rmme); +++ +++ cols_t *tmp_cols = (cols_t*)calloc(1,sizeof(cols_t)); +++ tmp_cols->rmme = (char*) calloc(tot_len,1); +++ tmp_cols->off = (char**) calloc(cols->n+1,sizeof(*tmp_cols->off)); +++ +++ char *ptr = tmp_cols->rmme; +++ int i; +++ for (i=0; in; i++) +++ { +++ size_t len = strlen(cols->off[i]); +++ memcpy(ptr, cols->off[i], len); +++ tmp_cols->off[i] = ptr; +++ ptr += len + 1; +++ } +++ memcpy(ptr, str, str_len); +++ tmp_cols->off[i] = ptr; +++ +++ free(cols->off); +++ free(cols->rmme); +++ cols->rmme = tmp_cols->rmme; +++ cols->off = tmp_cols->off; +++ cols->n = cols->n+1; +++ cols->m = cols->n; +++ free(tmp_cols); +++ return; +++ } +++ cols->n++; +++ if ( cols->n > cols->m ) +++ { +++ cols->m++; +++ cols->off = (char**) realloc(cols->off,sizeof(*cols->off)*cols->m); +++ } +++ cols->off[cols->n-1] = str; +++} +++void cols_clear(cols_t *cols) +++{ +++ if ( !cols ) return; +++ free(cols->rmme); +++ free(cols->off); +++ cols->rmme = NULL; +++ cols->off = NULL; +++} +++void cols_destroy(cols_t *cols) +++{ +++ if ( !cols ) return; +++ cols_clear(cols); +++ free(cols); +++} +++ ++--- /dev/null +++++ python-pysam/bcftools/cols.c.pysam.c ++@@ -0,0 +1,111 @@ +++#include "bcftools.pysam.h" +++ +++/* +++ Copyright (C) 2019 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++ DEALINGS IN THE SOFTWARE. +++*/ +++ +++#include +++#include "cols.h" +++ +++cols_t *cols_split(const char *line, cols_t *cols, char delim) +++{ +++ if ( !cols ) cols = (cols_t*) calloc(1,sizeof(cols_t)); +++ if ( cols->rmme ) free(cols->rmme); +++ cols->n = 0; +++ cols->rmme = strdup(line); +++ char *ss = cols->rmme; +++ while (1) +++ { +++ char *se = ss; +++ while ( *se && *se!=delim ) se++; +++ char tmp = *se; +++ *se = 0; +++ cols->n++; +++ if ( cols->n > cols->m ) +++ { +++ cols->m += 10; +++ cols->off = (char**) realloc(cols->off, sizeof(*cols->off)*cols->m); +++ } +++ cols->off[ cols->n - 1 ] = ss; +++ if ( !tmp ) break; +++ ss = se + 1; +++ } +++ return cols; +++} +++ +++void cols_append(cols_t *cols, char *str) +++{ +++ if ( cols->rmme ) +++ { +++ size_t str_len = strlen(str); +++ size_t lst_len = strlen(cols->off[ cols->n - 1 ]); +++ size_t tot_len = 2 + str_len + lst_len + (cols->off[ cols->n - 1 ] - cols->rmme); +++ +++ cols_t *tmp_cols = (cols_t*)calloc(1,sizeof(cols_t)); +++ tmp_cols->rmme = (char*) calloc(tot_len,1); +++ tmp_cols->off = (char**) calloc(cols->n+1,sizeof(*tmp_cols->off)); +++ +++ char *ptr = tmp_cols->rmme; +++ int i; +++ for (i=0; in; i++) +++ { +++ size_t len = strlen(cols->off[i]); +++ memcpy(ptr, cols->off[i], len); +++ tmp_cols->off[i] = ptr; +++ ptr += len + 1; +++ } +++ memcpy(ptr, str, str_len); +++ tmp_cols->off[i] = ptr; +++ +++ free(cols->off); +++ free(cols->rmme); +++ cols->rmme = tmp_cols->rmme; +++ cols->off = tmp_cols->off; +++ cols->n = cols->n+1; +++ cols->m = cols->n; +++ free(tmp_cols); +++ return; +++ } +++ cols->n++; +++ if ( cols->n > cols->m ) +++ { +++ cols->m++; +++ cols->off = (char**) realloc(cols->off,sizeof(*cols->off)*cols->m); +++ } +++ cols->off[cols->n-1] = str; +++} +++void cols_clear(cols_t *cols) +++{ +++ if ( !cols ) return; +++ free(cols->rmme); +++ free(cols->off); +++ cols->rmme = NULL; +++ cols->off = NULL; +++} +++void cols_destroy(cols_t *cols) +++{ +++ if ( !cols ) return; +++ cols_clear(cols); +++ free(cols); +++} +++ ++--- /dev/null +++++ python-pysam/bcftools/cols.h ++@@ -0,0 +1,51 @@ +++/* +++ Copyright (C) 2019 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++ DEALINGS IN THE SOFTWARE. +++*/ +++ +++#ifndef __COLS_H__ +++#define __COLS_H__ +++ +++#include +++ +++typedef struct +++{ +++ int n,m; +++ char **off, *rmme; +++} +++cols_t; +++ +++/* +++ cols_split() can be called repeatedly to split new strings, memory is allocated +++ and deallocated automatically +++*/ +++cols_t *cols_split(const char *line, cols_t *cols, char delim); +++ +++/* +++ Although cols_append() can be combined with cols_split(), it is much slower and +++ the string must exist throughout the life of cols unless initialized with cols_split(). +++*/ +++void cols_append(cols_t *cols, char *str); +++void cols_clear(cols_t *cols); +++void cols_destroy(cols_t *cols); +++ +++#endif ++--- python-pysam.orig/bcftools/consensus.c +++++ python-pysam/bcftools/consensus.c ++@@ -50,6 +50,7 @@ ++ #define PICK_ALT 2 ++ #define PICK_LONG 4 ++ #define PICK_SHORT 8 +++#define PICK_IUPAC 16 ++ ++ typedef struct ++ { ++@@ -76,11 +77,12 @@ ++ int fa_src_pos; // last genomic coordinate read from the input fasta (0-based) ++ char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778 ++ int prev_base_pos; // the position of prev_base +++ int prev_is_insert; ++ ++ rbuf_t vcf_rbuf; ++ bcf1_t **vcf_buf; ++ int nvcf_buf, rid; ++- char *chr; +++ char *chr, *chr_prefix; ++ ++ regidx_t *mask; ++ regitr_t *itr; ++@@ -98,7 +100,7 @@ ++ FILE *fp_out; ++ FILE *fp_chain; ++ char **argv; ++- int argc, output_iupac, haplotype, allele, isample; +++ int argc, output_iupac, haplotype, allele, isample, napplied; ++ char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele; ++ } ++ args_t; ++@@ -207,7 +209,7 @@ ++ { ++ args->files = bcf_sr_init(); ++ args->files->require_index = 1; ++- if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to open %s: %s\n", args->fname, bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum)); ++ args->hdr = args->files->readers[0].header; ++ args->isample = -1; ++ if ( args->sample ) ++@@ -299,7 +301,7 @@ ++ args->vcf_rbuf.n = 0; ++ bcf_sr_seek(args->files,line,args->fa_ori_pos); ++ if ( tmp_ptr ) *tmp_ptr = tmp; ++- fprintf(args->fp_out,">%s\n",line); +++ fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line); ++ if (args->chain_fname ) ++ { ++ args->chain = init_chain(args->chain, args->fa_ori_pos); ++@@ -331,7 +333,7 @@ ++ { ++ bcf1_t *rec = *rec_ptr; ++ if ( args->vcf_rbuf.n >= args->vcf_rbuf.m ) ++- error("FIXME: too many overlapping records near %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ error("FIXME: too many overlapping records near %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ ++ // Insert the new record in the buffer. The line would be overwritten in ++ // the next bcf_sr_next_line call, therefore we need to swap it with an ++@@ -395,9 +397,18 @@ ++ if ( !fmt ) return; ++ ++ if ( fmt->type!=BCF_BT_INT8 ) ++- error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1); +++ error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%"PRId64"?\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ uint8_t *ptr = fmt->p + fmt->size*args->isample; ++- if ( args->haplotype ) +++ +++ enum { use_hap, use_iupac, pick_one } action = use_hap; +++ if ( args->allele==PICK_IUPAC ) +++ { +++ if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac; +++ } +++ else if ( args->output_iupac ) action = use_iupac; +++ else if ( !args->haplotype ) action = pick_one; +++ +++ if ( action==use_hap ) ++ { ++ if ( args->haplotype > fmt->n ) ++ { ++@@ -410,7 +421,7 @@ ++ { ++ if ( !warned_haplotype ) ++ { ++- fprintf(stderr, "Can't apply %d-th haplotype at %s:%d. (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1); +++ fprintf(stderr, "Can't apply %d-th haplotype at %s:%"PRId64". (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ warned_haplotype = 1; ++ } ++ return; ++@@ -428,7 +439,7 @@ ++ ialt = bcf_gt_allele(ialt); ++ } ++ } ++- else if ( args->output_iupac ) +++ else if ( action==use_iupac ) ++ { ++ ialt = ptr[0]; ++ if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) ++@@ -456,7 +467,7 @@ ++ ++ if ( ialt>=0 ) ++ { ++- if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? ++ { ++ char ial = rec->d.allele[ialt][0]; ++@@ -488,7 +499,7 @@ ++ { ++ if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break; ++ jalt = bcf_gt_allele(ptr[i]); ++- if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ if ( args->allele & (PICK_LONG|PICK_SHORT) ) ++ { ++ int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]); ++@@ -510,7 +521,7 @@ ++ } ++ } ++ if ( !ialt ) return; // ref allele ++- if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ } ++ else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] ) ++ { ++@@ -531,18 +542,29 @@ ++ ialt = 1; ++ } ++ ++- // Overlapping variant? Can be still OK iff this is an insertion ++- if ( rec->pos <= args->fa_frz_pos && (rec->pos!=args->fa_frz_pos || rec->d.allele[0][0]!=rec->d.allele[ialt][0]) ) +++ // Overlapping variant? +++ if ( rec->pos <= args->fa_frz_pos ) ++ { ++- fprintf(stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1); ++- return; +++ // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888). +++ // This still may not be enough for more complicated cases with multiple duplicate positions +++ // and other types in between. In such case let the user normalize the VCF and remove duplicates. +++ int overlap = 0; +++ if ( rec->pos < args->fa_frz_pos || !(bcf_get_variant_type(rec,ialt) & VCF_INDEL) ) overlap = 1; +++ else if ( rec->d.var[ialt].n <= 0 || args->prev_is_insert ) overlap = 1; +++ +++ if ( overlap ) +++ { +++ fprintf(stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ return; +++ } +++ ++ } ++ ++ int len_diff = 0, alen = 0; ++ int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; ++ if ( idx<0 ) ++ { ++- fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ return; ++ } ++ if ( rec->rlen > args->fa_buf.l - idx ) ++@@ -552,17 +574,17 @@ ++ if ( alen > rec->rlen ) ++ { ++ rec->d.allele[ialt][rec->rlen] = 0; ++- fprintf(stderr,"Warning: trimming variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ fprintf(stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( idx>=args->fa_buf.l ) ++- error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); +++ error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); ++ ++ // sanity check the reference base ++ if ( rec->d.allele[ialt][0]=='<' ) ++ { ++ if ( strcasecmp(rec->d.allele[ialt], "") ) ++- error("Symbolic alleles other than are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1); +++ error("Symbolic alleles other than are currently not supported: %s at %s:%"PRId64"\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1 ++ len_diff = 1-rec->rlen; ++ rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event ++@@ -570,7 +592,7 @@ ++ } ++ else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) ++ { ++- // This is hacky, handle a special case: if insert follows a deletion (AAC>A, C>CAA), +++ // This is hacky, handle a special case: if SNP or an insert follows a deletion (AAC>A, C>CAA), ++ // the reference base in fa_buf is lost and the check fails. We do not keep a buffer ++ // with the original sequence as it should not be necessary, we should encounter max ++ // one base overlap ++@@ -591,11 +613,11 @@ ++ args->fa_buf.s[idx+rec->rlen] = 0; ++ } ++ error( ++- "The fasta sequence does not match the REF allele at %s:%d:\n" ++- " .vcf: [%s]\n" +++ "The fasta sequence does not match the REF allele at %s:%"PRId64":\n" +++ " .vcf: [%s] <- (REF)\n" ++ " .vcf: [%s] <- (ALT)\n" ++ " .fa: [%s]%c%s\n", ++- bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, +++ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, ++ tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" ++ ); ++ } ++@@ -618,19 +640,31 @@ ++ // deletion or same size event ++ for (i=0; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; +++ ++ if ( len_diff ) ++- { ++- args->prev_base = rec->d.allele[0][rec->rlen - 1]; ++- args->prev_base_pos = rec->pos + rec->rlen - 1; ++ memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen); ++- } +++ +++ args->prev_base = rec->d.allele[0][rec->rlen - 1]; +++ args->prev_base_pos = rec->pos + rec->rlen - 1; +++ args->prev_is_insert = 0; ++ } ++ else ++ { +++ args->prev_is_insert = 1; +++ args->prev_base_pos = rec->pos; +++ ++ // insertion ++ ks_resize(&args->fa_buf, args->fa_buf.l + len_diff); ++ memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen); ++- for (i=0; id.allele[0][ibeg]==rec->d.allele[ialt][ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++; +++ for (i=ibeg; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; ++ } ++ if (args->chain && len_diff != 0) ++@@ -650,6 +684,7 @@ ++ args->fa_buf.l += len_diff; ++ args->fa_mod_off += len_diff; ++ args->fa_frz_pos = rec->pos + rec->rlen - 1; +++ args->napplied++; ++ } ++ ++ ++@@ -755,6 +790,7 @@ ++ flush_fa_buffer(args, 0); ++ bgzf_close(fasta); ++ free(str.s); +++ fprintf(stderr,"Applied %d variants\n", args->napplied); ++ } ++ ++ static void usage(args_t *args) ++@@ -772,17 +808,19 @@ ++ fprintf(stderr, " -f, --fasta-ref reference sequence in fasta format\n"); ++ fprintf(stderr, " -H, --haplotype choose which allele to use from the FORMAT/GT field, note\n"); ++ fprintf(stderr, " the codes are case-insensitive:\n"); ++- fprintf(stderr, " 1: first allele from GT\n"); ++- fprintf(stderr, " 2: second allele\n"); +++ fprintf(stderr, " 1: first allele from GT, regardless of phasing\n"); +++ fprintf(stderr, " 2: second allele from GT, regardless of phasing\n"); ++ fprintf(stderr, " R: REF allele in het genotypes\n"); ++ fprintf(stderr, " A: ALT allele\n"); ++ fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); ++ fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); +++ fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); ++ fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); ++ fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); ++ fprintf(stderr, " -m, --mask replace regions with N\n"); ++ fprintf(stderr, " -M, --missing output instead of skipping the missing genotypes\n"); ++ fprintf(stderr, " -o, --output write output to a file [standard output]\n"); +++ fprintf(stderr, " -p, --prefix prefix to add to output sequence names\n"); ++ fprintf(stderr, " -s, --sample apply variants of the given sample\n"); ++ fprintf(stderr, "Examples:\n"); ++ fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); ++@@ -809,13 +847,15 @@ ++ {"mask",1,0,'m'}, ++ {"missing",1,0,'M'}, ++ {"chain",1,0,'c'}, +++ {"prefix",required_argument,0,'p'}, ++ {0,0,0,0} ++ }; ++ int c; ++- while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { +++ case 'p': args->chr_prefix = optarg; break; ++ case 's': args->sample = optarg; break; ++ case 'o': args->output_fname = optarg; break; ++ case 'I': args->output_iupac = 1; break; ++@@ -837,10 +877,14 @@ ++ else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT; ++ else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF; ++ else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT; +++ else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1; +++ else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2; ++ else ++ { ++- args->haplotype = optarg[0] - '0'; ++- if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n"); +++ char *tmp; +++ args->haplotype = strtol(optarg, &tmp, 10); +++ if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg); +++ if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n"); ++ } ++ break; ++ default: usage(args); break; ++--- python-pysam.orig/bcftools/consensus.c.pysam.c +++++ python-pysam/bcftools/consensus.c.pysam.c ++@@ -52,6 +52,7 @@ ++ #define PICK_ALT 2 ++ #define PICK_LONG 4 ++ #define PICK_SHORT 8 +++#define PICK_IUPAC 16 ++ ++ typedef struct ++ { ++@@ -78,11 +79,12 @@ ++ int fa_src_pos; // last genomic coordinate read from the input fasta (0-based) ++ char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778 ++ int prev_base_pos; // the position of prev_base +++ int prev_is_insert; ++ ++ rbuf_t vcf_rbuf; ++ bcf1_t **vcf_buf; ++ int nvcf_buf, rid; ++- char *chr; +++ char *chr, *chr_prefix; ++ ++ regidx_t *mask; ++ regitr_t *itr; ++@@ -100,7 +102,7 @@ ++ FILE *fp_out; ++ FILE *fp_chain; ++ char **argv; ++- int argc, output_iupac, haplotype, allele, isample; +++ int argc, output_iupac, haplotype, allele, isample, napplied; ++ char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele; ++ } ++ args_t; ++@@ -209,7 +211,7 @@ ++ { ++ args->files = bcf_sr_init(); ++ args->files->require_index = 1; ++- if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to open %s: %s\n", args->fname, bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum)); ++ args->hdr = args->files->readers[0].header; ++ args->isample = -1; ++ if ( args->sample ) ++@@ -301,7 +303,7 @@ ++ args->vcf_rbuf.n = 0; ++ bcf_sr_seek(args->files,line,args->fa_ori_pos); ++ if ( tmp_ptr ) *tmp_ptr = tmp; ++- fprintf(args->fp_out,">%s\n",line); +++ fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line); ++ if (args->chain_fname ) ++ { ++ args->chain = init_chain(args->chain, args->fa_ori_pos); ++@@ -333,7 +335,7 @@ ++ { ++ bcf1_t *rec = *rec_ptr; ++ if ( args->vcf_rbuf.n >= args->vcf_rbuf.m ) ++- error("FIXME: too many overlapping records near %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ error("FIXME: too many overlapping records near %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ ++ // Insert the new record in the buffer. The line would be overwritten in ++ // the next bcf_sr_next_line call, therefore we need to swap it with an ++@@ -397,9 +399,18 @@ ++ if ( !fmt ) return; ++ ++ if ( fmt->type!=BCF_BT_INT8 ) ++- error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1); +++ error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%"PRId64"?\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ uint8_t *ptr = fmt->p + fmt->size*args->isample; ++- if ( args->haplotype ) +++ +++ enum { use_hap, use_iupac, pick_one } action = use_hap; +++ if ( args->allele==PICK_IUPAC ) +++ { +++ if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac; +++ } +++ else if ( args->output_iupac ) action = use_iupac; +++ else if ( !args->haplotype ) action = pick_one; +++ +++ if ( action==use_hap ) ++ { ++ if ( args->haplotype > fmt->n ) ++ { ++@@ -412,7 +423,7 @@ ++ { ++ if ( !warned_haplotype ) ++ { ++- fprintf(bcftools_stderr, "Can't apply %d-th haplotype at %s:%d. (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1); +++ fprintf(bcftools_stderr, "Can't apply %d-th haplotype at %s:%"PRId64". (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ warned_haplotype = 1; ++ } ++ return; ++@@ -430,7 +441,7 @@ ++ ialt = bcf_gt_allele(ialt); ++ } ++ } ++- else if ( args->output_iupac ) +++ else if ( action==use_iupac ) ++ { ++ ialt = ptr[0]; ++ if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) ++@@ -458,7 +469,7 @@ ++ ++ if ( ialt>=0 ) ++ { ++- if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? ++ { ++ char ial = rec->d.allele[ialt][0]; ++@@ -490,7 +501,7 @@ ++ { ++ if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break; ++ jalt = bcf_gt_allele(ptr[i]); ++- if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ if ( args->allele & (PICK_LONG|PICK_SHORT) ) ++ { ++ int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]); ++@@ -512,7 +523,7 @@ ++ } ++ } ++ if ( !ialt ) return; // ref allele ++- if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ } ++ else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] ) ++ { ++@@ -533,18 +544,29 @@ ++ ialt = 1; ++ } ++ ++- // Overlapping variant? Can be still OK iff this is an insertion ++- if ( rec->pos <= args->fa_frz_pos && (rec->pos!=args->fa_frz_pos || rec->d.allele[0][0]!=rec->d.allele[ialt][0]) ) +++ // Overlapping variant? +++ if ( rec->pos <= args->fa_frz_pos ) ++ { ++- fprintf(bcftools_stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1); ++- return; +++ // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888). +++ // This still may not be enough for more complicated cases with multiple duplicate positions +++ // and other types in between. In such case let the user normalize the VCF and remove duplicates. +++ int overlap = 0; +++ if ( rec->pos < args->fa_frz_pos || !(bcf_get_variant_type(rec,ialt) & VCF_INDEL) ) overlap = 1; +++ else if ( rec->d.var[ialt].n <= 0 || args->prev_is_insert ) overlap = 1; +++ +++ if ( overlap ) +++ { +++ fprintf(bcftools_stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ return; +++ } +++ ++ } ++ ++ int len_diff = 0, alen = 0; ++ int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; ++ if ( idx<0 ) ++ { ++- fprintf(bcftools_stderr,"Warning: ignoring overlapping variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ fprintf(bcftools_stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ return; ++ } ++ if ( rec->rlen > args->fa_buf.l - idx ) ++@@ -554,17 +576,17 @@ ++ if ( alen > rec->rlen ) ++ { ++ rec->d.allele[ialt][rec->rlen] = 0; ++- fprintf(bcftools_stderr,"Warning: trimming variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ fprintf(bcftools_stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( idx>=args->fa_buf.l ) ++- error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); +++ error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); ++ ++ // sanity check the reference base ++ if ( rec->d.allele[ialt][0]=='<' ) ++ { ++ if ( strcasecmp(rec->d.allele[ialt], "") ) ++- error("Symbolic alleles other than are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1); +++ error("Symbolic alleles other than are currently not supported: %s at %s:%"PRId64"\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1 ++ len_diff = 1-rec->rlen; ++ rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event ++@@ -572,7 +594,7 @@ ++ } ++ else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) ++ { ++- // This is hacky, handle a special case: if insert follows a deletion (AAC>A, C>CAA), +++ // This is hacky, handle a special case: if SNP or an insert follows a deletion (AAC>A, C>CAA), ++ // the reference base in fa_buf is lost and the check fails. We do not keep a buffer ++ // with the original sequence as it should not be necessary, we should encounter max ++ // one base overlap ++@@ -593,11 +615,11 @@ ++ args->fa_buf.s[idx+rec->rlen] = 0; ++ } ++ error( ++- "The fasta sequence does not match the REF allele at %s:%d:\n" ++- " .vcf: [%s]\n" +++ "The fasta sequence does not match the REF allele at %s:%"PRId64":\n" +++ " .vcf: [%s] <- (REF)\n" ++ " .vcf: [%s] <- (ALT)\n" ++ " .fa: [%s]%c%s\n", ++- bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, +++ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, ++ tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" ++ ); ++ } ++@@ -620,19 +642,31 @@ ++ // deletion or same size event ++ for (i=0; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; +++ ++ if ( len_diff ) ++- { ++- args->prev_base = rec->d.allele[0][rec->rlen - 1]; ++- args->prev_base_pos = rec->pos + rec->rlen - 1; ++ memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen); ++- } +++ +++ args->prev_base = rec->d.allele[0][rec->rlen - 1]; +++ args->prev_base_pos = rec->pos + rec->rlen - 1; +++ args->prev_is_insert = 0; ++ } ++ else ++ { +++ args->prev_is_insert = 1; +++ args->prev_base_pos = rec->pos; +++ ++ // insertion ++ ks_resize(&args->fa_buf, args->fa_buf.l + len_diff); ++ memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen); ++- for (i=0; id.allele[0][ibeg]==rec->d.allele[ialt][ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++; +++ for (i=ibeg; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; ++ } ++ if (args->chain && len_diff != 0) ++@@ -652,6 +686,7 @@ ++ args->fa_buf.l += len_diff; ++ args->fa_mod_off += len_diff; ++ args->fa_frz_pos = rec->pos + rec->rlen - 1; +++ args->napplied++; ++ } ++ ++ ++@@ -757,6 +792,7 @@ ++ flush_fa_buffer(args, 0); ++ bgzf_close(fasta); ++ free(str.s); +++ fprintf(bcftools_stderr,"Applied %d variants\n", args->napplied); ++ } ++ ++ static void usage(args_t *args) ++@@ -774,17 +810,19 @@ ++ fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); ++ fprintf(bcftools_stderr, " -H, --haplotype choose which allele to use from the FORMAT/GT field, note\n"); ++ fprintf(bcftools_stderr, " the codes are case-insensitive:\n"); ++- fprintf(bcftools_stderr, " 1: first allele from GT\n"); ++- fprintf(bcftools_stderr, " 2: second allele\n"); +++ fprintf(bcftools_stderr, " 1: first allele from GT, regardless of phasing\n"); +++ fprintf(bcftools_stderr, " 2: second allele from GT, regardless of phasing\n"); ++ fprintf(bcftools_stderr, " R: REF allele in het genotypes\n"); ++ fprintf(bcftools_stderr, " A: ALT allele\n"); ++ fprintf(bcftools_stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); ++ fprintf(bcftools_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); +++ fprintf(bcftools_stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); ++ fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); ++ fprintf(bcftools_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); ++ fprintf(bcftools_stderr, " -m, --mask replace regions with N\n"); ++ fprintf(bcftools_stderr, " -M, --missing output instead of skipping the missing genotypes\n"); ++ fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); +++ fprintf(bcftools_stderr, " -p, --prefix prefix to add to output sequence names\n"); ++ fprintf(bcftools_stderr, " -s, --sample apply variants of the given sample\n"); ++ fprintf(bcftools_stderr, "Examples:\n"); ++ fprintf(bcftools_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); ++@@ -811,13 +849,15 @@ ++ {"mask",1,0,'m'}, ++ {"missing",1,0,'M'}, ++ {"chain",1,0,'c'}, +++ {"prefix",required_argument,0,'p'}, ++ {0,0,0,0} ++ }; ++ int c; ++- while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { +++ case 'p': args->chr_prefix = optarg; break; ++ case 's': args->sample = optarg; break; ++ case 'o': args->output_fname = optarg; break; ++ case 'I': args->output_iupac = 1; break; ++@@ -839,10 +879,14 @@ ++ else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT; ++ else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF; ++ else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT; +++ else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1; +++ else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2; ++ else ++ { ++- args->haplotype = optarg[0] - '0'; ++- if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n"); +++ char *tmp; +++ args->haplotype = strtol(optarg, &tmp, 10); +++ if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg); +++ if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n"); ++ } ++ break; ++ default: usage(args); break; ++--- python-pysam.orig/bcftools/convert.c +++++ python-pysam/bcftools/convert.c ++@@ -30,12 +30,15 @@ ++ #include ++ #include ++ #include +++#define __STDC_FORMAT_MACROS ++ #include ++ #include ++ #include ++ #include ++ #include +++#include ++ #include "bcftools.h" +++#include "variantkey.h" ++ #include "convert.h" ++ ++ #define T_CHROM 1 ++@@ -67,6 +70,9 @@ ++ #define T_END 27 ++ #define T_POS0 28 ++ #define T_END0 29 +++#define T_RSX 30 // RSID HEX +++#define T_VKX 31 // VARIANTKEY HEX +++#define T_PBINOM 32 ++ ++ typedef struct _fmt_t ++ { ++@@ -196,13 +202,44 @@ ++ } ++ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) ++ { +++ int i; +++ if ( !fmt->key ) // the whole INFO column +++ { +++ int first = 1; +++ for (i=0; in_info; i++) +++ { +++ bcf_info_t *inf = &line->d.info[i]; +++ if ( !inf->vptr ) continue; +++ if ( !first ) kputc(';', str); +++ first = 0; +++ if ( inf->key >= convert->header->n[BCF_DT_ID] ) continue; +++ kputs(convert->header->id[BCF_DT_ID][inf->key].key, str); +++ if ( inf->len <= 0 ) continue; +++ kputc('=', str); +++ if ( inf->len == 1 ) +++ { +++ switch (inf->type) +++ { +++ case BCF_BT_INT8: if ( inf->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(inf->v1.i, str); break; +++ case BCF_BT_INT16: if ( inf->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(inf->v1.i, str); break; +++ case BCF_BT_INT32: if ( inf->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(inf->v1.i, str); break; +++ case BCF_BT_FLOAT: if ( bcf_float_is_missing(inf->v1.f) ) kputc('.', str); else kputd(inf->v1.f, str); break; +++ case BCF_BT_CHAR: kputc(inf->v1.i, str); break; +++ default: error("Unexpected type %d", inf->type); break; +++ } +++ } +++ else bcf_fmt_array(str, inf->len, inf->type, inf->vptr); +++ } +++ if ( first ) kputc('.', str); +++ return; +++ } +++ ++ if ( fmt->id<0 ) ++ { ++ kputc('.', str); ++ return; ++ } ++ ++- int i; ++ for (i=0; in_info; i++) ++ if ( line->d.info[i].key == fmt->id ) break; ++ ++@@ -276,6 +313,50 @@ ++ ++ fmt->ready = 1; ++ } +++static void process_complete_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +++{ +++ if ( convert->nsamples ) +++ { +++ int i,j; +++ if ( line->n_fmt) +++ { +++ int gt_i = -1; +++ bcf_fmt_t *fmt = line->d.fmt; +++ int first = 1; +++ for (i=0; i<(int)line->n_fmt; i++) +++ { +++ if ( !fmt[i].p || fmt[i].id<0 ) continue; +++ if ( !first ) kputc(':', str); +++ first = 0; +++ kputs(convert->header->id[BCF_DT_ID][fmt[i].id].key, str); +++ if ( strcmp(convert->header->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i; +++ } +++ if ( first ) kputc('.', str); +++ for (j=0; jnsamples; j++) +++ { +++ kputc('\t', str); +++ first = 1; +++ for (i=0; i<(int)line->n_fmt; i++) +++ { +++ bcf_fmt_t *f = &fmt[i]; +++ if ( !f->p ) continue; +++ if ( !first ) kputc(':', str); +++ first = 0; +++ if (gt_i == i) +++ bcf_format_gt(f,convert->samples[j],str); +++ else +++ bcf_fmt_array(str, f->n, f->type, f->p + convert->samples[j] * f->size); +++ } +++ if ( first ) kputc('.', str); +++ } +++ } +++ else +++ for (j=0; j<=line->n_sample; j++) +++ kputs("\t.", str); +++ } +++ else +++ kputc('.',str); +++} ++ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) ++ { ++ if ( !fmt->ready ) ++@@ -555,6 +636,7 @@ ++ if ( line_type & VCF_INDEL ) { if (i) kputc(',',str); kputs("INDEL", str); i++; } ++ if ( line_type & VCF_OTHER ) { if (i) kputc(',',str); kputs("OTHER", str); i++; } ++ if ( line_type & VCF_BND ) { if (i) kputc(',',str); kputs("BND", str); i++; } +++ if ( line_type & VCF_OVERLAP ) { if (i) kputc(',',str); kputs("OVERLAP", str); i++; } ++ } ++ static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) ++ { ++@@ -590,7 +672,7 @@ ++ // for (i=0; insamples; i++) kputs(" 0.33 0.33 0.33", str); ++ // return; ++ ++- error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1); +++ error("Error parsing GT tag at %s:%"PRId64"\n", bcf_seqname(convert->header,line),(int64_t) line->pos+1); ++ } ++ ++ n /= convert->nsamples; ++@@ -641,7 +723,7 @@ ++ // for (i=0; insamples; i++) kputs(" 0.33 0.33 0.33", str); ++ // return; ++ ++- error("Error parsing PL tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1); +++ error("Error parsing PL tag at %s:%"PRId64"\n", bcf_seqname(convert->header,line),(int64_t) line->pos+1); ++ } ++ ++ n /= convert->nsamples; ++@@ -690,7 +772,7 @@ ++ // for (i=0; insamples; i++) kputs(" 0.33 0.33 0.33", str); ++ // return; ++ ++- error("Error parsing GP tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1); +++ error("Error parsing GP tag at %s:%"PRId64"\n", bcf_seqname(convert->header,line),(int64_t) line->pos+1); ++ } ++ ++ n /= convert->nsamples; ++@@ -702,7 +784,7 @@ ++ { ++ if ( ptr[j]==bcf_int32_vector_end ) break; ++ if ( ptr[j]==bcf_int32_missing ) { ptr[j]=0; continue; } ++- if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%d:%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),line->pos+1,ptr[j]); +++ if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]); ++ sum+=ptr[j]; ++ } ++ if ( j==line->n_allele ) ++@@ -745,24 +827,24 @@ ++ ++ int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); ++ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) ) ++- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); +++ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); ++ bcf_fmt_t *fmt_gt = NULL; ++ for (i=0; in_fmt; i++) ++ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } ++ if ( !fmt_gt ) ++- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); +++ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ ++ // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99 ++ if ( line->n_allele > 100 ) ++- error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); +++ error("Too many alleles (%d) at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ if ( ks_resize(str, str->l+convert->nsamples*8) != 0 ) ++- error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8)); +++ error("Could not alloc %" PRIu64 " bytes\n", (uint64_t)(str->l + convert->nsamples*8)); ++ ++ if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid ++- error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); +++ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ if ( fmt_gt->n!=1 && fmt_gt->n!=2 ) ++- error("Uh, ploidy of %d not supported, see %s:%d\n", fmt_gt->n, bcf_seqname(convert->header, line), line->pos+1); +++ error("Uh, ploidy of %d not supported, see %s:%"PRId64"\n", fmt_gt->n, bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ ++ int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n; ++ for (i=0; insamples; i++) ++@@ -899,22 +981,22 @@ ++ ++ int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); ++ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) ) ++- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); +++ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); ++ bcf_fmt_t *fmt_gt = NULL; ++ for (i=0; in_fmt; i++) ++ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } ++ if ( !fmt_gt ) ++- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); +++ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ ++ // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99 ++ if ( line->n_allele > 100 ) ++- error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); +++ error("Too many alleles (%d) at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ if ( ks_resize(str, str->l+convert->nsamples*8) != 0 ) ++- error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8)); +++ error("Could not alloc %" PRIu64 " bytes\n", (uint64_t)(str->l + convert->nsamples*8)); ++ ++ if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid ++- error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); +++ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ ++ int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n; ++ for (i=0; insamples; i++) ++@@ -1020,6 +1102,91 @@ ++ str->s[--str->l] = 0; // delete the last space ++ } ++ +++static void process_rsid_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +++{ +++ char *ptr = line->d.id; +++ ptr += 2; // remove 'rs' +++ ksprintf(str, "%08" PRIx32 "", (uint32_t)strtoul(ptr, NULL, 10)); +++} +++ +++static void process_variantkey_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +++{ +++ uint64_t vk = variantkey( +++ convert->header->id[BCF_DT_CTG][line->rid].key, +++ strlen(convert->header->id[BCF_DT_CTG][line->rid].key), +++ line->pos, +++ line->d.allele[0], +++ strlen(line->d.allele[0]), +++ line->d.allele[1], +++ strlen(line->d.allele[1])); +++ ksprintf(str, "%016" PRIx64 "", vk); +++} +++ +++static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +++{ +++ int i; +++ if ( !fmt->ready ) +++ { +++ fmt->fmt = NULL; // AD +++ fmt->usr = NULL; // GT +++ +++ for (i=0; i<(int)line->n_fmt; i++) +++ if ( line->d.fmt[i].id==fmt->id ) { fmt->fmt = &line->d.fmt[i]; break; } +++ +++ // Check that the first field is GT +++ int gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); +++ if ( !bcf_hdr_idinfo_exists(convert->header, BCF_HL_FMT, fmt->id) ) error("Error: FORMAT/GT is not defined in the header\n"); +++ for (i=0; i<(int)line->n_fmt; i++) +++ if ( line->d.fmt[i].id==gt_id ) { fmt->usr = &line->d.fmt[i]; break; } // it should always be first according to VCF spec, but... +++ +++ if ( fmt->usr && line->d.fmt[i].type!=BCF_BT_INT8 ) // skip sites with many alleles +++ fmt->usr = NULL; +++ +++ fmt->ready = 1; +++ } +++ bcf_fmt_t *gt_fmt = (bcf_fmt_t*) fmt->usr; +++ if ( !fmt->fmt || !gt_fmt || gt_fmt->n!=2 ) goto invalid; +++ +++ int n[2] = {0,0}; +++ int8_t *gt = (int8_t*)(gt_fmt->p + isample*gt_fmt->size); +++ for (i=0; i<2; i++) +++ { +++ if ( bcf_gt_is_missing(gt[i]) || gt[i] == bcf_int8_vector_end ) goto invalid; +++ int al = bcf_gt_allele(gt[i]); +++ if ( al > line->n_allele || al >= fmt->fmt->n ) goto invalid; +++ +++ #define BRANCH(type_t, missing, vector_end) { \ +++ type_t val = ((type_t *) fmt->fmt->p)[al + isample*fmt->fmt->n]; \ +++ if ( val==missing || val==vector_end ) goto invalid; \ +++ else n[i] = val; \ +++ } +++ switch (fmt->fmt->type) +++ { +++ case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; +++ case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; +++ case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; +++ default: goto invalid; break; +++ } +++ #undef BRANCH +++ } +++ +++ if ( n[0]==n[1] ) kputc(n[0]==0 ? '.':'0', str); +++ else +++ { +++ double pval = n[0] < n[1] ? kf_betai(n[1], n[0] + 1, 0.5) : kf_betai(n[0], n[1] + 1, 0.5); +++ pval *= 2; +++ assert( pval-1 < 1e-10 ); +++ if ( pval>=1 ) pval = 0; // this can happen, machine precision error, eg. kf_betai(1,0,0.5) +++ else +++ pval = -4.34294481903*log(pval); +++ kputd(pval, str); +++ } +++ return; +++ +++invalid: +++ kputc('.', str); +++} +++ ++ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) ++ { ++ convert->nfmt++; ++@@ -1054,11 +1221,14 @@ ++ else if ( !strcmp("QUAL",key) ) { fmt->type = T_QUAL; } ++ else if ( !strcmp("FILTER",key) ) { fmt->type = T_FILTER; } ++ else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; } ++- else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) ++- { ++- fmt->type = T_INFO; ++- fprintf(stderr,"Warning: Assuming INFO/%s\n", key); ++- } +++ else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; } +++ else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; } +++ else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; } +++ } +++ if ( fmt->type==T_PBINOM ) +++ { +++ fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key); +++ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id) ) error("No such FORMAT tag defined in the header: %s\n", fmt->key); ++ } ++ } ++ ++@@ -1072,15 +1242,15 @@ ++ case T_CHROM: fmt->handler = &process_chrom; break; ++ case T_POS: fmt->handler = &process_pos; break; ++ case T_POS0: fmt->handler = &process_pos0; break; ++- case T_END: fmt->handler = &process_end; break; ++- case T_END0: fmt->handler = &process_end0; break; +++ case T_END: fmt->handler = &process_end; convert->max_unpack |= BCF_UN_INFO; break; +++ case T_END0: fmt->handler = &process_end0; convert->max_unpack |= BCF_UN_INFO; break; ++ case T_ID: fmt->handler = &process_id; break; ++ case T_REF: fmt->handler = &process_ref; break; ++ case T_ALT: fmt->handler = &process_alt; break; ++ case T_QUAL: fmt->handler = &process_qual; break; ++ case T_FILTER: fmt->handler = &process_filter; convert->max_unpack |= BCF_UN_FLT; break; ++ case T_INFO: fmt->handler = &process_info; convert->max_unpack |= BCF_UN_INFO; break; ++- case T_FORMAT: fmt->handler = &process_format; convert->max_unpack |= BCF_UN_FMT; break; +++ case T_FORMAT: fmt->handler = fmt->key ? &process_format : &process_complete_format; convert->max_unpack |= BCF_UN_FMT; break; ++ case T_SAMPLE: fmt->handler = &process_sample; break; ++ case T_SEP: fmt->handler = &process_sep; break; ++ case T_IS_TS: fmt->handler = &process_is_ts; break; ++@@ -1093,6 +1263,9 @@ ++ case T_GT_TO_HAP2: fmt->handler = &process_gt_to_hap2; convert->max_unpack |= BCF_UN_FMT; break; ++ case T_TBCSQ: fmt->handler = &process_tbcsq; fmt->destroy = &destroy_tbcsq; convert->max_unpack |= BCF_UN_FMT; break; ++ case T_LINE: fmt->handler = &process_line; convert->max_unpack |= BCF_UN_FMT; break; +++ case T_RSX: fmt->handler = &process_rsid_hex; break; +++ case T_VKX: fmt->handler = &process_variantkey_hex; break; +++ case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break; ++ default: error("TODO: handler for type %d\n", fmt->type); ++ } ++ if ( key && fmt->type==T_INFO ) ++@@ -1144,7 +1317,14 @@ ++ else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf); ++ else if ( !strcmp(str.s, "INFO") ) ++ { ++- if ( *q!='/' ) error("Could not parse format string: %s\n", convert->format_str); +++ if ( *q!='/' ) +++ { +++ int id = bcf_hdr_id2int(convert->header, BCF_DT_ID, str.s); +++ if ( bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) +++ error("Could not parse format string \"%s\". Did you mean %%INFO/%s?\n", convert->format_str,str.s); +++ else +++ error("Could not parse format string: %s\n", convert->format_str); +++ } ++ p = ++q; ++ str.l = 0; ++ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; ++@@ -1153,6 +1333,17 @@ ++ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); ++ fmt->subscript = parse_subscript(&q); ++ } +++ else if ( !strcmp(str.s,"PBINOM") ) +++ { +++ if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str); +++ p = ++q; +++ str.l = 0; +++ while ( *q && *q!=')' ) q++; +++ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); +++ kputsn(p, q-p, &str); +++ register_tag(convert, T_PBINOM, str.s, is_gtf); +++ q++; +++ } ++ else ++ { ++ fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf); ++@@ -1187,17 +1378,26 @@ ++ else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, T_GP_TO_PROB3, str.s, is_gtf); ++ else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, T_GT_TO_HAP, str.s, is_gtf); ++ else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf); +++ else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf); +++ else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf); +++ else if ( !strcmp(str.s,"pbinom") ) error("Error: pbinom() is currently supported only with FORMAT tags. (todo)\n"); ++ else if ( !strcmp(str.s, "INFO") ) ++ { ++- if ( *q!='/' ) error("Could not parse format string: %s\n", convert->format_str); ++- p = ++q; ++- str.l = 0; ++- while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; ++- if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); ++- kputsn(p, q-p, &str); ++- fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); ++- fmt->subscript = parse_subscript(&q); +++ if ( *q=='/' ) +++ { +++ p = ++q; +++ str.l = 0; +++ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; +++ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); +++ kputsn(p, q-p, &str); +++ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); +++ fmt->subscript = parse_subscript(&q); +++ } +++ else +++ register_tag(convert, T_INFO, NULL, is_gtf); // the whole INFO ++ } +++ else if ( !strcmp(str.s, "FORMAT") ) +++ register_tag(convert, T_FORMAT, NULL, 0); ++ else ++ { ++ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); ++@@ -1336,7 +1536,15 @@ ++ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) ++ { ++ if ( !convert->allow_undef_tags && convert->undef_info_tag ) ++- error("Error: no such tag defined in the VCF header: INFO/%s. FORMAT fields must be in square brackets, e.g. \"[ %s]\"\n", convert->undef_info_tag,convert->undef_info_tag); +++ { +++ kstring_t msg = {0,0,0}; +++ ksprintf(&msg,"Error: no such tag defined in the VCF header: INFO/%s", convert->undef_info_tag); +++ +++ int hdr_id = bcf_hdr_id2int(convert->header,BCF_DT_ID,convert->undef_info_tag); +++ if ( hdr_id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,hdr_id) ) +++ ksprintf(&msg,". FORMAT fields must be enclosed in square brackets, e.g. \"[ %%%s]\"", convert->undef_info_tag); +++ error("%s\n", msg.s); +++ } ++ ++ int l_ori = str->l; ++ bcf_unpack(line, convert->max_unpack); ++@@ -1357,7 +1565,7 @@ ++ for (js=0; jsnsamples; js++) ++ { ++ // Skip samples when filtering was requested ++- if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; +++ if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; ++ ++ // Here comes a hack designed for TBCSQ. When running on large files, ++ // such as 1000GP, there are too many empty fields in the output and ++--- python-pysam.orig/bcftools/convert.c.pysam.c +++++ python-pysam/bcftools/convert.c.pysam.c ++@@ -32,12 +32,15 @@ ++ #include ++ #include ++ #include +++#define __STDC_FORMAT_MACROS ++ #include ++ #include ++ #include ++ #include ++ #include +++#include ++ #include "bcftools.h" +++#include "variantkey.h" ++ #include "convert.h" ++ ++ #define T_CHROM 1 ++@@ -69,6 +72,9 @@ ++ #define T_END 27 ++ #define T_POS0 28 ++ #define T_END0 29 +++#define T_RSX 30 // RSID HEX +++#define T_VKX 31 // VARIANTKEY HEX +++#define T_PBINOM 32 ++ ++ typedef struct _fmt_t ++ { ++@@ -198,13 +204,44 @@ ++ } ++ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) ++ { +++ int i; +++ if ( !fmt->key ) // the whole INFO column +++ { +++ int first = 1; +++ for (i=0; in_info; i++) +++ { +++ bcf_info_t *inf = &line->d.info[i]; +++ if ( !inf->vptr ) continue; +++ if ( !first ) kputc(';', str); +++ first = 0; +++ if ( inf->key >= convert->header->n[BCF_DT_ID] ) continue; +++ kputs(convert->header->id[BCF_DT_ID][inf->key].key, str); +++ if ( inf->len <= 0 ) continue; +++ kputc('=', str); +++ if ( inf->len == 1 ) +++ { +++ switch (inf->type) +++ { +++ case BCF_BT_INT8: if ( inf->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(inf->v1.i, str); break; +++ case BCF_BT_INT16: if ( inf->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(inf->v1.i, str); break; +++ case BCF_BT_INT32: if ( inf->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(inf->v1.i, str); break; +++ case BCF_BT_FLOAT: if ( bcf_float_is_missing(inf->v1.f) ) kputc('.', str); else kputd(inf->v1.f, str); break; +++ case BCF_BT_CHAR: kputc(inf->v1.i, str); break; +++ default: error("Unexpected type %d", inf->type); break; +++ } +++ } +++ else bcf_fmt_array(str, inf->len, inf->type, inf->vptr); +++ } +++ if ( first ) kputc('.', str); +++ return; +++ } +++ ++ if ( fmt->id<0 ) ++ { ++ kputc('.', str); ++ return; ++ } ++ ++- int i; ++ for (i=0; in_info; i++) ++ if ( line->d.info[i].key == fmt->id ) break; ++ ++@@ -278,6 +315,50 @@ ++ ++ fmt->ready = 1; ++ } +++static void process_complete_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +++{ +++ if ( convert->nsamples ) +++ { +++ int i,j; +++ if ( line->n_fmt) +++ { +++ int gt_i = -1; +++ bcf_fmt_t *fmt = line->d.fmt; +++ int first = 1; +++ for (i=0; i<(int)line->n_fmt; i++) +++ { +++ if ( !fmt[i].p || fmt[i].id<0 ) continue; +++ if ( !first ) kputc(':', str); +++ first = 0; +++ kputs(convert->header->id[BCF_DT_ID][fmt[i].id].key, str); +++ if ( strcmp(convert->header->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i; +++ } +++ if ( first ) kputc('.', str); +++ for (j=0; jnsamples; j++) +++ { +++ kputc('\t', str); +++ first = 1; +++ for (i=0; i<(int)line->n_fmt; i++) +++ { +++ bcf_fmt_t *f = &fmt[i]; +++ if ( !f->p ) continue; +++ if ( !first ) kputc(':', str); +++ first = 0; +++ if (gt_i == i) +++ bcf_format_gt(f,convert->samples[j],str); +++ else +++ bcf_fmt_array(str, f->n, f->type, f->p + convert->samples[j] * f->size); +++ } +++ if ( first ) kputc('.', str); +++ } +++ } +++ else +++ for (j=0; j<=line->n_sample; j++) +++ kputs("\t.", str); +++ } +++ else +++ kputc('.',str); +++} ++ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) ++ { ++ if ( !fmt->ready ) ++@@ -557,6 +638,7 @@ ++ if ( line_type & VCF_INDEL ) { if (i) kputc(',',str); kputs("INDEL", str); i++; } ++ if ( line_type & VCF_OTHER ) { if (i) kputc(',',str); kputs("OTHER", str); i++; } ++ if ( line_type & VCF_BND ) { if (i) kputc(',',str); kputs("BND", str); i++; } +++ if ( line_type & VCF_OVERLAP ) { if (i) kputc(',',str); kputs("OVERLAP", str); i++; } ++ } ++ static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) ++ { ++@@ -592,7 +674,7 @@ ++ // for (i=0; insamples; i++) kputs(" 0.33 0.33 0.33", str); ++ // return; ++ ++- error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1); +++ error("Error parsing GT tag at %s:%"PRId64"\n", bcf_seqname(convert->header,line),(int64_t) line->pos+1); ++ } ++ ++ n /= convert->nsamples; ++@@ -643,7 +725,7 @@ ++ // for (i=0; insamples; i++) kputs(" 0.33 0.33 0.33", str); ++ // return; ++ ++- error("Error parsing PL tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1); +++ error("Error parsing PL tag at %s:%"PRId64"\n", bcf_seqname(convert->header,line),(int64_t) line->pos+1); ++ } ++ ++ n /= convert->nsamples; ++@@ -692,7 +774,7 @@ ++ // for (i=0; insamples; i++) kputs(" 0.33 0.33 0.33", str); ++ // return; ++ ++- error("Error parsing GP tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1); +++ error("Error parsing GP tag at %s:%"PRId64"\n", bcf_seqname(convert->header,line),(int64_t) line->pos+1); ++ } ++ ++ n /= convert->nsamples; ++@@ -704,7 +786,7 @@ ++ { ++ if ( ptr[j]==bcf_int32_vector_end ) break; ++ if ( ptr[j]==bcf_int32_missing ) { ptr[j]=0; continue; } ++- if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%d:%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),line->pos+1,ptr[j]); +++ if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]); ++ sum+=ptr[j]; ++ } ++ if ( j==line->n_allele ) ++@@ -747,24 +829,24 @@ ++ ++ int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); ++ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) ) ++- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); +++ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); ++ bcf_fmt_t *fmt_gt = NULL; ++ for (i=0; in_fmt; i++) ++ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } ++ if ( !fmt_gt ) ++- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); +++ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ ++ // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99 ++ if ( line->n_allele > 100 ) ++- error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); +++ error("Too many alleles (%d) at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ if ( ks_resize(str, str->l+convert->nsamples*8) != 0 ) ++- error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8)); +++ error("Could not alloc %" PRIu64 " bytes\n", (uint64_t)(str->l + convert->nsamples*8)); ++ ++ if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid ++- error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); +++ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ if ( fmt_gt->n!=1 && fmt_gt->n!=2 ) ++- error("Uh, ploidy of %d not supported, see %s:%d\n", fmt_gt->n, bcf_seqname(convert->header, line), line->pos+1); +++ error("Uh, ploidy of %d not supported, see %s:%"PRId64"\n", fmt_gt->n, bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ ++ int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n; ++ for (i=0; insamples; i++) ++@@ -901,22 +983,22 @@ ++ ++ int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); ++ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) ) ++- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); +++ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); ++ bcf_fmt_t *fmt_gt = NULL; ++ for (i=0; in_fmt; i++) ++ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } ++ if ( !fmt_gt ) ++- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); +++ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ ++ // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99 ++ if ( line->n_allele > 100 ) ++- error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); +++ error("Too many alleles (%d) at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ if ( ks_resize(str, str->l+convert->nsamples*8) != 0 ) ++- error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8)); +++ error("Could not alloc %" PRIu64 " bytes\n", (uint64_t)(str->l + convert->nsamples*8)); ++ ++ if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid ++- error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); +++ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); ++ ++ int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n; ++ for (i=0; insamples; i++) ++@@ -1022,6 +1104,91 @@ ++ str->s[--str->l] = 0; // delete the last space ++ } ++ +++static void process_rsid_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +++{ +++ char *ptr = line->d.id; +++ ptr += 2; // remove 'rs' +++ ksprintf(str, "%08" PRIx32 "", (uint32_t)strtoul(ptr, NULL, 10)); +++} +++ +++static void process_variantkey_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +++{ +++ uint64_t vk = variantkey( +++ convert->header->id[BCF_DT_CTG][line->rid].key, +++ strlen(convert->header->id[BCF_DT_CTG][line->rid].key), +++ line->pos, +++ line->d.allele[0], +++ strlen(line->d.allele[0]), +++ line->d.allele[1], +++ strlen(line->d.allele[1])); +++ ksprintf(str, "%016" PRIx64 "", vk); +++} +++ +++static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) +++{ +++ int i; +++ if ( !fmt->ready ) +++ { +++ fmt->fmt = NULL; // AD +++ fmt->usr = NULL; // GT +++ +++ for (i=0; i<(int)line->n_fmt; i++) +++ if ( line->d.fmt[i].id==fmt->id ) { fmt->fmt = &line->d.fmt[i]; break; } +++ +++ // Check that the first field is GT +++ int gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); +++ if ( !bcf_hdr_idinfo_exists(convert->header, BCF_HL_FMT, fmt->id) ) error("Error: FORMAT/GT is not defined in the header\n"); +++ for (i=0; i<(int)line->n_fmt; i++) +++ if ( line->d.fmt[i].id==gt_id ) { fmt->usr = &line->d.fmt[i]; break; } // it should always be first according to VCF spec, but... +++ +++ if ( fmt->usr && line->d.fmt[i].type!=BCF_BT_INT8 ) // skip sites with many alleles +++ fmt->usr = NULL; +++ +++ fmt->ready = 1; +++ } +++ bcf_fmt_t *gt_fmt = (bcf_fmt_t*) fmt->usr; +++ if ( !fmt->fmt || !gt_fmt || gt_fmt->n!=2 ) goto invalid; +++ +++ int n[2] = {0,0}; +++ int8_t *gt = (int8_t*)(gt_fmt->p + isample*gt_fmt->size); +++ for (i=0; i<2; i++) +++ { +++ if ( bcf_gt_is_missing(gt[i]) || gt[i] == bcf_int8_vector_end ) goto invalid; +++ int al = bcf_gt_allele(gt[i]); +++ if ( al > line->n_allele || al >= fmt->fmt->n ) goto invalid; +++ +++ #define BRANCH(type_t, missing, vector_end) { \ +++ type_t val = ((type_t *) fmt->fmt->p)[al + isample*fmt->fmt->n]; \ +++ if ( val==missing || val==vector_end ) goto invalid; \ +++ else n[i] = val; \ +++ } +++ switch (fmt->fmt->type) +++ { +++ case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; +++ case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; +++ case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; +++ default: goto invalid; break; +++ } +++ #undef BRANCH +++ } +++ +++ if ( n[0]==n[1] ) kputc(n[0]==0 ? '.':'0', str); +++ else +++ { +++ double pval = n[0] < n[1] ? kf_betai(n[1], n[0] + 1, 0.5) : kf_betai(n[0], n[1] + 1, 0.5); +++ pval *= 2; +++ assert( pval-1 < 1e-10 ); +++ if ( pval>=1 ) pval = 0; // this can happen, machine precision error, eg. kf_betai(1,0,0.5) +++ else +++ pval = -4.34294481903*log(pval); +++ kputd(pval, str); +++ } +++ return; +++ +++invalid: +++ kputc('.', str); +++} +++ ++ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) ++ { ++ convert->nfmt++; ++@@ -1056,11 +1223,14 @@ ++ else if ( !strcmp("QUAL",key) ) { fmt->type = T_QUAL; } ++ else if ( !strcmp("FILTER",key) ) { fmt->type = T_FILTER; } ++ else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; } ++- else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) ++- { ++- fmt->type = T_INFO; ++- fprintf(bcftools_stderr,"Warning: Assuming INFO/%s\n", key); ++- } +++ else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; } +++ else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; } +++ else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; } +++ } +++ if ( fmt->type==T_PBINOM ) +++ { +++ fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key); +++ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id) ) error("No such FORMAT tag defined in the header: %s\n", fmt->key); ++ } ++ } ++ ++@@ -1074,15 +1244,15 @@ ++ case T_CHROM: fmt->handler = &process_chrom; break; ++ case T_POS: fmt->handler = &process_pos; break; ++ case T_POS0: fmt->handler = &process_pos0; break; ++- case T_END: fmt->handler = &process_end; break; ++- case T_END0: fmt->handler = &process_end0; break; +++ case T_END: fmt->handler = &process_end; convert->max_unpack |= BCF_UN_INFO; break; +++ case T_END0: fmt->handler = &process_end0; convert->max_unpack |= BCF_UN_INFO; break; ++ case T_ID: fmt->handler = &process_id; break; ++ case T_REF: fmt->handler = &process_ref; break; ++ case T_ALT: fmt->handler = &process_alt; break; ++ case T_QUAL: fmt->handler = &process_qual; break; ++ case T_FILTER: fmt->handler = &process_filter; convert->max_unpack |= BCF_UN_FLT; break; ++ case T_INFO: fmt->handler = &process_info; convert->max_unpack |= BCF_UN_INFO; break; ++- case T_FORMAT: fmt->handler = &process_format; convert->max_unpack |= BCF_UN_FMT; break; +++ case T_FORMAT: fmt->handler = fmt->key ? &process_format : &process_complete_format; convert->max_unpack |= BCF_UN_FMT; break; ++ case T_SAMPLE: fmt->handler = &process_sample; break; ++ case T_SEP: fmt->handler = &process_sep; break; ++ case T_IS_TS: fmt->handler = &process_is_ts; break; ++@@ -1095,6 +1265,9 @@ ++ case T_GT_TO_HAP2: fmt->handler = &process_gt_to_hap2; convert->max_unpack |= BCF_UN_FMT; break; ++ case T_TBCSQ: fmt->handler = &process_tbcsq; fmt->destroy = &destroy_tbcsq; convert->max_unpack |= BCF_UN_FMT; break; ++ case T_LINE: fmt->handler = &process_line; convert->max_unpack |= BCF_UN_FMT; break; +++ case T_RSX: fmt->handler = &process_rsid_hex; break; +++ case T_VKX: fmt->handler = &process_variantkey_hex; break; +++ case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break; ++ default: error("TODO: handler for type %d\n", fmt->type); ++ } ++ if ( key && fmt->type==T_INFO ) ++@@ -1146,7 +1319,14 @@ ++ else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf); ++ else if ( !strcmp(str.s, "INFO") ) ++ { ++- if ( *q!='/' ) error("Could not parse format string: %s\n", convert->format_str); +++ if ( *q!='/' ) +++ { +++ int id = bcf_hdr_id2int(convert->header, BCF_DT_ID, str.s); +++ if ( bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) +++ error("Could not parse format string \"%s\". Did you mean %%INFO/%s?\n", convert->format_str,str.s); +++ else +++ error("Could not parse format string: %s\n", convert->format_str); +++ } ++ p = ++q; ++ str.l = 0; ++ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; ++@@ -1155,6 +1335,17 @@ ++ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); ++ fmt->subscript = parse_subscript(&q); ++ } +++ else if ( !strcmp(str.s,"PBINOM") ) +++ { +++ if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str); +++ p = ++q; +++ str.l = 0; +++ while ( *q && *q!=')' ) q++; +++ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); +++ kputsn(p, q-p, &str); +++ register_tag(convert, T_PBINOM, str.s, is_gtf); +++ q++; +++ } ++ else ++ { ++ fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf); ++@@ -1189,17 +1380,26 @@ ++ else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, T_GP_TO_PROB3, str.s, is_gtf); ++ else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, T_GT_TO_HAP, str.s, is_gtf); ++ else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf); +++ else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf); +++ else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf); +++ else if ( !strcmp(str.s,"pbinom") ) error("Error: pbinom() is currently supported only with FORMAT tags. (todo)\n"); ++ else if ( !strcmp(str.s, "INFO") ) ++ { ++- if ( *q!='/' ) error("Could not parse format string: %s\n", convert->format_str); ++- p = ++q; ++- str.l = 0; ++- while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; ++- if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); ++- kputsn(p, q-p, &str); ++- fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); ++- fmt->subscript = parse_subscript(&q); +++ if ( *q=='/' ) +++ { +++ p = ++q; +++ str.l = 0; +++ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; +++ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); +++ kputsn(p, q-p, &str); +++ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); +++ fmt->subscript = parse_subscript(&q); +++ } +++ else +++ register_tag(convert, T_INFO, NULL, is_gtf); // the whole INFO ++ } +++ else if ( !strcmp(str.s, "FORMAT") ) +++ register_tag(convert, T_FORMAT, NULL, 0); ++ else ++ { ++ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); ++@@ -1338,7 +1538,15 @@ ++ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) ++ { ++ if ( !convert->allow_undef_tags && convert->undef_info_tag ) ++- error("Error: no such tag defined in the VCF header: INFO/%s. FORMAT fields must be in square brackets, e.g. \"[ %s]\"\n", convert->undef_info_tag,convert->undef_info_tag); +++ { +++ kstring_t msg = {0,0,0}; +++ ksprintf(&msg,"Error: no such tag defined in the VCF header: INFO/%s", convert->undef_info_tag); +++ +++ int hdr_id = bcf_hdr_id2int(convert->header,BCF_DT_ID,convert->undef_info_tag); +++ if ( hdr_id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,hdr_id) ) +++ ksprintf(&msg,". FORMAT fields must be enclosed in square brackets, e.g. \"[ %%%s]\"", convert->undef_info_tag); +++ error("%s\n", msg.s); +++ } ++ ++ int l_ori = str->l; ++ bcf_unpack(line, convert->max_unpack); ++@@ -1359,7 +1567,7 @@ ++ for (js=0; jsnsamples; js++) ++ { ++ // Skip samples when filtering was requested ++- if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; +++ if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; ++ ++ // Here comes a hack designed for TBCSQ. When running on large files, ++ // such as 1000GP, there are too many empty fields in the output and ++--- python-pysam.orig/bcftools/csq.c +++++ python-pysam/bcftools/csq.c ++@@ -1,3 +1,6 @@ +++//$bt csq -f $ref -g $gff -p r -Ou -o /dev/null /lustre/scratch116/vr/projects/g1k/phase3/release/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz +++ +++ ++ /* The MIT License ++ ++ Copyright (c) 2016-2018 Genome Research Ltd. ++@@ -25,6 +28,7 @@ ++ */ ++ /* ++ Things that would be nice to have +++ - dynamic N_REF_PAD ++ - for stop-lost events (also in frameshifts) report the number of truncated aa's ++ - memory could be greatly reduced by indexing gff (but it is quite compact already) ++ - deletions that go beyond transcript boundaries are not checked at sequence level ++@@ -95,6 +99,7 @@ ++ splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron ++ synonymous_variant .. DNA sequence variant resulting in no amino acid change ++ stop_retained_variant .. different stop codon +++ start_retained_variant .. start codon retained by indel realignment ++ non_coding_variant .. variant in non-coding sequence, such as RNA gene ++ 5_prime_UTR_variant ++ 3_prime_UTR_variant ++@@ -133,6 +138,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -142,7 +148,6 @@ ++ #include ++ #include ++ #include ++-#include ++ #include ++ #include "bcftools.h" ++ #include "filter.h" ++@@ -208,13 +213,15 @@ ++ #define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string ++ #define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf ++ #define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence +++#define CSQ_ELONGATION (1<<22) // symbolic insertion +++#define CSQ_START_RETAINED (1<<23) ++ ++ // Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345 ++ #define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \ ++ CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \ ++ CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \ ++- CSQ_UPSTREAM_STOP) ++-#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST) +++ CSQ_UPSTREAM_STOP|CSQ_START_RETAINED) +++#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST|CSQ_START_RETAINED) ++ ++ #define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION))) ++ #define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING)) ++@@ -244,7 +251,9 @@ ++ "inframe_altering", ++ NULL, ++ NULL, ++- "coding_sequence" +++ "coding_sequence", +++ "feature_elongation", +++ "start_retained" ++ }; ++ ++ ++@@ -339,7 +348,7 @@ ++ typedef struct ++ { ++ char *name; // human readable name, e.g. ORF45 ++- uint8_t iseq; +++ uint32_t iseq; ++ } ++ gf_gene_t; ++ typedef struct ++@@ -392,7 +401,8 @@ ++ { ++ bcf1_t *line; ++ uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved ++- uint32_t nfmt:4, nvcsq:28, mvcsq; +++ uint32_t nfmt:4, // the bitmask size (the number of integers per sample) +++ nvcsq:28, mvcsq; ++ vcsq_t *vcsq; // there can be multiple consequences for a single VCF record ++ } ++ vrec_t; ++@@ -408,6 +418,7 @@ ++ { ++ vrec_t **vrec; // buffer of VCF lines with the same position ++ int n, m; +++ uint32_t keep_until; // the maximum transcript end position ++ }; ++ KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*) ++ ++@@ -580,9 +591,10 @@ ++ char *outdir, **argv, *fa_fname, *gff_fname, *output_fname; ++ char *bcsq_tag; ++ int argc, output_type; ++- int phase, quiet, local_csq; +++ int phase, verbosity, local_csq, record_cmd_line; ++ int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ ++ int ncsq_small_warned; +++ int brief_predictions; ++ ++ int rid; // current chromosome ++ tr_heap_t *active_tr; // heap of active transcripts for quick flushing ++@@ -596,6 +608,7 @@ ++ int ncsq_buf, mcsq_buf; ++ id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx ++ int force; // force run under various conditions. Currently only to skip out-of-phase transcripts +++ int n_threads; // extra compression/decompression threads ++ ++ faidx_t *fai; ++ kstring_t str, str2; ++@@ -671,7 +684,7 @@ ++ aux->seq[aux->nseq] = strdup(chr_beg); ++ iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); ++ aux->nseq++; ++- assert( aux->nseq < 256 ); // see gf_gene_t.iseq +++ assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq ++ } ++ chr_end[1] = c; ++ return iseq; ++@@ -886,7 +899,7 @@ ++ int biotype = gff_parse_biotype(ss); ++ if ( biotype <= 0 ) ++ { ++- if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(stderr,"ignored transcript: %s\n",line); +++ if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript: %s\n",line); ++ return; ++ } ++ ++@@ -912,7 +925,7 @@ ++ int biotype = gff_parse_biotype(ss); ++ if ( biotype <= 0 ) ++ { ++- if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(stderr,"ignored gene: %s\n",line); +++ if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene: %s\n",line); ++ return; ++ } ++ ++@@ -978,7 +991,7 @@ ++ if ( !ss ) return -1; // no ID, ignore the line ++ if ( !strncmp("chromosome",ss+3,10) ) return -1; ++ if ( !strncmp("supercontig",ss+3,11) ) return -1; ++- if ( args->quiet<2 ) fprintf(stderr,"ignored: %s\n", line); +++ if ( args->verbosity > 0 ) fprintf(stderr,"ignored: %s\n", line); ++ return -1; ++ } ++ ++@@ -1000,7 +1013,7 @@ ++ // 7. column: strand ++ if ( *ss == '+' ) ftr->strand = STRAND_FWD; ++ else if ( *ss == '-' ) ftr->strand = STRAND_REV; ++- else { if ( args->quiet<2 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; } +++ else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; } ++ ss += 2; ++ ++ // 8. column: phase (codon offset) ++@@ -1008,7 +1021,7 @@ ++ else if ( *ss == '1' ) ftr->phase = 1; ++ else if ( *ss == '2' ) ftr->phase = 2; ++ else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase ++- else { if ( args->quiet<2 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } +++ else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } ++ ss += 2; ++ ++ // substring search for "Parent=transcript:ENST00000437963" ++@@ -1122,7 +1135,7 @@ ++ { ++ if ( args->force ) ++ { ++- if ( args->quiet < 2 ) +++ if ( args->verbosity > 0 ) ++ fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); ++ tscript_ok = 0; ++ break; ++@@ -1160,7 +1173,7 @@ ++ { ++ if ( args->force ) ++ { ++- if ( args->quiet < 2 ) +++ if ( args->verbosity > 0 ) ++ fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); ++ tscript_ok = 0; ++ break; ++@@ -1293,7 +1306,7 @@ ++ } ++ tscript_init_cds(args); ++ ++- if ( !args->quiet ) +++ if ( args->verbosity > 0 ) ++ { ++ fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", ++ regidx_nregs(args->idx_tscript), ++@@ -1309,14 +1322,16 @@ ++ free(aux->seq); ++ gff_id_destroy(&aux->gene_ids); ++ ++- if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) ) +++ if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) ) ++ { ++ khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; ++ fprintf(stderr,"Ignored the following biotypes:\n"); ++ for (i = kh_begin(ign); i < kh_end(ign); i++) ++ { ++ if ( !kh_exist(ign,i)) continue; ++- fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), kh_key(ign,i)); +++ const char *biotype = kh_key(ign,i); +++ if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")"; +++ fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype); ++ } ++ } ++ khash_str2int_destroy_free(aux->ignored_biotypes); ++@@ -1326,7 +1341,7 @@ ++ { ++ args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32; ++ ++- if ( !args->quiet ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname); +++ if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname); ++ init_gff(args); ++ ++ args->rid = -1; ++@@ -1349,7 +1364,8 @@ ++ if ( args->output_type==FT_TAB_TEXT ) ++ { ++ // significant speedup for plain VCFs ++- bcf_hdr_set_samples(args->hdr,NULL,0); +++ if (bcf_hdr_set_samples(args->hdr,NULL,0) < 0) +++ error_errno("[%s] Couldn't build sample filter", __func__); ++ } ++ args->phase = PHASE_DROP_GT; ++ } ++@@ -1360,7 +1376,7 @@ ++ if ( args->output_type==FT_TAB_TEXT ) ++ { ++ args->out = args->output_fname ? fopen(args->output_fname,"w") : stdout; ++- if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname,strerror(errno)); +++ if ( !args->out ) error("Failed to write to %s: %s\n", !strcmp("-",args->output_fname)?"standard output":args->output_fname,strerror(errno)); ++ ++ fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version()); ++ fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]); ++@@ -1380,14 +1396,16 @@ ++ else ++ { ++ args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); ++- if ( args->out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); ++- bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); ++- bcf_hdr_printf(args->hdr,"##INFO=",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); +++ if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno)); +++ if ( args->n_threads > 0) +++ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p); +++ if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); +++ bcf_hdr_printf(args->hdr,"##INFO=",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); ++ if ( args->hdr_nsmpl ) ++ bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); ++- bcf_hdr_write(args->out_fh, args->hdr); +++ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); ++ } ++- if ( !args->quiet ) fprintf(stderr,"Calling...\n"); +++ if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n"); ++ } ++ ++ void destroy_data(args_t *args) ++@@ -1487,6 +1505,7 @@ ++ splice->vcf.pos = rec->pos; ++ splice->vcf.rlen = rec->rlen; ++ splice->vcf.ref = rec->d.allele[0]; +++ splice->csq = 0; ++ } ++ static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) ++ { ++@@ -1594,7 +1613,7 @@ ++ #endif ++ } ++ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec); ++-static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid) +++static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type) ++ { ++ while ( regitr_overlap(itr) ) ++ { ++@@ -1604,7 +1623,7 @@ ++ csq_t csq; ++ memset(&csq, 0, sizeof(csq_t)); ++ csq.pos = rec->pos; ++- csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3; +++ csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | type; ++ csq.type.biotype = tr->type; ++ csq.type.strand = tr->strand; ++ csq.type.trid = tr->id; ++@@ -1658,7 +1677,7 @@ ++ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); ++ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr ++ { ++- ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); +++ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); ++ if ( ret!=0 ) ++ { ++ regitr_destroy(itr); ++@@ -1696,7 +1715,7 @@ ++ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); ++ if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr ++ { ++- ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); +++ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); ++ if ( ret!=0 ) ++ { ++ regitr_destroy(itr); ++@@ -1763,14 +1782,105 @@ ++ return SPLICE_INSIDE; ++ } ++ +++int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) +++{ +++ static int small_ref_padding_warned = 0; +++ tscript_t *tr = splice->tr; +++ +++ // We know the VCF record overlaps the exon, but does it overlap the start codon? +++ if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0; +++ if ( tr->strand==STRAND_FWD && splice->vcf.pos >= ex_beg + 3 ) return 0; +++ +++#if XDBG +++ fprintf(stderr,"shifted_del_synonymous: %d-%d %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":"rev"); +++ fprintf(stderr," %d .. %s > %s\n",splice->vcf.pos+1,splice->vcf.ref,splice->vcf.alt); +++#endif +++ +++ // is there enough ref sequence for the extension? All coordinates are 0-based +++ int ref_len = strlen(splice->vcf.ref); +++ int alt_len = strlen(splice->vcf.alt); +++ assert( ref_len > alt_len ); +++ int ndel = ref_len - alt_len; +++ +++ if ( tr->strand==STRAND_REV ) +++ { +++ int32_t vcf_ref_end = splice->vcf.pos + ref_len - 1; // end pos of the VCF REF allele +++ int32_t tr_ref_end = splice->tr->end + N_REF_PAD; // the end pos of accessible cached ref seq +++ if ( vcf_ref_end + ndel > tr_ref_end ) +++ { +++ if ( !small_ref_padding_warned ) +++ { +++ fprintf(stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1); +++ small_ref_padding_warned = 1; +++ } +++ return 0; +++ } +++ +++ char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele +++ char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted +++#if XDBG +++ fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); +++#endif +++ int i = 0; +++ while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++; +++ if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced +++ } +++ else +++ { +++ // STRAND_FWD +++ int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel; // the position of the first base of the ref block that could potentially replace the deletion +++ if ( vcf_block_beg < 0 ) return 0; +++ +++#if XDBG +++ fprintf(stderr,"vcf_block_beg: %d\n",vcf_block_beg+1); +++#endif +++ +++ if ( N_REF_PAD + vcf_block_beg < ex_beg ) +++ { +++ if ( !small_ref_padding_warned ) +++ { +++ fprintf(stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1); +++ small_ref_padding_warned = 1; +++ } +++ return 0; +++ } +++ +++ char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele +++ char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block +++#if XDBG +++ fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); +++#endif +++ +++ int i = 0; +++ while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++; +++ if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced +++ } +++ +++ return 1; +++} +++ ++ static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) ++ { +++ if ( splice->check_start ) +++ { +++ // check for synonymous start +++ // test/csq/ENST00000375992/incorrect-synon-del-not-start-lost.txt +++ // test/csq/ENST00000368801.2/start-lost.txt +++ // test/csq/ENST00000318249.2/synonymous-start-lost.txt +++ int is_synonymous = shifted_del_synonymous(args, splice, ex_beg, ex_end); +++ if ( is_synonymous ) +++ { +++ splice->csq |= CSQ_START_RETAINED; +++ return SPLICE_OVERLAP; +++ } +++ } +++ ++ // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG ++ splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; // 1b before the deleted base ++ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; // the last deleted base ++ ++ #if XDBG ++-fprintf(stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); +++fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); ++ #endif ++ ++ if ( splice->ref_beg + 1 < ex_beg ) // the part before the exon; ref_beg is off by -1 ++@@ -1783,7 +1893,7 @@ ++ regitr_t *itr = regitr_init(NULL); ++ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); ++ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr ++- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); +++ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); ++ regitr_destroy(itr); ++ } ++ if ( !csq ) ++@@ -1839,7 +1949,7 @@ ++ regitr_t *itr = regitr_init(NULL); ++ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); ++ if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr ++- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); +++ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); ++ regitr_destroy(itr); ++ } ++ if ( !csq ) ++@@ -1874,7 +1984,6 @@ ++ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); ++ return SPLICE_OUTSIDE; ++ } ++- ++ if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1 ++ { ++ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; ++@@ -1929,7 +2038,7 @@ ++ regitr_t *itr = regitr_init(NULL); ++ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); ++ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr ++- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); +++ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); ++ regitr_destroy(itr); ++ } ++ if ( !csq ) ++@@ -1959,7 +2068,7 @@ ++ regitr_t *itr = regitr_init(NULL); ++ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); ++ if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr ++- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); +++ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); ++ regitr_destroy(itr); ++ } ++ if ( !csq ) ++@@ -2008,7 +2117,6 @@ ++ } ++ static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) ++ { ++- splice->csq = 0; ++ splice->vcf.alen = strlen(splice->vcf.alt); ++ ++ int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0; ++@@ -2038,6 +2146,7 @@ ++ return 0; ++ } ++ +++ ++ // return value: 0 added, 1 overlapping variant, 2 silent discard (intronic,alt=ref) ++ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial) ++ { ++@@ -2070,7 +2179,7 @@ ++ if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; ++ ++ #if XDBG ++-fprintf(stderr,"\n%d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); +++fprintf(stderr,"\nhap_init: %d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); ++ #endif ++ int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1); ++ #if XDBG ++@@ -2078,7 +2187,7 @@ ++ #endif ++ ++ if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA ++- if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq +++ if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP || splice.csq==CSQ_START_LOST ) // not a coding csq ++ { ++ free(splice.kref.s); ++ free(splice.kalt.s); ++@@ -2136,6 +2245,8 @@ ++ if ( len < 0 ) // overlapping variants ++ { ++ free(str.s); +++ free(splice.kref.s); +++ free(splice.kalt.s); ++ return 1; ++ } ++ kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); ++@@ -2173,6 +2284,7 @@ ++ if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE; // hack, specifically for ENST00000390520/deletion-overlap.vcf ++ } ++ +++ ++ free(splice.kref.s); ++ free(splice.kalt.s); ++ return 0; ++@@ -2206,7 +2318,7 @@ ++ void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill) ++ { ++ #if XDBG ++-fprintf(stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); +++fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); ++ #endif ++ char tmp[3], *codon, *end; ++ int i, len, npad; ++@@ -2306,7 +2418,7 @@ ++ #if DBG>1 ++ fprintf(stderr," npad: %d\n",npad); ++ #endif ++-if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(stderr,"sbeg=%d seq.l=%d seq.m=%d\n",sbeg,(int)seq.l,(int)seq.m); +++ if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(stderr,"sbeg=%d seq.l=%d seq.m=%d npad=%d\n",sbeg,(int)seq.l,(int)seq.m,npad); ++ assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); // todo: first codon on the rev strand ++ ++ if ( npad==2 ) ++@@ -2327,8 +2439,8 @@ ++ for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end); ++ #if DBG>1 ++ fprintf(stderr,"\t i=%d\n", i); ++- if(i==1)fprintf(stderr,"[0] %c\n",tmp[2]); ++- if(i==0)fprintf(stderr,"[0] %c%c\n",tmp[1],tmp[2]); +++ if(i==1)fprintf(stderr,"[0] %c\n",tmp[2]); +++ if(i==0)fprintf(stderr,"[0] %c%c\n",tmp[1],tmp[2]); ++ #endif ++ if ( i==-1 ) ++ { ++@@ -2569,12 +2681,25 @@ ++ kputs(csq->vstr.s, str); ++ } ++ +++void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str) +++{ +++ if ( !args->brief_predictions ) +++ kputs(aa->s, str); +++ else +++ { +++ int len = aa->l; +++ if ( aa->s[len-1]=='*' ) len--; +++ kputc(aa->s[0], str); +++ kputs("..", str); +++ kputw(beg+len, str); +++ } +++} +++ ++ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel) ++ { ++ int i; ++ tscript_t *tr = hap->tr; ++ int ref_node = tr->strand==STRAND_FWD ? ibeg : iend; ++- ++ int icsq = node->ncsq_list++; ++ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); ++ csq_t *csq = &node->csq_list[icsq]; ++@@ -2678,12 +2803,12 @@ ++ int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1; ++ kputc_('|', &str); ++ kputw(aa_rbeg, &str); ++- kputs(hap->tref.s, &str); +++ kprint_aa_prediction(args,aa_rbeg,&hap->tref,&str); ++ if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) ) ++ { ++ kputc_('>', &str); ++ kputw(aa_sbeg, &str); ++- kputs(hap->tseq.s, &str); +++ kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&str); ++ } ++ kputc_('|', &str); ++ ++@@ -2961,18 +3086,15 @@ ++ int icsq = 2*csq->idx + ihap; ++ if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT ++ { ++- int print_warning = 1; ++- if ( args->quiet ) +++ if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) ++ { ++- if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0; +++ fprintf(stderr, +++ "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", +++ args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx); +++ if ( !args->ncsq_small_warned ) +++ fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); ++ args->ncsq_small_warned = 1; ++ } ++- if ( print_warning ) ++- { ++- fprintf(stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", ++- args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); ++- if ( args->quiet ) fprintf(stderr,"(This warning is printed only once)\n"); ++- } ++ break; ++ } ++ if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; ++@@ -2984,12 +3106,10 @@ ++ { ++ int i,j; ++ tr_heap_t *heap = args->active_tr; ++- ++ while ( heap->ndat && heap->dat[0]->end<=pos ) ++ { ++ tscript_t *tr = heap->dat[0]; ++ khp_delete(trhp, heap); ++- ++ args->hap->tr = tr; ++ if ( tr->root && tr->root->nchild ) // normal, non-localized calling ++ { ++@@ -3028,7 +3148,7 @@ ++ ++ #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; } ++ ++-void vbuf_push(args_t *args, bcf1_t **rec_ptr) +++vbuf_t *vbuf_push(args_t *args, bcf1_t **rec_ptr) ++ { ++ int i; ++ ++@@ -3044,6 +3164,7 @@ ++ i = rbuf_append(&args->vcf_rbuf); ++ if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t)); ++ args->vcf_buf[i]->n = 0; +++ args->vcf_buf[i]->keep_until = 0; ++ } ++ vbuf_t *vbuf = args->vcf_buf[i]; ++ vbuf->n++; ++@@ -3063,16 +3184,29 @@ ++ int ret; ++ khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret); ++ kh_val(args->pos2vbuf,k) = vbuf; +++ +++ return vbuf; ++ } ++ ++-void vbuf_flush(args_t *args) +++void vbuf_flush(args_t *args, uint32_t pos) ++ { ++- if ( args->active_tr->ndat ) return; // cannot output buffered VCF lines (args.vbuf) until all active transcripts are gone ++- ++ int i,j; ++- while ( (i=rbuf_shift(&args->vcf_rbuf))>=0 ) +++ while ( args->vcf_rbuf.n ) ++ { ++- vbuf_t *vbuf = args->vcf_buf[i]; +++ vbuf_t *vbuf; +++ if ( !args->local_csq && args->active_tr->ndat ) +++ { +++ // check if the first active transcript starts beyond the first buffered VCF record, +++ // cannot output buffered VCF lines (args.vbuf) until the active transcripts are gone +++ vbuf = args->vcf_buf[ args->vcf_rbuf.f ]; +++ if ( vbuf->keep_until > pos ) break; +++ assert( vbuf->n ); +++ } +++ +++ i = rbuf_shift(&args->vcf_rbuf); +++ assert( i>=0 ); +++ vbuf = args->vcf_buf[i]; +++ int pos = vbuf->n ? vbuf->vrec[0]->line->pos : -1; ++ for (i=0; in; i++) ++ { ++ vrec_t *vrec = vbuf->vrec[i]; ++@@ -3083,7 +3217,10 @@ ++ } ++ if ( !vrec->nvcsq ) ++ { ++- bcf_write(args->out_fh, args->hdr, vrec->line); +++ if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); +++ int save_pos = vrec->line->pos; +++ bcf_empty(vrec->line); +++ vrec->line->pos = save_pos; // this is necessary for compound variants ++ continue; ++ } ++ ++@@ -3098,19 +3235,24 @@ ++ if ( args->hdr_nsmpl ) ++ { ++ if ( vrec->nfmt < args->nfmt_bcsq ) ++- for (j=1; jhdr_nsmpl; j++) memcpy(vrec->smpl+j*vrec->nfmt, vrec->smpl+j*args->nfmt_bcsq, vrec->nfmt*sizeof(*vrec->smpl)); +++ for (j=1; jhdr_nsmpl; j++) +++ memmove(&vrec->smpl[j*vrec->nfmt], &vrec->smpl[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->smpl)); ++ bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt); ++ } ++ vrec->nvcsq = 0; ++- bcf_write(args->out_fh, args->hdr, vrec->line); +++ if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); +++ int save_pos = vrec->line->pos; +++ bcf_empty(vrec->line); +++ vrec->line->pos = save_pos; ++ } ++- if ( vbuf->n ) +++ if ( pos!=-1 ) ++ { ++- khint_t k = kh_get(pos2vbuf, args->pos2vbuf, vbuf->vrec[0]->line->pos); +++ khint_t k = kh_get(pos2vbuf, args->pos2vbuf, pos); ++ if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k); ++ } ++ vbuf->n = 0; ++ } +++ if ( args->active_tr->ndat ) return; ++ ++ for (i=0; inrm_tr; i++) ++ { ++@@ -3137,10 +3279,12 @@ ++ int pad_end = len - (tr->end - tr->beg + 1 + pad_beg); ++ if ( pad_beg + pad_end != 2*N_REF_PAD ) ++ { ++- char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD); +++ char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1); ++ for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N'; ++ memcpy(ref+i, tr->ref, len); +++ len += i; ++ for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N'; +++ ref[i+len] = 0; ++ free(tr->ref); ++ tr->ref = ref; ++ } ++@@ -3148,15 +3292,19 @@ ++ ++ static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) ++ { ++- char *ref = tr->ref + (rec->pos + N_REF_PAD >= tr->beg ? rec->pos - tr->beg + N_REF_PAD : 0); ++- char *vcf = rec->d.allele[0] + (rec->pos + N_REF_PAD >= tr->beg ? 0 : tr->beg - N_REF_PAD - rec->pos); ++- assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) ); ++- while ( *ref && *vcf ) ++- { ++- if ( *ref!=*vcf && toupper(*ref)!=toupper(*vcf) ) ++- error("Error: the fasta reference does not match the VCF REF allele at %s:%d .. %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,rec->d.allele[0]); ++- ref++; ++- vcf++; +++ int vbeg = 0; +++ int rbeg = rec->pos - tr->beg + N_REF_PAD; +++ if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; } +++ char *ref = tr->ref + rbeg; +++ char *vcf = rec->d.allele[0] + vbeg; +++ assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD ); +++ int i = 0; +++ while ( ref[i] && vcf[i] ) +++ { +++ if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) ) +++ error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", +++ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); +++ i++; ++ } ++ } ++ ++@@ -3195,6 +3343,7 @@ ++ ++ for (i=1; in_allele; i++) ++ { +++ if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } ++ if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue; ++ ++ csq_t csq; ++@@ -3294,12 +3443,12 @@ ++ int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; ++ kputc_('|', &str); ++ kputw(aa_rbeg, &str); ++- kputs(tref->s, &str); +++ kprint_aa_prediction(args,aa_rbeg,tref,&str); ++ if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) ) ++ { ++ kputc_('>', &str); ++ kputw(aa_sbeg, &str); ++- kputs(tseq->s, &str); +++ kprint_aa_prediction(args,aa_sbeg,tseq,&str); ++ } ++ kputc_('|', &str); ++ kputw(rec->pos+1, &str); ++@@ -3330,8 +3479,10 @@ ++ return ret; ++ } ++ ++-int test_cds(args_t *args, bcf1_t *rec) +++int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) ++ { +++ static int overlaps_warned = 0, multiploid_warned = 0; +++ ++ int i, ret = 0, hap_ret; ++ const char *chr = bcf_seqname(args->hdr,rec); ++ // note that the off-by-one extension of rlen is deliberate to account for insertions ++@@ -3341,6 +3492,7 @@ ++ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); ++ tscript_t *tr = cds->tr; ++ if ( !GF_is_coding(tr->type) ) continue; +++ if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end; ++ ret = 1; ++ if ( !tr->root ) ++ { ++@@ -3370,10 +3522,17 @@ ++ // overlapping or intron variant, cannot apply ++ if ( hap_ret==1 ) ++ { ++- if ( !args->quiet ) ++- fprintf(stderr,"Warning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); +++ if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) ) +++ { +++ fprintf(stderr, +++ "Warning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s.\n", +++ chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); +++ if ( !overlaps_warned ) +++ fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); +++ overlaps_warned = 1; +++ } ++ if ( args->out ) ++- fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); +++ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); ++ } ++ else ret = 1; // prevent reporting as intron in test_tscript ++ hap_destroy(child); ++@@ -3409,10 +3568,17 @@ ++ ngts /= bcf_hdr_nsamples(args->hdr); ++ if ( ngts!=1 && ngts!=2 ) ++ { ++- if ( !args->quiet ) ++- fprintf(stderr,"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); +++ if ( args->verbosity && (!multiploid_warned || args->verbosity > 1) ) +++ { +++ fprintf(stderr, +++ "Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s.\n", +++ chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); +++ if ( !multiploid_warned ) +++ fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); +++ multiploid_warned = 1; +++ } ++ if ( args->out ) ++- fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); +++ fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); ++ continue; ++ } ++ for (ismpl=0; ismplsmpl->n; ismpl++) ++@@ -3429,7 +3595,7 @@ ++ if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) ) ++ { ++ if ( args->phase==PHASE_REQUIRE ) ++- error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); +++ error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); ++ if ( args->phase==PHASE_SKIP ) ++ continue; ++ if ( args->phase==PHASE_NON_REF ) ++@@ -3468,12 +3634,18 @@ ++ // overlapping or intron variant, cannot apply ++ if ( hap_ret==1 ) ++ { ++- if ( !args->quiet ) ++- fprintf(stderr,"Warning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", ++- chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); +++ if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) ) +++ { +++ fprintf(stderr, +++ "Warning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s.\n", +++ chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); +++ if ( !overlaps_warned ) +++ fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); +++ overlaps_warned = 1; +++ } ++ if ( args->out ) ++- fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", ++- chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); +++ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s\n", +++ chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); ++ } ++ hap_destroy(child); ++ continue; ++@@ -3559,19 +3731,15 @@ ++ if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT ++ { ++ int ismpl = args->smpl->idx[i]; ++- int print_warning = 1; ++- if ( args->quiet ) +++ if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) ++ { ++- if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0; +++ fprintf(stderr, +++ "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", +++ args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq+1); +++ if ( !args->ncsq_small_warned ) +++ fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); ++ args->ncsq_small_warned = 1; ++ } ++- if ( print_warning ) ++- { ++- fprintf(stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", ++- args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); ++- if ( args->quiet ) fprintf(stderr,"(This warning is printed only once)\n"); ++- } ++- break; ++ } ++ if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; ++ vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); ++@@ -3594,8 +3762,9 @@ ++ tscript_t *tr = splice.tr = utr->tr; ++ for (i=1; in_allele; i++) ++ { ++- if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } +++ if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } ++ splice.vcf.alt = rec->d.allele[i]; +++ splice.csq = 0; ++ int splice_ret = splice_csq(args, &splice, utr->beg, utr->end); ++ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; ++ csq_t csq; ++@@ -3637,6 +3806,7 @@ ++ { ++ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } ++ splice.vcf.alt = rec->d.allele[i]; +++ splice.csq = 0; ++ splice_csq(args, &splice, exon->beg, exon->end); ++ if ( splice.csq ) ret = 1; ++ } ++@@ -3659,8 +3829,9 @@ ++ tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); ++ for (i=1; in_allele; i++) ++ { ++- if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } +++ if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } ++ splice.vcf.alt = rec->d.allele[i]; +++ splice.csq = 0; ++ int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); ++ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF ++ csq_t csq; ++@@ -3680,22 +3851,151 @@ ++ return ret; ++ } ++ ++-void process(args_t *args, bcf1_t **rec_ptr) +++void test_symbolic_alt(args_t *args, bcf1_t *rec) +++{ +++ static int warned = 0; +++ if ( args->verbosity && (!warned && args->verbosity > 0) ) +++ { +++ fprintf(stderr,"Warning: The support for symbolic ALT insertions is experimental.\n"); +++ warned = 1; +++ } +++ +++ const char *chr = bcf_seqname(args->hdr,rec); +++ +++ // only insertions atm +++ int beg = rec->pos + 1; +++ int end = beg; +++ int csq_class = CSQ_ELONGATION; +++ +++ int hit = 0; +++ if ( regidx_overlap(args->idx_cds,chr,beg,end, args->itr) ) +++ { +++ while ( regitr_overlap(args->itr) ) +++ { +++ csq_t csq; +++ memset(&csq, 0, sizeof(csq_t)); +++ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); +++ tscript_t *tr = cds->tr; +++ csq.type.type = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class; +++ csq.pos = rec->pos; +++ csq.type.biotype = tr->type; +++ csq.type.strand = tr->strand; +++ csq.type.trid = tr->id; +++ csq.type.gene = tr->gene->name; +++ csq_stage(args, &csq, rec); +++ hit = 1; +++ } +++ } +++ if ( regidx_overlap(args->idx_utr,chr,beg,end, args->itr) ) +++ { +++ while ( regitr_overlap(args->itr) ) +++ { +++ csq_t csq; +++ memset(&csq, 0, sizeof(csq_t)); +++ gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); +++ tscript_t *tr = utr->tr; +++ csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class; +++ csq.pos = rec->pos; +++ csq.type.biotype = tr->type; +++ csq.type.strand = tr->strand; +++ csq.type.trid = tr->id; +++ csq.type.gene = tr->gene->name; +++ csq_stage(args, &csq, rec); +++ hit = 1; +++ } +++ } +++ if ( regidx_overlap(args->idx_exon,chr,beg,end, args->itr) ) +++ { +++ splice_t splice; +++ splice_init(&splice, rec); +++ splice.check_acceptor = splice.check_donor = 1; +++ +++ while ( regitr_overlap(args->itr) ) +++ { +++ gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*); +++ splice.tr = exon->tr; +++ if ( !splice.tr->ncds ) continue; // not a coding transcript, no interest in splice sites +++ splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1; +++ splice.check_region_end = splice.tr->end==exon->end ? 0 : 1; +++ splice.vcf.alt = rec->d.allele[1]; +++ splice.csq = csq_class; +++ splice_csq(args, &splice, exon->beg, exon->end); +++ if ( splice.csq ) hit = 1; +++ } +++ } +++ if ( !hit && regidx_overlap(args->idx_tscript,chr,beg,end, args->itr) ) +++ { +++ splice_t splice; +++ splice_init(&splice, rec); +++ +++ while ( regitr_overlap(args->itr) ) +++ { +++ csq_t csq; +++ memset(&csq, 0, sizeof(csq_t)); +++ tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); +++ splice.vcf.alt = rec->d.allele[1]; +++ splice.csq = csq_class; +++ int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); +++ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF +++ csq.type.type = (GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING) | csq_class; +++ csq.pos = rec->pos; +++ csq.type.biotype = tr->type; +++ csq.type.strand = tr->strand; +++ csq.type.trid = tr->id; +++ csq.type.gene = tr->gene->name; +++ csq_stage(args, &csq, rec); +++ } +++ } +++} +++ +++void debug_print_buffers(args_t *args, int pos) +++{ +++ int i,j; +++ fprintf(stderr,"debug_print_buffers at %d\n", pos); +++ fprintf(stderr,"vbufs:\n"); +++ for (i=0; ivcf_rbuf.n; i++) +++ { +++ int k = rbuf_kth(&args->vcf_rbuf, i); +++ vbuf_t *vbuf = args->vcf_buf[k]; +++ +++ fprintf(stderr,"\tvbuf %d:\n", i); +++ for (j=0; jn; j++) +++ { +++ vrec_t *vrec = vbuf->vrec[j]; +++ fprintf(stderr,"\t\t%"PRId64" .. nvcsq=%d\n", (int64_t) vrec->line->pos+1, vrec->nvcsq); +++ } +++ } +++ fprintf(stderr,"pos2vbuf:"); +++ khint_t k; +++ for (k = 0; k < kh_end(args->pos2vbuf); ++k) +++ if (kh_exist(args->pos2vbuf, k)) fprintf(stderr," %d",1+(int)kh_key(args->pos2vbuf, k)); +++ fprintf(stderr,"\n"); +++ fprintf(stderr,"active_tr: %d\n", args->active_tr->ndat); +++} +++ +++static void process(args_t *args, bcf1_t **rec_ptr) ++ { ++ if ( !rec_ptr ) ++ { ++ hap_flush(args, REGIDX_MAX); ++- vbuf_flush(args); +++ vbuf_flush(args, REGIDX_MAX); ++ return; ++ } ++ ++ bcf1_t *rec = *rec_ptr; +++ static int32_t prev_rid = -1, prev_pos = -1; +++ if ( prev_rid!=rec->rid ) { prev_rid = rec->rid; prev_pos = rec->pos; } +++ if ( prev_pos > rec->pos ) +++ error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ ++ int call_csq = 1; ++- if ( !rec->n_allele ) call_csq = 0; // no alternate allele ++- else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele ++- else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc ++- else if ( args->filter ) +++ if ( rec->n_allele < 2 ) call_csq = 0; // no alternate allele +++ else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='*' || rec->d.allele[1][1]=='*') ) call_csq = 0; // gVCF, not an alt allele +++ else if ( rec->d.allele[1][0]=='<' ) +++ { +++ if ( strncmp("d.allele[1], 4) ) call_csq = 0; // only is supported at the moment +++ } +++ if ( call_csq && args->filter ) ++ { ++ call_csq = filter_test(args->filter, rec, NULL); ++ if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1; ++@@ -3704,25 +4004,34 @@ ++ { ++ if ( !args->out_fh ) return; // not a VCF output ++ vbuf_push(args, rec_ptr); ++- vbuf_flush(args); +++ hap_flush(args, rec->pos-1); +++ vbuf_flush(args, rec->pos-1); ++ return; ++ } ++ ++ if ( args->rid != rec->rid ) ++ { ++ hap_flush(args, REGIDX_MAX); ++- vbuf_flush(args); +++ vbuf_flush(args, REGIDX_MAX); ++ } ++ args->rid = rec->rid; ++- vbuf_push(args, rec_ptr); +++ vbuf_t *vbuf = vbuf_push(args, rec_ptr); ++ ++- int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec); ++- hit += test_utr(args, rec); ++- hit += test_splice(args, rec); ++- if ( !hit ) test_tscript(args, rec); +++ if ( rec->d.allele[1][0]!='<' ) +++ { +++ int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec, vbuf); +++ hit += test_utr(args, rec); +++ hit += test_splice(args, rec); +++ if ( !hit ) test_tscript(args, rec); +++ } +++ else +++ test_symbolic_alt(args, rec); ++ ++- hap_flush(args, rec->pos-1); ++- vbuf_flush(args); +++ if ( rec->pos > 0 ) +++ { +++ hap_flush(args, rec->pos-1); +++ vbuf_flush(args, rec->pos-1); +++ } ++ ++ return; ++ } ++@@ -3739,6 +4048,7 @@ ++ " -g, --gff-annot gff3 annotation file\n" ++ "\n" ++ "CSQ options:\n" +++ " -b, --brief-predictions annotate with abbreviated protein-changing predictions\n" ++ " -c, --custom-tag use this tag instead of the default BCSQ\n" ++ " -l, --local-csq localized predictions, consider only one VCF record at a time\n" ++ " -n, --ncsq maximum number of consequences to consider per site [16]\n" ++@@ -3752,16 +4062,18 @@ ++ " -e, --exclude exclude sites for which the expression is true\n" ++ " --force run even if some sanity checks fail\n" ++ " -i, --include select sites for which the expression is true\n" +++ " --no-version do not append version and command line to the header\n" ++ " -o, --output write output to a file [standard output]\n" ++ " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" ++ " v: uncompressed VCF, t: plain tab-delimited text output [v]\n" ++- " -q, --quiet suppress warning messages. Can be given two times for even less messages\n" ++ " -r, --regions restrict to comma-separated list of regions\n" ++ " -R, --regions-file restrict to regions listed in a file\n" ++ " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n" ++ " -S, --samples-file samples to include\n" ++ " -t, --targets similar to -r but streams rather than index-jumps\n" ++ " -T, --targets-file similar to -R but streams rather than index-jumps\n" +++ " --threads use multithreading with worker threads [0]\n" +++ " -v, --verbose verbosity level 0-2 [1]\n" ++ "\n" ++ "Example:\n" ++ " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" ++@@ -3779,12 +4091,16 @@ ++ args->output_type = FT_VCF; ++ args->bcsq_tag = "BCSQ"; ++ args->ncsq_max = 2*16; +++ args->verbosity = 1; +++ args->record_cmd_line = 1; ++ ++ static struct option loptions[] = ++ { ++ {"force",0,0,1}, +++ {"threads",required_argument,NULL,2}, ++ {"help",0,0,'h'}, ++ {"ncsq",1,0,'n'}, +++ {"brief-predictions",0,0,'b'}, ++ {"custom-tag",1,0,'c'}, ++ {"local-csq",0,0,'l'}, ++ {"gff-annot",1,0,'g'}, ++@@ -3795,24 +4111,36 @@ ++ {"output-type",1,NULL,'O'}, ++ {"phase",1,0,'p'}, ++ {"quiet",0,0,'q'}, +++ {"verbose",1,0,'v'}, ++ {"regions",1,0,'r'}, ++ {"regions-file",1,0,'R'}, ++ {"samples",1,0,'s'}, ++ {"samples-file",1,0,'S'}, ++ {"targets",1,0,'t'}, ++ {"targets-file",1,0,'T'}, +++ {"no-version",no_argument,NULL,3}, ++ {0,0,0,0} ++ }; ++ int c, targets_is_file = 0, regions_is_file = 0; ++- char *targets_list = NULL, *regions_list = NULL; ++- while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:",loptions,NULL)) >= 0) +++ char *targets_list = NULL, *regions_list = NULL, *tmp; +++ while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bv:",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { ++ case 1 : args->force = 1; break; +++ case 2 : +++ args->n_threads = strtol(optarg,&tmp,10); +++ if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); +++ break; +++ case 3 : args->record_cmd_line = 0; break; +++ case 'b': args->brief_predictions = 1; break; ++ case 'l': args->local_csq = 1; break; ++ case 'c': args->bcsq_tag = optarg; break; ++- case 'q': args->quiet++; break; +++ case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break; +++ case 'v': +++ args->verbosity = atoi(optarg); +++ if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n"); +++ break; ++ case 'p': ++ switch (optarg[0]) ++ { ++@@ -3869,8 +4197,9 @@ ++ error("Failed to read the targets: %s\n", targets_list); ++ if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 ) ++ error("Failed to read the regions: %s\n", regions_list); +++ if ( bcf_sr_set_threads(args->sr, args->n_threads)<0 ) error("Failed to create %d extra threads\n", args->n_threads); ++ if ( !bcf_sr_add_reader(args->sr, fname) ) ++- error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->sr->errnum)); +++ error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->sr->errnum)); ++ args->hdr = bcf_sr_get_header(args->sr,0); ++ ++ init_data(args); ++@@ -3883,7 +4212,6 @@ ++ destroy_data(args); ++ bcf_sr_destroy(args->sr); ++ free(args); ++- ++ return 0; ++ } ++ ++--- python-pysam.orig/bcftools/csq.c.pysam.c +++++ python-pysam/bcftools/csq.c.pysam.c ++@@ -1,5 +1,8 @@ ++ #include "bcftools.pysam.h" ++ +++//$bt csq -f $ref -g $gff -p r -Ou -o /dev/null /lustre/scratch116/vr/projects/g1k/phase3/release/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz +++ +++ ++ /* The MIT License ++ ++ Copyright (c) 2016-2018 Genome Research Ltd. ++@@ -27,6 +30,7 @@ ++ */ ++ /* ++ Things that would be nice to have +++ - dynamic N_REF_PAD ++ - for stop-lost events (also in frameshifts) report the number of truncated aa's ++ - memory could be greatly reduced by indexing gff (but it is quite compact already) ++ - deletions that go beyond transcript boundaries are not checked at sequence level ++@@ -97,6 +101,7 @@ ++ splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron ++ synonymous_variant .. DNA sequence variant resulting in no amino acid change ++ stop_retained_variant .. different stop codon +++ start_retained_variant .. start codon retained by indel realignment ++ non_coding_variant .. variant in non-coding sequence, such as RNA gene ++ 5_prime_UTR_variant ++ 3_prime_UTR_variant ++@@ -135,6 +140,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -144,7 +150,6 @@ ++ #include ++ #include ++ #include ++-#include ++ #include ++ #include "bcftools.h" ++ #include "filter.h" ++@@ -210,13 +215,15 @@ ++ #define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string ++ #define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf ++ #define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence +++#define CSQ_ELONGATION (1<<22) // symbolic insertion +++#define CSQ_START_RETAINED (1<<23) ++ ++ // Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345 ++ #define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \ ++ CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \ ++ CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \ ++- CSQ_UPSTREAM_STOP) ++-#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST) +++ CSQ_UPSTREAM_STOP|CSQ_START_RETAINED) +++#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST|CSQ_START_RETAINED) ++ ++ #define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION))) ++ #define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING)) ++@@ -246,7 +253,9 @@ ++ "inframe_altering", ++ NULL, ++ NULL, ++- "coding_sequence" +++ "coding_sequence", +++ "feature_elongation", +++ "start_retained" ++ }; ++ ++ ++@@ -341,7 +350,7 @@ ++ typedef struct ++ { ++ char *name; // human readable name, e.g. ORF45 ++- uint8_t iseq; +++ uint32_t iseq; ++ } ++ gf_gene_t; ++ typedef struct ++@@ -394,7 +403,8 @@ ++ { ++ bcf1_t *line; ++ uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved ++- uint32_t nfmt:4, nvcsq:28, mvcsq; +++ uint32_t nfmt:4, // the bitmask size (the number of integers per sample) +++ nvcsq:28, mvcsq; ++ vcsq_t *vcsq; // there can be multiple consequences for a single VCF record ++ } ++ vrec_t; ++@@ -410,6 +420,7 @@ ++ { ++ vrec_t **vrec; // buffer of VCF lines with the same position ++ int n, m; +++ uint32_t keep_until; // the maximum transcript end position ++ }; ++ KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*) ++ ++@@ -582,9 +593,10 @@ ++ char *outdir, **argv, *fa_fname, *gff_fname, *output_fname; ++ char *bcsq_tag; ++ int argc, output_type; ++- int phase, quiet, local_csq; +++ int phase, verbosity, local_csq, record_cmd_line; ++ int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ ++ int ncsq_small_warned; +++ int brief_predictions; ++ ++ int rid; // current chromosome ++ tr_heap_t *active_tr; // heap of active transcripts for quick flushing ++@@ -598,6 +610,7 @@ ++ int ncsq_buf, mcsq_buf; ++ id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx ++ int force; // force run under various conditions. Currently only to skip out-of-phase transcripts +++ int n_threads; // extra compression/decompression threads ++ ++ faidx_t *fai; ++ kstring_t str, str2; ++@@ -673,7 +686,7 @@ ++ aux->seq[aux->nseq] = strdup(chr_beg); ++ iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); ++ aux->nseq++; ++- assert( aux->nseq < 256 ); // see gf_gene_t.iseq +++ assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq ++ } ++ chr_end[1] = c; ++ return iseq; ++@@ -888,7 +901,7 @@ ++ int biotype = gff_parse_biotype(ss); ++ if ( biotype <= 0 ) ++ { ++- if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(bcftools_stderr,"ignored transcript: %s\n",line); +++ if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored transcript: %s\n",line); ++ return; ++ } ++ ++@@ -914,7 +927,7 @@ ++ int biotype = gff_parse_biotype(ss); ++ if ( biotype <= 0 ) ++ { ++- if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(bcftools_stderr,"ignored gene: %s\n",line); +++ if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored gene: %s\n",line); ++ return; ++ } ++ ++@@ -980,7 +993,7 @@ ++ if ( !ss ) return -1; // no ID, ignore the line ++ if ( !strncmp("chromosome",ss+3,10) ) return -1; ++ if ( !strncmp("supercontig",ss+3,11) ) return -1; ++- if ( args->quiet<2 ) fprintf(bcftools_stderr,"ignored: %s\n", line); +++ if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored: %s\n", line); ++ return -1; ++ } ++ ++@@ -1002,7 +1015,7 @@ ++ // 7. column: strand ++ if ( *ss == '+' ) ftr->strand = STRAND_FWD; ++ else if ( *ss == '-' ) ftr->strand = STRAND_REV; ++- else { if ( args->quiet<2 ) fprintf(bcftools_stderr,"Skipping unknown strand: %c\n", *ss); return -1; } +++ else { if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Skipping unknown strand: %c\n", *ss); return -1; } ++ ss += 2; ++ ++ // 8. column: phase (codon offset) ++@@ -1010,7 +1023,7 @@ ++ else if ( *ss == '1' ) ftr->phase = 1; ++ else if ( *ss == '2' ) ftr->phase = 2; ++ else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase ++- else { if ( args->quiet<2 ) fprintf(bcftools_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } +++ else { if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } ++ ss += 2; ++ ++ // substring search for "Parent=transcript:ENST00000437963" ++@@ -1124,7 +1137,7 @@ ++ { ++ if ( args->force ) ++ { ++- if ( args->quiet < 2 ) +++ if ( args->verbosity > 0 ) ++ fprintf(bcftools_stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); ++ tscript_ok = 0; ++ break; ++@@ -1162,7 +1175,7 @@ ++ { ++ if ( args->force ) ++ { ++- if ( args->quiet < 2 ) +++ if ( args->verbosity > 0 ) ++ fprintf(bcftools_stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); ++ tscript_ok = 0; ++ break; ++@@ -1295,7 +1308,7 @@ ++ } ++ tscript_init_cds(args); ++ ++- if ( !args->quiet ) +++ if ( args->verbosity > 0 ) ++ { ++ fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", ++ regidx_nregs(args->idx_tscript), ++@@ -1311,14 +1324,16 @@ ++ free(aux->seq); ++ gff_id_destroy(&aux->gene_ids); ++ ++- if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) ) +++ if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) ) ++ { ++ khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; ++ fprintf(bcftools_stderr,"Ignored the following biotypes:\n"); ++ for (i = kh_begin(ign); i < kh_end(ign); i++) ++ { ++ if ( !kh_exist(ign,i)) continue; ++- fprintf(bcftools_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), kh_key(ign,i)); +++ const char *biotype = kh_key(ign,i); +++ if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")"; +++ fprintf(bcftools_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype); ++ } ++ } ++ khash_str2int_destroy_free(aux->ignored_biotypes); ++@@ -1328,7 +1343,7 @@ ++ { ++ args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32; ++ ++- if ( !args->quiet ) fprintf(bcftools_stderr,"Parsing %s ...\n", args->gff_fname); +++ if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", args->gff_fname); ++ init_gff(args); ++ ++ args->rid = -1; ++@@ -1351,7 +1366,8 @@ ++ if ( args->output_type==FT_TAB_TEXT ) ++ { ++ // significant speedup for plain VCFs ++- bcf_hdr_set_samples(args->hdr,NULL,0); +++ if (bcf_hdr_set_samples(args->hdr,NULL,0) < 0) +++ error_errno("[%s] Couldn't build sample filter", __func__); ++ } ++ args->phase = PHASE_DROP_GT; ++ } ++@@ -1362,7 +1378,7 @@ ++ if ( args->output_type==FT_TAB_TEXT ) ++ { ++ args->out = args->output_fname ? fopen(args->output_fname,"w") : bcftools_stdout; ++- if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname,strerror(errno)); +++ if ( !args->out ) error("Failed to write to %s: %s\n", !strcmp("-",args->output_fname)?"standard output":args->output_fname,strerror(errno)); ++ ++ fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version()); ++ fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]); ++@@ -1382,14 +1398,16 @@ ++ else ++ { ++ args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); ++- if ( args->out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); ++- bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); ++- bcf_hdr_printf(args->hdr,"##INFO=",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); +++ if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno)); +++ if ( args->n_threads > 0) +++ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p); +++ if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); +++ bcf_hdr_printf(args->hdr,"##INFO=",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); ++ if ( args->hdr_nsmpl ) ++ bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); ++- bcf_hdr_write(args->out_fh, args->hdr); +++ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); ++ } ++- if ( !args->quiet ) fprintf(bcftools_stderr,"Calling...\n"); +++ if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Calling...\n"); ++ } ++ ++ void destroy_data(args_t *args) ++@@ -1489,6 +1507,7 @@ ++ splice->vcf.pos = rec->pos; ++ splice->vcf.rlen = rec->rlen; ++ splice->vcf.ref = rec->d.allele[0]; +++ splice->csq = 0; ++ } ++ static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) ++ { ++@@ -1596,7 +1615,7 @@ ++ #endif ++ } ++ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec); ++-static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid) +++static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type) ++ { ++ while ( regitr_overlap(itr) ) ++ { ++@@ -1606,7 +1625,7 @@ ++ csq_t csq; ++ memset(&csq, 0, sizeof(csq_t)); ++ csq.pos = rec->pos; ++- csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3; +++ csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | type; ++ csq.type.biotype = tr->type; ++ csq.type.strand = tr->strand; ++ csq.type.trid = tr->id; ++@@ -1660,7 +1679,7 @@ ++ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); ++ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr ++ { ++- ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); +++ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); ++ if ( ret!=0 ) ++ { ++ regitr_destroy(itr); ++@@ -1698,7 +1717,7 @@ ++ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); ++ if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr ++ { ++- ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); +++ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); ++ if ( ret!=0 ) ++ { ++ regitr_destroy(itr); ++@@ -1765,14 +1784,105 @@ ++ return SPLICE_INSIDE; ++ } ++ +++int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) +++{ +++ static int small_ref_padding_warned = 0; +++ tscript_t *tr = splice->tr; +++ +++ // We know the VCF record overlaps the exon, but does it overlap the start codon? +++ if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0; +++ if ( tr->strand==STRAND_FWD && splice->vcf.pos >= ex_beg + 3 ) return 0; +++ +++#if XDBG +++ fprintf(bcftools_stderr,"shifted_del_synonymous: %d-%d %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":"rev"); +++ fprintf(bcftools_stderr," %d .. %s > %s\n",splice->vcf.pos+1,splice->vcf.ref,splice->vcf.alt); +++#endif +++ +++ // is there enough ref sequence for the extension? All coordinates are 0-based +++ int ref_len = strlen(splice->vcf.ref); +++ int alt_len = strlen(splice->vcf.alt); +++ assert( ref_len > alt_len ); +++ int ndel = ref_len - alt_len; +++ +++ if ( tr->strand==STRAND_REV ) +++ { +++ int32_t vcf_ref_end = splice->vcf.pos + ref_len - 1; // end pos of the VCF REF allele +++ int32_t tr_ref_end = splice->tr->end + N_REF_PAD; // the end pos of accessible cached ref seq +++ if ( vcf_ref_end + ndel > tr_ref_end ) +++ { +++ if ( !small_ref_padding_warned ) +++ { +++ fprintf(bcftools_stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1); +++ small_ref_padding_warned = 1; +++ } +++ return 0; +++ } +++ +++ char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele +++ char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted +++#if XDBG +++ fprintf(bcftools_stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); +++#endif +++ int i = 0; +++ while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++; +++ if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced +++ } +++ else +++ { +++ // STRAND_FWD +++ int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel; // the position of the first base of the ref block that could potentially replace the deletion +++ if ( vcf_block_beg < 0 ) return 0; +++ +++#if XDBG +++ fprintf(bcftools_stderr,"vcf_block_beg: %d\n",vcf_block_beg+1); +++#endif +++ +++ if ( N_REF_PAD + vcf_block_beg < ex_beg ) +++ { +++ if ( !small_ref_padding_warned ) +++ { +++ fprintf(bcftools_stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1); +++ small_ref_padding_warned = 1; +++ } +++ return 0; +++ } +++ +++ char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele +++ char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block +++#if XDBG +++ fprintf(bcftools_stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); +++#endif +++ +++ int i = 0; +++ while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++; +++ if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced +++ } +++ +++ return 1; +++} +++ ++ static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) ++ { +++ if ( splice->check_start ) +++ { +++ // check for synonymous start +++ // test/csq/ENST00000375992/incorrect-synon-del-not-start-lost.txt +++ // test/csq/ENST00000368801.2/start-lost.txt +++ // test/csq/ENST00000318249.2/synonymous-start-lost.txt +++ int is_synonymous = shifted_del_synonymous(args, splice, ex_beg, ex_end); +++ if ( is_synonymous ) +++ { +++ splice->csq |= CSQ_START_RETAINED; +++ return SPLICE_OVERLAP; +++ } +++ } +++ ++ // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG ++ splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; // 1b before the deleted base ++ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; // the last deleted base ++ ++ #if XDBG ++-fprintf(bcftools_stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); +++fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); ++ #endif ++ ++ if ( splice->ref_beg + 1 < ex_beg ) // the part before the exon; ref_beg is off by -1 ++@@ -1785,7 +1895,7 @@ ++ regitr_t *itr = regitr_init(NULL); ++ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); ++ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr ++- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); +++ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); ++ regitr_destroy(itr); ++ } ++ if ( !csq ) ++@@ -1841,7 +1951,7 @@ ++ regitr_t *itr = regitr_init(NULL); ++ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); ++ if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr ++- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); +++ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); ++ regitr_destroy(itr); ++ } ++ if ( !csq ) ++@@ -1876,7 +1986,6 @@ ++ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); ++ return SPLICE_OUTSIDE; ++ } ++- ++ if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1 ++ { ++ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; ++@@ -1931,7 +2040,7 @@ ++ regitr_t *itr = regitr_init(NULL); ++ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); ++ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr ++- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); +++ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); ++ regitr_destroy(itr); ++ } ++ if ( !csq ) ++@@ -1961,7 +2070,7 @@ ++ regitr_t *itr = regitr_init(NULL); ++ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); ++ if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr ++- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); +++ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); ++ regitr_destroy(itr); ++ } ++ if ( !csq ) ++@@ -2010,7 +2119,6 @@ ++ } ++ static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) ++ { ++- splice->csq = 0; ++ splice->vcf.alen = strlen(splice->vcf.alt); ++ ++ int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0; ++@@ -2040,6 +2148,7 @@ ++ return 0; ++ } ++ +++ ++ // return value: 0 added, 1 overlapping variant, 2 silent discard (intronic,alt=ref) ++ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial) ++ { ++@@ -2072,7 +2181,7 @@ ++ if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; ++ ++ #if XDBG ++-fprintf(bcftools_stderr,"\n%d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); +++fprintf(bcftools_stderr,"\nhap_init: %d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); ++ #endif ++ int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1); ++ #if XDBG ++@@ -2080,7 +2189,7 @@ ++ #endif ++ ++ if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA ++- if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq +++ if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP || splice.csq==CSQ_START_LOST ) // not a coding csq ++ { ++ free(splice.kref.s); ++ free(splice.kalt.s); ++@@ -2138,6 +2247,8 @@ ++ if ( len < 0 ) // overlapping variants ++ { ++ free(str.s); +++ free(splice.kref.s); +++ free(splice.kalt.s); ++ return 1; ++ } ++ kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); ++@@ -2175,6 +2286,7 @@ ++ if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE; // hack, specifically for ENST00000390520/deletion-overlap.vcf ++ } ++ +++ ++ free(splice.kref.s); ++ free(splice.kalt.s); ++ return 0; ++@@ -2208,7 +2320,7 @@ ++ void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill) ++ { ++ #if XDBG ++-fprintf(bcftools_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); +++fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); ++ #endif ++ char tmp[3], *codon, *end; ++ int i, len, npad; ++@@ -2308,7 +2420,7 @@ ++ #if DBG>1 ++ fprintf(bcftools_stderr," npad: %d\n",npad); ++ #endif ++-if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(bcftools_stderr,"sbeg=%d seq.l=%d seq.m=%d\n",sbeg,(int)seq.l,(int)seq.m); +++ if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(bcftools_stderr,"sbeg=%d seq.l=%d seq.m=%d npad=%d\n",sbeg,(int)seq.l,(int)seq.m,npad); ++ assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); // todo: first codon on the rev strand ++ ++ if ( npad==2 ) ++@@ -2329,8 +2441,8 @@ ++ for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end); ++ #if DBG>1 ++ fprintf(bcftools_stderr,"\t i=%d\n", i); ++- if(i==1)fprintf(bcftools_stderr,"[0] %c\n",tmp[2]); ++- if(i==0)fprintf(bcftools_stderr,"[0] %c%c\n",tmp[1],tmp[2]); +++ if(i==1)fprintf(bcftools_stderr,"[0] %c\n",tmp[2]); +++ if(i==0)fprintf(bcftools_stderr,"[0] %c%c\n",tmp[1],tmp[2]); ++ #endif ++ if ( i==-1 ) ++ { ++@@ -2571,12 +2683,25 @@ ++ kputs(csq->vstr.s, str); ++ } ++ +++void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str) +++{ +++ if ( !args->brief_predictions ) +++ kputs(aa->s, str); +++ else +++ { +++ int len = aa->l; +++ if ( aa->s[len-1]=='*' ) len--; +++ kputc(aa->s[0], str); +++ kputs("..", str); +++ kputw(beg+len, str); +++ } +++} +++ ++ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel) ++ { ++ int i; ++ tscript_t *tr = hap->tr; ++ int ref_node = tr->strand==STRAND_FWD ? ibeg : iend; ++- ++ int icsq = node->ncsq_list++; ++ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); ++ csq_t *csq = &node->csq_list[icsq]; ++@@ -2680,12 +2805,12 @@ ++ int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1; ++ kputc_('|', &str); ++ kputw(aa_rbeg, &str); ++- kputs(hap->tref.s, &str); +++ kprint_aa_prediction(args,aa_rbeg,&hap->tref,&str); ++ if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) ) ++ { ++ kputc_('>', &str); ++ kputw(aa_sbeg, &str); ++- kputs(hap->tseq.s, &str); +++ kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&str); ++ } ++ kputc_('|', &str); ++ ++@@ -2963,18 +3088,15 @@ ++ int icsq = 2*csq->idx + ihap; ++ if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT ++ { ++- int print_warning = 1; ++- if ( args->quiet ) +++ if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) ++ { ++- if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0; +++ fprintf(bcftools_stderr, +++ "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", +++ args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx); +++ if ( !args->ncsq_small_warned ) +++ fprintf(bcftools_stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); ++ args->ncsq_small_warned = 1; ++ } ++- if ( print_warning ) ++- { ++- fprintf(bcftools_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", ++- args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); ++- if ( args->quiet ) fprintf(bcftools_stderr,"(This warning is printed only once)\n"); ++- } ++ break; ++ } ++ if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; ++@@ -2986,12 +3108,10 @@ ++ { ++ int i,j; ++ tr_heap_t *heap = args->active_tr; ++- ++ while ( heap->ndat && heap->dat[0]->end<=pos ) ++ { ++ tscript_t *tr = heap->dat[0]; ++ khp_delete(trhp, heap); ++- ++ args->hap->tr = tr; ++ if ( tr->root && tr->root->nchild ) // normal, non-localized calling ++ { ++@@ -3030,7 +3150,7 @@ ++ ++ #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; } ++ ++-void vbuf_push(args_t *args, bcf1_t **rec_ptr) +++vbuf_t *vbuf_push(args_t *args, bcf1_t **rec_ptr) ++ { ++ int i; ++ ++@@ -3046,6 +3166,7 @@ ++ i = rbuf_append(&args->vcf_rbuf); ++ if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t)); ++ args->vcf_buf[i]->n = 0; +++ args->vcf_buf[i]->keep_until = 0; ++ } ++ vbuf_t *vbuf = args->vcf_buf[i]; ++ vbuf->n++; ++@@ -3065,16 +3186,29 @@ ++ int ret; ++ khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret); ++ kh_val(args->pos2vbuf,k) = vbuf; +++ +++ return vbuf; ++ } ++ ++-void vbuf_flush(args_t *args) +++void vbuf_flush(args_t *args, uint32_t pos) ++ { ++- if ( args->active_tr->ndat ) return; // cannot output buffered VCF lines (args.vbuf) until all active transcripts are gone ++- ++ int i,j; ++- while ( (i=rbuf_shift(&args->vcf_rbuf))>=0 ) +++ while ( args->vcf_rbuf.n ) ++ { ++- vbuf_t *vbuf = args->vcf_buf[i]; +++ vbuf_t *vbuf; +++ if ( !args->local_csq && args->active_tr->ndat ) +++ { +++ // check if the first active transcript starts beyond the first buffered VCF record, +++ // cannot output buffered VCF lines (args.vbuf) until the active transcripts are gone +++ vbuf = args->vcf_buf[ args->vcf_rbuf.f ]; +++ if ( vbuf->keep_until > pos ) break; +++ assert( vbuf->n ); +++ } +++ +++ i = rbuf_shift(&args->vcf_rbuf); +++ assert( i>=0 ); +++ vbuf = args->vcf_buf[i]; +++ int pos = vbuf->n ? vbuf->vrec[0]->line->pos : -1; ++ for (i=0; in; i++) ++ { ++ vrec_t *vrec = vbuf->vrec[i]; ++@@ -3085,7 +3219,10 @@ ++ } ++ if ( !vrec->nvcsq ) ++ { ++- bcf_write(args->out_fh, args->hdr, vrec->line); +++ if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); +++ int save_pos = vrec->line->pos; +++ bcf_empty(vrec->line); +++ vrec->line->pos = save_pos; // this is necessary for compound variants ++ continue; ++ } ++ ++@@ -3100,19 +3237,24 @@ ++ if ( args->hdr_nsmpl ) ++ { ++ if ( vrec->nfmt < args->nfmt_bcsq ) ++- for (j=1; jhdr_nsmpl; j++) memcpy(vrec->smpl+j*vrec->nfmt, vrec->smpl+j*args->nfmt_bcsq, vrec->nfmt*sizeof(*vrec->smpl)); +++ for (j=1; jhdr_nsmpl; j++) +++ memmove(&vrec->smpl[j*vrec->nfmt], &vrec->smpl[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->smpl)); ++ bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt); ++ } ++ vrec->nvcsq = 0; ++- bcf_write(args->out_fh, args->hdr, vrec->line); +++ if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); +++ int save_pos = vrec->line->pos; +++ bcf_empty(vrec->line); +++ vrec->line->pos = save_pos; ++ } ++- if ( vbuf->n ) +++ if ( pos!=-1 ) ++ { ++- khint_t k = kh_get(pos2vbuf, args->pos2vbuf, vbuf->vrec[0]->line->pos); +++ khint_t k = kh_get(pos2vbuf, args->pos2vbuf, pos); ++ if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k); ++ } ++ vbuf->n = 0; ++ } +++ if ( args->active_tr->ndat ) return; ++ ++ for (i=0; inrm_tr; i++) ++ { ++@@ -3139,10 +3281,12 @@ ++ int pad_end = len - (tr->end - tr->beg + 1 + pad_beg); ++ if ( pad_beg + pad_end != 2*N_REF_PAD ) ++ { ++- char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD); +++ char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1); ++ for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N'; ++ memcpy(ref+i, tr->ref, len); +++ len += i; ++ for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N'; +++ ref[i+len] = 0; ++ free(tr->ref); ++ tr->ref = ref; ++ } ++@@ -3150,15 +3294,19 @@ ++ ++ static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) ++ { ++- char *ref = tr->ref + (rec->pos + N_REF_PAD >= tr->beg ? rec->pos - tr->beg + N_REF_PAD : 0); ++- char *vcf = rec->d.allele[0] + (rec->pos + N_REF_PAD >= tr->beg ? 0 : tr->beg - N_REF_PAD - rec->pos); ++- assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) ); ++- while ( *ref && *vcf ) ++- { ++- if ( *ref!=*vcf && toupper(*ref)!=toupper(*vcf) ) ++- error("Error: the fasta reference does not match the VCF REF allele at %s:%d .. %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,rec->d.allele[0]); ++- ref++; ++- vcf++; +++ int vbeg = 0; +++ int rbeg = rec->pos - tr->beg + N_REF_PAD; +++ if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; } +++ char *ref = tr->ref + rbeg; +++ char *vcf = rec->d.allele[0] + vbeg; +++ assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD ); +++ int i = 0; +++ while ( ref[i] && vcf[i] ) +++ { +++ if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) ) +++ error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", +++ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); +++ i++; ++ } ++ } ++ ++@@ -3197,6 +3345,7 @@ ++ ++ for (i=1; in_allele; i++) ++ { +++ if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } ++ if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue; ++ ++ csq_t csq; ++@@ -3296,12 +3445,12 @@ ++ int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; ++ kputc_('|', &str); ++ kputw(aa_rbeg, &str); ++- kputs(tref->s, &str); +++ kprint_aa_prediction(args,aa_rbeg,tref,&str); ++ if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) ) ++ { ++ kputc_('>', &str); ++ kputw(aa_sbeg, &str); ++- kputs(tseq->s, &str); +++ kprint_aa_prediction(args,aa_sbeg,tseq,&str); ++ } ++ kputc_('|', &str); ++ kputw(rec->pos+1, &str); ++@@ -3332,8 +3481,10 @@ ++ return ret; ++ } ++ ++-int test_cds(args_t *args, bcf1_t *rec) +++int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) ++ { +++ static int overlaps_warned = 0, multiploid_warned = 0; +++ ++ int i, ret = 0, hap_ret; ++ const char *chr = bcf_seqname(args->hdr,rec); ++ // note that the off-by-one extension of rlen is deliberate to account for insertions ++@@ -3343,6 +3494,7 @@ ++ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); ++ tscript_t *tr = cds->tr; ++ if ( !GF_is_coding(tr->type) ) continue; +++ if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end; ++ ret = 1; ++ if ( !tr->root ) ++ { ++@@ -3372,10 +3524,17 @@ ++ // overlapping or intron variant, cannot apply ++ if ( hap_ret==1 ) ++ { ++- if ( !args->quiet ) ++- fprintf(bcftools_stderr,"Warning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); +++ if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) ) +++ { +++ fprintf(bcftools_stderr, +++ "Warning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s.\n", +++ chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); +++ if ( !overlaps_warned ) +++ fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); +++ overlaps_warned = 1; +++ } ++ if ( args->out ) ++- fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); +++ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); ++ } ++ else ret = 1; // prevent reporting as intron in test_tscript ++ hap_destroy(child); ++@@ -3411,10 +3570,17 @@ ++ ngts /= bcf_hdr_nsamples(args->hdr); ++ if ( ngts!=1 && ngts!=2 ) ++ { ++- if ( !args->quiet ) ++- fprintf(bcftools_stderr,"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); +++ if ( args->verbosity && (!multiploid_warned || args->verbosity > 1) ) +++ { +++ fprintf(bcftools_stderr, +++ "Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s.\n", +++ chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); +++ if ( !multiploid_warned ) +++ fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); +++ multiploid_warned = 1; +++ } ++ if ( args->out ) ++- fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); +++ fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); ++ continue; ++ } ++ for (ismpl=0; ismplsmpl->n; ismpl++) ++@@ -3431,7 +3597,7 @@ ++ if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) ) ++ { ++ if ( args->phase==PHASE_REQUIRE ) ++- error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); +++ error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); ++ if ( args->phase==PHASE_SKIP ) ++ continue; ++ if ( args->phase==PHASE_NON_REF ) ++@@ -3470,12 +3636,18 @@ ++ // overlapping or intron variant, cannot apply ++ if ( hap_ret==1 ) ++ { ++- if ( !args->quiet ) ++- fprintf(bcftools_stderr,"Warning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", ++- chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); +++ if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) ) +++ { +++ fprintf(bcftools_stderr, +++ "Warning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s.\n", +++ chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); +++ if ( !overlaps_warned ) +++ fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); +++ overlaps_warned = 1; +++ } ++ if ( args->out ) ++- fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", ++- chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); +++ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s\n", +++ chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); ++ } ++ hap_destroy(child); ++ continue; ++@@ -3561,19 +3733,15 @@ ++ if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT ++ { ++ int ismpl = args->smpl->idx[i]; ++- int print_warning = 1; ++- if ( args->quiet ) +++ if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) ++ { ++- if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0; +++ fprintf(bcftools_stderr, +++ "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", +++ args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq+1); +++ if ( !args->ncsq_small_warned ) +++ fprintf(bcftools_stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); ++ args->ncsq_small_warned = 1; ++ } ++- if ( print_warning ) ++- { ++- fprintf(bcftools_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", ++- args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); ++- if ( args->quiet ) fprintf(bcftools_stderr,"(This warning is printed only once)\n"); ++- } ++- break; ++ } ++ if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; ++ vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); ++@@ -3596,8 +3764,9 @@ ++ tscript_t *tr = splice.tr = utr->tr; ++ for (i=1; in_allele; i++) ++ { ++- if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } +++ if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } ++ splice.vcf.alt = rec->d.allele[i]; +++ splice.csq = 0; ++ int splice_ret = splice_csq(args, &splice, utr->beg, utr->end); ++ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; ++ csq_t csq; ++@@ -3639,6 +3808,7 @@ ++ { ++ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } ++ splice.vcf.alt = rec->d.allele[i]; +++ splice.csq = 0; ++ splice_csq(args, &splice, exon->beg, exon->end); ++ if ( splice.csq ) ret = 1; ++ } ++@@ -3661,8 +3831,9 @@ ++ tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); ++ for (i=1; in_allele; i++) ++ { ++- if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } +++ if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } ++ splice.vcf.alt = rec->d.allele[i]; +++ splice.csq = 0; ++ int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); ++ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF ++ csq_t csq; ++@@ -3682,22 +3853,151 @@ ++ return ret; ++ } ++ ++-void process(args_t *args, bcf1_t **rec_ptr) +++void test_symbolic_alt(args_t *args, bcf1_t *rec) +++{ +++ static int warned = 0; +++ if ( args->verbosity && (!warned && args->verbosity > 0) ) +++ { +++ fprintf(bcftools_stderr,"Warning: The support for symbolic ALT insertions is experimental.\n"); +++ warned = 1; +++ } +++ +++ const char *chr = bcf_seqname(args->hdr,rec); +++ +++ // only insertions atm +++ int beg = rec->pos + 1; +++ int end = beg; +++ int csq_class = CSQ_ELONGATION; +++ +++ int hit = 0; +++ if ( regidx_overlap(args->idx_cds,chr,beg,end, args->itr) ) +++ { +++ while ( regitr_overlap(args->itr) ) +++ { +++ csq_t csq; +++ memset(&csq, 0, sizeof(csq_t)); +++ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); +++ tscript_t *tr = cds->tr; +++ csq.type.type = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class; +++ csq.pos = rec->pos; +++ csq.type.biotype = tr->type; +++ csq.type.strand = tr->strand; +++ csq.type.trid = tr->id; +++ csq.type.gene = tr->gene->name; +++ csq_stage(args, &csq, rec); +++ hit = 1; +++ } +++ } +++ if ( regidx_overlap(args->idx_utr,chr,beg,end, args->itr) ) +++ { +++ while ( regitr_overlap(args->itr) ) +++ { +++ csq_t csq; +++ memset(&csq, 0, sizeof(csq_t)); +++ gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); +++ tscript_t *tr = utr->tr; +++ csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class; +++ csq.pos = rec->pos; +++ csq.type.biotype = tr->type; +++ csq.type.strand = tr->strand; +++ csq.type.trid = tr->id; +++ csq.type.gene = tr->gene->name; +++ csq_stage(args, &csq, rec); +++ hit = 1; +++ } +++ } +++ if ( regidx_overlap(args->idx_exon,chr,beg,end, args->itr) ) +++ { +++ splice_t splice; +++ splice_init(&splice, rec); +++ splice.check_acceptor = splice.check_donor = 1; +++ +++ while ( regitr_overlap(args->itr) ) +++ { +++ gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*); +++ splice.tr = exon->tr; +++ if ( !splice.tr->ncds ) continue; // not a coding transcript, no interest in splice sites +++ splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1; +++ splice.check_region_end = splice.tr->end==exon->end ? 0 : 1; +++ splice.vcf.alt = rec->d.allele[1]; +++ splice.csq = csq_class; +++ splice_csq(args, &splice, exon->beg, exon->end); +++ if ( splice.csq ) hit = 1; +++ } +++ } +++ if ( !hit && regidx_overlap(args->idx_tscript,chr,beg,end, args->itr) ) +++ { +++ splice_t splice; +++ splice_init(&splice, rec); +++ +++ while ( regitr_overlap(args->itr) ) +++ { +++ csq_t csq; +++ memset(&csq, 0, sizeof(csq_t)); +++ tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); +++ splice.vcf.alt = rec->d.allele[1]; +++ splice.csq = csq_class; +++ int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); +++ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF +++ csq.type.type = (GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING) | csq_class; +++ csq.pos = rec->pos; +++ csq.type.biotype = tr->type; +++ csq.type.strand = tr->strand; +++ csq.type.trid = tr->id; +++ csq.type.gene = tr->gene->name; +++ csq_stage(args, &csq, rec); +++ } +++ } +++} +++ +++void debug_print_buffers(args_t *args, int pos) +++{ +++ int i,j; +++ fprintf(bcftools_stderr,"debug_print_buffers at %d\n", pos); +++ fprintf(bcftools_stderr,"vbufs:\n"); +++ for (i=0; ivcf_rbuf.n; i++) +++ { +++ int k = rbuf_kth(&args->vcf_rbuf, i); +++ vbuf_t *vbuf = args->vcf_buf[k]; +++ +++ fprintf(bcftools_stderr,"\tvbuf %d:\n", i); +++ for (j=0; jn; j++) +++ { +++ vrec_t *vrec = vbuf->vrec[j]; +++ fprintf(bcftools_stderr,"\t\t%"PRId64" .. nvcsq=%d\n", (int64_t) vrec->line->pos+1, vrec->nvcsq); +++ } +++ } +++ fprintf(bcftools_stderr,"pos2vbuf:"); +++ khint_t k; +++ for (k = 0; k < kh_end(args->pos2vbuf); ++k) +++ if (kh_exist(args->pos2vbuf, k)) fprintf(bcftools_stderr," %d",1+(int)kh_key(args->pos2vbuf, k)); +++ fprintf(bcftools_stderr,"\n"); +++ fprintf(bcftools_stderr,"active_tr: %d\n", args->active_tr->ndat); +++} +++ +++static void process(args_t *args, bcf1_t **rec_ptr) ++ { ++ if ( !rec_ptr ) ++ { ++ hap_flush(args, REGIDX_MAX); ++- vbuf_flush(args); +++ vbuf_flush(args, REGIDX_MAX); ++ return; ++ } ++ ++ bcf1_t *rec = *rec_ptr; +++ static int32_t prev_rid = -1, prev_pos = -1; +++ if ( prev_rid!=rec->rid ) { prev_rid = rec->rid; prev_pos = rec->pos; } +++ if ( prev_pos > rec->pos ) +++ error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ ++ int call_csq = 1; ++- if ( !rec->n_allele ) call_csq = 0; // no alternate allele ++- else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele ++- else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc ++- else if ( args->filter ) +++ if ( rec->n_allele < 2 ) call_csq = 0; // no alternate allele +++ else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='*' || rec->d.allele[1][1]=='*') ) call_csq = 0; // gVCF, not an alt allele +++ else if ( rec->d.allele[1][0]=='<' ) +++ { +++ if ( strncmp("d.allele[1], 4) ) call_csq = 0; // only is supported at the moment +++ } +++ if ( call_csq && args->filter ) ++ { ++ call_csq = filter_test(args->filter, rec, NULL); ++ if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1; ++@@ -3706,25 +4006,34 @@ ++ { ++ if ( !args->out_fh ) return; // not a VCF output ++ vbuf_push(args, rec_ptr); ++- vbuf_flush(args); +++ hap_flush(args, rec->pos-1); +++ vbuf_flush(args, rec->pos-1); ++ return; ++ } ++ ++ if ( args->rid != rec->rid ) ++ { ++ hap_flush(args, REGIDX_MAX); ++- vbuf_flush(args); +++ vbuf_flush(args, REGIDX_MAX); ++ } ++ args->rid = rec->rid; ++- vbuf_push(args, rec_ptr); +++ vbuf_t *vbuf = vbuf_push(args, rec_ptr); ++ ++- int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec); ++- hit += test_utr(args, rec); ++- hit += test_splice(args, rec); ++- if ( !hit ) test_tscript(args, rec); +++ if ( rec->d.allele[1][0]!='<' ) +++ { +++ int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec, vbuf); +++ hit += test_utr(args, rec); +++ hit += test_splice(args, rec); +++ if ( !hit ) test_tscript(args, rec); +++ } +++ else +++ test_symbolic_alt(args, rec); ++ ++- hap_flush(args, rec->pos-1); ++- vbuf_flush(args); +++ if ( rec->pos > 0 ) +++ { +++ hap_flush(args, rec->pos-1); +++ vbuf_flush(args, rec->pos-1); +++ } ++ ++ return; ++ } ++@@ -3741,6 +4050,7 @@ ++ " -g, --gff-annot gff3 annotation file\n" ++ "\n" ++ "CSQ options:\n" +++ " -b, --brief-predictions annotate with abbreviated protein-changing predictions\n" ++ " -c, --custom-tag use this tag instead of the default BCSQ\n" ++ " -l, --local-csq localized predictions, consider only one VCF record at a time\n" ++ " -n, --ncsq maximum number of consequences to consider per site [16]\n" ++@@ -3754,16 +4064,18 @@ ++ " -e, --exclude exclude sites for which the expression is true\n" ++ " --force run even if some sanity checks fail\n" ++ " -i, --include select sites for which the expression is true\n" +++ " --no-version do not append version and command line to the header\n" ++ " -o, --output write output to a file [standard output]\n" ++ " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" ++ " v: uncompressed VCF, t: plain tab-delimited text output [v]\n" ++- " -q, --quiet suppress warning messages. Can be given two times for even less messages\n" ++ " -r, --regions restrict to comma-separated list of regions\n" ++ " -R, --regions-file restrict to regions listed in a file\n" ++ " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n" ++ " -S, --samples-file samples to include\n" ++ " -t, --targets similar to -r but streams rather than index-jumps\n" ++ " -T, --targets-file similar to -R but streams rather than index-jumps\n" +++ " --threads use multithreading with worker threads [0]\n" +++ " -v, --verbose verbosity level 0-2 [1]\n" ++ "\n" ++ "Example:\n" ++ " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" ++@@ -3781,12 +4093,16 @@ ++ args->output_type = FT_VCF; ++ args->bcsq_tag = "BCSQ"; ++ args->ncsq_max = 2*16; +++ args->verbosity = 1; +++ args->record_cmd_line = 1; ++ ++ static struct option loptions[] = ++ { ++ {"force",0,0,1}, +++ {"threads",required_argument,NULL,2}, ++ {"help",0,0,'h'}, ++ {"ncsq",1,0,'n'}, +++ {"brief-predictions",0,0,'b'}, ++ {"custom-tag",1,0,'c'}, ++ {"local-csq",0,0,'l'}, ++ {"gff-annot",1,0,'g'}, ++@@ -3797,24 +4113,36 @@ ++ {"output-type",1,NULL,'O'}, ++ {"phase",1,0,'p'}, ++ {"quiet",0,0,'q'}, +++ {"verbose",1,0,'v'}, ++ {"regions",1,0,'r'}, ++ {"regions-file",1,0,'R'}, ++ {"samples",1,0,'s'}, ++ {"samples-file",1,0,'S'}, ++ {"targets",1,0,'t'}, ++ {"targets-file",1,0,'T'}, +++ {"no-version",no_argument,NULL,3}, ++ {0,0,0,0} ++ }; ++ int c, targets_is_file = 0, regions_is_file = 0; ++- char *targets_list = NULL, *regions_list = NULL; ++- while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:",loptions,NULL)) >= 0) +++ char *targets_list = NULL, *regions_list = NULL, *tmp; +++ while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bv:",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { ++ case 1 : args->force = 1; break; +++ case 2 : +++ args->n_threads = strtol(optarg,&tmp,10); +++ if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); +++ break; +++ case 3 : args->record_cmd_line = 0; break; +++ case 'b': args->brief_predictions = 1; break; ++ case 'l': args->local_csq = 1; break; ++ case 'c': args->bcsq_tag = optarg; break; ++- case 'q': args->quiet++; break; +++ case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break; +++ case 'v': +++ args->verbosity = atoi(optarg); +++ if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n"); +++ break; ++ case 'p': ++ switch (optarg[0]) ++ { ++@@ -3871,8 +4199,9 @@ ++ error("Failed to read the targets: %s\n", targets_list); ++ if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 ) ++ error("Failed to read the regions: %s\n", regions_list); +++ if ( bcf_sr_set_threads(args->sr, args->n_threads)<0 ) error("Failed to create %d extra threads\n", args->n_threads); ++ if ( !bcf_sr_add_reader(args->sr, fname) ) ++- error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->sr->errnum)); +++ error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->sr->errnum)); ++ args->hdr = bcf_sr_get_header(args->sr,0); ++ ++ init_data(args); ++@@ -3885,7 +4214,6 @@ ++ destroy_data(args); ++ bcf_sr_destroy(args->sr); ++ free(args); ++- ++ return 0; ++ } ++ ++--- python-pysam.orig/bcftools/filter.c +++++ python-pysam/bcftools/filter.c ++@@ -28,7 +28,10 @@ ++ #include ++ #include ++ #include +++#include +++#ifndef _WIN32 ++ #include +++#endif ++ #include ++ #include ++ #include ++@@ -53,8 +56,8 @@ ++ # define __FUNCTION__ __func__ ++ #endif ++ ++-uint64_t bcf_double_missing = 0x7ff0000000000001; ++-uint64_t bcf_double_vector_end = 0x7ff0000000000002; +++static const uint64_t bcf_double_missing = 0x7ff0000000000001; +++static const uint64_t bcf_double_vector_end = 0x7ff0000000000002; ++ static inline void bcf_double_set(double *ptr, uint64_t value) ++ { ++ union { uint64_t i; double d; } u; ++@@ -71,6 +74,7 @@ ++ #define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing) ++ #define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end) ++ #define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing) +++#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end)) ++ ++ ++ typedef struct _token_t ++@@ -82,7 +86,7 @@ ++ char *tag; // for debugging and printout only, VCF tag name ++ double threshold; // filtering threshold ++ int is_constant; // the threshold is set ++- int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types +++ int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types ++ int idx; // 0-based index to VCF vectors, ++ // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) ++ int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited ++@@ -151,11 +155,14 @@ ++ #define TOK_CNT 26 ++ #define TOK_PERLSUB 27 ++ #define TOK_BINOM 28 ++- ++-// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 ++-// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p ++-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8}; ++-#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcp" +++#define TOK_PHRED 29 +++#define TOK_MEDIAN 30 +++#define TOK_STDEV 31 +++ +++// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +++// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s +++static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; +++#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" ++ ++ // Return negative values if it is a function with variable number of arguments ++ static int filters_next_token(char **str, int *len) ++@@ -179,12 +186,16 @@ ++ ++ if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; } ++ if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; } +++ if ( !strncasecmp(tmp,"MEAN(",5) ) { (*str) += 4; return TOK_AVG; } +++ if ( !strncasecmp(tmp,"MEDIAN(",7) ) { (*str) += 6; return TOK_MEDIAN; } ++ if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; } +++ if ( !strncasecmp(tmp,"STDEV(",6) ) { (*str) += 5; return TOK_STDEV; } ++ if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; } ++ if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; } ++ if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; } ++ if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; } ++ if ( !strncasecmp(tmp,"BINOM(",6) ) { (*str) += 5; return -TOK_BINOM; } +++ if ( !strncasecmp(tmp,"PHRED(",6) ) { (*str) += 5; return TOK_PHRED; } ++ if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility ++ if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility ++ if ( !strncasecmp(tmp,"%AVG(",5) ) { (*str) += 4; return TOK_AVG; } // for backward compatibility ++@@ -195,6 +206,7 @@ ++ if ( !strncasecmp(tmp,"PERL.",5) ) { (*str) += 5; return -TOK_PERLSUB; } ++ if ( !strncasecmp(tmp,"N_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; } ++ if ( !strncasecmp(tmp,"F_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; } +++ if ( !strncasecmp(tmp,"%ILEN",5) ) { *len = 5; return TOK_VAL; } // to be able to distinguish between INFO/ILEN and on-the-fly ILEN ++ ++ if ( tmp[0]=='@' ) // file name ++ { ++@@ -280,28 +292,30 @@ ++ } ++ ++ ++-/* +++/* ++ Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller. ++ ++- Based on jkb's staden code with some adjustements. +++ Based on jkb's staden code with some adjustments. ++ https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123 ++ */ ++ char *expand_path(char *path) ++ { ++-#ifdef _WIN32 ++- return strdup(path); // windows expansion: todo ++-#endif ++- ++ kstring_t str = {0,0,0}; ++ ++ if ( path[0] == '~' ) ++ { ++ if ( !path[1] || path[1] == '/' ) ++ { +++#ifdef _WIN32 +++ kputs(getenv("HOMEDRIVE"), &str); +++ kputs(getenv("HOMEPATH"), &str); +++#else ++ // ~ or ~/path ++ kputs(getenv("HOME"), &str); ++ if ( path[1] ) kputs(path+1, &str); +++#endif ++ } +++#ifndef _WIN32 ++ else ++ { ++ // user name: ~pd3/path ++@@ -315,13 +329,18 @@ ++ else kputs(pwentry->pw_dir, &str); ++ kputs(end, &str); ++ } ++- return str.s; +++#endif +++ return ks_release(&str); ++ } ++ if ( path[0] == '$' ) ++ { ++ char *var = getenv(path+1); ++- if ( var ) path = var; +++ if ( var ) { +++ kputs(var, &str); +++ return ks_release(&str); +++ } ++ } +++ ++ return strdup(path); ++ } ++ ++@@ -444,6 +463,8 @@ ++ return; ++ } ++ +++ if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n"); +++ ++ if ( rtok->tok_type==TOK_EQ ) ++ rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1; ++ else ++@@ -499,6 +520,14 @@ ++ return -1; // this shouldn't happen ++ } ++ +++static void filters_set_chrom(filter_t *flt, bcf1_t *line, token_t *tok) +++{ +++ tok->str_value.l = 0; +++ kputs(bcf_seqname(flt->hdr,line), &tok->str_value); +++ tok->nvalues = tok->str_value.l; +++ tok->is_str = 1; +++} +++ ++ static void filters_set_pos(filter_t *flt, bcf1_t *line, token_t *tok) ++ { ++ tok->values[0] = line->pos+1; ++@@ -640,7 +669,7 @@ ++ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) ++ { ++ if ( line->n_sample != tok->nsamples ) ++- error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); +++ error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples); ++ ++ int nvals; ++ if ( (nvals=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 ) ++@@ -659,8 +688,10 @@ ++ { ++ if ( !tok->usmpl[i] ) continue; ++ int32_t *ptr = flt->tmpi + i*nsrc1; ++- if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing || ptr[tok->idx]==bcf_int32_vector_end ) +++ if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing ) ++ bcf_double_set_missing(tok->values[i]); +++ else if ( ptr[tok->idx]==bcf_int32_vector_end ) +++ bcf_double_set_vector_end(tok->values[i]); ++ else ++ tok->values[i] = ptr[tok->idx]; ++ } ++@@ -677,24 +708,31 @@ ++ for (k=0; knidxs && !tok->idxs[k] ) continue; ++- if ( src[k]==bcf_int32_missing || src[k]==bcf_int32_vector_end ) +++ if ( src[k]==bcf_int32_missing ) ++ bcf_double_set_missing(dst[j]); +++ else if ( src[k]==bcf_int32_vector_end ) +++ bcf_double_set_vector_end(dst[j]); ++ else ++ dst[j] = src[k]; ++ j++; ++ } ++- while (j < tok->nval1) +++ if ( j==0 ) ++ { ++ bcf_double_set_missing(dst[j]); ++ j++; ++ } +++ while (j < tok->nval1) +++ { +++ bcf_double_set_vector_end(dst[j]); +++ j++; +++ } ++ } ++ } ++ } ++ static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) ++ { ++ if ( line->n_sample != tok->nsamples ) ++- error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); +++ error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples); ++ ++ int nvals; ++ if ( (nvals=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 ) ++@@ -713,8 +751,10 @@ ++ { ++ if ( !tok->usmpl[i] ) continue; ++ float *ptr = flt->tmpf + i*nsrc1; ++- if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) || bcf_float_is_vector_end(ptr[tok->idx]) ) +++ if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) ) ++ bcf_double_set_missing(tok->values[i]); +++ else if ( bcf_float_is_vector_end(ptr[tok->idx]) ) +++ bcf_double_set_vector_end(tok->values[i]); ++ else ++ tok->values[i] = ptr[tok->idx]; ++ } ++@@ -731,24 +771,31 @@ ++ for (k=0; knidxs && !tok->idxs[k] ) continue; ++- if ( bcf_float_is_missing(src[k]) || bcf_float_is_vector_end(src[k]) ) +++ if ( bcf_float_is_missing(src[k]) ) ++ bcf_double_set_missing(dst[j]); +++ else if ( bcf_float_is_vector_end(src[k]) ) +++ bcf_double_set_vector_end(dst[j]); ++ else ++ dst[j] = src[k]; ++ j++; ++ } ++- while (j < tok->nval1) +++ if ( j==0 ) ++ { ++ bcf_double_set_missing(dst[j]); ++ j++; ++ } +++ while (j < tok->nval1) +++ { +++ bcf_double_set_vector_end(dst[j]); +++ j++; +++ } ++ } ++ } ++ } ++ static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok) ++ { ++ if ( line->n_sample != tok->nsamples ) ++- error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); +++ error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples); ++ ++ int i, ndim = tok->str_value.m; ++ int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim); ++@@ -868,7 +915,7 @@ ++ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; ++ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; ++ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; ++- default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break; +++ default: error("The GT type is not lineognised: %d at %s:%"PRId64"\n",fmt->type, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1); break; ++ } ++ #undef BRANCH_INT ++ assert( tok->nsamples == nsmpl ); ++@@ -916,6 +963,19 @@ ++ tok->nvalues = tok->str_value.l; ++ tok->nval1 = blen; ++ } +++static void filters_set_ilen(filter_t *flt, bcf1_t *line, token_t *tok) +++{ +++ tok->nvalues = line->n_allele - 1; +++ hts_expand(double,tok->nvalues,tok->mvalues,tok->values); +++ +++ int i, rlen = strlen(line->d.allele[0]); +++ for (i=1; in_allele; i++) +++ { +++ int alen = strlen(line->d.allele[i]); +++ if ( rlen==alen ) bcf_double_set_missing(tok->values[i-1]); +++ else tok->values[i-1] = alen - rlen; +++ } +++} ++ static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok) ++ { ++ tok->str_value.l = 0; ++@@ -1014,10 +1074,16 @@ ++ if ( rtok->pass_samples[i] ) npass++; ++ } ++ ++- assert( rtok->values ); ++- rtok->nvalues = 1; ++- rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); ++- rtok->nsamples = 0; +++ hts_expand(double,rtok->nsamples,rtok->mvalues,rtok->values); +++ double value = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); +++ rtok->nval1 = 1; +++ rtok->nvalues = rtok->nsamples; +++ +++ // Set per-sample status so that `query -i 'F_PASS(GT!="mis" & GQ >= 20) > 0.5'` or +trio-stats +++ // consider only the passing site AND samples. The values for failed samples is set to -1 so +++ // that it can never conflict with valid expressions. +++ for (i=0; insamples; i++) +++ rtok->values[i] = rtok->pass_samples[i] ? value : -1; ++ ++ return 1; ++ } ++@@ -1103,7 +1169,7 @@ ++ int i, has_value = 0; ++ for (i=0; invalues; i++) ++ { ++- if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue; +++ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; ++ has_value = 1; ++ if ( val < tok->values[i] ) val = tok->values[i]; ++ } ++@@ -1123,7 +1189,7 @@ ++ int i, has_value = 0; ++ for (i=0; invalues; i++) ++ { ++- if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue; +++ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; ++ has_value = 1; ++ if ( val > tok->values[i] ) val = tok->values[i]; ++ } ++@@ -1142,7 +1208,7 @@ ++ double val = 0; ++ int i, n = 0; ++ for (i=0; invalues; i++) ++- if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } +++ if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } ++ if ( n ) ++ { ++ rtok->values[0] = val / n; ++@@ -1150,6 +1216,61 @@ ++ } ++ return 1; ++ } +++static int compare_doubles(const void *lhs, const void *rhs) +++{ +++ double arg1 = *(const double*) lhs; +++ double arg2 = *(const double*) rhs; +++ if (arg1 < arg2) return -1; +++ if (arg1 > arg2) return 1; +++ return 0; +++} +++static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +++{ +++ token_t *tok = stack[nstack - 1]; +++ rtok->nvalues = 0; +++ if ( !tok->nvalues ) return 1; +++ int i, n = 0; +++ for (i=0; invalues; i++) +++ { +++ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; +++ if ( n < i ) tok->values[n] = tok->values[i]; +++ n++; +++ } +++ if ( !n ) return 1; +++ if ( n==1 ) rtok->values[0] = tok->values[0]; +++ else +++ { +++ qsort(tok->values, n, sizeof(double), compare_doubles); +++ rtok->values[0] = n % 2 ? tok->values[n/2] : (tok->values[n/2-1] + tok->values[n/2]) * 0.5; +++ } +++ rtok->nvalues = 1; +++ return 1; +++} +++static int func_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +++{ +++ token_t *tok = stack[nstack - 1]; +++ rtok->nvalues = 0; +++ if ( !tok->nvalues ) return 1; +++ int i, n = 0; +++ for (i=0; invalues; i++) +++ { +++ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; +++ if ( n < i ) tok->values[n] = tok->values[i]; +++ n++; +++ } +++ if ( !n ) return 1; +++ if ( n==1 ) rtok->values[0] = 0; +++ else +++ { +++ double sdev = 0, avg = 0; +++ for (i=0; ivalues[n]; +++ avg /= n; +++ for (i=0; ivalues[n] - avg) * (tok->values[n] - avg); +++ rtok->values[0] = sqrt(sdev/n); +++ } +++ rtok->nvalues = 1; +++ return 1; +++} ++ static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) ++ { ++ rtok->nvalues = 0; ++@@ -1158,7 +1279,7 @@ ++ double val = 0; ++ int i, n = 0; ++ for (i=0; invalues; i++) ++- if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } +++ if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } ++ if ( n ) ++ { ++ rtok->values[0] = val; ++@@ -1177,17 +1298,28 @@ ++ int i; ++ for (i=0; invalues; i++) ++ if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); ++- else rtok->values[i] = fabs(tok->values[i]); +++ else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]); ++ return 1; ++ } ++ static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) ++ { ++ token_t *tok = stack[nstack - 1]; ++- if ( !tok->nsamples ) error("COUNT() can be applied only on FORMAT fields\n"); ++- ++ int i, cnt = 0; ++- for (i=0; insamples; i++) ++- if ( tok->pass_samples[i] ) cnt++; +++ if ( !tok->nsamples ) +++ { +++ if ( tok->is_str ) +++ { +++ if ( tok->str_value.l ) cnt = 1; +++ for (i=0; istr_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++; +++ } +++ else +++ cnt = tok->nvalues; +++ } +++ else +++ { +++ for (i=0; insamples; i++) +++ if ( tok->pass_samples[i] ) cnt++; +++ } ++ ++ rtok->nvalues = 1; ++ rtok->values[0] = cnt; ++@@ -1303,10 +1435,10 @@ ++ } ++ int idx1 = bcf_gt_allele(ptr[0]); ++ int idx2 = bcf_gt_allele(ptr[1]); ++- if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%d, sample %s\n", bcf_seqname(flt->hdr,line),line->pos+1,flt->hdr->samples[i]); ++- if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%d, sample %s\n", bcf_seqname(flt->hdr,line),line->pos+1,flt->hdr->samples[i]); +++ if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]); +++ if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]); ++ double *vals = tok->values + tok->nval1*i; ++- if ( bcf_double_is_missing(vals[idx1]) || bcf_double_is_missing(vals[idx2]) ) +++ if ( bcf_double_is_missing_or_vector_end(vals[idx1]) || bcf_double_is_missing_or_vector_end(vals[idx2]) ) ++ { ++ bcf_double_set_missing(rtok->values[i]); ++ continue; ++@@ -1324,13 +1456,13 @@ ++ // the fields given explicitly: binom(AD[:0],AD[:1]) ++ token_t *tok2 = stack[istack+1]; ++ if ( tok->nval1!=1 || tok2->nval1!=1 ) ++- error("Expected one value per binom() argument, found %d and %d at %s:%d\n",tok->nval1,tok2->nval1, bcf_seqname(flt->hdr,line),line->pos+1); +++ error("Expected one value per binom() argument, found %d and %d at %s:%"PRId64"\n",tok->nval1,tok2->nval1, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1); ++ for (i=0; insamples; i++) ++ { ++ if ( !rtok->usmpl[i] ) continue; ++ double *ptr1 = tok->values + tok->nval1*i; ++ double *ptr2 = tok2->values + tok2->nval1*i; ++- if ( bcf_double_is_missing(ptr1[0]) || bcf_double_is_missing(ptr2[0]) ) +++ if ( bcf_double_is_missing_or_vector_end(ptr1[0]) || bcf_double_is_missing_or_vector_end(ptr2[0]) ) ++ { ++ bcf_double_set_missing(rtok->values[i]); ++ continue; ++@@ -1370,7 +1502,7 @@ ++ ptr2 = &tok2->values[0]; ++ } ++ } ++- if ( !ptr1 || !ptr2 || bcf_double_is_missing(ptr1[0]) || bcf_double_is_missing(ptr2[0]) ) +++ if ( !ptr1 || !ptr2 || bcf_double_is_missing_or_vector_end(ptr1[0]) || bcf_double_is_missing_or_vector_end(ptr2[0]) ) ++ bcf_double_set_missing(rtok->values[0]); ++ else ++ { ++@@ -1381,6 +1513,31 @@ ++ } ++ return rtok->nargs; ++ } +++static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +++{ +++ token_t *tok = stack[nstack - 1]; +++ if ( tok->is_str ) error("PHRED() can be applied only on numeric values\n"); +++ +++ rtok->nsamples = tok->nsamples; +++ rtok->nval1 = tok->nval1; +++ memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples)); +++ assert(tok->usmpl); +++ if ( !rtok->usmpl ) +++ { +++ rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl)); +++ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl)); +++ } +++ rtok->nvalues = tok->nvalues; +++ if ( !tok->nvalues ) return 1; +++ +++ hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); +++ int i; +++ for (i=0; invalues; i++) +++ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); +++ else rtok->values[i] = -4.34294481903*log(tok->values[i]); +++ +++ return 1; +++} ++ inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok) ++ { ++ token_t *tok = atok->nvalues > btok->nvalues ? atok : btok; ++@@ -1414,7 +1571,7 @@ ++ assert( atok->nsamples==btok->nsamples ); \ ++ for (i=0; invalues; i++) \ ++ { \ ++- if ( bcf_double_is_missing(atok->values[i]) || bcf_double_is_missing(btok->values[i]) ) \ +++ if ( bcf_double_is_missing_or_vector_end(atok->values[i]) || bcf_double_is_missing_or_vector_end(btok->values[i]) ) \ ++ { \ ++ bcf_double_set_missing(rtok->values[i]); \ ++ continue; \ ++@@ -1428,11 +1585,11 @@ ++ token_t *xtok = atok->nsamples ? atok : btok; \ ++ token_t *ytok = atok->nsamples ? btok : atok; \ ++ assert( ytok->nvalues==1 ); \ ++- if ( !bcf_double_is_missing(ytok->values[0]) ) \ +++ if ( !bcf_double_is_missing_or_vector_end(ytok->values[0]) ) \ ++ { \ ++ for (i=0; invalues; i++) \ ++ { \ ++- if ( bcf_double_is_missing(xtok->values[i]) ) \ +++ if ( bcf_double_is_missing_or_vector_end(xtok->values[i]) ) \ ++ { \ ++ bcf_double_set_missing(rtok->values[i]); \ ++ continue; \ ++@@ -1566,7 +1723,6 @@ ++ { \ ++ token_t *rtok = _rtok; \ ++ int i, j, k; \ ++- assert( !atok->nsamples || !btok->nsamples ); \ ++ tok_init_samples(atok, btok, rtok); \ ++ if ( !atok->nsamples && !btok->nsamples ) \ ++ { \ ++@@ -1576,7 +1732,7 @@ ++ token_t *tok = atok->nvalues ? atok : btok; \ ++ for (j=0; jnvalues; j++) \ ++ { \ ++- if ( bcf_double_is_missing(tok->values[j]) ) \ +++ if ( bcf_double_is_missing_or_vector_end(tok->values[j]) ) \ ++ { \ ++ if ( missing_logic[2] ) { rtok->pass_site = 1; break; } \ ++ } \ ++@@ -1587,15 +1743,19 @@ ++ { \ ++ for (i=0; invalues; i++) \ ++ { \ ++- int amiss = bcf_double_is_missing(atok->values[i]) ? 1 : 0; \ +++ int amiss = bcf_double_is_missing_or_vector_end(atok->values[i]) ? 1 : 0; \ ++ for (j=0; jnvalues; j++) \ ++ { \ ++- int nmiss = amiss + (bcf_double_is_missing(btok->values[j]) ? 1 : 0); \ +++ int nmiss = amiss + (bcf_double_is_missing_or_vector_end(btok->values[j]) ? 1 : 0); \ ++ if ( nmiss ) \ ++ { \ ++ if ( missing_logic[nmiss] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ ++ } \ ++- else if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ +++ else if ( atok->values[i] > 16777216 || btok->values[j] > 16777216 ) /* Ugly, see #871 */ \ +++ { \ +++ if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ +++ } \ +++ else if ( (float)atok->values[i] CMP_OP (float)btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ ++ } \ ++ } \ ++ } \ ++@@ -1617,7 +1777,7 @@ ++ { \ ++ int miss = 0; \ ++ for (j=0; jnvalues; j++) \ ++- miss |= bcf_double_is_missing(tok->values[j]) ? 1 : 0; \ +++ miss |= bcf_double_is_missing_or_vector_end(tok->values[j]) ? 1 : 0; \ ++ if ( missing_logic[++miss] ) \ ++ { \ ++ for (i=0; insamples; i++) \ ++@@ -1631,10 +1791,36 @@ ++ double *ptr = tok->values + i*tok->nval1; \ ++ int miss = 0; \ ++ for (j=0; jnval1; j++) \ ++- miss |= bcf_double_is_missing(ptr[j]) ? 1 : 0; \ +++ miss |= bcf_double_is_missing_or_vector_end(ptr[j]) ? 1 : 0; \ ++ if ( missing_logic[++miss] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \ ++ } \ ++ } \ +++ else if ( atok->nsamples && btok->nsamples ) \ +++ { \ +++ if ( atok->nval1!=btok->nval1 ) error("Incompatible number of per-sample values in comparison: %d vs %d\n",atok->nval1,btok->nval1); \ +++ if ( atok->nsamples!=btok->nsamples ) error("Incompatible number samples in comparison: %d vs %d\n",atok->nsamples,btok->nsamples); \ +++ for (i=0; insamples; i++) \ +++ { \ +++ if ( !atok->usmpl[i] || !btok->usmpl[i] ) { rtok->usmpl[i] = 0; continue; } \ +++ double *aptr = atok->values + i*atok->nval1; \ +++ double *bptr = btok->values + i*btok->nval1; \ +++ for (j=0; jnval1; j++) \ +++ { \ +++ int nmiss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \ +++ if ( nmiss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ +++ nmiss += (bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0); \ +++ if ( nmiss ) \ +++ { \ +++ if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \ +++ } \ +++ else if ( aptr[j] > 16777216 || bptr[j] > 16777216 ) /* Ugly, see #871 */ \ +++ { \ +++ if ( aptr[j] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \ +++ } \ +++ else if ( (float)aptr[j] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \ +++ } \ +++ } \ +++ } \ ++ else \ ++ { \ ++ token_t *xtok = atok->nsamples ? atok : btok; \ ++@@ -1646,16 +1832,20 @@ ++ double *yptr = ytok->values + i*ytok->nval1; \ ++ for (j=0; jnval1; j++) \ ++ { \ ++- int miss = bcf_double_is_missing(xptr[j]) ? 1 : 0; \ +++ int miss = bcf_double_is_missing_or_vector_end(xptr[j]) ? 1 : 0; \ ++ if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ ++ for (k=0; knvalues; k++) \ ++ { \ ++- int nmiss = miss + (bcf_double_is_missing(yptr[k]) ? 1 : 0); \ +++ int nmiss = miss + (bcf_double_is_missing_or_vector_end(yptr[k]) ? 1 : 0); \ ++ if ( nmiss ) \ ++ { \ ++ if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ ++ } \ ++- else if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ +++ else if ( xptr[j] > 16777216 || yptr[k] > 16777216 ) /* Ugly, see #871 */ \ +++ { \ +++ if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ +++ } \ +++ else if ( (float)xptr[j] CMP_OP (float)yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ ++ } \ ++ } \ ++ } \ ++@@ -1874,11 +2064,15 @@ ++ int *idxs2 = NULL, nidxs2 = 0, idx2 = 0; ++ ++ int set_samples = 0; ++- char *colon = rindex(tag_idx, ':'); +++ char *colon = strrchr(tag_idx, ':'); ++ if ( tag_idx[0]=='@' ) // file list with sample names ++ { ++ if ( !is_fmt ) error("Could not parse \"%s\". (Not a FORMAT tag yet a sample list provided.)\n", ori); ++ char *fname = expand_path(tag_idx+1); +++#ifdef _WIN32 +++ if (fname && strlen(fname) > 2 && fname[1] == ':') // Deal with Windows paths, such as 'C:\..' +++ colon = strrchr(fname+2, ':'); +++#endif ++ int nsmpl; ++ char **list = hts_readlist(fname, 1, &nsmpl); ++ if ( !list && colon ) ++@@ -1887,7 +2081,7 @@ ++ tok->idxs = idxs2; ++ tok->nidxs = nidxs2; ++ tok->idx = idx2; ++- colon = rindex(fname, ':'); +++ colon = strrchr(fname, ':'); ++ *colon = 0; ++ list = hts_readlist(fname, 1, &nsmpl); ++ } ++@@ -1995,6 +2189,7 @@ ++ } ++ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) ++ { +++ tok->tag_type = -1; ++ tok->tok_type = TOK_VAL; ++ tok->hdr_id = -1; ++ tok->pass_site = -1; ++@@ -2065,6 +2260,7 @@ ++ tok->comparator = filters_cmp_filter; ++ tok->tag = strdup("FILTER"); ++ filter->max_unpack |= BCF_UN_FLT; +++ tok->tag_type = BCF_HL_FLT; ++ return 0; ++ } ++ else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ ) ++@@ -2073,6 +2269,12 @@ ++ tok->tag = strdup("ID"); ++ return 0; ++ } +++ else if ( !strncasecmp(str,"CHROM",len) ) +++ { +++ tok->setter = &filters_set_chrom; +++ tok->tag = strdup("CHROM"); +++ return 0; +++ } ++ else if ( !strncasecmp(str,"POS",len) ) ++ { ++ tok->setter = &filters_set_pos; ++@@ -2111,12 +2313,14 @@ ++ } ++ else if ( !strncasecmp(str,"N_MISSING",len) ) ++ { +++ filter->max_unpack |= BCF_UN_FMT; ++ tok->setter = &filters_set_nmissing; ++ tok->tag = strdup("N_MISSING"); ++ return 0; ++ } ++ else if ( !strncasecmp(str,"F_MISSING",len) ) ++ { +++ filter->max_unpack |= BCF_UN_FMT; ++ tok->setter = &filters_set_nmissing; ++ tok->tag = strdup("F_MISSING"); ++ return 0; ++@@ -2154,7 +2358,7 @@ ++ for (i=0; insamples; i++) tok->usmpl[i] = 1; ++ } ++ ++- tok->type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; +++ tok->tag_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; ++ if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT; ++ if ( tok->hdr_id>=0 ) ++ { ++@@ -2264,17 +2468,26 @@ ++ free(tmp.s); ++ return 0; ++ } +++ else if ( !strcasecmp(tmp.s,"ILEN") || !strcasecmp(tmp.s,"%ILEN") ) +++ { +++ filter->max_unpack |= BCF_UN_STR; +++ tok->setter = &filters_set_ilen; +++ tok->tag = strdup("ILEN"); +++ free(tmp.s); +++ return 0; +++ } ++ ++ // is it a value? Here we parse as integer/float separately and use strtof ++ // rather than strtod, because the more accurate double representation ++ // would invalidate floating point comparisons like QUAL=59.2, obtained via ++- // htslib/vcf parser +++ // htslib/vcf parser. +++ // Update: use strtod() and force floats only in comparisons ++ char *end; ++ tok->threshold = strtol(tmp.s, &end, 10); // integer? ++ if ( end - tmp.s != strlen(tmp.s) ) ++ { ++ errno = 0; ++- tok->threshold = strtof(tmp.s, &end); // float? +++ tok->threshold = strtod(tmp.s, &end); // float? ++ if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); ++ } ++ tok->is_constant = 1; ++@@ -2455,7 +2668,7 @@ ++ if ( ret==-1 ) error("Missing quotes in: %s\n", str); ++ ++ // fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len); ++- // int i; for (i=0; ihdr_id = -1; ++ tok->pass_site = -1; ++ tok->threshold = -1.0; ++- if ( !strncasecmp(tmp-len,"N_PASS",6) ) { tok->func = func_npass; tok->tag = strdup("N_PASS"); } ++- else if ( !strncasecmp(tmp-len,"F_PASS",6) ) { tok->func = func_npass; tok->tag = strdup("F_PASS"); } +++ if ( !strncasecmp(tmp-len,"N_PASS",6) ) +++ { +++ filter->max_unpack |= BCF_UN_FMT; +++ tok->func = func_npass; +++ tok->tag = strdup("N_PASS"); +++ } +++ else if ( !strncasecmp(tmp-len,"F_PASS",6) ) +++ { +++ filter->max_unpack |= BCF_UN_FMT; +++ tok->func = func_npass; +++ tok->tag = strdup("F_PASS"); +++ } ++ else error("The function \"%s\" is not supported\n", tmp-len); ++ continue; ++ } ++@@ -2607,7 +2830,8 @@ ++ // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be ++ // just before or after the FILTER token and they must be followed with a comparison operator. ++ // At this point we also initialize regex expressions which, in RPN, must preceed the LIKE/NLIKE operator. ++- // Additionally, treat "." as missing value rather than a string in numeric equalities. +++ // Additionally, treat "." as missing value rather than a string in numeric equalities; that +++ // @file is only used with ID; etc. ++ // This code is fragile: improve me. ++ int i; ++ for (i=0; istr); ++ +++ if ( out[i].hash ) +++ { +++ int j = out[i+1].tok_type==TOK_VAL ? i+1 : i-1; +++ if ( out[j].comparator!=filters_cmp_id ) +++ error("Error: could not parse the expression. Note that the \"@file_name\" syntax can be currently used with ID column only.\n"); +++ } ++ if ( out[i].tok_type==TOK_OR || out[i].tok_type==TOK_OR_VEC ) ++ out[i].func = vector_logic_or; ++ if ( out[i].tok_type==TOK_AND || out[i].tok_type==TOK_AND_VEC ) ++@@ -2629,7 +2859,7 @@ ++ int set_missing = 0; ++ if ( out[k].hdr_id>0 ) ++ { ++- int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id); +++ int type = bcf_hdr_id2type(filter->hdr,out[k].tag_type,out[k].hdr_id); ++ if ( type==BCF_HT_INT ) set_missing = 1; ++ else if ( type==BCF_HT_REAL ) set_missing = 1; ++ } ++@@ -2655,7 +2885,7 @@ ++ } ++ if ( out[i].tok_type!=TOK_VAL ) continue; ++ if ( !out[i].tag ) continue; ++- if ( !strcmp(out[i].tag,"TYPE") ) +++ if ( out[i].setter==filters_set_type ) ++ { ++ if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); ++ int itok, ival; ++@@ -2669,6 +2899,7 @@ ++ else if ( !strcasecmp(out[ival].key,"mnp") || !strcasecmp(out[ival].key,"mnps") ) { out[ival].threshold = VCF_MNP<<1; out[ival].is_str = 0; } ++ else if ( !strcasecmp(out[ival].key,"other") ) { out[ival].threshold = VCF_OTHER<<1; out[ival].is_str = 0; } ++ else if ( !strcasecmp(out[ival].key,"bnd") ) { out[ival].threshold = VCF_BND<<1; out[ival].is_str = 0; } +++ else if ( !strcasecmp(out[ival].key,"overlap") ) { out[ival].threshold = VCF_OVERLAP<<1; out[ival].is_str = 0; } ++ else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; } ++ else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str); ++ if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and; ++@@ -2703,7 +2934,7 @@ ++ else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; } // r ++ continue; ++ } ++- if ( !strcmp(out[i].tag,"FILTER") ) +++ if ( out[i].tag_type==BCF_HL_FLT ) ++ { ++ if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); ++ int itok = i, ival; ++@@ -2732,13 +2963,17 @@ ++ filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0; ++ for (i=0; ifilters[i].tok_type == TOK_EQ ) { missing_logic[0] = missing_logic[2] = 1; } ++@@ -2893,7 +3130,6 @@ ++ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>=,missing_logic) ++ else ++ error("todo: %s:%d .. type=%d\n", __FILE__,__LINE__,filter->filters[i].tok_type); ++- ++ } ++ filter->flt_stack[nstack-2] = &filter->filters[i]; ++ nstack--; ++--- python-pysam.orig/bcftools/filter.c.pysam.c +++++ python-pysam/bcftools/filter.c.pysam.c ++@@ -30,7 +30,10 @@ ++ #include ++ #include ++ #include +++#include +++#ifndef _WIN32 ++ #include +++#endif ++ #include ++ #include ++ #include ++@@ -55,8 +58,8 @@ ++ # define __FUNCTION__ __func__ ++ #endif ++ ++-uint64_t bcf_double_missing = 0x7ff0000000000001; ++-uint64_t bcf_double_vector_end = 0x7ff0000000000002; +++static const uint64_t bcf_double_missing = 0x7ff0000000000001; +++static const uint64_t bcf_double_vector_end = 0x7ff0000000000002; ++ static inline void bcf_double_set(double *ptr, uint64_t value) ++ { ++ union { uint64_t i; double d; } u; ++@@ -73,6 +76,7 @@ ++ #define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing) ++ #define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end) ++ #define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing) +++#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end)) ++ ++ ++ typedef struct _token_t ++@@ -84,7 +88,7 @@ ++ char *tag; // for debugging and printout only, VCF tag name ++ double threshold; // filtering threshold ++ int is_constant; // the threshold is set ++- int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types +++ int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types ++ int idx; // 0-based index to VCF vectors, ++ // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) ++ int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited ++@@ -153,11 +157,14 @@ ++ #define TOK_CNT 26 ++ #define TOK_PERLSUB 27 ++ #define TOK_BINOM 28 ++- ++-// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 ++-// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p ++-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8}; ++-#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcp" +++#define TOK_PHRED 29 +++#define TOK_MEDIAN 30 +++#define TOK_STDEV 31 +++ +++// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +++// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s +++static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; +++#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" ++ ++ // Return negative values if it is a function with variable number of arguments ++ static int filters_next_token(char **str, int *len) ++@@ -181,12 +188,16 @@ ++ ++ if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; } ++ if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; } +++ if ( !strncasecmp(tmp,"MEAN(",5) ) { (*str) += 4; return TOK_AVG; } +++ if ( !strncasecmp(tmp,"MEDIAN(",7) ) { (*str) += 6; return TOK_MEDIAN; } ++ if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; } +++ if ( !strncasecmp(tmp,"STDEV(",6) ) { (*str) += 5; return TOK_STDEV; } ++ if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; } ++ if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; } ++ if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; } ++ if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; } ++ if ( !strncasecmp(tmp,"BINOM(",6) ) { (*str) += 5; return -TOK_BINOM; } +++ if ( !strncasecmp(tmp,"PHRED(",6) ) { (*str) += 5; return TOK_PHRED; } ++ if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility ++ if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility ++ if ( !strncasecmp(tmp,"%AVG(",5) ) { (*str) += 4; return TOK_AVG; } // for backward compatibility ++@@ -197,6 +208,7 @@ ++ if ( !strncasecmp(tmp,"PERL.",5) ) { (*str) += 5; return -TOK_PERLSUB; } ++ if ( !strncasecmp(tmp,"N_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; } ++ if ( !strncasecmp(tmp,"F_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; } +++ if ( !strncasecmp(tmp,"%ILEN",5) ) { *len = 5; return TOK_VAL; } // to be able to distinguish between INFO/ILEN and on-the-fly ILEN ++ ++ if ( tmp[0]=='@' ) // file name ++ { ++@@ -282,28 +294,30 @@ ++ } ++ ++ ++-/* +++/* ++ Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller. ++ ++- Based on jkb's staden code with some adjustements. +++ Based on jkb's staden code with some adjustments. ++ https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123 ++ */ ++ char *expand_path(char *path) ++ { ++-#ifdef _WIN32 ++- return strdup(path); // windows expansion: todo ++-#endif ++- ++ kstring_t str = {0,0,0}; ++ ++ if ( path[0] == '~' ) ++ { ++ if ( !path[1] || path[1] == '/' ) ++ { +++#ifdef _WIN32 +++ kputs(getenv("HOMEDRIVE"), &str); +++ kputs(getenv("HOMEPATH"), &str); +++#else ++ // ~ or ~/path ++ kputs(getenv("HOME"), &str); ++ if ( path[1] ) kputs(path+1, &str); +++#endif ++ } +++#ifndef _WIN32 ++ else ++ { ++ // user name: ~pd3/path ++@@ -317,13 +331,18 @@ ++ else kputs(pwentry->pw_dir, &str); ++ kputs(end, &str); ++ } ++- return str.s; +++#endif +++ return ks_release(&str); ++ } ++ if ( path[0] == '$' ) ++ { ++ char *var = getenv(path+1); ++- if ( var ) path = var; +++ if ( var ) { +++ kputs(var, &str); +++ return ks_release(&str); +++ } ++ } +++ ++ return strdup(path); ++ } ++ ++@@ -446,6 +465,8 @@ ++ return; ++ } ++ +++ if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n"); +++ ++ if ( rtok->tok_type==TOK_EQ ) ++ rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1; ++ else ++@@ -501,6 +522,14 @@ ++ return -1; // this shouldn't happen ++ } ++ +++static void filters_set_chrom(filter_t *flt, bcf1_t *line, token_t *tok) +++{ +++ tok->str_value.l = 0; +++ kputs(bcf_seqname(flt->hdr,line), &tok->str_value); +++ tok->nvalues = tok->str_value.l; +++ tok->is_str = 1; +++} +++ ++ static void filters_set_pos(filter_t *flt, bcf1_t *line, token_t *tok) ++ { ++ tok->values[0] = line->pos+1; ++@@ -642,7 +671,7 @@ ++ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) ++ { ++ if ( line->n_sample != tok->nsamples ) ++- error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); +++ error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples); ++ ++ int nvals; ++ if ( (nvals=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 ) ++@@ -661,8 +690,10 @@ ++ { ++ if ( !tok->usmpl[i] ) continue; ++ int32_t *ptr = flt->tmpi + i*nsrc1; ++- if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing || ptr[tok->idx]==bcf_int32_vector_end ) +++ if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing ) ++ bcf_double_set_missing(tok->values[i]); +++ else if ( ptr[tok->idx]==bcf_int32_vector_end ) +++ bcf_double_set_vector_end(tok->values[i]); ++ else ++ tok->values[i] = ptr[tok->idx]; ++ } ++@@ -679,24 +710,31 @@ ++ for (k=0; knidxs && !tok->idxs[k] ) continue; ++- if ( src[k]==bcf_int32_missing || src[k]==bcf_int32_vector_end ) +++ if ( src[k]==bcf_int32_missing ) ++ bcf_double_set_missing(dst[j]); +++ else if ( src[k]==bcf_int32_vector_end ) +++ bcf_double_set_vector_end(dst[j]); ++ else ++ dst[j] = src[k]; ++ j++; ++ } ++- while (j < tok->nval1) +++ if ( j==0 ) ++ { ++ bcf_double_set_missing(dst[j]); ++ j++; ++ } +++ while (j < tok->nval1) +++ { +++ bcf_double_set_vector_end(dst[j]); +++ j++; +++ } ++ } ++ } ++ } ++ static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) ++ { ++ if ( line->n_sample != tok->nsamples ) ++- error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); +++ error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples); ++ ++ int nvals; ++ if ( (nvals=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 ) ++@@ -715,8 +753,10 @@ ++ { ++ if ( !tok->usmpl[i] ) continue; ++ float *ptr = flt->tmpf + i*nsrc1; ++- if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) || bcf_float_is_vector_end(ptr[tok->idx]) ) +++ if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) ) ++ bcf_double_set_missing(tok->values[i]); +++ else if ( bcf_float_is_vector_end(ptr[tok->idx]) ) +++ bcf_double_set_vector_end(tok->values[i]); ++ else ++ tok->values[i] = ptr[tok->idx]; ++ } ++@@ -733,24 +773,31 @@ ++ for (k=0; knidxs && !tok->idxs[k] ) continue; ++- if ( bcf_float_is_missing(src[k]) || bcf_float_is_vector_end(src[k]) ) +++ if ( bcf_float_is_missing(src[k]) ) ++ bcf_double_set_missing(dst[j]); +++ else if ( bcf_float_is_vector_end(src[k]) ) +++ bcf_double_set_vector_end(dst[j]); ++ else ++ dst[j] = src[k]; ++ j++; ++ } ++- while (j < tok->nval1) +++ if ( j==0 ) ++ { ++ bcf_double_set_missing(dst[j]); ++ j++; ++ } +++ while (j < tok->nval1) +++ { +++ bcf_double_set_vector_end(dst[j]); +++ j++; +++ } ++ } ++ } ++ } ++ static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok) ++ { ++ if ( line->n_sample != tok->nsamples ) ++- error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); +++ error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples); ++ ++ int i, ndim = tok->str_value.m; ++ int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim); ++@@ -870,7 +917,7 @@ ++ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; ++ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; ++ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; ++- default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break; +++ default: error("The GT type is not lineognised: %d at %s:%"PRId64"\n",fmt->type, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1); break; ++ } ++ #undef BRANCH_INT ++ assert( tok->nsamples == nsmpl ); ++@@ -918,6 +965,19 @@ ++ tok->nvalues = tok->str_value.l; ++ tok->nval1 = blen; ++ } +++static void filters_set_ilen(filter_t *flt, bcf1_t *line, token_t *tok) +++{ +++ tok->nvalues = line->n_allele - 1; +++ hts_expand(double,tok->nvalues,tok->mvalues,tok->values); +++ +++ int i, rlen = strlen(line->d.allele[0]); +++ for (i=1; in_allele; i++) +++ { +++ int alen = strlen(line->d.allele[i]); +++ if ( rlen==alen ) bcf_double_set_missing(tok->values[i-1]); +++ else tok->values[i-1] = alen - rlen; +++ } +++} ++ static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok) ++ { ++ tok->str_value.l = 0; ++@@ -1016,10 +1076,16 @@ ++ if ( rtok->pass_samples[i] ) npass++; ++ } ++ ++- assert( rtok->values ); ++- rtok->nvalues = 1; ++- rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); ++- rtok->nsamples = 0; +++ hts_expand(double,rtok->nsamples,rtok->mvalues,rtok->values); +++ double value = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); +++ rtok->nval1 = 1; +++ rtok->nvalues = rtok->nsamples; +++ +++ // Set per-sample status so that `query -i 'F_PASS(GT!="mis" & GQ >= 20) > 0.5'` or +trio-stats +++ // consider only the passing site AND samples. The values for failed samples is set to -1 so +++ // that it can never conflict with valid expressions. +++ for (i=0; insamples; i++) +++ rtok->values[i] = rtok->pass_samples[i] ? value : -1; ++ ++ return 1; ++ } ++@@ -1105,7 +1171,7 @@ ++ int i, has_value = 0; ++ for (i=0; invalues; i++) ++ { ++- if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue; +++ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; ++ has_value = 1; ++ if ( val < tok->values[i] ) val = tok->values[i]; ++ } ++@@ -1125,7 +1191,7 @@ ++ int i, has_value = 0; ++ for (i=0; invalues; i++) ++ { ++- if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue; +++ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; ++ has_value = 1; ++ if ( val > tok->values[i] ) val = tok->values[i]; ++ } ++@@ -1144,7 +1210,7 @@ ++ double val = 0; ++ int i, n = 0; ++ for (i=0; invalues; i++) ++- if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } +++ if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } ++ if ( n ) ++ { ++ rtok->values[0] = val / n; ++@@ -1152,6 +1218,61 @@ ++ } ++ return 1; ++ } +++static int compare_doubles(const void *lhs, const void *rhs) +++{ +++ double arg1 = *(const double*) lhs; +++ double arg2 = *(const double*) rhs; +++ if (arg1 < arg2) return -1; +++ if (arg1 > arg2) return 1; +++ return 0; +++} +++static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +++{ +++ token_t *tok = stack[nstack - 1]; +++ rtok->nvalues = 0; +++ if ( !tok->nvalues ) return 1; +++ int i, n = 0; +++ for (i=0; invalues; i++) +++ { +++ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; +++ if ( n < i ) tok->values[n] = tok->values[i]; +++ n++; +++ } +++ if ( !n ) return 1; +++ if ( n==1 ) rtok->values[0] = tok->values[0]; +++ else +++ { +++ qsort(tok->values, n, sizeof(double), compare_doubles); +++ rtok->values[0] = n % 2 ? tok->values[n/2] : (tok->values[n/2-1] + tok->values[n/2]) * 0.5; +++ } +++ rtok->nvalues = 1; +++ return 1; +++} +++static int func_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +++{ +++ token_t *tok = stack[nstack - 1]; +++ rtok->nvalues = 0; +++ if ( !tok->nvalues ) return 1; +++ int i, n = 0; +++ for (i=0; invalues; i++) +++ { +++ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; +++ if ( n < i ) tok->values[n] = tok->values[i]; +++ n++; +++ } +++ if ( !n ) return 1; +++ if ( n==1 ) rtok->values[0] = 0; +++ else +++ { +++ double sdev = 0, avg = 0; +++ for (i=0; ivalues[n]; +++ avg /= n; +++ for (i=0; ivalues[n] - avg) * (tok->values[n] - avg); +++ rtok->values[0] = sqrt(sdev/n); +++ } +++ rtok->nvalues = 1; +++ return 1; +++} ++ static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) ++ { ++ rtok->nvalues = 0; ++@@ -1160,7 +1281,7 @@ ++ double val = 0; ++ int i, n = 0; ++ for (i=0; invalues; i++) ++- if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } +++ if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } ++ if ( n ) ++ { ++ rtok->values[0] = val; ++@@ -1179,17 +1300,28 @@ ++ int i; ++ for (i=0; invalues; i++) ++ if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); ++- else rtok->values[i] = fabs(tok->values[i]); +++ else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]); ++ return 1; ++ } ++ static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) ++ { ++ token_t *tok = stack[nstack - 1]; ++- if ( !tok->nsamples ) error("COUNT() can be applied only on FORMAT fields\n"); ++- ++ int i, cnt = 0; ++- for (i=0; insamples; i++) ++- if ( tok->pass_samples[i] ) cnt++; +++ if ( !tok->nsamples ) +++ { +++ if ( tok->is_str ) +++ { +++ if ( tok->str_value.l ) cnt = 1; +++ for (i=0; istr_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++; +++ } +++ else +++ cnt = tok->nvalues; +++ } +++ else +++ { +++ for (i=0; insamples; i++) +++ if ( tok->pass_samples[i] ) cnt++; +++ } ++ ++ rtok->nvalues = 1; ++ rtok->values[0] = cnt; ++@@ -1305,10 +1437,10 @@ ++ } ++ int idx1 = bcf_gt_allele(ptr[0]); ++ int idx2 = bcf_gt_allele(ptr[1]); ++- if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%d, sample %s\n", bcf_seqname(flt->hdr,line),line->pos+1,flt->hdr->samples[i]); ++- if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%d, sample %s\n", bcf_seqname(flt->hdr,line),line->pos+1,flt->hdr->samples[i]); +++ if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]); +++ if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]); ++ double *vals = tok->values + tok->nval1*i; ++- if ( bcf_double_is_missing(vals[idx1]) || bcf_double_is_missing(vals[idx2]) ) +++ if ( bcf_double_is_missing_or_vector_end(vals[idx1]) || bcf_double_is_missing_or_vector_end(vals[idx2]) ) ++ { ++ bcf_double_set_missing(rtok->values[i]); ++ continue; ++@@ -1326,13 +1458,13 @@ ++ // the fields given explicitly: binom(AD[:0],AD[:1]) ++ token_t *tok2 = stack[istack+1]; ++ if ( tok->nval1!=1 || tok2->nval1!=1 ) ++- error("Expected one value per binom() argument, found %d and %d at %s:%d\n",tok->nval1,tok2->nval1, bcf_seqname(flt->hdr,line),line->pos+1); +++ error("Expected one value per binom() argument, found %d and %d at %s:%"PRId64"\n",tok->nval1,tok2->nval1, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1); ++ for (i=0; insamples; i++) ++ { ++ if ( !rtok->usmpl[i] ) continue; ++ double *ptr1 = tok->values + tok->nval1*i; ++ double *ptr2 = tok2->values + tok2->nval1*i; ++- if ( bcf_double_is_missing(ptr1[0]) || bcf_double_is_missing(ptr2[0]) ) +++ if ( bcf_double_is_missing_or_vector_end(ptr1[0]) || bcf_double_is_missing_or_vector_end(ptr2[0]) ) ++ { ++ bcf_double_set_missing(rtok->values[i]); ++ continue; ++@@ -1372,7 +1504,7 @@ ++ ptr2 = &tok2->values[0]; ++ } ++ } ++- if ( !ptr1 || !ptr2 || bcf_double_is_missing(ptr1[0]) || bcf_double_is_missing(ptr2[0]) ) +++ if ( !ptr1 || !ptr2 || bcf_double_is_missing_or_vector_end(ptr1[0]) || bcf_double_is_missing_or_vector_end(ptr2[0]) ) ++ bcf_double_set_missing(rtok->values[0]); ++ else ++ { ++@@ -1383,6 +1515,31 @@ ++ } ++ return rtok->nargs; ++ } +++static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +++{ +++ token_t *tok = stack[nstack - 1]; +++ if ( tok->is_str ) error("PHRED() can be applied only on numeric values\n"); +++ +++ rtok->nsamples = tok->nsamples; +++ rtok->nval1 = tok->nval1; +++ memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples)); +++ assert(tok->usmpl); +++ if ( !rtok->usmpl ) +++ { +++ rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl)); +++ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl)); +++ } +++ rtok->nvalues = tok->nvalues; +++ if ( !tok->nvalues ) return 1; +++ +++ hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); +++ int i; +++ for (i=0; invalues; i++) +++ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); +++ else rtok->values[i] = -4.34294481903*log(tok->values[i]); +++ +++ return 1; +++} ++ inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok) ++ { ++ token_t *tok = atok->nvalues > btok->nvalues ? atok : btok; ++@@ -1416,7 +1573,7 @@ ++ assert( atok->nsamples==btok->nsamples ); \ ++ for (i=0; invalues; i++) \ ++ { \ ++- if ( bcf_double_is_missing(atok->values[i]) || bcf_double_is_missing(btok->values[i]) ) \ +++ if ( bcf_double_is_missing_or_vector_end(atok->values[i]) || bcf_double_is_missing_or_vector_end(btok->values[i]) ) \ ++ { \ ++ bcf_double_set_missing(rtok->values[i]); \ ++ continue; \ ++@@ -1430,11 +1587,11 @@ ++ token_t *xtok = atok->nsamples ? atok : btok; \ ++ token_t *ytok = atok->nsamples ? btok : atok; \ ++ assert( ytok->nvalues==1 ); \ ++- if ( !bcf_double_is_missing(ytok->values[0]) ) \ +++ if ( !bcf_double_is_missing_or_vector_end(ytok->values[0]) ) \ ++ { \ ++ for (i=0; invalues; i++) \ ++ { \ ++- if ( bcf_double_is_missing(xtok->values[i]) ) \ +++ if ( bcf_double_is_missing_or_vector_end(xtok->values[i]) ) \ ++ { \ ++ bcf_double_set_missing(rtok->values[i]); \ ++ continue; \ ++@@ -1568,7 +1725,6 @@ ++ { \ ++ token_t *rtok = _rtok; \ ++ int i, j, k; \ ++- assert( !atok->nsamples || !btok->nsamples ); \ ++ tok_init_samples(atok, btok, rtok); \ ++ if ( !atok->nsamples && !btok->nsamples ) \ ++ { \ ++@@ -1578,7 +1734,7 @@ ++ token_t *tok = atok->nvalues ? atok : btok; \ ++ for (j=0; jnvalues; j++) \ ++ { \ ++- if ( bcf_double_is_missing(tok->values[j]) ) \ +++ if ( bcf_double_is_missing_or_vector_end(tok->values[j]) ) \ ++ { \ ++ if ( missing_logic[2] ) { rtok->pass_site = 1; break; } \ ++ } \ ++@@ -1589,15 +1745,19 @@ ++ { \ ++ for (i=0; invalues; i++) \ ++ { \ ++- int amiss = bcf_double_is_missing(atok->values[i]) ? 1 : 0; \ +++ int amiss = bcf_double_is_missing_or_vector_end(atok->values[i]) ? 1 : 0; \ ++ for (j=0; jnvalues; j++) \ ++ { \ ++- int nmiss = amiss + (bcf_double_is_missing(btok->values[j]) ? 1 : 0); \ +++ int nmiss = amiss + (bcf_double_is_missing_or_vector_end(btok->values[j]) ? 1 : 0); \ ++ if ( nmiss ) \ ++ { \ ++ if ( missing_logic[nmiss] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ ++ } \ ++- else if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ +++ else if ( atok->values[i] > 16777216 || btok->values[j] > 16777216 ) /* Ugly, see #871 */ \ +++ { \ +++ if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ +++ } \ +++ else if ( (float)atok->values[i] CMP_OP (float)btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ ++ } \ ++ } \ ++ } \ ++@@ -1619,7 +1779,7 @@ ++ { \ ++ int miss = 0; \ ++ for (j=0; jnvalues; j++) \ ++- miss |= bcf_double_is_missing(tok->values[j]) ? 1 : 0; \ +++ miss |= bcf_double_is_missing_or_vector_end(tok->values[j]) ? 1 : 0; \ ++ if ( missing_logic[++miss] ) \ ++ { \ ++ for (i=0; insamples; i++) \ ++@@ -1633,10 +1793,36 @@ ++ double *ptr = tok->values + i*tok->nval1; \ ++ int miss = 0; \ ++ for (j=0; jnval1; j++) \ ++- miss |= bcf_double_is_missing(ptr[j]) ? 1 : 0; \ +++ miss |= bcf_double_is_missing_or_vector_end(ptr[j]) ? 1 : 0; \ ++ if ( missing_logic[++miss] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \ ++ } \ ++ } \ +++ else if ( atok->nsamples && btok->nsamples ) \ +++ { \ +++ if ( atok->nval1!=btok->nval1 ) error("Incompatible number of per-sample values in comparison: %d vs %d\n",atok->nval1,btok->nval1); \ +++ if ( atok->nsamples!=btok->nsamples ) error("Incompatible number samples in comparison: %d vs %d\n",atok->nsamples,btok->nsamples); \ +++ for (i=0; insamples; i++) \ +++ { \ +++ if ( !atok->usmpl[i] || !btok->usmpl[i] ) { rtok->usmpl[i] = 0; continue; } \ +++ double *aptr = atok->values + i*atok->nval1; \ +++ double *bptr = btok->values + i*btok->nval1; \ +++ for (j=0; jnval1; j++) \ +++ { \ +++ int nmiss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \ +++ if ( nmiss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ +++ nmiss += (bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0); \ +++ if ( nmiss ) \ +++ { \ +++ if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \ +++ } \ +++ else if ( aptr[j] > 16777216 || bptr[j] > 16777216 ) /* Ugly, see #871 */ \ +++ { \ +++ if ( aptr[j] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \ +++ } \ +++ else if ( (float)aptr[j] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \ +++ } \ +++ } \ +++ } \ ++ else \ ++ { \ ++ token_t *xtok = atok->nsamples ? atok : btok; \ ++@@ -1648,16 +1834,20 @@ ++ double *yptr = ytok->values + i*ytok->nval1; \ ++ for (j=0; jnval1; j++) \ ++ { \ ++- int miss = bcf_double_is_missing(xptr[j]) ? 1 : 0; \ +++ int miss = bcf_double_is_missing_or_vector_end(xptr[j]) ? 1 : 0; \ ++ if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ ++ for (k=0; knvalues; k++) \ ++ { \ ++- int nmiss = miss + (bcf_double_is_missing(yptr[k]) ? 1 : 0); \ +++ int nmiss = miss + (bcf_double_is_missing_or_vector_end(yptr[k]) ? 1 : 0); \ ++ if ( nmiss ) \ ++ { \ ++ if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ ++ } \ ++- else if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ +++ else if ( xptr[j] > 16777216 || yptr[k] > 16777216 ) /* Ugly, see #871 */ \ +++ { \ +++ if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ +++ } \ +++ else if ( (float)xptr[j] CMP_OP (float)yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ ++ } \ ++ } \ ++ } \ ++@@ -1876,11 +2066,15 @@ ++ int *idxs2 = NULL, nidxs2 = 0, idx2 = 0; ++ ++ int set_samples = 0; ++- char *colon = rindex(tag_idx, ':'); +++ char *colon = strrchr(tag_idx, ':'); ++ if ( tag_idx[0]=='@' ) // file list with sample names ++ { ++ if ( !is_fmt ) error("Could not parse \"%s\". (Not a FORMAT tag yet a sample list provided.)\n", ori); ++ char *fname = expand_path(tag_idx+1); +++#ifdef _WIN32 +++ if (fname && strlen(fname) > 2 && fname[1] == ':') // Deal with Windows paths, such as 'C:\..' +++ colon = strrchr(fname+2, ':'); +++#endif ++ int nsmpl; ++ char **list = hts_readlist(fname, 1, &nsmpl); ++ if ( !list && colon ) ++@@ -1889,7 +2083,7 @@ ++ tok->idxs = idxs2; ++ tok->nidxs = nidxs2; ++ tok->idx = idx2; ++- colon = rindex(fname, ':'); +++ colon = strrchr(fname, ':'); ++ *colon = 0; ++ list = hts_readlist(fname, 1, &nsmpl); ++ } ++@@ -1997,6 +2191,7 @@ ++ } ++ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) ++ { +++ tok->tag_type = -1; ++ tok->tok_type = TOK_VAL; ++ tok->hdr_id = -1; ++ tok->pass_site = -1; ++@@ -2067,6 +2262,7 @@ ++ tok->comparator = filters_cmp_filter; ++ tok->tag = strdup("FILTER"); ++ filter->max_unpack |= BCF_UN_FLT; +++ tok->tag_type = BCF_HL_FLT; ++ return 0; ++ } ++ else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ ) ++@@ -2075,6 +2271,12 @@ ++ tok->tag = strdup("ID"); ++ return 0; ++ } +++ else if ( !strncasecmp(str,"CHROM",len) ) +++ { +++ tok->setter = &filters_set_chrom; +++ tok->tag = strdup("CHROM"); +++ return 0; +++ } ++ else if ( !strncasecmp(str,"POS",len) ) ++ { ++ tok->setter = &filters_set_pos; ++@@ -2113,12 +2315,14 @@ ++ } ++ else if ( !strncasecmp(str,"N_MISSING",len) ) ++ { +++ filter->max_unpack |= BCF_UN_FMT; ++ tok->setter = &filters_set_nmissing; ++ tok->tag = strdup("N_MISSING"); ++ return 0; ++ } ++ else if ( !strncasecmp(str,"F_MISSING",len) ) ++ { +++ filter->max_unpack |= BCF_UN_FMT; ++ tok->setter = &filters_set_nmissing; ++ tok->tag = strdup("F_MISSING"); ++ return 0; ++@@ -2156,7 +2360,7 @@ ++ for (i=0; insamples; i++) tok->usmpl[i] = 1; ++ } ++ ++- tok->type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; +++ tok->tag_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; ++ if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT; ++ if ( tok->hdr_id>=0 ) ++ { ++@@ -2266,17 +2470,26 @@ ++ free(tmp.s); ++ return 0; ++ } +++ else if ( !strcasecmp(tmp.s,"ILEN") || !strcasecmp(tmp.s,"%ILEN") ) +++ { +++ filter->max_unpack |= BCF_UN_STR; +++ tok->setter = &filters_set_ilen; +++ tok->tag = strdup("ILEN"); +++ free(tmp.s); +++ return 0; +++ } ++ ++ // is it a value? Here we parse as integer/float separately and use strtof ++ // rather than strtod, because the more accurate double representation ++ // would invalidate floating point comparisons like QUAL=59.2, obtained via ++- // htslib/vcf parser +++ // htslib/vcf parser. +++ // Update: use strtod() and force floats only in comparisons ++ char *end; ++ tok->threshold = strtol(tmp.s, &end, 10); // integer? ++ if ( end - tmp.s != strlen(tmp.s) ) ++ { ++ errno = 0; ++- tok->threshold = strtof(tmp.s, &end); // float? +++ tok->threshold = strtod(tmp.s, &end); // float? ++ if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); ++ } ++ tok->is_constant = 1; ++@@ -2457,7 +2670,7 @@ ++ if ( ret==-1 ) error("Missing quotes in: %s\n", str); ++ ++ // fprintf(bcftools_stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len); ++- // int i; for (i=0; ihdr_id = -1; ++ tok->pass_site = -1; ++ tok->threshold = -1.0; ++- if ( !strncasecmp(tmp-len,"N_PASS",6) ) { tok->func = func_npass; tok->tag = strdup("N_PASS"); } ++- else if ( !strncasecmp(tmp-len,"F_PASS",6) ) { tok->func = func_npass; tok->tag = strdup("F_PASS"); } +++ if ( !strncasecmp(tmp-len,"N_PASS",6) ) +++ { +++ filter->max_unpack |= BCF_UN_FMT; +++ tok->func = func_npass; +++ tok->tag = strdup("N_PASS"); +++ } +++ else if ( !strncasecmp(tmp-len,"F_PASS",6) ) +++ { +++ filter->max_unpack |= BCF_UN_FMT; +++ tok->func = func_npass; +++ tok->tag = strdup("F_PASS"); +++ } ++ else error("The function \"%s\" is not supported\n", tmp-len); ++ continue; ++ } ++@@ -2609,7 +2832,8 @@ ++ // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be ++ // just before or after the FILTER token and they must be followed with a comparison operator. ++ // At this point we also initialize regex expressions which, in RPN, must preceed the LIKE/NLIKE operator. ++- // Additionally, treat "." as missing value rather than a string in numeric equalities. +++ // Additionally, treat "." as missing value rather than a string in numeric equalities; that +++ // @file is only used with ID; etc. ++ // This code is fragile: improve me. ++ int i; ++ for (i=0; istr); ++ +++ if ( out[i].hash ) +++ { +++ int j = out[i+1].tok_type==TOK_VAL ? i+1 : i-1; +++ if ( out[j].comparator!=filters_cmp_id ) +++ error("Error: could not parse the expression. Note that the \"@file_name\" syntax can be currently used with ID column only.\n"); +++ } ++ if ( out[i].tok_type==TOK_OR || out[i].tok_type==TOK_OR_VEC ) ++ out[i].func = vector_logic_or; ++ if ( out[i].tok_type==TOK_AND || out[i].tok_type==TOK_AND_VEC ) ++@@ -2631,7 +2861,7 @@ ++ int set_missing = 0; ++ if ( out[k].hdr_id>0 ) ++ { ++- int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id); +++ int type = bcf_hdr_id2type(filter->hdr,out[k].tag_type,out[k].hdr_id); ++ if ( type==BCF_HT_INT ) set_missing = 1; ++ else if ( type==BCF_HT_REAL ) set_missing = 1; ++ } ++@@ -2657,7 +2887,7 @@ ++ } ++ if ( out[i].tok_type!=TOK_VAL ) continue; ++ if ( !out[i].tag ) continue; ++- if ( !strcmp(out[i].tag,"TYPE") ) +++ if ( out[i].setter==filters_set_type ) ++ { ++ if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); ++ int itok, ival; ++@@ -2671,6 +2901,7 @@ ++ else if ( !strcasecmp(out[ival].key,"mnp") || !strcasecmp(out[ival].key,"mnps") ) { out[ival].threshold = VCF_MNP<<1; out[ival].is_str = 0; } ++ else if ( !strcasecmp(out[ival].key,"other") ) { out[ival].threshold = VCF_OTHER<<1; out[ival].is_str = 0; } ++ else if ( !strcasecmp(out[ival].key,"bnd") ) { out[ival].threshold = VCF_BND<<1; out[ival].is_str = 0; } +++ else if ( !strcasecmp(out[ival].key,"overlap") ) { out[ival].threshold = VCF_OVERLAP<<1; out[ival].is_str = 0; } ++ else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; } ++ else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str); ++ if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and; ++@@ -2705,7 +2936,7 @@ ++ else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; } // r ++ continue; ++ } ++- if ( !strcmp(out[i].tag,"FILTER") ) +++ if ( out[i].tag_type==BCF_HL_FLT ) ++ { ++ if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); ++ int itok = i, ival; ++@@ -2734,13 +2965,17 @@ ++ filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0; ++ for (i=0; ifilters[i].tok_type == TOK_EQ ) { missing_logic[0] = missing_logic[2] = 1; } ++@@ -2895,7 +3132,6 @@ ++ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>=,missing_logic) ++ else ++ error("todo: %s:%d .. type=%d\n", __FILE__,__LINE__,filter->filters[i].tok_type); ++- ++ } ++ filter->flt_stack[nstack-2] = &filter->filters[i]; ++ nstack--; ++--- python-pysam.orig/bcftools/gvcf.c +++++ python-pysam/bcftools/gvcf.c ++@@ -156,7 +156,7 @@ ++ if ( gvcf->npl>0 ) ++ bcf_update_format_int32(hdr, gvcf->line, "PL", gvcf->pl, gvcf->npl); ++ bcf_update_format_int32(hdr, gvcf->line, "DP", gvcf->dp, nsmpl); ++- bcf_write1(fh, hdr, gvcf->line); +++ if ( bcf_write1(fh, hdr, gvcf->line)!=0 ) error("[%s] Error: failed to write the record\n", __func__); ++ gvcf->prev_range = 0; ++ gvcf->rid = -1; ++ gvcf->npl = 0; ++--- python-pysam.orig/bcftools/gvcf.c.pysam.c +++++ python-pysam/bcftools/gvcf.c.pysam.c ++@@ -158,7 +158,7 @@ ++ if ( gvcf->npl>0 ) ++ bcf_update_format_int32(hdr, gvcf->line, "PL", gvcf->pl, gvcf->npl); ++ bcf_update_format_int32(hdr, gvcf->line, "DP", gvcf->dp, nsmpl); ++- bcf_write1(fh, hdr, gvcf->line); +++ if ( bcf_write1(fh, hdr, gvcf->line)!=0 ) error("[%s] Error: failed to write the record\n", __func__); ++ gvcf->prev_range = 0; ++ gvcf->rid = -1; ++ gvcf->npl = 0; ++--- /dev/null +++++ python-pysam/bcftools/hex.h ++@@ -0,0 +1,95 @@ +++// VariantKey +++// +++// hex.h +++// +++// @category Libraries +++// @author Nicola Asuni +++// @copyright 2017-2018 GENOMICS plc +++// @license MIT (see LICENSE) +++// @link https://github.com/genomicsplc/variantkey +++// +++// LICENSE +++// +++// Copyright (c) 2017-2018 GENOMICS plc +++// +++// Permission is hereby granted, free of charge, to any person obtaining a copy +++// of this software and associated documentation files (the "Software"), to deal +++// in the Software without restriction, including without limitation the rights +++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++// copies of the Software, and to permit persons to whom the Software is +++// furnished to do so, subject to the following conditions: +++// +++// The above copyright notice and this permission notice shall be included in +++// all copies or substantial portions of the Software. +++// +++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++// THE SOFTWARE. +++ +++/** +++ * @file hex.h +++ * @brief Utility functions to manipulate strings. +++ * +++ * Collection of utility functions to manipulate strings. +++ */ +++ +++#ifndef ASTRING_H +++#define ASTRING_H +++ +++#include +++#include +++ +++/** @brief Returns uint64_t hexadecimal string (16 characters). +++ * +++ * @param n Number to parse +++ * @param str String buffer to be returned (it must be sized 17 bytes at least). +++ * +++ * @return Upon successful return, these function returns the number of characters processed +++ * (excluding the null byte used to end output to strings). +++ * If the buffer size is not sufficient, then the return value is the number of characters required for +++ * buffer string, including the terminating null byte. +++ */ +++static inline size_t hex_uint64_t(uint64_t n, char *str) +++{ +++ return sprintf(str, "%016" PRIx64, n); +++} +++ +++/** @brief Parses a 16 chars hexadecimal string and returns the code. +++ * +++ * @param s Hexadecimal string to parse (it must contain 16 hexadecimal characters). +++ * +++ * @return uint64_t unsigned integer number. +++ */ +++static inline uint64_t parse_hex_uint64_t(const char *s) +++{ +++ uint64_t v = 0; +++ uint8_t b; +++ size_t i; +++ for (i = 0; i < 16; i++) +++ { +++ b = s[i]; +++ if (b >= 'a') +++ { +++ b -= ('a' - 10); // a-f +++ } +++ else +++ { +++ if (b >= 'A') +++ { +++ b -= ('A' - 10); // A-F +++ } +++ else +++ { +++ b -= '0'; // 0-9 +++ } +++ } +++ v = ((v << 4) | b); +++ } +++ return v; +++} +++ +++#endif // ASTRING_H ++--- python-pysam.orig/bcftools/htslib-1.9/LICENSE +++++ /dev/null ++@@ -1,69 +0,0 @@ ++-[Files in this distribution outwith the cram/ subdirectory are distributed ++-according to the terms of the following MIT/Expat license.] ++- ++-The MIT/Expat License ++- ++-Copyright (C) 2012-2018 Genome Research Ltd. ++- ++-Permission is hereby granted, free of charge, to any person obtaining a copy ++-of this software and associated documentation files (the "Software"), to deal ++-in the Software without restriction, including without limitation the rights ++-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++-copies of the Software, and to permit persons to whom the Software is ++-furnished to do so, subject to the following conditions: ++- ++-The above copyright notice and this permission notice shall be included in ++-all copies or substantial portions of the Software. ++- ++-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++-DEALINGS IN THE SOFTWARE. ++- ++- ++-[Files within the cram/ subdirectory in this distribution are distributed ++-according to the terms of the following Modified 3-Clause BSD license.] ++- ++-The Modified-BSD License ++- ++-Copyright (C) 2012-2018 Genome Research Ltd. ++- ++-Redistribution and use in source and binary forms, with or without ++-modification, are permitted provided that the following conditions are met: ++- ++-1. Redistributions of source code must retain the above copyright notice, ++- this list of conditions and the following disclaimer. ++- ++-2. Redistributions in binary form must reproduce the above copyright notice, ++- this list of conditions and the following disclaimer in the documentation ++- and/or other materials provided with the distribution. ++- ++-3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute ++- nor the names of its contributors may be used to endorse or promote products ++- derived from this software without specific prior written permission. ++- ++-THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" ++-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++-DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR ITS CONTRIBUTORS BE LIABLE ++-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++- ++- ++-[The use of a range of years within a copyright notice in this distribution ++-should be interpreted as being equivalent to a list of years including the ++-first and last year specified and all consecutive years between them. ++- ++-For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009, ++-2011-2012" should be interpreted as being identical to a notice that reads ++-"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice ++-that reads "Copyright (C) 2005-2012" should be interpreted as being identical ++-to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, ++-2011, 2012".] ++--- python-pysam.orig/bcftools/htslib-1.9/README +++++ /dev/null ++@@ -1,5 +0,0 @@ ++-HTSlib is an implementation of a unified C library for accessing common file ++-formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing ++-data. It is the core library used by samtools and bcftools. ++- ++-See INSTALL for building and installation instructions. ++--- python-pysam.orig/bcftools/main.c +++++ python-pysam/bcftools/main.c ++@@ -53,7 +53,9 @@ ++ #if USE_GPL ++ int main_polysomy(int argc, char *argv[]); ++ #endif +++#ifdef ENABLE_BCF_PLUGINS ++ int main_plugin(int argc, char *argv[]); +++#endif ++ int main_consensus(int argc, char *argv[]); ++ int main_csq(int argc, char *argv[]); ++ int bam_mpileup(int argc, char *argv[]); ++@@ -110,15 +112,12 @@ ++ .alias = "norm", ++ .help = "left-align and normalize indels" ++ }, +++#ifdef ENABLE_BCF_PLUGINS ++ { .func = main_plugin, ++ .alias = "plugin", ++-#ifdef ENABLE_BCF_PLUGINS ++ .help = "user-defined plugins" ++-#else ++- /* Do not advertise when plugins disabled. */ ++- .help = "-user-defined plugins" ++-#endif ++ }, +++#endif ++ { .func = main_vcfquery, ++ .alias = "query", ++ .help = "transform VCF/BCF into user-defined formats" ++@@ -235,12 +234,24 @@ ++ fprintf(fp,"\n"); ++ } ++ +++// This is a tricky one, but on Windows the filename wildcard expansion is done by +++// the application and not by the shell, as traditionally it never had a "shell". +++// Even now, DOS and Powershell do not do this expansion (but bash does). +++// +++// This means that Mingw/Msys implements code before main() that takes e.g. "*" and +++// expands it up to a list of matching filenames. This in turn breaks things like +++// specifying "*" as a region (all the unmapped reads). We take a hard line here - +++// filename expansion is the task of the shell, not our application! +++#ifdef _WIN32 +++int _CRT_glob = 0; +++#endif +++ ++ int main(int argc, char *argv[]) ++ { ++ if (argc < 2) { usage(stderr); return 1; } ++ ++ if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { ++- printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2018 Genome Research Ltd.\n", bcftools_version(), hts_version()); +++ printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2019 Genome Research Ltd.\n", bcftools_version(), hts_version()); ++ #if USE_GPL ++ printf("License GPLv3+: GNU GPL version 3 or later \n"); ++ #else ++--- python-pysam.orig/bcftools/main.c.pysam.c +++++ python-pysam/bcftools/main.c.pysam.c ++@@ -55,7 +55,9 @@ ++ #if USE_GPL ++ int main_polysomy(int argc, char *argv[]); ++ #endif +++#ifdef ENABLE_BCF_PLUGINS ++ int main_plugin(int argc, char *argv[]); +++#endif ++ int main_consensus(int argc, char *argv[]); ++ int main_csq(int argc, char *argv[]); ++ int bam_mpileup(int argc, char *argv[]); ++@@ -112,15 +114,12 @@ ++ .alias = "norm", ++ .help = "left-align and normalize indels" ++ }, +++#ifdef ENABLE_BCF_PLUGINS ++ { .func = main_plugin, ++ .alias = "plugin", ++-#ifdef ENABLE_BCF_PLUGINS ++ .help = "user-defined plugins" ++-#else ++- /* Do not advertise when plugins disabled. */ ++- .help = "-user-defined plugins" ++-#endif ++ }, +++#endif ++ { .func = main_vcfquery, ++ .alias = "query", ++ .help = "transform VCF/BCF into user-defined formats" ++@@ -237,12 +236,24 @@ ++ fprintf(fp,"\n"); ++ } ++ +++// This is a tricky one, but on Windows the filename wildcard expansion is done by +++// the application and not by the shell, as traditionally it never had a "shell". +++// Even now, DOS and Powershell do not do this expansion (but bash does). +++// +++// This means that Mingw/Msys implements code before main() that takes e.g. "*" and +++// expands it up to a list of matching filenames. This in turn breaks things like +++// specifying "*" as a region (all the unmapped reads). We take a hard line here - +++// filename expansion is the task of the shell, not our application! +++#ifdef _WIN32 +++int _CRT_glob = 0; +++#endif +++ ++ int bcftools_main(int argc, char *argv[]) ++ { ++ if (argc < 2) { usage(bcftools_stderr); return 1; } ++ ++ if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { ++- fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2018 Genome Research Ltd.\n", bcftools_version(), hts_version()); +++ fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2019 Genome Research Ltd.\n", bcftools_version(), hts_version()); ++ #if USE_GPL ++ fprintf(bcftools_stdout, "License GPLv3+: GNU GPL version 3 or later \n"); ++ #else ++--- python-pysam.orig/bcftools/mcall.c +++++ python-pysam/bcftools/mcall.c ++@@ -23,7 +23,9 @@ ++ THE SOFTWARE. */ ++ ++ #include +++#include ++ #include +++#include ++ #include "call.h" ++ ++ // Using priors for GTs does not seem to be mathematically justified. Although ++@@ -36,9 +38,6 @@ ++ // genotypes is reported instead. ++ #define FLAT_PDG_FOR_MISSING 0 ++ ++-// Estimate QS (combined quality and allele frequencies) from PLs ++-#define QS_FROM_PDG 0 ++- ++ ++ void qcall_init(call_t *call) { return; } ++ void qcall_destroy(call_t *call) { return; } ++@@ -244,12 +243,84 @@ ++ free(call->trio[j][i]); ++ } ++ +++static void init_sample_groups(call_t *call) +++{ +++ int i, nsmpl = bcf_hdr_nsamples(call->hdr); +++ if ( !call->sample_groups ) +++ { +++ // standard pooled calling, all samples in the same group +++ grp_t *grps = &call->smpl_grp; +++ grps->ngrp = 1; +++ grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); +++ grps->smpl2grp = (int*)calloc(nsmpl,sizeof(int)); +++ } +++ else if ( !strcmp("-",call->sample_groups) ) +++ { +++ // single-sample calling, each sample creates its own group +++ grp_t *grps = &call->smpl_grp; +++ grps->ngrp = nsmpl; +++ grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); +++ grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); +++ for (i=0; ismpl2grp[i] = i; +++ } +++ else +++ { +++ int nlines; +++ char **lines = hts_readlist(call->sample_groups, 1, &nlines); +++ if ( !lines ) error("Could not read the file: %s\n", call->sample_groups); +++ +++ uint32_t *smpl2grp1 = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); +++ void *grp2idx = khash_str2int_init(); +++ +++ grp_t *grps = &call->smpl_grp; +++ for (i=0; isample_groups,lines[i]); +++ *ptr = 0; +++ int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); +++ if ( ismpl<0 ) continue; +++ if ( smpl2grp1[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups); +++ if ( !khash_str2int_has_key(grp2idx,ptr+1) ) +++ { +++ khash_str2int_inc(grp2idx, ptr+1); +++ grps->ngrp++; +++ } +++ int igrp; +++ if ( khash_str2int_get(grp2idx, ptr+1, &igrp)==0 ) +++ smpl2grp1[ismpl] = igrp+1; +++ else +++ error("This should not happen, fixme: %s\n",ptr+1); +++ } +++ khash_str2int_destroy(grp2idx); +++ +++ grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); +++ grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); +++ for (i=0; ihdr->samples[i],call->sample_groups); +++ grps->smpl2grp[i] = smpl2grp1[i] - 1; +++ } +++ free(smpl2grp1); +++ for (i=0; ismpl_grp; +++ for (i=0; ingrp; i++) +++ free(grps->grp[i].qsum); +++ free(grps->grp); +++ free(grps->smpl2grp); +++} +++ ++ void mcall_init(call_t *call) ++ { ++ call_init_pl2p(call); ++ ++- call->nqsum = 5; ++- call->qsum = (float*) malloc(sizeof(float)*call->nqsum); // will be expanded later if ncessary ++ call->nals_map = 5; ++ call->als_map = (int*) malloc(sizeof(int)*call->nals_map); ++ call->npl_map = 5*(5+1)/2; // will be expanded later if necessary ++@@ -302,26 +373,28 @@ ++ call->theta = log(call->theta); ++ } ++ ++- return; +++ init_sample_groups(call); ++ } ++ ++ void mcall_destroy(call_t *call) ++ { +++ destroy_sample_groups(call); ++ if (call->vcmp) vcmp_destroy(call->vcmp); ++ free(call->itmp); ++ mcall_destroy_trios(call); ++ free(call->GPs); +++ free(call->ADs); ++ free(call->GLs); ++ free(call->GQs); ++ free(call->anno16); ++ free(call->PLs); ++- free(call->qsum); ++ free(call->als_map); ++ free(call->pl_map); ++ free(call->gts); free(call->cgts); free(call->ugts); ++ free(call->pdg); ++ free(call->als); ++ free(call->ac); +++ free(call->qsum); ++ return; ++ } ++ ++@@ -431,40 +504,6 @@ ++ } ++ } ++ ++-/* ++- Allele frequency estimated as: ++- #A = \sum_i (2*P_AA + P_AB) ++- F_A = #A / ( #A + #B ) ++- where i runs across all samples ++-*/ ++-void estimate_qsum(call_t *call, bcf1_t *rec) ++-{ ++- double *pdg = call->pdg; ++- int ngts = rec->n_allele*(rec->n_allele+1)/2; ++- int i,nsmpl = bcf_hdr_nsamples(call->hdr); ++- ++- hts_expand(float,rec->n_allele,call->nqsum,call->qsum); ++- for (i=0; in_allele; i++) call->qsum[i] = 0; ++- ++- for (i=0; in_allele; a++) ++- { ++- for (b=0; b<=a; b++) ++- { ++- call->qsum[a] += pdg[k]; ++- call->qsum[b] += pdg[k]; ++- k++; ++- } ++- } ++- pdg += ngts; ++- } ++- float sum = 0; ++- for (i=0; in_allele; i++) sum += call->qsum[i]; ++- if ( sum ) for (i=0; in_allele; i++) call->qsum[i] /= sum; ++-} ++- ++ // Create mapping between old and new (trimmed) alleles ++ void init_allele_trimming_maps(call_t *call, int als, int nals) ++ { ++@@ -581,6 +620,7 @@ ++ // at most tri-allelic sites are considered. Returns the number of alleles. ++ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) ++ { +++ int j; ++ int ia,ib,ic; // iterators over up to three alleles ++ int max_als=0; // most likely combination of alleles ++ double ref_lk = 0, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles ++@@ -606,32 +646,46 @@ ++ UPDATE_MAX_LKs(1<0 && lk_tot_set); ++ } ++ +++ grp_t *grps = &call->smpl_grp; +++ ++ // Two alleles ++ if ( nals>1 ) ++ { ++ for (ia=0; iaqsum[ia]==0 ) continue; +++ if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue; ++ int iaa = (ia+1)*(ia+2)/2-1; ++ for (ib=0; ibqsum[ib]==0 ) continue; +++ if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; ++ double lk_tot = 0; ++ int lk_tot_set = 0; ++- double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]); ++- double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]); ++- double fa2 = fa*fa; ++- double fb2 = fb*fb; ++- double fab = 2*fa*fb; +++ int ia_cov = 0, ib_cov = 0; +++ for (j=0; jngrp; j++) +++ { +++ grp1_t *grp = &grps->grp[j]; +++ if ( grp->qsum[ia] ) ia_cov = 1; +++ if ( grp->qsum[ib] ) ib_cov = 1; +++ if ( !grp->qsum[ia] && !grp->qsum[ib] ) { grp->dp = 0; continue; } +++ grp->dp = 1; +++ grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]); +++ grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]); +++ grp->fa2 = grp->fa*grp->fa; +++ grp->fb2 = grp->fb*grp->fb; +++ grp->fab = 2*grp->fa*grp->fb; +++ } +++ if ( !ia_cov || !ib_cov ) continue; ++ int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; ++ double *pdg = call->pdg; ++ for (isample=0; isamplegrp[grps->smpl2grp[isample]]; +++ if ( !grp->dp ) continue; ++ double val = 0; ++ if ( !call->ploidy || call->ploidy[isample]==2 ) ++- val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab]; +++ val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fab*pdg[iab]; ++ else if ( call->ploidy && call->ploidy[isample]==1 ) ++- val = fa*pdg[iaa] + fb*pdg[ibb]; +++ val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb]; ++ if ( val ) { lk_tot += log(val); lk_tot_set = 1; } ++ pdg += ngts; ++ } ++@@ -647,35 +701,48 @@ ++ { ++ for (ia=0; iaqsum[ia]==0 ) continue; +++ if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue; ++ int iaa = (ia+1)*(ia+2)/2-1; ++ for (ib=0; ibqsum[ib]==0 ) continue; +++ if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; ++ int ibb = (ib+1)*(ib+2)/2-1; ++ int iab = iaa - ia + ib; ++ for (ic=0; icqsum[ic]==0 ) continue; +++ if ( grps->ngrp==1 && grps->grp[0].qsum[ic]==0 ) continue; ++ double lk_tot = 0; ++ int lk_tot_set = 1; ++- double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); ++- double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); ++- double fc = call->qsum[ic]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); ++- double fa2 = fa*fa; ++- double fb2 = fb*fb; ++- double fc2 = fc*fc; ++- double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; +++ int ia_cov = 0, ib_cov = 0, ic_cov = 0; +++ for (j=0; jngrp; j++) +++ { +++ grp1_t *grp = &grps->grp[j]; +++ if ( grp->qsum[ia] ) ia_cov = 1; +++ if ( grp->qsum[ib] ) ib_cov = 1; +++ if ( grp->qsum[ic] ) ic_cov = 1; +++ if ( !grp->qsum[ia] && !grp->qsum[ib] && !grp->qsum[ic] ) { grp->dp = 0; continue; } +++ grp->dp = 1; +++ grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); +++ grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); +++ grp->fc = grp->qsum[ic]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); +++ grp->fa2 = grp->fa*grp->fa; +++ grp->fb2 = grp->fb*grp->fb; +++ grp->fc2 = grp->fc*grp->fc; +++ grp->fab = 2*grp->fa*grp->fb, grp->fac = 2*grp->fa*grp->fc, grp->fbc = 2*grp->fb*grp->fc; +++ } +++ if ( !ia_cov || !ib_cov || !ic_cov ) continue; ++ int isample, icc = (ic+1)*(ic+2)/2-1; ++ int iac = iaa - ia + ic, ibc = ibb - ib + ic; ++ double *pdg = call->pdg; ++ for (isample=0; isamplegrp[grps->smpl2grp[isample]]; +++ if ( !grp->dp ) continue; ++ double val = 0; ++ if ( !call->ploidy || call->ploidy[isample]==2 ) ++- val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc]; +++ val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fc2*pdg[icc] + grp->fab*pdg[iab] + grp->fac*pdg[iac] + grp->fbc*pdg[ibc]; ++ else if ( call->ploidy && call->ploidy[isample]==1 ) ++- val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc]; +++ val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb] + grp->fc*pdg[icc]; ++ if ( val ) { lk_tot += log(val); lk_tot_set = 1; } ++ pdg += ngts; ++ } ++@@ -788,12 +855,13 @@ ++ gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end; ++ ++ // Non-zero depth, determine the most likely genotype +++ grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]]; ++ double best_lk = 0; ++ for (ia=0; iaqsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia]; +++ double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; ++ #if USE_PRIOR_FOR_GTS ++ if ( ia!=0 ) lk *= prior; ++ #endif ++@@ -816,7 +884,7 @@ ++ { ++ if ( !(out_als & 1<qsum[ia]*call->qsum[ib]; +++ double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib]; ++ #if USE_PRIOR_FOR_GTS ++ if ( ia!=0 ) lk *= prior; ++ if ( ib!=0 ) lk *= prior; ++@@ -940,6 +1008,7 @@ ++ ++ for (i=0; ismpl_grp.grp[call->smpl_grp.smpl2grp[isample]]; ++ double sum_lk = 0; ++ double best_lk = 0; ++ for (ia=0; iaals_map[ia],call->als_map[ia]); ++- double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia]; +++ double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; ++ sum_lk += lk; ++ gls[idx] = lk; ++ if ( best_lk < lk ) ++@@ -966,7 +1035,7 @@ ++ if ( !(out_als & 1<als_map[ia],call->als_map[ib]); ++- double lk = 2*pdg[iab]*call->qsum[ia]*call->qsum[ib]; +++ double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib]; ++ sum_lk += lk; ++ gls[idx] = lk; ++ if ( best_lk < lk ) ++@@ -1272,28 +1341,37 @@ ++ // ++ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) ++ { ++- bcf_sr_regions_t *tgt = call->srs->targets; ++- if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals); ++- hts_expand(char*,tgt->nals+1,call->nals,call->als); +++ assert( call->tgt_als->n ); +++ if ( call->tgt_als->n>5 ) error("Maximum accepted number of alleles is 5, got %d\n", call->tgt_als->n); +++ hts_expand(char*,call->tgt_als->n+1,call->nals,call->als); ++ ++ int has_new = 0; ++ ++ int i, j, nals = 1; ++ for (i=1; inals_map; i++) call->als_map[i] = -1; ++ ++- if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], tgt->als[0]) < 0 ) ++- error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,tgt->als[0],rec->d.allele[0]); +++ if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], call->tgt_als->allele[0]) < 0 ) +++ error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,call->tgt_als->allele[0],rec->d.allele[0]); ++ ++ // create mapping from new to old alleles ++- call->als[0] = tgt->als[0]; +++ call->als[0] = call->tgt_als->allele[0]; ++ call->als_map[0] = 0; ++ ++- for (i=1; inals; i++) +++ for (i=1; itgt_als->n; i++) ++ { ++- call->als[nals] = tgt->als[i]; ++- j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]); +++ call->als[nals] = call->tgt_als->allele[i]; +++ j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]); ++ ++- if ( j+1==*unseen ) { fprintf(stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; } +++ if ( j+1==*unseen ) +++ { +++ fprintf(stderr,"Fixme? Cannot constrain to %d-th allele (%s). VCF=",i,call->tgt_als->allele[i]); +++ int k; +++ for (k=0; kn_allele; k++) fprintf(stderr,"%s%s",k==0?"":",",rec->d.allele[k]); +++ fprintf(stderr,"\tTAB="); +++ for (k=0; ktgt_als->n; k++) fprintf(stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]); +++ fprintf(stderr,"\n"); +++ return -1; +++ } ++ ++ if ( j>=0 ) ++ { ++@@ -1364,11 +1442,51 @@ ++ bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl); ++ ++ // update QS ++- float qsum[5]; ++- int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum); +++ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); +++ hts_expand(float,nals,call->nqsum,call->qsum); ++ for (i=0; ials_map[i]qsum[call->als_map[i]] : 0; ++- bcf_update_info_float(call->hdr, rec, "QS", qsum, nals); +++ call->qsum[i] = call->als_map[i]smpl_grp.grp[0].qsum[call->als_map[i]] : 0; +++ bcf_update_info_float(call->hdr, rec, "QS", call->qsum, nals); +++ +++ // update any Number=R tags +++ void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point +++ int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs; +++ for (i=0; in_fmt; i++) +++ { +++ bcf_fmt_t *fmt = &rec->d.fmt[i]; +++ int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id); +++ if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag +++ +++ // NB:works only for BCF_HT_INT and BCF_HT_REAL +++ int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id); +++ assert( type==BCF_HT_INT || type==BCF_HT_REAL ); +++ assert( sizeof(float)==sizeof(int32_t) ); +++ +++ const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id); +++ int nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type); +++ if (nret<=0) continue; +++ int nsmpl = bcf_hdr_nsamples(call->hdr); +++ int size1 = sizeof(float); +++ hts_expand(float, nsmpl * nals, ntmp_new, tmp_new); +++ for (j=0; jn; +++ uint8_t *ptr_new = (uint8_t *) tmp_new + j*nals*size1; +++ for (k=0; kals_map[k]; +++ memcpy(dst,src,size1); +++ } +++ } +++ nret = bcf_update_format(call->hdr, rec, key, tmp_new, nsmpl*nals, type); +++ assert( nret==0 ); +++ } +++ call->PLs = (int32_t*) tmp_new; +++ call->mPLs = ntmp_new; +++ call->itmp = (int32_t*) tmp_ori; +++ call->n_itmp = ntmp_ori; +++ ++ ++ if ( *unseen ) *unseen = nals-1; ++ return 0; ++@@ -1383,7 +1501,7 @@ ++ */ ++ int mcall(call_t *call, bcf1_t *rec) ++ { ++- int i, unseen = call->unseen; +++ int i,j, unseen = call->unseen; ++ ++ // Force alleles when calling genotypes given alleles was requested ++ if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2; ++@@ -1404,61 +1522,83 @@ ++ hts_expand(double, call->nPLs, call->npdg, call->pdg); ++ set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen); ++ ++- #if QS_FROM_PDG ++- estimate_qsum(call, rec); ++- #else ++- // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. ++- int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum); +++ // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. +++ if ( call->smpl_grp.ngrp == 1 ) +++ { +++ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); ++ if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1); ++ if ( nqs < nals ) ++ { ++ // Some of the listed alleles do not have the corresponding QS field. This is ++- // typically ref-only site with X in ALT. +++ // typically ref-only site with <*> in ALT. +++ hts_expand(float,nals,call->smpl_grp.grp[0].nqsum,call->smpl_grp.grp[0].qsum); +++ for (i=nqs; ismpl_grp.grp[0].qsum[i] = 0; +++ } +++ } +++ else +++ { +++ for (j=0; jsmpl_grp.ngrp; j++) +++ { +++ hts_expand(float,nals,call->smpl_grp.grp[j].nqsum,call->smpl_grp.grp[j].qsum); +++ memset(call->smpl_grp.grp[j].qsum, 0, sizeof(float)*nals); +++ } ++ ++- hts_expand(float,nals,call->nqsum,call->qsum); ++- for (i=nqs; iqsum[i] = 0; +++ int nad = bcf_get_format_int32(call->hdr, rec, "AD", &call->ADs, &call->nADs); +++ if ( nad<1 ) error("Error: FORMAT/AD is required with the -G option, mpileup must be run with -a AD\n"); +++ nad /= bcf_hdr_nsamples(call->hdr); +++ hts_expand(float,nals,call->nqsum,call->qsum); +++ float qsum = 0; +++ for (i=0; ihdr); i++) +++ { +++ int32_t *ptr = call->ADs + i*nad; +++ for (j=0; jqsum[j] = 0; +++ else { call->qsum[j] = ptr[j]; qsum += ptr[j]; } +++ } +++ for (; jqsum[j] = 0; +++ if ( qsum ) +++ for (j=0; jqsum[j] /= qsum; +++ +++ grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[i]]; +++ for (j=0; jqsum[j] += call->qsum[j]; ++ } +++ } ++ ++- // If available, take into account reference panel AFs ++- if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) +++ // If available, take into account reference panel AFs +++ if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) +++ { +++ int an = call->ac[0]; +++ if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) ++ { ++- int an = call->ac[0]; ++- if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) +++ int ac0 = an; // number of alleles in the reference population +++ for (i=0; iac[i]==bcf_int32_vector_end ) break; ++- if ( call->ac[i]==bcf_int32_missing ) continue; ++- ac0 -= call->ac[i]; ++- call->qsum[i+1] += call->ac[i]*0.5; ++- } ++- if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); ++- call->qsum[0] += ac0*0.5; ++- for (i=0; iqsum[i] /= nsmpl + 0.5*an; +++ if ( call->ac[i]==bcf_int32_vector_end ) break; +++ if ( call->ac[i]==bcf_int32_missing ) continue; +++ ac0 -= call->ac[i]; +++ for (j=0; jsmpl_grp.ngrp; j++) +++ call->smpl_grp.grp[j].qsum[i+1] += call->ac[i]*0.5; +++ } +++ if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); +++ for (j=0; jsmpl_grp.ngrp; j++) +++ call->smpl_grp.grp[j].qsum[0] += ac0*0.5; +++ for (i=0; ismpl_grp.ngrp; j++) +++ call->smpl_grp.grp[j].qsum[i] /= nsmpl + 0.5*an; ++ } ++ } +++ } ++ +++ for (j=0; jsmpl_grp.ngrp; j++) +++ { ++ float qsum_tot = 0; ++- for (i=0; iqsum[i]; ++- ++- // Is this still necessary?? ++- // ++- // if (0&& !call->qsum[0] ) ++- // { ++- // // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value, ++- // // an equivalent of a single reference read. ++- // if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 ) ++- // error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1); ++- // if ( call->itmp[0] ) ++- // { ++- // call->qsum[0] = 1.0 / call->itmp[0] / nsmpl; ++- // qsum_tot += call->qsum[0]; ++- // } ++- // } ++- ++- if ( qsum_tot ) for (i=0; iqsum[i] /= qsum_tot; ++- #endif +++ for (i=0; ismpl_grp.grp[j].qsum[i]; +++ if ( qsum_tot ) for (i=0; ismpl_grp.grp[j].qsum[i] /= qsum_tot; +++ } ++ ++ bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag ++ ++@@ -1466,7 +1606,7 @@ ++ int out_als, nout; ++ if ( nals > 8*sizeof(out_als) ) ++ { ++- fprintf(stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); +++ fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); ++ return 0; ++ } ++ nout = mcall_find_best_alleles(call, nals, &out_als); ++@@ -1510,7 +1650,7 @@ ++ { ++ if ( nout>4 ) ++ { ++- fprintf(stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); +++ fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); ++ return 0; ++ } ++ mcall_call_trio_genotypes(call, rec, nals,nout,out_als); ++--- python-pysam.orig/bcftools/mcall.c.pysam.c +++++ python-pysam/bcftools/mcall.c.pysam.c ++@@ -25,7 +25,9 @@ ++ THE SOFTWARE. */ ++ ++ #include +++#include ++ #include +++#include ++ #include "call.h" ++ ++ // Using priors for GTs does not seem to be mathematically justified. Although ++@@ -38,9 +40,6 @@ ++ // genotypes is reported instead. ++ #define FLAT_PDG_FOR_MISSING 0 ++ ++-// Estimate QS (combined quality and allele frequencies) from PLs ++-#define QS_FROM_PDG 0 ++- ++ ++ void qcall_init(call_t *call) { return; } ++ void qcall_destroy(call_t *call) { return; } ++@@ -246,12 +245,84 @@ ++ free(call->trio[j][i]); ++ } ++ +++static void init_sample_groups(call_t *call) +++{ +++ int i, nsmpl = bcf_hdr_nsamples(call->hdr); +++ if ( !call->sample_groups ) +++ { +++ // standard pooled calling, all samples in the same group +++ grp_t *grps = &call->smpl_grp; +++ grps->ngrp = 1; +++ grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); +++ grps->smpl2grp = (int*)calloc(nsmpl,sizeof(int)); +++ } +++ else if ( !strcmp("-",call->sample_groups) ) +++ { +++ // single-sample calling, each sample creates its own group +++ grp_t *grps = &call->smpl_grp; +++ grps->ngrp = nsmpl; +++ grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); +++ grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); +++ for (i=0; ismpl2grp[i] = i; +++ } +++ else +++ { +++ int nlines; +++ char **lines = hts_readlist(call->sample_groups, 1, &nlines); +++ if ( !lines ) error("Could not read the file: %s\n", call->sample_groups); +++ +++ uint32_t *smpl2grp1 = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); +++ void *grp2idx = khash_str2int_init(); +++ +++ grp_t *grps = &call->smpl_grp; +++ for (i=0; isample_groups,lines[i]); +++ *ptr = 0; +++ int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); +++ if ( ismpl<0 ) continue; +++ if ( smpl2grp1[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups); +++ if ( !khash_str2int_has_key(grp2idx,ptr+1) ) +++ { +++ khash_str2int_inc(grp2idx, ptr+1); +++ grps->ngrp++; +++ } +++ int igrp; +++ if ( khash_str2int_get(grp2idx, ptr+1, &igrp)==0 ) +++ smpl2grp1[ismpl] = igrp+1; +++ else +++ error("This should not happen, fixme: %s\n",ptr+1); +++ } +++ khash_str2int_destroy(grp2idx); +++ +++ grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); +++ grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); +++ for (i=0; ihdr->samples[i],call->sample_groups); +++ grps->smpl2grp[i] = smpl2grp1[i] - 1; +++ } +++ free(smpl2grp1); +++ for (i=0; ismpl_grp; +++ for (i=0; ingrp; i++) +++ free(grps->grp[i].qsum); +++ free(grps->grp); +++ free(grps->smpl2grp); +++} +++ ++ void mcall_init(call_t *call) ++ { ++ call_init_pl2p(call); ++ ++- call->nqsum = 5; ++- call->qsum = (float*) malloc(sizeof(float)*call->nqsum); // will be expanded later if ncessary ++ call->nals_map = 5; ++ call->als_map = (int*) malloc(sizeof(int)*call->nals_map); ++ call->npl_map = 5*(5+1)/2; // will be expanded later if necessary ++@@ -304,26 +375,28 @@ ++ call->theta = log(call->theta); ++ } ++ ++- return; +++ init_sample_groups(call); ++ } ++ ++ void mcall_destroy(call_t *call) ++ { +++ destroy_sample_groups(call); ++ if (call->vcmp) vcmp_destroy(call->vcmp); ++ free(call->itmp); ++ mcall_destroy_trios(call); ++ free(call->GPs); +++ free(call->ADs); ++ free(call->GLs); ++ free(call->GQs); ++ free(call->anno16); ++ free(call->PLs); ++- free(call->qsum); ++ free(call->als_map); ++ free(call->pl_map); ++ free(call->gts); free(call->cgts); free(call->ugts); ++ free(call->pdg); ++ free(call->als); ++ free(call->ac); +++ free(call->qsum); ++ return; ++ } ++ ++@@ -433,40 +506,6 @@ ++ } ++ } ++ ++-/* ++- Allele frequency estimated as: ++- #A = \sum_i (2*P_AA + P_AB) ++- F_A = #A / ( #A + #B ) ++- where i runs across all samples ++-*/ ++-void estimate_qsum(call_t *call, bcf1_t *rec) ++-{ ++- double *pdg = call->pdg; ++- int ngts = rec->n_allele*(rec->n_allele+1)/2; ++- int i,nsmpl = bcf_hdr_nsamples(call->hdr); ++- ++- hts_expand(float,rec->n_allele,call->nqsum,call->qsum); ++- for (i=0; in_allele; i++) call->qsum[i] = 0; ++- ++- for (i=0; in_allele; a++) ++- { ++- for (b=0; b<=a; b++) ++- { ++- call->qsum[a] += pdg[k]; ++- call->qsum[b] += pdg[k]; ++- k++; ++- } ++- } ++- pdg += ngts; ++- } ++- float sum = 0; ++- for (i=0; in_allele; i++) sum += call->qsum[i]; ++- if ( sum ) for (i=0; in_allele; i++) call->qsum[i] /= sum; ++-} ++- ++ // Create mapping between old and new (trimmed) alleles ++ void init_allele_trimming_maps(call_t *call, int als, int nals) ++ { ++@@ -583,6 +622,7 @@ ++ // at most tri-allelic sites are considered. Returns the number of alleles. ++ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) ++ { +++ int j; ++ int ia,ib,ic; // iterators over up to three alleles ++ int max_als=0; // most likely combination of alleles ++ double ref_lk = 0, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles ++@@ -608,32 +648,46 @@ ++ UPDATE_MAX_LKs(1<0 && lk_tot_set); ++ } ++ +++ grp_t *grps = &call->smpl_grp; +++ ++ // Two alleles ++ if ( nals>1 ) ++ { ++ for (ia=0; iaqsum[ia]==0 ) continue; +++ if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue; ++ int iaa = (ia+1)*(ia+2)/2-1; ++ for (ib=0; ibqsum[ib]==0 ) continue; +++ if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; ++ double lk_tot = 0; ++ int lk_tot_set = 0; ++- double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]); ++- double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]); ++- double fa2 = fa*fa; ++- double fb2 = fb*fb; ++- double fab = 2*fa*fb; +++ int ia_cov = 0, ib_cov = 0; +++ for (j=0; jngrp; j++) +++ { +++ grp1_t *grp = &grps->grp[j]; +++ if ( grp->qsum[ia] ) ia_cov = 1; +++ if ( grp->qsum[ib] ) ib_cov = 1; +++ if ( !grp->qsum[ia] && !grp->qsum[ib] ) { grp->dp = 0; continue; } +++ grp->dp = 1; +++ grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]); +++ grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]); +++ grp->fa2 = grp->fa*grp->fa; +++ grp->fb2 = grp->fb*grp->fb; +++ grp->fab = 2*grp->fa*grp->fb; +++ } +++ if ( !ia_cov || !ib_cov ) continue; ++ int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; ++ double *pdg = call->pdg; ++ for (isample=0; isamplegrp[grps->smpl2grp[isample]]; +++ if ( !grp->dp ) continue; ++ double val = 0; ++ if ( !call->ploidy || call->ploidy[isample]==2 ) ++- val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab]; +++ val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fab*pdg[iab]; ++ else if ( call->ploidy && call->ploidy[isample]==1 ) ++- val = fa*pdg[iaa] + fb*pdg[ibb]; +++ val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb]; ++ if ( val ) { lk_tot += log(val); lk_tot_set = 1; } ++ pdg += ngts; ++ } ++@@ -649,35 +703,48 @@ ++ { ++ for (ia=0; iaqsum[ia]==0 ) continue; +++ if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue; ++ int iaa = (ia+1)*(ia+2)/2-1; ++ for (ib=0; ibqsum[ib]==0 ) continue; +++ if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; ++ int ibb = (ib+1)*(ib+2)/2-1; ++ int iab = iaa - ia + ib; ++ for (ic=0; icqsum[ic]==0 ) continue; +++ if ( grps->ngrp==1 && grps->grp[0].qsum[ic]==0 ) continue; ++ double lk_tot = 0; ++ int lk_tot_set = 1; ++- double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); ++- double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); ++- double fc = call->qsum[ic]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); ++- double fa2 = fa*fa; ++- double fb2 = fb*fb; ++- double fc2 = fc*fc; ++- double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; +++ int ia_cov = 0, ib_cov = 0, ic_cov = 0; +++ for (j=0; jngrp; j++) +++ { +++ grp1_t *grp = &grps->grp[j]; +++ if ( grp->qsum[ia] ) ia_cov = 1; +++ if ( grp->qsum[ib] ) ib_cov = 1; +++ if ( grp->qsum[ic] ) ic_cov = 1; +++ if ( !grp->qsum[ia] && !grp->qsum[ib] && !grp->qsum[ic] ) { grp->dp = 0; continue; } +++ grp->dp = 1; +++ grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); +++ grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); +++ grp->fc = grp->qsum[ic]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); +++ grp->fa2 = grp->fa*grp->fa; +++ grp->fb2 = grp->fb*grp->fb; +++ grp->fc2 = grp->fc*grp->fc; +++ grp->fab = 2*grp->fa*grp->fb, grp->fac = 2*grp->fa*grp->fc, grp->fbc = 2*grp->fb*grp->fc; +++ } +++ if ( !ia_cov || !ib_cov || !ic_cov ) continue; ++ int isample, icc = (ic+1)*(ic+2)/2-1; ++ int iac = iaa - ia + ic, ibc = ibb - ib + ic; ++ double *pdg = call->pdg; ++ for (isample=0; isamplegrp[grps->smpl2grp[isample]]; +++ if ( !grp->dp ) continue; ++ double val = 0; ++ if ( !call->ploidy || call->ploidy[isample]==2 ) ++- val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc]; +++ val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fc2*pdg[icc] + grp->fab*pdg[iab] + grp->fac*pdg[iac] + grp->fbc*pdg[ibc]; ++ else if ( call->ploidy && call->ploidy[isample]==1 ) ++- val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc]; +++ val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb] + grp->fc*pdg[icc]; ++ if ( val ) { lk_tot += log(val); lk_tot_set = 1; } ++ pdg += ngts; ++ } ++@@ -790,12 +857,13 @@ ++ gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end; ++ ++ // Non-zero depth, determine the most likely genotype +++ grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]]; ++ double best_lk = 0; ++ for (ia=0; iaqsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia]; +++ double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; ++ #if USE_PRIOR_FOR_GTS ++ if ( ia!=0 ) lk *= prior; ++ #endif ++@@ -818,7 +886,7 @@ ++ { ++ if ( !(out_als & 1<qsum[ia]*call->qsum[ib]; +++ double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib]; ++ #if USE_PRIOR_FOR_GTS ++ if ( ia!=0 ) lk *= prior; ++ if ( ib!=0 ) lk *= prior; ++@@ -942,6 +1010,7 @@ ++ ++ for (i=0; ismpl_grp.grp[call->smpl_grp.smpl2grp[isample]]; ++ double sum_lk = 0; ++ double best_lk = 0; ++ for (ia=0; iaals_map[ia],call->als_map[ia]); ++- double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia]; +++ double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; ++ sum_lk += lk; ++ gls[idx] = lk; ++ if ( best_lk < lk ) ++@@ -968,7 +1037,7 @@ ++ if ( !(out_als & 1<als_map[ia],call->als_map[ib]); ++- double lk = 2*pdg[iab]*call->qsum[ia]*call->qsum[ib]; +++ double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib]; ++ sum_lk += lk; ++ gls[idx] = lk; ++ if ( best_lk < lk ) ++@@ -1274,28 +1343,37 @@ ++ // ++ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) ++ { ++- bcf_sr_regions_t *tgt = call->srs->targets; ++- if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals); ++- hts_expand(char*,tgt->nals+1,call->nals,call->als); +++ assert( call->tgt_als->n ); +++ if ( call->tgt_als->n>5 ) error("Maximum accepted number of alleles is 5, got %d\n", call->tgt_als->n); +++ hts_expand(char*,call->tgt_als->n+1,call->nals,call->als); ++ ++ int has_new = 0; ++ ++ int i, j, nals = 1; ++ for (i=1; inals_map; i++) call->als_map[i] = -1; ++ ++- if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], tgt->als[0]) < 0 ) ++- error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,tgt->als[0],rec->d.allele[0]); +++ if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], call->tgt_als->allele[0]) < 0 ) +++ error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,call->tgt_als->allele[0],rec->d.allele[0]); ++ ++ // create mapping from new to old alleles ++- call->als[0] = tgt->als[0]; +++ call->als[0] = call->tgt_als->allele[0]; ++ call->als_map[0] = 0; ++ ++- for (i=1; inals; i++) +++ for (i=1; itgt_als->n; i++) ++ { ++- call->als[nals] = tgt->als[i]; ++- j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]); +++ call->als[nals] = call->tgt_als->allele[i]; +++ j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]); ++ ++- if ( j+1==*unseen ) { fprintf(bcftools_stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; } +++ if ( j+1==*unseen ) +++ { +++ fprintf(bcftools_stderr,"Fixme? Cannot constrain to %d-th allele (%s). VCF=",i,call->tgt_als->allele[i]); +++ int k; +++ for (k=0; kn_allele; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",rec->d.allele[k]); +++ fprintf(bcftools_stderr,"\tTAB="); +++ for (k=0; ktgt_als->n; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]); +++ fprintf(bcftools_stderr,"\n"); +++ return -1; +++ } ++ ++ if ( j>=0 ) ++ { ++@@ -1366,11 +1444,51 @@ ++ bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl); ++ ++ // update QS ++- float qsum[5]; ++- int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum); +++ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); +++ hts_expand(float,nals,call->nqsum,call->qsum); ++ for (i=0; ials_map[i]qsum[call->als_map[i]] : 0; ++- bcf_update_info_float(call->hdr, rec, "QS", qsum, nals); +++ call->qsum[i] = call->als_map[i]smpl_grp.grp[0].qsum[call->als_map[i]] : 0; +++ bcf_update_info_float(call->hdr, rec, "QS", call->qsum, nals); +++ +++ // update any Number=R tags +++ void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point +++ int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs; +++ for (i=0; in_fmt; i++) +++ { +++ bcf_fmt_t *fmt = &rec->d.fmt[i]; +++ int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id); +++ if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag +++ +++ // NB:works only for BCF_HT_INT and BCF_HT_REAL +++ int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id); +++ assert( type==BCF_HT_INT || type==BCF_HT_REAL ); +++ assert( sizeof(float)==sizeof(int32_t) ); +++ +++ const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id); +++ int nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type); +++ if (nret<=0) continue; +++ int nsmpl = bcf_hdr_nsamples(call->hdr); +++ int size1 = sizeof(float); +++ hts_expand(float, nsmpl * nals, ntmp_new, tmp_new); +++ for (j=0; jn; +++ uint8_t *ptr_new = (uint8_t *) tmp_new + j*nals*size1; +++ for (k=0; kals_map[k]; +++ memcpy(dst,src,size1); +++ } +++ } +++ nret = bcf_update_format(call->hdr, rec, key, tmp_new, nsmpl*nals, type); +++ assert( nret==0 ); +++ } +++ call->PLs = (int32_t*) tmp_new; +++ call->mPLs = ntmp_new; +++ call->itmp = (int32_t*) tmp_ori; +++ call->n_itmp = ntmp_ori; +++ ++ ++ if ( *unseen ) *unseen = nals-1; ++ return 0; ++@@ -1385,7 +1503,7 @@ ++ */ ++ int mcall(call_t *call, bcf1_t *rec) ++ { ++- int i, unseen = call->unseen; +++ int i,j, unseen = call->unseen; ++ ++ // Force alleles when calling genotypes given alleles was requested ++ if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2; ++@@ -1406,61 +1524,83 @@ ++ hts_expand(double, call->nPLs, call->npdg, call->pdg); ++ set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen); ++ ++- #if QS_FROM_PDG ++- estimate_qsum(call, rec); ++- #else ++- // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. ++- int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum); +++ // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. +++ if ( call->smpl_grp.ngrp == 1 ) +++ { +++ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); ++ if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1); ++ if ( nqs < nals ) ++ { ++ // Some of the listed alleles do not have the corresponding QS field. This is ++- // typically ref-only site with X in ALT. +++ // typically ref-only site with <*> in ALT. +++ hts_expand(float,nals,call->smpl_grp.grp[0].nqsum,call->smpl_grp.grp[0].qsum); +++ for (i=nqs; ismpl_grp.grp[0].qsum[i] = 0; +++ } +++ } +++ else +++ { +++ for (j=0; jsmpl_grp.ngrp; j++) +++ { +++ hts_expand(float,nals,call->smpl_grp.grp[j].nqsum,call->smpl_grp.grp[j].qsum); +++ memset(call->smpl_grp.grp[j].qsum, 0, sizeof(float)*nals); +++ } ++ ++- hts_expand(float,nals,call->nqsum,call->qsum); ++- for (i=nqs; iqsum[i] = 0; +++ int nad = bcf_get_format_int32(call->hdr, rec, "AD", &call->ADs, &call->nADs); +++ if ( nad<1 ) error("Error: FORMAT/AD is required with the -G option, mpileup must be run with -a AD\n"); +++ nad /= bcf_hdr_nsamples(call->hdr); +++ hts_expand(float,nals,call->nqsum,call->qsum); +++ float qsum = 0; +++ for (i=0; ihdr); i++) +++ { +++ int32_t *ptr = call->ADs + i*nad; +++ for (j=0; jqsum[j] = 0; +++ else { call->qsum[j] = ptr[j]; qsum += ptr[j]; } +++ } +++ for (; jqsum[j] = 0; +++ if ( qsum ) +++ for (j=0; jqsum[j] /= qsum; +++ +++ grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[i]]; +++ for (j=0; jqsum[j] += call->qsum[j]; ++ } +++ } ++ ++- // If available, take into account reference panel AFs ++- if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) +++ // If available, take into account reference panel AFs +++ if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) +++ { +++ int an = call->ac[0]; +++ if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) ++ { ++- int an = call->ac[0]; ++- if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) +++ int ac0 = an; // number of alleles in the reference population +++ for (i=0; iac[i]==bcf_int32_vector_end ) break; ++- if ( call->ac[i]==bcf_int32_missing ) continue; ++- ac0 -= call->ac[i]; ++- call->qsum[i+1] += call->ac[i]*0.5; ++- } ++- if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); ++- call->qsum[0] += ac0*0.5; ++- for (i=0; iqsum[i] /= nsmpl + 0.5*an; +++ if ( call->ac[i]==bcf_int32_vector_end ) break; +++ if ( call->ac[i]==bcf_int32_missing ) continue; +++ ac0 -= call->ac[i]; +++ for (j=0; jsmpl_grp.ngrp; j++) +++ call->smpl_grp.grp[j].qsum[i+1] += call->ac[i]*0.5; +++ } +++ if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); +++ for (j=0; jsmpl_grp.ngrp; j++) +++ call->smpl_grp.grp[j].qsum[0] += ac0*0.5; +++ for (i=0; ismpl_grp.ngrp; j++) +++ call->smpl_grp.grp[j].qsum[i] /= nsmpl + 0.5*an; ++ } ++ } +++ } ++ +++ for (j=0; jsmpl_grp.ngrp; j++) +++ { ++ float qsum_tot = 0; ++- for (i=0; iqsum[i]; ++- ++- // Is this still necessary?? ++- // ++- // if (0&& !call->qsum[0] ) ++- // { ++- // // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value, ++- // // an equivalent of a single reference read. ++- // if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 ) ++- // error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1); ++- // if ( call->itmp[0] ) ++- // { ++- // call->qsum[0] = 1.0 / call->itmp[0] / nsmpl; ++- // qsum_tot += call->qsum[0]; ++- // } ++- // } ++- ++- if ( qsum_tot ) for (i=0; iqsum[i] /= qsum_tot; ++- #endif +++ for (i=0; ismpl_grp.grp[j].qsum[i]; +++ if ( qsum_tot ) for (i=0; ismpl_grp.grp[j].qsum[i] /= qsum_tot; +++ } ++ ++ bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag ++ ++@@ -1468,7 +1608,7 @@ ++ int out_als, nout; ++ if ( nals > 8*sizeof(out_als) ) ++ { ++- fprintf(bcftools_stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); +++ fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); ++ return 0; ++ } ++ nout = mcall_find_best_alleles(call, nals, &out_als); ++@@ -1512,7 +1652,7 @@ ++ { ++ if ( nout>4 ) ++ { ++- fprintf(bcftools_stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); +++ fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); ++ return 0; ++ } ++ mcall_call_trio_genotypes(call, rec, nals,nout,out_als); ++--- python-pysam.orig/bcftools/mpileup.c +++++ python-pysam/bcftools/mpileup.c ++@@ -1,6 +1,6 @@ ++ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools ++ ++- Copyright (C) 2008-2017 Genome Research Ltd. +++ Copyright (C) 2008-2018 Genome Research Ltd. ++ Portions copyright (C) 2009-2012 Broad Institute. ++ ++ Author: Heng Li ++@@ -31,6 +31,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -222,8 +223,8 @@ ++ if (ma->conf->fai && b->core.tid >= 0) { ++ has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); ++ if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence ++- fprintf(stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", ++- __func__, b->core.pos, ref_len, b->core.tid); +++ fprintf(stderr,"[%s] Skipping because %"PRId64" is outside of %d [ref:%d]\n", +++ __func__, (int64_t) b->core.pos, ref_len, b->core.tid); ++ continue; ++ } ++ } else { ++@@ -246,13 +247,28 @@ ++ ++ // Called once per new bam added to the pileup. ++ // We cache sample information here so we don't have to keep recomputing this ++-// on each and every pileup column. +++// on each and every pileup column. If FMT/SCR annotation is requested, a flag +++// is set to indicate the presence of a soft clip. ++ // ++ // Cd is an arbitrary block of data we can write into, which ends up in ++-// the pileup structures. We stash the sample ID there. ++-static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { +++// the pileup structures. We stash the sample ID there: +++// has_soft_clip .. cd->i & 1 +++// sample_id .. cd->i >> 1 +++static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) +++{ ++ mplp_aux_t *ma = (mplp_aux_t *)data; ++- cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b); +++ cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b) << 1; +++ if ( ma->conf->fmt_flag & (B2B_INFO_SCR|B2B_FMT_SCR) ) +++ { +++ int i; +++ for (i=0; icore.n_cigar; i++) +++ { +++ int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK; +++ if ( cig!=BAM_CSOFT_CLIP ) continue; +++ cd->i |= 1; +++ break; +++ } +++ } ++ return 0; ++ } ++ ++@@ -265,7 +281,7 @@ ++ for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position ++ { ++ const bam_pileup1_t *p = plp[i] + j; ++- int id = p->cd.i; +++ int id = PLP_SAMPLE_ID(p->cd.i); ++ if (m->n_plp[id] == m->m_plp[id]) ++ { ++ m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; ++@@ -280,7 +296,7 @@ ++ { ++ if ( !conf->gvcf ) ++ { ++- if ( rec ) bcf_write1(fp, hdr, rec); +++ if ( rec && bcf_write1(fp, hdr, rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output"); ++ return; ++ } ++ ++@@ -298,7 +314,7 @@ ++ if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1; ++ } ++ rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref); ++- if ( rec ) bcf_write1(fp,hdr,rec); +++ if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output"); ++ } ++ ++ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) ++@@ -310,7 +326,7 @@ ++ ++ while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0) ++ { ++- if ( end && (posend) ) continue; +++ if ( posend ) continue; ++ if ( conf->bed && tid >= 0 ) ++ { ++ int overlap = regidx_overlap(conf->bed, hdr->target_name[tid], pos, pos, NULL); ++@@ -521,11 +537,13 @@ ++ ++ bcf_hdr_append(conf->bcf_hdr,"##ALT="); ++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ if ( conf->fmt_flag&B2B_INFO_VDB ) +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ if ( conf->fmt_flag&B2B_INFO_RPB ) +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++@@ -553,17 +571,21 @@ ++ if ( conf->fmt_flag&B2B_FMT_SP ) ++ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); ++ if ( conf->fmt_flag&B2B_FMT_AD ) ++- bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); +++ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); ++ if ( conf->fmt_flag&B2B_FMT_ADF ) ++- bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); +++ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); ++ if ( conf->fmt_flag&B2B_FMT_ADR ) ++- bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); +++ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); ++ if ( conf->fmt_flag&B2B_INFO_AD ) ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++ if ( conf->fmt_flag&B2B_INFO_ADF ) ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ if ( conf->fmt_flag&B2B_INFO_SCR ) +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ if ( conf->fmt_flag&B2B_FMT_SCR ) +++ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); ++ if ( conf->fmt_flag&B2B_INFO_ADR ) ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++ if ( conf->gvcf ) ++ gvcf_update_header(conf->gvcf, conf->bcf_hdr); ++ ++@@ -571,7 +593,7 @@ ++ const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl); ++ for (i=0; ibcf_hdr, smpl[i]); ++- bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr); +++ if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output"); ++ ++ conf->bca = bcf_call_init(-1., conf->min_baseQ); ++ conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); ++@@ -579,6 +601,7 @@ ++ conf->bca->min_frac = conf->min_frac; ++ conf->bca->min_support = conf->min_support; ++ conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; +++ conf->bca->fmt_flag = conf->fmt_flag; ++ ++ conf->bc.bcf_hdr = conf->bcf_hdr; ++ conf->bc.n = nsmpl; ++@@ -599,11 +622,14 @@ ++ conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES; ++ } ++ } +++ if ( conf->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) ) +++ conf->bc.SCR = (int32_t*) malloc((nsmpl+1)*sizeof(*conf->bc.SCR)); ++ } ++ ++ // init mpileup ++ conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); ++ if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter); +++ fprintf(stderr, "[%s] maximum number of reads per input file set to -d %d\n", __func__, conf->max_depth); ++ if ( (double)conf->max_depth * conf->nfiles > 1<<20) ++ fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles); ++ if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 ) ++@@ -623,7 +649,7 @@ ++ if ( ireg++ > 0 ) ++ { ++ conf->buf.l = 0; ++- ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end); +++ ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); ++ ++ for (i=0; infiles; i++) ++ { ++@@ -647,7 +673,7 @@ ++ while ( regitr_loop(conf->reg_itr) ); ++ } ++ else ++- mpileup_reg(conf,0,0); +++ mpileup_reg(conf,0,UINT32_MAX); ++ ++ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL); ++ ++@@ -656,13 +682,14 @@ ++ bcf_destroy1(conf->bcf_rec); ++ if (conf->bcf_fp) ++ { ++- hts_close(conf->bcf_fp); +++ if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname); ++ bcf_hdr_destroy(conf->bcf_hdr); ++ bcf_call_destroy(conf->bca); ++ free(conf->bc.PL); ++ free(conf->bc.DP4); ++ free(conf->bc.ADR); ++ free(conf->bc.ADF); +++ free(conf->bc.SCR); ++ free(conf->bc.fmt_arr); ++ free(conf->bcr); ++ } ++@@ -738,7 +765,7 @@ ++ files = (char**) realloc(files,nfiles*sizeof(char*)); ++ files[nfiles-1] = strdup(buf); ++ } ++- fclose(fh); +++ if ( fclose(fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,file_list); ++ if ( !nfiles ) ++ { ++ fprintf(stderr,"No files read from %s\n", file_list); ++@@ -765,6 +792,8 @@ ++ else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD; ++ else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF; ++ else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR; +++ else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR; +++ else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR; ++ else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD; ++ else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF; ++ else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR; ++@@ -779,6 +808,9 @@ ++ return flag; ++ } ++ +++// todo: make it possible to turn off some annotations or change the defaults, +++// specifically RPB, VDB, MWU, SGB tests. It would be good to do some +++// benchmarking first to see if it's worth it. ++ static void list_annotations(FILE *fp) ++ { ++ fprintf(fp, ++@@ -790,12 +822,14 @@ ++ " FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n" ++ " FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n" ++ " FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n" +++" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" ++ "\n" ++ "INFO annotation tags available:\n" ++ "\n" ++ " INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n" ++ " INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n" ++ " INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n" +++" INFO/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" ++ "\n"); ++ } ++ ++@@ -818,7 +852,7 @@ ++ " -b, --bam-list FILE list of input BAM filenames, one per line\n" ++ " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" ++ " -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n" ++-" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); +++" -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); ++ fprintf(fp, ++ " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" ++ " -f, --fasta-ref FILE faidx indexed reference sequence file\n" ++@@ -850,7 +884,7 @@ ++ " -o, --output FILE write output to FILE [standard output]\n" ++ " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" ++ " 'z' compressed VCF; 'v' uncompressed VCF [v]\n" ++-" --threads INT number of extra output compression threads [0]\n" +++" --threads INT use multithreading with INT worker threads [0]\n" ++ "\n" ++ "SNP/INDEL genotype likelihoods options:\n" ++ " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); ++@@ -870,6 +904,10 @@ ++ " -P, --platforms STR comma separated list of platforms for indels [all]\n" ++ "\n" ++ "Notes: Assuming diploid individuals.\n" +++"\n" +++"Example:\n" +++" # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" +++" bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" ++ "\n"); ++ ++ free(tmp_require); ++@@ -897,6 +935,7 @@ ++ mplp.record_cmd_line = 1; ++ mplp.n_threads = 0; ++ mplp.bsmpl = bam_smpl_init(); +++ mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB; // the default to be changed in future, see also parse_format_flag() ++ ++ static const struct option lopts[] = ++ { ++@@ -1049,7 +1088,7 @@ ++ ++ if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) ) ++ { ++- fprintf(stderr,"[warning] The -t DP option is required with --gvcf, switching on.\n"); +++ fprintf(stderr,"[warning] The -a DP option is required with --gvcf, switching on.\n"); ++ mplp.fmt_flag |= B2B_FMT_DP; ++ } ++ if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) ++--- python-pysam.orig/bcftools/mpileup.c.pysam.c +++++ python-pysam/bcftools/mpileup.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools ++ ++- Copyright (C) 2008-2017 Genome Research Ltd. +++ Copyright (C) 2008-2018 Genome Research Ltd. ++ Portions copyright (C) 2009-2012 Broad Institute. ++ ++ Author: Heng Li ++@@ -33,6 +33,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -224,8 +225,8 @@ ++ if (ma->conf->fai && b->core.tid >= 0) { ++ has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); ++ if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence ++- fprintf(bcftools_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", ++- __func__, b->core.pos, ref_len, b->core.tid); +++ fprintf(bcftools_stderr,"[%s] Skipping because %"PRId64" is outside of %d [ref:%d]\n", +++ __func__, (int64_t) b->core.pos, ref_len, b->core.tid); ++ continue; ++ } ++ } else { ++@@ -248,13 +249,28 @@ ++ ++ // Called once per new bam added to the pileup. ++ // We cache sample information here so we don't have to keep recomputing this ++-// on each and every pileup column. +++// on each and every pileup column. If FMT/SCR annotation is requested, a flag +++// is set to indicate the presence of a soft clip. ++ // ++ // Cd is an arbitrary block of data we can write into, which ends up in ++-// the pileup structures. We stash the sample ID there. ++-static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { +++// the pileup structures. We stash the sample ID there: +++// has_soft_clip .. cd->i & 1 +++// sample_id .. cd->i >> 1 +++static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) +++{ ++ mplp_aux_t *ma = (mplp_aux_t *)data; ++- cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b); +++ cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b) << 1; +++ if ( ma->conf->fmt_flag & (B2B_INFO_SCR|B2B_FMT_SCR) ) +++ { +++ int i; +++ for (i=0; icore.n_cigar; i++) +++ { +++ int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK; +++ if ( cig!=BAM_CSOFT_CLIP ) continue; +++ cd->i |= 1; +++ break; +++ } +++ } ++ return 0; ++ } ++ ++@@ -267,7 +283,7 @@ ++ for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position ++ { ++ const bam_pileup1_t *p = plp[i] + j; ++- int id = p->cd.i; +++ int id = PLP_SAMPLE_ID(p->cd.i); ++ if (m->n_plp[id] == m->m_plp[id]) ++ { ++ m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; ++@@ -282,7 +298,7 @@ ++ { ++ if ( !conf->gvcf ) ++ { ++- if ( rec ) bcf_write1(fp, hdr, rec); +++ if ( rec && bcf_write1(fp, hdr, rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output"); ++ return; ++ } ++ ++@@ -300,7 +316,7 @@ ++ if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1; ++ } ++ rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref); ++- if ( rec ) bcf_write1(fp,hdr,rec); +++ if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output"); ++ } ++ ++ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) ++@@ -312,7 +328,7 @@ ++ ++ while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0) ++ { ++- if ( end && (posend) ) continue; +++ if ( posend ) continue; ++ if ( conf->bed && tid >= 0 ) ++ { ++ int overlap = regidx_overlap(conf->bed, hdr->target_name[tid], pos, pos, NULL); ++@@ -523,11 +539,13 @@ ++ ++ bcf_hdr_append(conf->bcf_hdr,"##ALT="); ++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ if ( conf->fmt_flag&B2B_INFO_VDB ) +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ if ( conf->fmt_flag&B2B_INFO_RPB ) +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++@@ -555,17 +573,21 @@ ++ if ( conf->fmt_flag&B2B_FMT_SP ) ++ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); ++ if ( conf->fmt_flag&B2B_FMT_AD ) ++- bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); +++ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); ++ if ( conf->fmt_flag&B2B_FMT_ADF ) ++- bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); +++ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); ++ if ( conf->fmt_flag&B2B_FMT_ADR ) ++- bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); +++ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); ++ if ( conf->fmt_flag&B2B_INFO_AD ) ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++ if ( conf->fmt_flag&B2B_INFO_ADF ) ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ if ( conf->fmt_flag&B2B_INFO_SCR ) +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ if ( conf->fmt_flag&B2B_FMT_SCR ) +++ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); ++ if ( conf->fmt_flag&B2B_INFO_ADR ) ++- bcf_hdr_append(conf->bcf_hdr,"##INFO="); +++ bcf_hdr_append(conf->bcf_hdr,"##INFO="); ++ if ( conf->gvcf ) ++ gvcf_update_header(conf->gvcf, conf->bcf_hdr); ++ ++@@ -573,7 +595,7 @@ ++ const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl); ++ for (i=0; ibcf_hdr, smpl[i]); ++- bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr); +++ if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output"); ++ ++ conf->bca = bcf_call_init(-1., conf->min_baseQ); ++ conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); ++@@ -581,6 +603,7 @@ ++ conf->bca->min_frac = conf->min_frac; ++ conf->bca->min_support = conf->min_support; ++ conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; +++ conf->bca->fmt_flag = conf->fmt_flag; ++ ++ conf->bc.bcf_hdr = conf->bcf_hdr; ++ conf->bc.n = nsmpl; ++@@ -601,11 +624,14 @@ ++ conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES; ++ } ++ } +++ if ( conf->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) ) +++ conf->bc.SCR = (int32_t*) malloc((nsmpl+1)*sizeof(*conf->bc.SCR)); ++ } ++ ++ // init mpileup ++ conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); ++ if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter); +++ fprintf(bcftools_stderr, "[%s] maximum number of reads per input file set to -d %d\n", __func__, conf->max_depth); ++ if ( (double)conf->max_depth * conf->nfiles > 1<<20) ++ fprintf(bcftools_stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles); ++ if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 ) ++@@ -625,7 +651,7 @@ ++ if ( ireg++ > 0 ) ++ { ++ conf->buf.l = 0; ++- ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end); +++ ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); ++ ++ for (i=0; infiles; i++) ++ { ++@@ -649,7 +675,7 @@ ++ while ( regitr_loop(conf->reg_itr) ); ++ } ++ else ++- mpileup_reg(conf,0,0); +++ mpileup_reg(conf,0,UINT32_MAX); ++ ++ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL); ++ ++@@ -658,13 +684,14 @@ ++ bcf_destroy1(conf->bcf_rec); ++ if (conf->bcf_fp) ++ { ++- hts_close(conf->bcf_fp); +++ if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname); ++ bcf_hdr_destroy(conf->bcf_hdr); ++ bcf_call_destroy(conf->bca); ++ free(conf->bc.PL); ++ free(conf->bc.DP4); ++ free(conf->bc.ADR); ++ free(conf->bc.ADF); +++ free(conf->bc.SCR); ++ free(conf->bc.fmt_arr); ++ free(conf->bcr); ++ } ++@@ -740,7 +767,7 @@ ++ files = (char**) realloc(files,nfiles*sizeof(char*)); ++ files[nfiles-1] = strdup(buf); ++ } ++- fclose(fh); +++ if ( fclose(fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,file_list); ++ if ( !nfiles ) ++ { ++ fprintf(bcftools_stderr,"No files read from %s\n", file_list); ++@@ -767,6 +794,8 @@ ++ else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD; ++ else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF; ++ else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR; +++ else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR; +++ else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR; ++ else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD; ++ else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF; ++ else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR; ++@@ -781,6 +810,9 @@ ++ return flag; ++ } ++ +++// todo: make it possible to turn off some annotations or change the defaults, +++// specifically RPB, VDB, MWU, SGB tests. It would be good to do some +++// benchmarking first to see if it's worth it. ++ static void list_annotations(FILE *fp) ++ { ++ fprintf(fp, ++@@ -792,12 +824,14 @@ ++ " FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n" ++ " FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n" ++ " FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n" +++" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" ++ "\n" ++ "INFO annotation tags available:\n" ++ "\n" ++ " INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n" ++ " INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n" ++ " INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n" +++" INFO/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" ++ "\n"); ++ } ++ ++@@ -820,7 +854,7 @@ ++ " -b, --bam-list FILE list of input BAM filenames, one per line\n" ++ " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" ++ " -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n" ++-" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); +++" -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); ++ fprintf(fp, ++ " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" ++ " -f, --fasta-ref FILE faidx indexed reference sequence file\n" ++@@ -852,7 +886,7 @@ ++ " -o, --output FILE write output to FILE [standard output]\n" ++ " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" ++ " 'z' compressed VCF; 'v' uncompressed VCF [v]\n" ++-" --threads INT number of extra output compression threads [0]\n" +++" --threads INT use multithreading with INT worker threads [0]\n" ++ "\n" ++ "SNP/INDEL genotype likelihoods options:\n" ++ " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); ++@@ -872,6 +906,10 @@ ++ " -P, --platforms STR comma separated list of platforms for indels [all]\n" ++ "\n" ++ "Notes: Assuming diploid individuals.\n" +++"\n" +++"Example:\n" +++" # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" +++" bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" ++ "\n"); ++ ++ free(tmp_require); ++@@ -899,6 +937,7 @@ ++ mplp.record_cmd_line = 1; ++ mplp.n_threads = 0; ++ mplp.bsmpl = bam_smpl_init(); +++ mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB; // the default to be changed in future, see also parse_format_flag() ++ ++ static const struct option lopts[] = ++ { ++@@ -1051,7 +1090,7 @@ ++ ++ if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) ) ++ { ++- fprintf(bcftools_stderr,"[warning] The -t DP option is required with --gvcf, switching on.\n"); +++ fprintf(bcftools_stderr,"[warning] The -a DP option is required with --gvcf, switching on.\n"); ++ mplp.fmt_flag |= B2B_FMT_DP; ++ } ++ if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) ++--- python-pysam.orig/bcftools/plugins/GTisec.c +++++ python-pysam/bcftools/plugins/GTisec.c ++@@ -320,7 +320,7 @@ ++ int gte_smp = 0; // number GT array entries per sample (should be 2, one entry per allele) ++ if ( (gte_smp = bcf_get_genotypes(args.hdr, rec, &(args.gt_arr), &(args.ngt_arr) ) ) <= 0 ) ++ { ++- error("GT not present at %s: %d\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos+1); +++ error("GT not present at %s: %"PRId64"\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, (int64_t) rec->pos+1); ++ } ++ ++ gte_smp /= args.nsmp; // divide total number of genotypes array entries (= args.ngt_arr) by number of samples ++--- python-pysam.orig/bcftools/plugins/GTisec.c.pysam.c +++++ python-pysam/bcftools/plugins/GTisec.c.pysam.c ++@@ -322,7 +322,7 @@ ++ int gte_smp = 0; // number GT array entries per sample (should be 2, one entry per allele) ++ if ( (gte_smp = bcf_get_genotypes(args.hdr, rec, &(args.gt_arr), &(args.ngt_arr) ) ) <= 0 ) ++ { ++- error("GT not present at %s: %d\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos+1); +++ error("GT not present at %s: %"PRId64"\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, (int64_t) rec->pos+1); ++ } ++ ++ gte_smp /= args.nsmp; // divide total number of genotypes array entries (= args.ngt_arr) by number of samples ++--- python-pysam.orig/bcftools/plugins/GTsubset.c +++++ python-pysam/bcftools/plugins/GTsubset.c ++@@ -163,7 +163,7 @@ ++ args.ngt_arr = 0; /*! hold the number of current GT array entries */ ++ if ( (gte_smp = bcf_get_genotypes(args.hdr, rec, &(args.gt_arr), &(args.ngt_arr) ) ) <= 0 ) ++ { ++- error("GT not present at %s: %d\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos+1); +++ error("GT not present at %s: %"PRId64"\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, (int64_t) rec->pos+1); ++ } ++ ++ gte_smp /= args.nsmp; // divide total number of genotypes array entries (= args.ngt_arr) by number of samples ++--- python-pysam.orig/bcftools/plugins/GTsubset.c.pysam.c +++++ python-pysam/bcftools/plugins/GTsubset.c.pysam.c ++@@ -165,7 +165,7 @@ ++ args.ngt_arr = 0; /*! hold the number of current GT array entries */ ++ if ( (gte_smp = bcf_get_genotypes(args.hdr, rec, &(args.gt_arr), &(args.ngt_arr) ) ) <= 0 ) ++ { ++- error("GT not present at %s: %d\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos+1); +++ error("GT not present at %s: %"PRId64"\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, (int64_t) rec->pos+1); ++ } ++ ++ gte_smp /= args.nsmp; // divide total number of genotypes array entries (= args.ngt_arr) by number of samples ++--- python-pysam.orig/bcftools/plugins/ad-bias.c +++++ python-pysam/bcftools/plugins/ad-bias.c ++@@ -26,6 +26,7 @@ ++ ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -55,6 +56,7 @@ ++ convert_t *convert; ++ kstring_t str; ++ uint64_t nsite,ncmp; +++ int variant_type; ++ } ++ args_t; ++ ++@@ -75,11 +77,12 @@ ++ " run \"bcftools plugin\" for a list of common options\n" ++ "\n" ++ "Plugin options:\n" ++- " -a, --min-alt-dp Minimum required alternate allele depth [1]\n" ++- " -d, --min-dp Minimum required depth [0]\n" ++- " -f, --format Optional tags to append to output (`bcftools query` style of format)\n" ++- " -s, --samples List of sample pairs, one tab-delimited pair per line\n" ++- " -t, --threshold Output only hits with p-value smaller than [1e-3]\n" +++ " -a, --min-alt-dp Minimum required alternate allele depth [1]\n" +++ " -d, --min-dp Minimum required depth [0]\n" +++ " -f, --format Optional tags to append to output (`bcftools query` style of format)\n" +++ " -s, --samples List of sample pairs, one tab-delimited pair per line\n" +++ " -t, --threshold Output only hits with p-value smaller than [1e-3]\n" +++ " -v, --variant-type Consider only variants of this type. (By default all variants are considered.)\n" ++ "\n" ++ "Example:\n" ++ " bcftools +ad-bias file.bcf -- -t 1e-3 -s samples.txt\n" ++@@ -117,7 +120,7 @@ ++ ++ free(str.s); ++ free(off); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); ++ } ++ ++ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) ++@@ -134,11 +137,12 @@ ++ {"format",required_argument,NULL,'f'}, ++ {"samples",required_argument,NULL,'s'}, ++ {"threshold",required_argument,NULL,'t'}, +++ {"variant-type",required_argument,NULL,'v'}, ++ {NULL,0,NULL,0} ++ }; ++ int c; ++ char *tmp; ++- while ((c = getopt_long(argc, argv, "?hs:t:f:d:a:",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "?hs:t:f:d:a:v:",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { ++@@ -155,6 +159,11 @@ ++ if ( *tmp ) error("Could not parse: -t %s\n", optarg); ++ break; ++ case 's': fname = optarg; break; +++ case 'v': +++ if ( !strcasecmp(optarg,"snp") || !strcasecmp(optarg,"snps") ) args.variant_type = VCF_SNP; +++ else if ( !strcasecmp(optarg,"indel") || !strcasecmp(optarg,"indels") ) args.variant_type = VCF_INDEL; +++ else error("Error: Variant type \"%s\" is not supported\n",optarg); +++ break; ++ case 'f': format = optarg; break; ++ case 'h': ++ case '?': ++@@ -168,14 +177,29 @@ ++ printf("# The command line was:\tbcftools +ad-bias %s", argv[0]); ++ for (c=1; cn_allele < 2 ) return NULL; +++ ++ int nad = bcf_get_format_int32(args.hdr, rec, "AD", &args.ad_arr, &args.mad_arr); ++ if ( nad<0 ) return NULL; ++ nad /= bcf_hdr_nsamples(args.hdr); ++@@ -183,30 +207,78 @@ ++ if ( args.convert ) convert_line(args.convert, rec, &args.str); ++ args.nsite++; ++ ++- int i; +++ int i,j; ++ for (i=0; ismpl; ++ int32_t *bptr = args.ad_arr + nad*pair->ctrl; ++ ++- if ( aptr[0]==bcf_int32_missing ) continue; ++- if ( bptr[0]==bcf_int32_missing ) continue; ++- if ( aptr[0]+aptr[1] < args.min_dp ) continue; ++- if ( bptr[0]+bptr[1] < args.min_dp ) continue; ++- if ( aptr[1] < args.min_alt_dp && bptr[1] < args.min_alt_dp ) continue; +++ // Find the two most frequent alleles +++ int nbig=-1,nsmall=-1,ibig=-1,ismall=-1; +++ for (j=0; jd.allele[ibig])!=strlen(rec->d.allele[ismall]) ) continue; +++ if ( args.variant_type==VCF_INDEL && strlen(rec->d.allele[ibig])==strlen(rec->d.allele[ismall]) ) continue; +++ } +++ +++ int iref,ialt,nalt; +++ if ( ibig > ismall ) ialt = ibig, iref = ismall, nalt = nbig; +++ else ialt = ismall, iref = ibig, nalt = nsmall; +++ +++ if ( nalt < args.min_alt_dp ) continue; ++ ++ args.ncmp++; ++ ++- int n11 = aptr[0], n12 = aptr[1]; ++- int n21 = bptr[0], n22 = bptr[1]; +++ int n11 = aptr[iref], n12 = aptr[ialt]; +++ int n21 = bptr[iref], n22 = bptr[ialt]; ++ double left, right, fisher; ++ kt_fisher_exact(n11,n12,n21,n22, &left,&right,&fisher); ++ if ( fisher >= args.th ) continue; ++ ++- printf("FT\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%e", +++ printf("FT\t%s\t%s\t%s\t%"PRId64"\t%s\t%s\t%d\t%d\t%d\t%d\t%e", ++ pair->smpl_name,pair->ctrl_name, ++- bcf_hdr_id2name(args.hdr,rec->rid), rec->pos+1, +++ bcf_hdr_id2name(args.hdr,rec->rid), (int64_t) rec->pos+1, +++ rec->d.allele[iref],rec->d.allele[ialt], ++ n11,n12,n21,n22, fisher ++ ); ++ if ( args.convert ) printf("\t%s", args.str.s); ++--- python-pysam.orig/bcftools/plugins/ad-bias.c.pysam.c +++++ python-pysam/bcftools/plugins/ad-bias.c.pysam.c ++@@ -28,6 +28,7 @@ ++ ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -57,6 +58,7 @@ ++ convert_t *convert; ++ kstring_t str; ++ uint64_t nsite,ncmp; +++ int variant_type; ++ } ++ args_t; ++ ++@@ -77,11 +79,12 @@ ++ " run \"bcftools plugin\" for a list of common options\n" ++ "\n" ++ "Plugin options:\n" ++- " -a, --min-alt-dp Minimum required alternate allele depth [1]\n" ++- " -d, --min-dp Minimum required depth [0]\n" ++- " -f, --format Optional tags to append to output (`bcftools query` style of format)\n" ++- " -s, --samples List of sample pairs, one tab-delimited pair per line\n" ++- " -t, --threshold Output only hits with p-value smaller than [1e-3]\n" +++ " -a, --min-alt-dp Minimum required alternate allele depth [1]\n" +++ " -d, --min-dp Minimum required depth [0]\n" +++ " -f, --format Optional tags to append to output (`bcftools query` style of format)\n" +++ " -s, --samples List of sample pairs, one tab-delimited pair per line\n" +++ " -t, --threshold Output only hits with p-value smaller than [1e-3]\n" +++ " -v, --variant-type Consider only variants of this type. (By default all variants are considered.)\n" ++ "\n" ++ "Example:\n" ++ " bcftools +ad-bias file.bcf -- -t 1e-3 -s samples.txt\n" ++@@ -119,7 +122,7 @@ ++ ++ free(str.s); ++ free(off); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); ++ } ++ ++ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) ++@@ -136,11 +139,12 @@ ++ {"format",required_argument,NULL,'f'}, ++ {"samples",required_argument,NULL,'s'}, ++ {"threshold",required_argument,NULL,'t'}, +++ {"variant-type",required_argument,NULL,'v'}, ++ {NULL,0,NULL,0} ++ }; ++ int c; ++ char *tmp; ++- while ((c = getopt_long(argc, argv, "?hs:t:f:d:a:",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "?hs:t:f:d:a:v:",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { ++@@ -157,6 +161,11 @@ ++ if ( *tmp ) error("Could not parse: -t %s\n", optarg); ++ break; ++ case 's': fname = optarg; break; +++ case 'v': +++ if ( !strcasecmp(optarg,"snp") || !strcasecmp(optarg,"snps") ) args.variant_type = VCF_SNP; +++ else if ( !strcasecmp(optarg,"indel") || !strcasecmp(optarg,"indels") ) args.variant_type = VCF_INDEL; +++ else error("Error: Variant type \"%s\" is not supported\n",optarg); +++ break; ++ case 'f': format = optarg; break; ++ case 'h': ++ case '?': ++@@ -170,14 +179,29 @@ ++ fprintf(bcftools_stdout, "# The command line was:\tbcftools +ad-bias %s", argv[0]); ++ for (c=1; cn_allele < 2 ) return NULL; +++ ++ int nad = bcf_get_format_int32(args.hdr, rec, "AD", &args.ad_arr, &args.mad_arr); ++ if ( nad<0 ) return NULL; ++ nad /= bcf_hdr_nsamples(args.hdr); ++@@ -185,30 +209,78 @@ ++ if ( args.convert ) convert_line(args.convert, rec, &args.str); ++ args.nsite++; ++ ++- int i; +++ int i,j; ++ for (i=0; ismpl; ++ int32_t *bptr = args.ad_arr + nad*pair->ctrl; ++ ++- if ( aptr[0]==bcf_int32_missing ) continue; ++- if ( bptr[0]==bcf_int32_missing ) continue; ++- if ( aptr[0]+aptr[1] < args.min_dp ) continue; ++- if ( bptr[0]+bptr[1] < args.min_dp ) continue; ++- if ( aptr[1] < args.min_alt_dp && bptr[1] < args.min_alt_dp ) continue; +++ // Find the two most frequent alleles +++ int nbig=-1,nsmall=-1,ibig=-1,ismall=-1; +++ for (j=0; jd.allele[ibig])!=strlen(rec->d.allele[ismall]) ) continue; +++ if ( args.variant_type==VCF_INDEL && strlen(rec->d.allele[ibig])==strlen(rec->d.allele[ismall]) ) continue; +++ } +++ +++ int iref,ialt,nalt; +++ if ( ibig > ismall ) ialt = ibig, iref = ismall, nalt = nbig; +++ else ialt = ismall, iref = ibig, nalt = nsmall; +++ +++ if ( nalt < args.min_alt_dp ) continue; ++ ++ args.ncmp++; ++ ++- int n11 = aptr[0], n12 = aptr[1]; ++- int n21 = bptr[0], n22 = bptr[1]; +++ int n11 = aptr[iref], n12 = aptr[ialt]; +++ int n21 = bptr[iref], n22 = bptr[ialt]; ++ double left, right, fisher; ++ kt_fisher_exact(n11,n12,n21,n22, &left,&right,&fisher); ++ if ( fisher >= args.th ) continue; ++ ++- fprintf(bcftools_stdout, "FT\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%e", +++ fprintf(bcftools_stdout, "FT\t%s\t%s\t%s\t%"PRId64"\t%s\t%s\t%d\t%d\t%d\t%d\t%e", ++ pair->smpl_name,pair->ctrl_name, ++- bcf_hdr_id2name(args.hdr,rec->rid), rec->pos+1, +++ bcf_hdr_id2name(args.hdr,rec->rid), (int64_t) rec->pos+1, +++ rec->d.allele[iref],rec->d.allele[ialt], ++ n11,n12,n21,n22, fisher ++ ); ++ if ( args.convert ) fprintf(bcftools_stdout, "\t%s", args.str.s); ++--- /dev/null +++++ python-pysam/bcftools/plugins/add-variantkey.c ++@@ -0,0 +1,86 @@ +++/* plugins/add-variantkey.c -- add VariantKey INFO field. +++ +++ Copyright (C) 2017-2018 GENOMICS plc. +++ +++ Author: Nicola Asuni +++ +++Permission is hereby granted, free of charge, to any person obtaining a copy +++of this software and associated documentation files (the "Software"), to deal +++in the Software without restriction, including without limitation the rights +++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++copies of the Software, and to permit persons to whom the Software is +++furnished to do so, subject to the following conditions: +++ +++The above copyright notice and this permission notice shall be included in +++all copies or substantial portions of the Software. +++ +++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++DEALINGS IN THE SOFTWARE. */ +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include "../variantkey.h" +++ +++bcf_hdr_t *in_hdr, *out_hdr; +++ +++const char *about(void) +++{ +++ return "Add VariantKey INFO fields VKX and RSX.\n"; +++} +++ +++const char *usage(void) +++{ +++ return +++ "\n" +++ "About: Add VKX and RSX columns.\n" +++ "Usage: bcftools +add-variantkey [General Options] \n" +++ "Options:\n" +++ " run \"bcftools plugin\" for a list of common options\n" +++ "\n" +++ "Example:\n" +++ " bcftools +add-variantkey in.vcf\n" +++ "\n"; +++} +++ +++int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) +++{ +++ in_hdr = in; +++ out_hdr = out; +++ bcf_hdr_append(out_hdr, "##INFO="); +++ bcf_hdr_append(out_hdr, "##INFO="); +++ return 0; +++} +++ +++bcf1_t *process(bcf1_t *rec) +++{ +++ uint64_t vk = variantkey( +++ in_hdr->id[BCF_DT_CTG][rec->rid].key, +++ strlen(in_hdr->id[BCF_DT_CTG][rec->rid].key), +++ rec->pos, +++ rec->d.allele[0], +++ strlen(rec->d.allele[0]), +++ rec->d.allele[1], +++ strlen(rec->d.allele[1])); +++ char vs[17]; +++ variantkey_hex(vk, vs); +++ bcf_update_info_string(out_hdr, rec, "VKX", vs); +++ char rsid[9]; +++ char *ptr = rec->d.id; +++ ptr += 2; // remove 'rs' +++ sprintf(rsid, "%08" PRIx32, (uint32_t)strtoul(ptr, NULL, 10)); +++ bcf_update_info_string(out_hdr, rec, "RSX", rsid); +++ return rec; +++} +++ +++void destroy(void) +++{ +++} ++--- /dev/null +++++ python-pysam/bcftools/plugins/add-variantkey.c.pysam.c ++@@ -0,0 +1,88 @@ +++#include "bcftools.pysam.h" +++ +++/* plugins/add-variantkey.c -- add VariantKey INFO field. +++ +++ Copyright (C) 2017-2018 GENOMICS plc. +++ +++ Author: Nicola Asuni +++ +++Permission is hereby granted, free of charge, to any person obtaining a copy +++of this software and associated documentation files (the "Software"), to deal +++in the Software without restriction, including without limitation the rights +++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++copies of the Software, and to permit persons to whom the Software is +++furnished to do so, subject to the following conditions: +++ +++The above copyright notice and this permission notice shall be included in +++all copies or substantial portions of the Software. +++ +++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++DEALINGS IN THE SOFTWARE. */ +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include "../variantkey.h" +++ +++bcf_hdr_t *in_hdr, *out_hdr; +++ +++const char *about(void) +++{ +++ return "Add VariantKey INFO fields VKX and RSX.\n"; +++} +++ +++const char *usage(void) +++{ +++ return +++ "\n" +++ "About: Add VKX and RSX columns.\n" +++ "Usage: bcftools +add-variantkey [General Options] \n" +++ "Options:\n" +++ " run \"bcftools plugin\" for a list of common options\n" +++ "\n" +++ "Example:\n" +++ " bcftools +add-variantkey in.vcf\n" +++ "\n"; +++} +++ +++int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) +++{ +++ in_hdr = in; +++ out_hdr = out; +++ bcf_hdr_append(out_hdr, "##INFO="); +++ bcf_hdr_append(out_hdr, "##INFO="); +++ return 0; +++} +++ +++bcf1_t *process(bcf1_t *rec) +++{ +++ uint64_t vk = variantkey( +++ in_hdr->id[BCF_DT_CTG][rec->rid].key, +++ strlen(in_hdr->id[BCF_DT_CTG][rec->rid].key), +++ rec->pos, +++ rec->d.allele[0], +++ strlen(rec->d.allele[0]), +++ rec->d.allele[1], +++ strlen(rec->d.allele[1])); +++ char vs[17]; +++ variantkey_hex(vk, vs); +++ bcf_update_info_string(out_hdr, rec, "VKX", vs); +++ char rsid[9]; +++ char *ptr = rec->d.id; +++ ptr += 2; // remove 'rs' +++ sprintf(rsid, "%08" PRIx32, (uint32_t)strtoul(ptr, NULL, 10)); +++ bcf_update_info_string(out_hdr, rec, "RSX", rsid); +++ return rec; +++} +++ +++void destroy(void) +++{ +++} ++--- python-pysam.orig/bcftools/plugins/af-dist.c +++++ python-pysam/bcftools/plugins/af-dist.c ++@@ -170,12 +170,12 @@ ++ if ( dosage==1 ) ++ { ++ args->prob_dist[iRA]++; ++- if ( list_RA ) printf("GT\t%s\t%d\t%s\t1\t%f\n",chr,rec->pos+1,args->hdr->samples[i],pRA); +++ if ( list_RA ) printf("GT\t%s\t%"PRId64"\t%s\t1\t%f\n",chr,(int64_t) rec->pos+1,args->hdr->samples[i],pRA); ++ } ++ else if ( dosage==2 ) ++ { ++ args->prob_dist[iAA]++; ++- if ( list_AA ) printf("GT\t%s\t%d\t%s\t2\t%f\n",chr,rec->pos+1,args->hdr->samples[i],pAA); +++ if ( list_AA ) printf("GT\t%s\t%"PRId64"\t%s\t2\t%f\n",chr,(int64_t) rec->pos+1,args->hdr->samples[i],pAA); ++ } ++ } ++ ++--- python-pysam.orig/bcftools/plugins/af-dist.c.pysam.c +++++ python-pysam/bcftools/plugins/af-dist.c.pysam.c ++@@ -172,12 +172,12 @@ ++ if ( dosage==1 ) ++ { ++ args->prob_dist[iRA]++; ++- if ( list_RA ) fprintf(bcftools_stdout, "GT\t%s\t%d\t%s\t1\t%f\n",chr,rec->pos+1,args->hdr->samples[i],pRA); +++ if ( list_RA ) fprintf(bcftools_stdout, "GT\t%s\t%"PRId64"\t%s\t1\t%f\n",chr,(int64_t) rec->pos+1,args->hdr->samples[i],pRA); ++ } ++ else if ( dosage==2 ) ++ { ++ args->prob_dist[iAA]++; ++- if ( list_AA ) fprintf(bcftools_stdout, "GT\t%s\t%d\t%s\t2\t%f\n",chr,rec->pos+1,args->hdr->samples[i],pAA); +++ if ( list_AA ) fprintf(bcftools_stdout, "GT\t%s\t%"PRId64"\t%s\t2\t%f\n",chr,(int64_t) rec->pos+1,args->hdr->samples[i],pAA); ++ } ++ } ++ ++--- /dev/null +++++ python-pysam/bcftools/plugins/allele-length.c ++@@ -0,0 +1,113 @@ +++/* plugins/allele-length.c -- Calculate stats about the length of alleles +++ +++ Copyright (C) 2017-2018 GENOMICS plc. +++ +++ Author: Nicola Asuni +++ +++Permission is hereby granted, free of charge, to any person obtaining a copy +++of this software and associated documentation files (the "Software"), to deal +++in the Software without restriction, including without limitation the rights +++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++copies of the Software, and to permit persons to whom the Software is +++furnished to do so, subject to the following conditions: +++ +++The above copyright notice and this permission notice shall be included in +++all copies or substantial portions of the Software. +++ +++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++DEALINGS IN THE SOFTWARE. */ +++ +++#include +++#include +++#include +++#include +++ +++#define MAXLEN 512 +++ +++static uint64_t numvar; +++static uint64_t numxvar; +++static uint64_t reflen[MAXLEN]; +++static uint64_t altlen[MAXLEN]; +++static uint64_t refaltlen[MAXLEN]; +++static uint64_t xrefaltlen[MAXLEN]; +++ +++const char *about(void) +++{ +++ return "Count the frequency of the length of REF, ALT and REF+ALT\n"; +++} +++ +++const char *usage(void) +++{ +++ return +++ "\n" +++ "About: Count the frequency of the length of alleles.\n" +++ "Usage: bcftools +allele-length [General Options] \n" +++ "Options:\n" +++ " run \"bcftools plugin\" for a list of common options\n" +++ "\n" +++ "Example:\n" +++ " bcftools +allele-length in.vcf\n" +++ "\n"; +++} +++ +++// return 0 if the string contains characters other than standard ACGT base letters +++int contain_non_base(const char *str) +++{ +++ int c; +++ while ((c = *str++)) +++ { +++ if ((c != 'A') && (c != 'a') && (c != 'C') && (c != 'c') && (c != 'G') && (c != 'g') && (c != 'T') && (c != 't')) +++ { +++ return 1; +++ } +++ } +++ return 0; +++} +++ +++// Called once at startup, allows to initialize local variables. +++// Return 1 to suppress VCF/BCF header from printing, 0 otherwise. +++int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) +++{ +++ numvar = 0; +++ int i = 0; +++ for(i = 0; i < MAXLEN; i++) { +++ reflen[i] = 0; +++ altlen[i] = 0; +++ refaltlen[i] = 0; +++ xrefaltlen[i] = 0; +++ } +++ return 1; +++} +++ +++// Called for each VCF record. Return rec to output the line or NULL to suppress output. +++bcf1_t *process(bcf1_t *rec) +++{ +++ int rl = strlen(rec->d.allele[0]); +++ int al = strlen(rec->d.allele[1]); +++ reflen[rl] += 1; +++ altlen[al] += 1; +++ refaltlen[(rl + al)] += 1; +++ if ((contain_non_base(rec->d.allele[0])) || (contain_non_base(rec->d.allele[1]))) +++ { +++ xrefaltlen[(rl + al)] += 1; +++ numxvar++; +++ } +++ numvar++; +++ return NULL; +++} +++ +++// Print final output +++void destroy(void) +++{ +++ int i = 0; +++ printf("LENGTH\tREF\tALT\tREF+ALT\tREF+ALT WITH NON-BASE NUCLEOTIDES\n"); +++ for(i = 0; i < MAXLEN; i++) { +++ printf("%d\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", i, reflen[i], altlen[i], refaltlen[i], xrefaltlen[i]); +++ } +++ printf("\t\t\t%"PRIu64"\t%"PRIu64"\n", numvar, numxvar); +++} ++--- /dev/null +++++ python-pysam/bcftools/plugins/allele-length.c.pysam.c ++@@ -0,0 +1,115 @@ +++#include "bcftools.pysam.h" +++ +++/* plugins/allele-length.c -- Calculate stats about the length of alleles +++ +++ Copyright (C) 2017-2018 GENOMICS plc. +++ +++ Author: Nicola Asuni +++ +++Permission is hereby granted, free of charge, to any person obtaining a copy +++of this software and associated documentation files (the "Software"), to deal +++in the Software without restriction, including without limitation the rights +++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++copies of the Software, and to permit persons to whom the Software is +++furnished to do so, subject to the following conditions: +++ +++The above copyright notice and this permission notice shall be included in +++all copies or substantial portions of the Software. +++ +++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++DEALINGS IN THE SOFTWARE. */ +++ +++#include +++#include +++#include +++#include +++ +++#define MAXLEN 512 +++ +++static uint64_t numvar; +++static uint64_t numxvar; +++static uint64_t reflen[MAXLEN]; +++static uint64_t altlen[MAXLEN]; +++static uint64_t refaltlen[MAXLEN]; +++static uint64_t xrefaltlen[MAXLEN]; +++ +++const char *about(void) +++{ +++ return "Count the frequency of the length of REF, ALT and REF+ALT\n"; +++} +++ +++const char *usage(void) +++{ +++ return +++ "\n" +++ "About: Count the frequency of the length of alleles.\n" +++ "Usage: bcftools +allele-length [General Options] \n" +++ "Options:\n" +++ " run \"bcftools plugin\" for a list of common options\n" +++ "\n" +++ "Example:\n" +++ " bcftools +allele-length in.vcf\n" +++ "\n"; +++} +++ +++// return 0 if the string contains characters other than standard ACGT base letters +++int contain_non_base(const char *str) +++{ +++ int c; +++ while ((c = *str++)) +++ { +++ if ((c != 'A') && (c != 'a') && (c != 'C') && (c != 'c') && (c != 'G') && (c != 'g') && (c != 'T') && (c != 't')) +++ { +++ return 1; +++ } +++ } +++ return 0; +++} +++ +++// Called once at startup, allows to initialize local variables. +++// Return 1 to suppress VCF/BCF header from printing, 0 otherwise. +++int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) +++{ +++ numvar = 0; +++ int i = 0; +++ for(i = 0; i < MAXLEN; i++) { +++ reflen[i] = 0; +++ altlen[i] = 0; +++ refaltlen[i] = 0; +++ xrefaltlen[i] = 0; +++ } +++ return 1; +++} +++ +++// Called for each VCF record. Return rec to output the line or NULL to suppress output. +++bcf1_t *process(bcf1_t *rec) +++{ +++ int rl = strlen(rec->d.allele[0]); +++ int al = strlen(rec->d.allele[1]); +++ reflen[rl] += 1; +++ altlen[al] += 1; +++ refaltlen[(rl + al)] += 1; +++ if ((contain_non_base(rec->d.allele[0])) || (contain_non_base(rec->d.allele[1]))) +++ { +++ xrefaltlen[(rl + al)] += 1; +++ numxvar++; +++ } +++ numvar++; +++ return NULL; +++} +++ +++// Print final output +++void destroy(void) +++{ +++ int i = 0; +++ fprintf(bcftools_stdout, "LENGTH\tREF\tALT\tREF+ALT\tREF+ALT WITH NON-BASE NUCLEOTIDES\n"); +++ for(i = 0; i < MAXLEN; i++) { +++ fprintf(bcftools_stdout, "%d\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", i, reflen[i], altlen[i], refaltlen[i], xrefaltlen[i]); +++ } +++ fprintf(bcftools_stdout, "\t\t\t%"PRIu64"\t%"PRIu64"\n", numvar, numxvar); +++} ++--- python-pysam.orig/bcftools/plugins/check-ploidy.c +++++ python-pysam/bcftools/plugins/check-ploidy.c ++@@ -101,7 +101,7 @@ ++ if ( !fmt_gt ) return NULL; // no GT tag ++ ++ if ( args->ndat != rec->n_sample ) ++- error("Incorrect number of samples at %s:%d .. found %d, expected %d\n",bcf_seqname(args->hdr,rec),rec->pos+1,rec->n_sample,args->ndat); +++ error("Incorrect number of samples at %s:%"PRId64" .. found %d, expected %d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_sample,args->ndat); ++ ++ if ( args->rid!=rec->rid && args->rid!=-1 ) ++ { ++@@ -143,7 +143,7 @@ ++ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; ++ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; ++ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; ++- default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args->hdr,rec),rec->pos+1); break; +++ default: error("The GT type is not recognised: %d at %s:%"PRId64"\n",fmt_gt->type, bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); break; ++ } ++ #undef BRANCH_INT ++ ++--- python-pysam.orig/bcftools/plugins/check-ploidy.c.pysam.c +++++ python-pysam/bcftools/plugins/check-ploidy.c.pysam.c ++@@ -103,7 +103,7 @@ ++ if ( !fmt_gt ) return NULL; // no GT tag ++ ++ if ( args->ndat != rec->n_sample ) ++- error("Incorrect number of samples at %s:%d .. found %d, expected %d\n",bcf_seqname(args->hdr,rec),rec->pos+1,rec->n_sample,args->ndat); +++ error("Incorrect number of samples at %s:%"PRId64" .. found %d, expected %d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_sample,args->ndat); ++ ++ if ( args->rid!=rec->rid && args->rid!=-1 ) ++ { ++@@ -145,7 +145,7 @@ ++ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; ++ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; ++ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; ++- default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args->hdr,rec),rec->pos+1); break; +++ default: error("The GT type is not recognised: %d at %s:%"PRId64"\n",fmt_gt->type, bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); break; ++ } ++ #undef BRANCH_INT ++ ++--- python-pysam.orig/bcftools/plugins/check-sparsity.c +++++ python-pysam/bcftools/plugins/check-sparsity.c ++@@ -129,7 +129,7 @@ ++ if ( args->itr ) hts_itr_destroy(args->itr); ++ if ( args->tbx ) tbx_destroy(args->tbx); ++ if ( args->idx ) hts_idx_destroy(args->idx); ++- hts_close(args->fp); +++ if ( hts_close(args->fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname); ++ } ++ ++ static void report(args_t *args, const char *reg) ++@@ -247,7 +247,7 @@ ++ args->min_sites = strtol(optarg,&tmp,10); ++ if ( *tmp ) error("Could not parse: -n %s\n", optarg); ++ break; ++- case 'R': args->region_is_file = 1; +++ case 'R': args->region_is_file = 1; // fall-through ++ case 'r': args->region = optarg; break; ++ case 'h': ++ case '?': ++--- python-pysam.orig/bcftools/plugins/check-sparsity.c.pysam.c +++++ python-pysam/bcftools/plugins/check-sparsity.c.pysam.c ++@@ -131,7 +131,7 @@ ++ if ( args->itr ) hts_itr_destroy(args->itr); ++ if ( args->tbx ) tbx_destroy(args->tbx); ++ if ( args->idx ) hts_idx_destroy(args->idx); ++- hts_close(args->fp); +++ if ( hts_close(args->fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname); ++ } ++ ++ static void report(args_t *args, const char *reg) ++@@ -249,7 +249,7 @@ ++ args->min_sites = strtol(optarg,&tmp,10); ++ if ( *tmp ) error("Could not parse: -n %s\n", optarg); ++ break; ++- case 'R': args->region_is_file = 1; +++ case 'R': args->region_is_file = 1; // fall-through ++ case 'r': args->region = optarg; break; ++ case 'h': ++ case '?': ++--- python-pysam.orig/bcftools/plugins/contrast.c +++++ python-pysam/bcftools/plugins/contrast.c ++@@ -27,12 +27,15 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include // for isatty +++#include ++ #include ++ #include ++ #include ++ #include +++#include ++ #include ++ #include "bcftools.h" ++ #include "filter.h" ++@@ -42,21 +45,29 @@ ++ #define FLT_INCLUDE 1 ++ #define FLT_EXCLUDE 2 ++ +++#define PRINT_PASSOC (1<<0) +++#define PRINT_FASSOC (1<<1) +++#define PRINT_NASSOC (1<<2) +++#define PRINT_NOVELAL (1<<3) +++#define PRINT_NOVELGT (1<<4) +++ ++ typedef struct ++ { ++- int argc, filter_logic, regions_is_file, targets_is_file, output_type; ++- char **argv, *output_fname, *fname, *regions, *targets, *filter_str; ++- char *bg_samples_str, *novel_samples_str; ++- int *bg_smpl, *novel_smpl, nbg_smpl, nnovel_smpl; +++ int argc, filter_logic, regions_is_file, targets_is_file, output_type, force_samples; +++ uint32_t annots; +++ char **argv, *output_fname, *fname, *regions, *targets, *filter_str, *annots_str; +++ char *control_samples_str, *case_samples_str, *max_AC_str; +++ int *control_smpl, *case_smpl, ncontrol_smpl, ncase_smpl; ++ filter_t *filter; ++ bcf_srs_t *sr; ++ bcf_hdr_t *hdr, *hdr_out; ++ htsFile *out_fh; ++ int32_t *gts; ++ int mgts; ++- uint32_t *bg_gts; ++- int nbg_gts, mbg_gts, ntotal, nskipped, ntested, nnovel_al, nnovel_gt; ++- kstring_t novel_als_smpl, novel_gts_smpl; +++ uint32_t *control_gts; +++ int ncontrol_gts, mcontrol_gts, ntotal, nskipped, ntested, ncase_al, ncase_gt; +++ kstring_t case_als_smpl, case_gts_smpl; +++ int max_AC, nals[4]; // nals: number of control-ref, control-alt, case-ref and case-alt alleles in the region ++ } ++ args_t; ++ ++@@ -71,30 +82,110 @@ ++ { ++ return ++ "\n" ++- "About: Finds novel alleles and genotypes in two groups of samples. Adds\n" ++- " an annotation which lists samples with a novel allele (INFO/NOVELAL)\n" ++- " or a novel genotype (INFO/NOVELGT)\n" +++ "About: Runs a basic association test, per-site or in a region, and checks for novel alleles and\n" +++ " genotypes in two groups of samples. Adds the following INFO annotations:\n" +++ " - PASSOC .. Fisher's exact test probability of genotypic association (REF vs non-REF allele)\n" +++ " - FASSOC .. proportion of non-REF allele in controls and cases\n" +++ " - NASSOC .. number of control-ref, control-alt, case-ref and case-alt alleles\n" +++ " - NOVELAL .. lists samples with a novel allele not observed in the control group\n" +++ " - NOVELGT .. lists samples with a novel genotype not observed in the control group\n" ++ "Usage: bcftools +contrast [Plugin Options]\n" ++ "Plugin options:\n" ++- " -0, --bg-samples list of background samples\n" ++- " -1, --novel-samples list of samples where novel allele or genotype are expected\n" ++- " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" ++- " -i, --include EXPR include sites and samples for which the expression is true\n" ++- " -o, --output FILE output file name [stdout]\n" ++- " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" ++- " -r, --regions REG restrict to comma-separated list of regions\n" ++- " -R, --regions-file FILE restrict to regions listed in a file\n" ++- " -t, --targets REG similar to -r but streams rather than index-jumps\n" ++- " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" +++ " -a, --annots list of annotations to output [PASSOC,FASSOC,NOVELAL]\n" +++ " -0, --control-samples file or comma-separated list of control (background) samples\n" +++ " -1, --case-samples file or comma-separated list of samples where novel allele or genotype is expected\n" +++ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" +++ " -f, --max-allele-freq NUM calculate enrichment of rare alleles. Floating point numbers between 0 and 1 are\n" +++ " interpreted as ALT allele frequencies, integers as ALT allele counts\n" +++ " --force-samples continue even if some samples listed in the -0,-1 files are missing from the VCF\n" +++ " -i, --include EXPR include sites and samples for which the expression is true\n" +++ " -o, --output FILE output file name [stdout]\n" +++ " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" +++ " -r, --regions REG restrict to comma-separated list of regions\n" +++ " -R, --regions-file FILE restrict to regions listed in a file\n" +++ " -t, --targets REG similar to -r but streams rather than index-jumps\n" +++ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" ++ "\n" ++ "Example:\n" ++ " # Test if any of the samples a,b is different from the samples c,d,e\n" ++ " bcftools +contrast -0 c,d,e -1 a,b file.bcf\n" +++ "\n" +++ " # Same as above, but read samples from a file. In case of a name collision, the sample name\n" +++ " # has precedence: the existence of a file with a list of samples is not checked unless no such\n" +++ " # sample exists in the VCF. Use a full path (e.g. \"./string\" instead of \"string\") to avoid\n" +++ " # name clashes\n" +++ " bcftools +contrast -0 samples0.txt -1 samples1.txt file.bcf\n" +++ "\n" +++ " # The same as above but checks for enrichment of rare alleles, AF<0.001 in this example, in a region\n" +++ " bcftools +contrast -r 20:1000-2000 -f 0.001 -0 samples0.txt -1 samples1.txt file.bcf\n" ++ "\n"; ++ } ++ +++static int cmp_int(const void *a, const void *b) +++{ +++ if ( *((int*)a) < *((int*)b) ) return -1; +++ if ( *((int*)a) > *((int*)b) ) return -1; +++ return 0; +++} +++static void read_sample_list_or_file(bcf_hdr_t *hdr, const char *str, int **smpl, int *nsmpl, int force_samples) +++{ +++ char **str_list = NULL; +++ int i,j, *list, nlist = 0, is_file, nskipped = 0; +++ +++ for (is_file=0; is_file<=1; is_file++) +++ { +++ if ( str_list ) +++ { +++ for (i=0; i= 0 ) continue; +++ if ( is_file ) +++ { +++ if ( !force_samples ) error("The sample \"%s\" is not present in the VCF. Use --force-samples to proceed anyway.\n", str_list[i]); +++ j--; +++ nskipped++; +++ continue; +++ } +++ break; +++ } +++ if ( i==nlist ) break; +++ } +++ for (i=0; i1?"s":"",nskipped,str,nskipped>1?"are":"is"); +++ free(str_list); +++ qsort(list,nlist,sizeof(*list),cmp_int); +++ *smpl = list; +++ *nsmpl = nlist; +++} +++ ++ static void init_data(args_t *args) ++ { +++ int ntmp, i; +++ char **tmp = hts_readlist(args->annots_str, 0, &ntmp); +++ for (i=0; iannots |= PRINT_PASSOC; +++ else if ( !strcasecmp("FASSOC",tmp[i]) ) args->annots |= PRINT_FASSOC; +++ else if ( !strcasecmp("NASSOC",tmp[i]) ) args->annots |= PRINT_NASSOC; +++ else if ( !strcasecmp("NOVELAL",tmp[i]) ) args->annots |= PRINT_NOVELAL; +++ else if ( !strcasecmp("NOVELGT",tmp[i]) ) args->annots |= PRINT_NOVELGT; +++ else error("The annotation is not recognised: %s\n", tmp[i]); +++ free(tmp[i]); +++ } +++ free(tmp); +++ ++ args->sr = bcf_sr_init(); ++ if ( args->regions ) ++ { ++@@ -105,47 +196,51 @@ ++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); ++ args->hdr = bcf_sr_get_header(args->sr,0); ++ args->hdr_out = bcf_hdr_dup(args->hdr); ++- bcf_hdr_append(args->hdr_out, "##INFO="); ++- bcf_hdr_append(args->hdr_out, "##INFO="); +++ if ( args->annots & PRINT_PASSOC ) +++ bcf_hdr_append(args->hdr_out, "##INFO="); +++ if ( args->annots & PRINT_FASSOC ) +++ bcf_hdr_append(args->hdr_out, "##INFO="); +++ if ( args->annots & PRINT_NASSOC ) +++ bcf_hdr_append(args->hdr_out, "##INFO="); +++ if ( args->annots & PRINT_NOVELAL ) +++ bcf_hdr_append(args->hdr_out, "##INFO="); +++ if ( args->annots & PRINT_NOVELGT ) +++ bcf_hdr_append(args->hdr_out, "##INFO="); ++ ++ if ( args->filter_str ) ++ args->filter = filter_init(args->hdr, args->filter_str); ++ ++- int i; ++- char **smpl = hts_readlist(args->bg_samples_str, 0, &args->nbg_smpl); ++- args->bg_smpl = (int*) malloc(sizeof(int)*args->nbg_smpl); ++- for (i=0; inbg_smpl; i++) ++- { ++- args->bg_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]); ++- if ( args->bg_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]); ++- free(smpl[i]); ++- } ++- free(smpl); ++- ++- smpl = hts_readlist(args->novel_samples_str, 0, &args->nnovel_smpl); ++- args->novel_smpl = (int*) malloc(sizeof(int)*args->nnovel_smpl); ++- for (i=0; innovel_smpl; i++) ++- { ++- args->novel_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]); ++- if ( args->novel_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]); ++- free(smpl[i]); ++- } ++- free(smpl); +++ read_sample_list_or_file(args->hdr, args->control_samples_str, &args->control_smpl, &args->ncontrol_smpl, args->force_samples); +++ read_sample_list_or_file(args->hdr, args->case_samples_str, &args->case_smpl, &args->ncase_smpl, args->force_samples); ++ ++ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); ++ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); ++- bcf_hdr_write(args->out_fh, args->hdr_out); +++ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ +++ if ( args->max_AC_str ) +++ { +++ char *tmp; +++ args->max_AC = strtol(args->max_AC_str, &tmp, 10); +++ if ( tmp==args->max_AC_str || *tmp ) +++ { +++ double val = strtod(args->max_AC_str, &tmp); +++ if ( tmp==args->max_AC_str || *tmp ) error("Could not parse the argument: -f, --max-allele-freq %s\n", args->max_AC_str); +++ if ( val<0 || val>1 ) error("Expected integer or float from the range [0,1]: -f, --max-allele-freq %s\n", args->max_AC_str); +++ args->max_AC = val * bcf_hdr_nsamples(args->hdr); +++ if ( !args->max_AC ) args->max_AC = 1; +++ } +++ } ++ } ++ static void destroy_data(args_t *args) ++ { ++ bcf_hdr_destroy(args->hdr_out); ++- hts_close(args->out_fh); ++- free(args->novel_als_smpl.s); ++- free(args->novel_gts_smpl.s); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); +++ free(args->case_als_smpl.s); +++ free(args->case_gts_smpl.s); ++ free(args->gts); ++- free(args->bg_gts); ++- free(args->bg_smpl); ++- free(args->novel_smpl); +++ free(args->control_gts); +++ free(args->control_smpl); +++ free(args->case_smpl); ++ if ( args->filter ) filter_destroy(args->filter); ++ bcf_sr_destroy(args->sr); ++ free(args); ++@@ -191,13 +286,14 @@ ++ ngts /= rec->n_sample; ++ if ( ngts>2 ) error("todo: ploidy=%d\n", ngts); ++ ++- args->nbg_gts = 0; ++- uint32_t bg_als = 0; +++ args->ncontrol_gts = 0; +++ uint32_t control_als = 0; +++ int32_t nals[4] = {0,0,0,0}; // ctrl-ref, ctrl-alt, case-ref, case-alt ++ int i,j; ++- for (i=0; inbg_smpl; i++) +++ for (i=0; incontrol_smpl; i++) ++ { ++ uint32_t gt = 0; ++- int32_t *ptr = args->gts + args->bg_smpl[i]*ngts; +++ int32_t *ptr = args->gts + args->control_smpl[i]*ngts; ++ for (j=0; j32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ fprintf(stderr,"Too many alleles (>32) at %s:%"PRId64", skipping the site.\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ warned = 1; ++ } ++ args->nskipped++; ++ return -1; ++ } ++- bg_als |= 1<bg_gts, &args->nbg_gts, &args->mbg_gts); +++ if ( args->annots & PRINT_NOVELGT ) +++ binary_insert(gt, &args->control_gts, &args->ncontrol_gts, &args->mcontrol_gts); ++ } ++- if ( !bg_als ) +++ if ( !control_als ) ++ { ++ // all are missing ++ args->nskipped++; ++ return -1; ++ } ++ ++- args->novel_als_smpl.l = 0; ++- args->novel_gts_smpl.l = 0; +++ args->case_als_smpl.l = 0; +++ args->case_gts_smpl.l = 0; ++ ++ int has_gt = 0; ++- for (i=0; innovel_smpl; i++) +++ for (i=0; incase_smpl; i++) ++ { ++- int novel_al = 0; +++ int case_al = 0; ++ uint32_t gt = 0; ++- int32_t *ptr = args->gts + args->novel_smpl[i]*ngts; +++ int32_t *ptr = args->gts + args->case_smpl[i]*ngts; ++ for (j=0; j32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ fprintf(stderr,"Too many alleles (>32) at %s:%"PRId64", skipping. (todo?)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ warned = 1; ++ } ++ args->nskipped++; ++ return -1; ++ } ++- if ( !(bg_als & (1<hdr->samples[ args->novel_smpl[i] ]; ++- if ( novel_al ) +++ char *smpl = args->hdr->samples[ args->case_smpl[i] ]; +++ if ( case_al ) ++ { ++- if ( args->novel_als_smpl.l ) kputc(',', &args->novel_als_smpl); ++- kputs(smpl, &args->novel_als_smpl); +++ if ( args->annots & PRINT_NOVELAL ) +++ { +++ if ( args->case_als_smpl.l ) kputc(',', &args->case_als_smpl); +++ kputs(smpl, &args->case_als_smpl); +++ } ++ } ++- else if ( !binary_search(gt, args->bg_gts, args->nbg_gts) ) +++ else if ( (args->annots & PRINT_NOVELGT) && !binary_search(gt, args->control_gts, args->ncontrol_gts) ) ++ { ++- if ( args->novel_gts_smpl.l ) kputc(',', &args->novel_gts_smpl); ++- kputs(smpl, &args->novel_gts_smpl); +++ if ( args->case_gts_smpl.l ) kputc(',', &args->case_gts_smpl); +++ kputs(smpl, &args->case_gts_smpl); ++ } ++ } ++ if ( !has_gt ) ++@@ -273,15 +377,54 @@ ++ args->nskipped++; ++ return -1; ++ } ++- if ( args->novel_als_smpl.l ) +++ +++ if ( args->max_AC ) ++ { ++- bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->novel_als_smpl.s); ++- args->nnovel_al++; +++ if ( nals[0]+nals[2] > nals[1]+nals[3] ) +++ { +++ if ( nals[1]+nals[3] <= args->max_AC ) +++ for (i=0; i<4; i++) args->nals[i] += nals[i]; +++ } +++ else +++ { +++ if ( nals[0]+nals[2] <= args->max_AC ) +++ { +++ args->nals[0] += nals[1]; +++ args->nals[1] += nals[0]; +++ args->nals[2] += nals[3]; +++ args->nals[3] += nals[2]; +++ } +++ } +++ } +++ +++ float vals[2]; +++ if ( args->annots & PRINT_PASSOC ) +++ { +++ double left, right, fisher; +++ kt_fisher_exact(nals[0],nals[1],nals[2],nals[3], &left,&right,&fisher); +++ vals[0] = fisher; +++ bcf_update_info_float(args->hdr_out, rec, "PASSOC", vals, 1); ++ } ++- if ( args->novel_gts_smpl.l ) +++ if ( args->annots & PRINT_FASSOC ) ++ { ++- bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->novel_gts_smpl.s); ++- args->nnovel_gt++; +++ if ( nals[0]+nals[1] ) vals[0] = (float)nals[1]/(nals[0]+nals[1]); +++ else bcf_float_set_missing(vals[0]); +++ if ( nals[2]+nals[3] ) vals[1] = (float)nals[3]/(nals[2]+nals[3]); +++ else bcf_float_set_missing(vals[1]); +++ bcf_update_info_float(args->hdr_out, rec, "FASSOC", vals, 2); +++ } +++ if ( args->annots & PRINT_NASSOC ) +++ bcf_update_info_int32(args->hdr_out, rec, "NASSOC", nals, 4); +++ +++ if ( args->case_als_smpl.l ) +++ { +++ bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->case_als_smpl.s); +++ args->ncase_al++; +++ } +++ if ( args->case_gts_smpl.l ) +++ { +++ bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->case_gts_smpl.s); +++ args->ncase_gt++; ++ } ++ args->ntested++; ++ return 0; ++@@ -292,10 +435,16 @@ ++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); ++ args->argc = argc; args->argv = argv; ++ args->output_fname = "-"; +++ args->annots_str = "PASSOC,FASSOC"; ++ static struct option loptions[] = ++ { ++- {"bg-samples",required_argument,0,'0'}, ++- {"novel-samples",required_argument,0,'1'}, +++ {"max-allele-freq",required_argument,0,'f'}, +++ {"annots",required_argument,0,'a'}, +++ {"force-samples",no_argument,0,1}, +++ {"bg-samples",required_argument,0,'0'}, // renamed to --control-samples, leaving it in for backward compatibility +++ {"control-samples",required_argument,0,'0'}, +++ {"novel-samples",required_argument,0,'1'}, // renamed to --case-samples, leaving it in for backward compatibility +++ {"case-samples",required_argument,0,'1'}, ++ {"include",required_argument,0,'i'}, ++ {"exclude",required_argument,0,'e'}, ++ {"output",required_argument,NULL,'o'}, ++@@ -307,12 +456,15 @@ ++ {NULL,0,NULL,0} ++ }; ++ int c; ++- while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:a:f:",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { ++- case '0': args->bg_samples_str = optarg; break; ++- case '1': args->novel_samples_str = optarg; break; +++ case 1 : args->force_samples = 1; break; +++ case 'f': args->max_AC_str = optarg; break; +++ case 'a': args->annots_str = optarg; break; +++ case '0': args->control_samples_str = optarg; break; +++ case '1': args->case_samples_str = optarg; break; ++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; ++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; ++ case 't': args->targets = optarg; break; ++@@ -354,10 +506,18 @@ ++ if ( !pass ) continue; ++ } ++ process_record(args, rec); ++- bcf_write(args->out_fh, args->hdr_out, rec); +++ if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ ++- fprintf(stderr,"Total/processed/skipped/novel_allele/novel_gt:\t%d\t%d\t%d\t%d\t%d\n", args->ntotal, args->ntested, args->nskipped, args->nnovel_al, args->nnovel_gt); +++ fprintf(stderr,"Total/processed/skipped/case_allele/case_gt:\t%d\t%d\t%d\t%d\t%d\n", args->ntotal, args->ntested, args->nskipped, args->ncase_al, args->ncase_gt); +++ if ( args->max_AC ) +++ { +++ double val1, val2, fisher; +++ kt_fisher_exact(args->nals[0],args->nals[1],args->nals[2],args->nals[3], &val1,&val2,&fisher); +++ val1 = args->nals[0]+args->nals[1] ? (float)args->nals[1]/(args->nals[0]+args->nals[1]) : 0; +++ val2 = args->nals[2]+args->nals[3] ? (float)args->nals[3]/(args->nals[2]+args->nals[3]) : 0; +++ fprintf(stderr,"max_AC/PASSOC/FASSOC/NASSOC:\t%d\t%e\t%f,%f\t%d,%d,%d,%d\n",args->max_AC,fisher,val1,val2,args->nals[0],args->nals[1],args->nals[2],args->nals[3]); +++ } ++ destroy_data(args); ++ ++ return 0; ++--- python-pysam.orig/bcftools/plugins/contrast.c.pysam.c +++++ python-pysam/bcftools/plugins/contrast.c.pysam.c ++@@ -29,12 +29,15 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include // for isatty +++#include ++ #include ++ #include ++ #include ++ #include +++#include ++ #include ++ #include "bcftools.h" ++ #include "filter.h" ++@@ -44,21 +47,29 @@ ++ #define FLT_INCLUDE 1 ++ #define FLT_EXCLUDE 2 ++ +++#define PRINT_PASSOC (1<<0) +++#define PRINT_FASSOC (1<<1) +++#define PRINT_NASSOC (1<<2) +++#define PRINT_NOVELAL (1<<3) +++#define PRINT_NOVELGT (1<<4) +++ ++ typedef struct ++ { ++- int argc, filter_logic, regions_is_file, targets_is_file, output_type; ++- char **argv, *output_fname, *fname, *regions, *targets, *filter_str; ++- char *bg_samples_str, *novel_samples_str; ++- int *bg_smpl, *novel_smpl, nbg_smpl, nnovel_smpl; +++ int argc, filter_logic, regions_is_file, targets_is_file, output_type, force_samples; +++ uint32_t annots; +++ char **argv, *output_fname, *fname, *regions, *targets, *filter_str, *annots_str; +++ char *control_samples_str, *case_samples_str, *max_AC_str; +++ int *control_smpl, *case_smpl, ncontrol_smpl, ncase_smpl; ++ filter_t *filter; ++ bcf_srs_t *sr; ++ bcf_hdr_t *hdr, *hdr_out; ++ htsFile *out_fh; ++ int32_t *gts; ++ int mgts; ++- uint32_t *bg_gts; ++- int nbg_gts, mbg_gts, ntotal, nskipped, ntested, nnovel_al, nnovel_gt; ++- kstring_t novel_als_smpl, novel_gts_smpl; +++ uint32_t *control_gts; +++ int ncontrol_gts, mcontrol_gts, ntotal, nskipped, ntested, ncase_al, ncase_gt; +++ kstring_t case_als_smpl, case_gts_smpl; +++ int max_AC, nals[4]; // nals: number of control-ref, control-alt, case-ref and case-alt alleles in the region ++ } ++ args_t; ++ ++@@ -73,30 +84,110 @@ ++ { ++ return ++ "\n" ++- "About: Finds novel alleles and genotypes in two groups of samples. Adds\n" ++- " an annotation which lists samples with a novel allele (INFO/NOVELAL)\n" ++- " or a novel genotype (INFO/NOVELGT)\n" +++ "About: Runs a basic association test, per-site or in a region, and checks for novel alleles and\n" +++ " genotypes in two groups of samples. Adds the following INFO annotations:\n" +++ " - PASSOC .. Fisher's exact test probability of genotypic association (REF vs non-REF allele)\n" +++ " - FASSOC .. proportion of non-REF allele in controls and cases\n" +++ " - NASSOC .. number of control-ref, control-alt, case-ref and case-alt alleles\n" +++ " - NOVELAL .. lists samples with a novel allele not observed in the control group\n" +++ " - NOVELGT .. lists samples with a novel genotype not observed in the control group\n" ++ "Usage: bcftools +contrast [Plugin Options]\n" ++ "Plugin options:\n" ++- " -0, --bg-samples list of background samples\n" ++- " -1, --novel-samples list of samples where novel allele or genotype are expected\n" ++- " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" ++- " -i, --include EXPR include sites and samples for which the expression is true\n" ++- " -o, --output FILE output file name [bcftools_stdout]\n" ++- " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" ++- " -r, --regions REG restrict to comma-separated list of regions\n" ++- " -R, --regions-file FILE restrict to regions listed in a file\n" ++- " -t, --targets REG similar to -r but streams rather than index-jumps\n" ++- " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" +++ " -a, --annots list of annotations to output [PASSOC,FASSOC,NOVELAL]\n" +++ " -0, --control-samples file or comma-separated list of control (background) samples\n" +++ " -1, --case-samples file or comma-separated list of samples where novel allele or genotype is expected\n" +++ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" +++ " -f, --max-allele-freq NUM calculate enrichment of rare alleles. Floating point numbers between 0 and 1 are\n" +++ " interpreted as ALT allele frequencies, integers as ALT allele counts\n" +++ " --force-samples continue even if some samples listed in the -0,-1 files are missing from the VCF\n" +++ " -i, --include EXPR include sites and samples for which the expression is true\n" +++ " -o, --output FILE output file name [bcftools_stdout]\n" +++ " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" +++ " -r, --regions REG restrict to comma-separated list of regions\n" +++ " -R, --regions-file FILE restrict to regions listed in a file\n" +++ " -t, --targets REG similar to -r but streams rather than index-jumps\n" +++ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" ++ "\n" ++ "Example:\n" ++ " # Test if any of the samples a,b is different from the samples c,d,e\n" ++ " bcftools +contrast -0 c,d,e -1 a,b file.bcf\n" +++ "\n" +++ " # Same as above, but read samples from a file. In case of a name collision, the sample name\n" +++ " # has precedence: the existence of a file with a list of samples is not checked unless no such\n" +++ " # sample exists in the VCF. Use a full path (e.g. \"./string\" instead of \"string\") to avoid\n" +++ " # name clashes\n" +++ " bcftools +contrast -0 samples0.txt -1 samples1.txt file.bcf\n" +++ "\n" +++ " # The same as above but checks for enrichment of rare alleles, AF<0.001 in this example, in a region\n" +++ " bcftools +contrast -r 20:1000-2000 -f 0.001 -0 samples0.txt -1 samples1.txt file.bcf\n" ++ "\n"; ++ } ++ +++static int cmp_int(const void *a, const void *b) +++{ +++ if ( *((int*)a) < *((int*)b) ) return -1; +++ if ( *((int*)a) > *((int*)b) ) return -1; +++ return 0; +++} +++static void read_sample_list_or_file(bcf_hdr_t *hdr, const char *str, int **smpl, int *nsmpl, int force_samples) +++{ +++ char **str_list = NULL; +++ int i,j, *list, nlist = 0, is_file, nskipped = 0; +++ +++ for (is_file=0; is_file<=1; is_file++) +++ { +++ if ( str_list ) +++ { +++ for (i=0; i= 0 ) continue; +++ if ( is_file ) +++ { +++ if ( !force_samples ) error("The sample \"%s\" is not present in the VCF. Use --force-samples to proceed anyway.\n", str_list[i]); +++ j--; +++ nskipped++; +++ continue; +++ } +++ break; +++ } +++ if ( i==nlist ) break; +++ } +++ for (i=0; i1?"s":"",nskipped,str,nskipped>1?"are":"is"); +++ free(str_list); +++ qsort(list,nlist,sizeof(*list),cmp_int); +++ *smpl = list; +++ *nsmpl = nlist; +++} +++ ++ static void init_data(args_t *args) ++ { +++ int ntmp, i; +++ char **tmp = hts_readlist(args->annots_str, 0, &ntmp); +++ for (i=0; iannots |= PRINT_PASSOC; +++ else if ( !strcasecmp("FASSOC",tmp[i]) ) args->annots |= PRINT_FASSOC; +++ else if ( !strcasecmp("NASSOC",tmp[i]) ) args->annots |= PRINT_NASSOC; +++ else if ( !strcasecmp("NOVELAL",tmp[i]) ) args->annots |= PRINT_NOVELAL; +++ else if ( !strcasecmp("NOVELGT",tmp[i]) ) args->annots |= PRINT_NOVELGT; +++ else error("The annotation is not recognised: %s\n", tmp[i]); +++ free(tmp[i]); +++ } +++ free(tmp); +++ ++ args->sr = bcf_sr_init(); ++ if ( args->regions ) ++ { ++@@ -107,47 +198,51 @@ ++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); ++ args->hdr = bcf_sr_get_header(args->sr,0); ++ args->hdr_out = bcf_hdr_dup(args->hdr); ++- bcf_hdr_append(args->hdr_out, "##INFO="); ++- bcf_hdr_append(args->hdr_out, "##INFO="); +++ if ( args->annots & PRINT_PASSOC ) +++ bcf_hdr_append(args->hdr_out, "##INFO="); +++ if ( args->annots & PRINT_FASSOC ) +++ bcf_hdr_append(args->hdr_out, "##INFO="); +++ if ( args->annots & PRINT_NASSOC ) +++ bcf_hdr_append(args->hdr_out, "##INFO="); +++ if ( args->annots & PRINT_NOVELAL ) +++ bcf_hdr_append(args->hdr_out, "##INFO="); +++ if ( args->annots & PRINT_NOVELGT ) +++ bcf_hdr_append(args->hdr_out, "##INFO="); ++ ++ if ( args->filter_str ) ++ args->filter = filter_init(args->hdr, args->filter_str); ++ ++- int i; ++- char **smpl = hts_readlist(args->bg_samples_str, 0, &args->nbg_smpl); ++- args->bg_smpl = (int*) malloc(sizeof(int)*args->nbg_smpl); ++- for (i=0; inbg_smpl; i++) ++- { ++- args->bg_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]); ++- if ( args->bg_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]); ++- free(smpl[i]); ++- } ++- free(smpl); ++- ++- smpl = hts_readlist(args->novel_samples_str, 0, &args->nnovel_smpl); ++- args->novel_smpl = (int*) malloc(sizeof(int)*args->nnovel_smpl); ++- for (i=0; innovel_smpl; i++) ++- { ++- args->novel_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]); ++- if ( args->novel_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]); ++- free(smpl[i]); ++- } ++- free(smpl); +++ read_sample_list_or_file(args->hdr, args->control_samples_str, &args->control_smpl, &args->ncontrol_smpl, args->force_samples); +++ read_sample_list_or_file(args->hdr, args->case_samples_str, &args->case_smpl, &args->ncase_smpl, args->force_samples); ++ ++ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); ++ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); ++- bcf_hdr_write(args->out_fh, args->hdr_out); +++ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ +++ if ( args->max_AC_str ) +++ { +++ char *tmp; +++ args->max_AC = strtol(args->max_AC_str, &tmp, 10); +++ if ( tmp==args->max_AC_str || *tmp ) +++ { +++ double val = strtod(args->max_AC_str, &tmp); +++ if ( tmp==args->max_AC_str || *tmp ) error("Could not parse the argument: -f, --max-allele-freq %s\n", args->max_AC_str); +++ if ( val<0 || val>1 ) error("Expected integer or float from the range [0,1]: -f, --max-allele-freq %s\n", args->max_AC_str); +++ args->max_AC = val * bcf_hdr_nsamples(args->hdr); +++ if ( !args->max_AC ) args->max_AC = 1; +++ } +++ } ++ } ++ static void destroy_data(args_t *args) ++ { ++ bcf_hdr_destroy(args->hdr_out); ++- hts_close(args->out_fh); ++- free(args->novel_als_smpl.s); ++- free(args->novel_gts_smpl.s); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); +++ free(args->case_als_smpl.s); +++ free(args->case_gts_smpl.s); ++ free(args->gts); ++- free(args->bg_gts); ++- free(args->bg_smpl); ++- free(args->novel_smpl); +++ free(args->control_gts); +++ free(args->control_smpl); +++ free(args->case_smpl); ++ if ( args->filter ) filter_destroy(args->filter); ++ bcf_sr_destroy(args->sr); ++ free(args); ++@@ -193,13 +288,14 @@ ++ ngts /= rec->n_sample; ++ if ( ngts>2 ) error("todo: ploidy=%d\n", ngts); ++ ++- args->nbg_gts = 0; ++- uint32_t bg_als = 0; +++ args->ncontrol_gts = 0; +++ uint32_t control_als = 0; +++ int32_t nals[4] = {0,0,0,0}; // ctrl-ref, ctrl-alt, case-ref, case-alt ++ int i,j; ++- for (i=0; inbg_smpl; i++) +++ for (i=0; incontrol_smpl; i++) ++ { ++ uint32_t gt = 0; ++- int32_t *ptr = args->gts + args->bg_smpl[i]*ngts; +++ int32_t *ptr = args->gts + args->control_smpl[i]*ngts; ++ for (j=0; j32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ fprintf(bcftools_stderr,"Too many alleles (>32) at %s:%"PRId64", skipping the site.\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ warned = 1; ++ } ++ args->nskipped++; ++ return -1; ++ } ++- bg_als |= 1<bg_gts, &args->nbg_gts, &args->mbg_gts); +++ if ( args->annots & PRINT_NOVELGT ) +++ binary_insert(gt, &args->control_gts, &args->ncontrol_gts, &args->mcontrol_gts); ++ } ++- if ( !bg_als ) +++ if ( !control_als ) ++ { ++ // all are missing ++ args->nskipped++; ++ return -1; ++ } ++ ++- args->novel_als_smpl.l = 0; ++- args->novel_gts_smpl.l = 0; +++ args->case_als_smpl.l = 0; +++ args->case_gts_smpl.l = 0; ++ ++ int has_gt = 0; ++- for (i=0; innovel_smpl; i++) +++ for (i=0; incase_smpl; i++) ++ { ++- int novel_al = 0; +++ int case_al = 0; ++ uint32_t gt = 0; ++- int32_t *ptr = args->gts + args->novel_smpl[i]*ngts; +++ int32_t *ptr = args->gts + args->case_smpl[i]*ngts; ++ for (j=0; j32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ fprintf(bcftools_stderr,"Too many alleles (>32) at %s:%"PRId64", skipping. (todo?)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ warned = 1; ++ } ++ args->nskipped++; ++ return -1; ++ } ++- if ( !(bg_als & (1<hdr->samples[ args->novel_smpl[i] ]; ++- if ( novel_al ) +++ char *smpl = args->hdr->samples[ args->case_smpl[i] ]; +++ if ( case_al ) ++ { ++- if ( args->novel_als_smpl.l ) kputc(',', &args->novel_als_smpl); ++- kputs(smpl, &args->novel_als_smpl); +++ if ( args->annots & PRINT_NOVELAL ) +++ { +++ if ( args->case_als_smpl.l ) kputc(',', &args->case_als_smpl); +++ kputs(smpl, &args->case_als_smpl); +++ } ++ } ++- else if ( !binary_search(gt, args->bg_gts, args->nbg_gts) ) +++ else if ( (args->annots & PRINT_NOVELGT) && !binary_search(gt, args->control_gts, args->ncontrol_gts) ) ++ { ++- if ( args->novel_gts_smpl.l ) kputc(',', &args->novel_gts_smpl); ++- kputs(smpl, &args->novel_gts_smpl); +++ if ( args->case_gts_smpl.l ) kputc(',', &args->case_gts_smpl); +++ kputs(smpl, &args->case_gts_smpl); ++ } ++ } ++ if ( !has_gt ) ++@@ -275,15 +379,54 @@ ++ args->nskipped++; ++ return -1; ++ } ++- if ( args->novel_als_smpl.l ) +++ +++ if ( args->max_AC ) ++ { ++- bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->novel_als_smpl.s); ++- args->nnovel_al++; +++ if ( nals[0]+nals[2] > nals[1]+nals[3] ) +++ { +++ if ( nals[1]+nals[3] <= args->max_AC ) +++ for (i=0; i<4; i++) args->nals[i] += nals[i]; +++ } +++ else +++ { +++ if ( nals[0]+nals[2] <= args->max_AC ) +++ { +++ args->nals[0] += nals[1]; +++ args->nals[1] += nals[0]; +++ args->nals[2] += nals[3]; +++ args->nals[3] += nals[2]; +++ } +++ } +++ } +++ +++ float vals[2]; +++ if ( args->annots & PRINT_PASSOC ) +++ { +++ double left, right, fisher; +++ kt_fisher_exact(nals[0],nals[1],nals[2],nals[3], &left,&right,&fisher); +++ vals[0] = fisher; +++ bcf_update_info_float(args->hdr_out, rec, "PASSOC", vals, 1); ++ } ++- if ( args->novel_gts_smpl.l ) +++ if ( args->annots & PRINT_FASSOC ) ++ { ++- bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->novel_gts_smpl.s); ++- args->nnovel_gt++; +++ if ( nals[0]+nals[1] ) vals[0] = (float)nals[1]/(nals[0]+nals[1]); +++ else bcf_float_set_missing(vals[0]); +++ if ( nals[2]+nals[3] ) vals[1] = (float)nals[3]/(nals[2]+nals[3]); +++ else bcf_float_set_missing(vals[1]); +++ bcf_update_info_float(args->hdr_out, rec, "FASSOC", vals, 2); +++ } +++ if ( args->annots & PRINT_NASSOC ) +++ bcf_update_info_int32(args->hdr_out, rec, "NASSOC", nals, 4); +++ +++ if ( args->case_als_smpl.l ) +++ { +++ bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->case_als_smpl.s); +++ args->ncase_al++; +++ } +++ if ( args->case_gts_smpl.l ) +++ { +++ bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->case_gts_smpl.s); +++ args->ncase_gt++; ++ } ++ args->ntested++; ++ return 0; ++@@ -294,10 +437,16 @@ ++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); ++ args->argc = argc; args->argv = argv; ++ args->output_fname = "-"; +++ args->annots_str = "PASSOC,FASSOC"; ++ static struct option loptions[] = ++ { ++- {"bg-samples",required_argument,0,'0'}, ++- {"novel-samples",required_argument,0,'1'}, +++ {"max-allele-freq",required_argument,0,'f'}, +++ {"annots",required_argument,0,'a'}, +++ {"force-samples",no_argument,0,1}, +++ {"bg-samples",required_argument,0,'0'}, // renamed to --control-samples, leaving it in for backward compatibility +++ {"control-samples",required_argument,0,'0'}, +++ {"novel-samples",required_argument,0,'1'}, // renamed to --case-samples, leaving it in for backward compatibility +++ {"case-samples",required_argument,0,'1'}, ++ {"include",required_argument,0,'i'}, ++ {"exclude",required_argument,0,'e'}, ++ {"output",required_argument,NULL,'o'}, ++@@ -309,12 +458,15 @@ ++ {NULL,0,NULL,0} ++ }; ++ int c; ++- while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:a:f:",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { ++- case '0': args->bg_samples_str = optarg; break; ++- case '1': args->novel_samples_str = optarg; break; +++ case 1 : args->force_samples = 1; break; +++ case 'f': args->max_AC_str = optarg; break; +++ case 'a': args->annots_str = optarg; break; +++ case '0': args->control_samples_str = optarg; break; +++ case '1': args->case_samples_str = optarg; break; ++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; ++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; ++ case 't': args->targets = optarg; break; ++@@ -356,10 +508,18 @@ ++ if ( !pass ) continue; ++ } ++ process_record(args, rec); ++- bcf_write(args->out_fh, args->hdr_out, rec); +++ if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ ++- fprintf(bcftools_stderr,"Total/processed/skipped/novel_allele/novel_gt:\t%d\t%d\t%d\t%d\t%d\n", args->ntotal, args->ntested, args->nskipped, args->nnovel_al, args->nnovel_gt); +++ fprintf(bcftools_stderr,"Total/processed/skipped/case_allele/case_gt:\t%d\t%d\t%d\t%d\t%d\n", args->ntotal, args->ntested, args->nskipped, args->ncase_al, args->ncase_gt); +++ if ( args->max_AC ) +++ { +++ double val1, val2, fisher; +++ kt_fisher_exact(args->nals[0],args->nals[1],args->nals[2],args->nals[3], &val1,&val2,&fisher); +++ val1 = args->nals[0]+args->nals[1] ? (float)args->nals[1]/(args->nals[0]+args->nals[1]) : 0; +++ val2 = args->nals[2]+args->nals[3] ? (float)args->nals[3]/(args->nals[2]+args->nals[3]) : 0; +++ fprintf(bcftools_stderr,"max_AC/PASSOC/FASSOC/NASSOC:\t%d\t%e\t%f,%f\t%d,%d,%d,%d\n",args->max_AC,fisher,val1,val2,args->nals[0],args->nals[1],args->nals[2],args->nals[3]); +++ } ++ destroy_data(args); ++ ++ return 0; ++--- python-pysam.orig/bcftools/plugins/counts.c +++++ python-pysam/bcftools/plugins/counts.c ++@@ -1,6 +1,6 @@ ++ /* plugins/counts.c -- counts SNPs, Indels, and total number of sites. ++ ++- Copyright (C) 2013, 2014 Genome Research Ltd. +++ Copyright (C) 2013-2018 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -24,9 +24,10 @@ ++ ++ #include ++ #include +++#include ++ #include ++ ++-int nsamples, nsnps, nindels, nmnps, nothers, nsites; +++uint64_t nsamples, nsnps, nindels, nmnps, nothers, nsites; ++ ++ /* ++ This short description is used to generate the output of `bcftools plugin -l`. ++@@ -71,12 +72,12 @@ ++ */ ++ void destroy(void) ++ { ++- printf("Number of samples: %d\n", nsamples); ++- printf("Number of SNPs: %d\n", nsnps); ++- printf("Number of INDELs: %d\n", nindels); ++- printf("Number of MNPs: %d\n", nmnps); ++- printf("Number of others: %d\n", nothers); ++- printf("Number of sites: %d\n", nsites); +++ printf("Number of samples: %"PRIu64"\n", nsamples); +++ printf("Number of SNPs: %"PRIu64"\n", nsnps); +++ printf("Number of INDELs: %"PRIu64"\n", nindels); +++ printf("Number of MNPs: %"PRIu64"\n", nmnps); +++ printf("Number of others: %"PRIu64"\n", nothers); +++ printf("Number of sites: %"PRIu64"\n", nsites); ++ } ++ ++ ++--- python-pysam.orig/bcftools/plugins/counts.c.pysam.c +++++ python-pysam/bcftools/plugins/counts.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* plugins/counts.c -- counts SNPs, Indels, and total number of sites. ++ ++- Copyright (C) 2013, 2014 Genome Research Ltd. +++ Copyright (C) 2013-2018 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -26,9 +26,10 @@ ++ ++ #include ++ #include +++#include ++ #include ++ ++-int nsamples, nsnps, nindels, nmnps, nothers, nsites; +++uint64_t nsamples, nsnps, nindels, nmnps, nothers, nsites; ++ ++ /* ++ This short description is used to generate the output of `bcftools plugin -l`. ++@@ -73,12 +74,12 @@ ++ */ ++ void destroy(void) ++ { ++- fprintf(bcftools_stdout, "Number of samples: %d\n", nsamples); ++- fprintf(bcftools_stdout, "Number of SNPs: %d\n", nsnps); ++- fprintf(bcftools_stdout, "Number of INDELs: %d\n", nindels); ++- fprintf(bcftools_stdout, "Number of MNPs: %d\n", nmnps); ++- fprintf(bcftools_stdout, "Number of others: %d\n", nothers); ++- fprintf(bcftools_stdout, "Number of sites: %d\n", nsites); +++ fprintf(bcftools_stdout, "Number of samples: %"PRIu64"\n", nsamples); +++ fprintf(bcftools_stdout, "Number of SNPs: %"PRIu64"\n", nsnps); +++ fprintf(bcftools_stdout, "Number of INDELs: %"PRIu64"\n", nindels); +++ fprintf(bcftools_stdout, "Number of MNPs: %"PRIu64"\n", nmnps); +++ fprintf(bcftools_stdout, "Number of others: %"PRIu64"\n", nothers); +++ fprintf(bcftools_stdout, "Number of sites: %"PRIu64"\n", nsites); ++ } ++ ++ ++--- python-pysam.orig/bcftools/plugins/dosage.c +++++ python-pysam/bcftools/plugins/dosage.c ++@@ -1,6 +1,6 @@ ++ /* plugins/dosage.c -- prints genotype dosage. ++ ++- Copyright (C) 2014 Genome Research Ltd. +++ Copyright (C) 2014-2018 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -27,6 +27,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include "bcftools.h" ++ ++ ++@@ -87,7 +88,7 @@ ++ for (j=0; jn_allele); \ ++ int k, l = 0; \ ++ for (j=0; jn_allele; j++) \ ++@@ -103,11 +105,12 @@ ++ { \ ++ dsg[j] += vals[l]; \ ++ dsg[k] += vals[l]; \ +++ l++; \ ++ } \ ++ } \ ++ } \ ++ for (j=1; jn_allele; j++) \ ++- printf("%c%.1f",j==1?'\t':',',dsg[j]); \ +++ printf("%c%f",j==1?'\t':',',dsg[j]); \ ++ ptr += nret; \ ++ } \ ++ } ++@@ -122,7 +125,7 @@ ++ ++ int calc_dosage_GL(bcf1_t *rec) ++ { ++- int i, j, nret = bcf_get_format_values(in_hdr,rec,"GL",(void**)&buf,&nbuf,pl_type); +++ int i, j, nret = bcf_get_format_values(in_hdr,rec,"GL",(void**)&buf,&nbuf,gl_type); ++ if ( nret<0 ) return -1; ++ ++ nret /= rec->n_sample; ++@@ -138,15 +141,15 @@ ++ for (j=0; jn_allele; j++) dsg[j] = -1; \ ++ else \ ++ { \ ++- for (; jn_allele); \ ++ int k, l = 0; \ ++ for (j=0; jn_allele; j++) \ ++@@ -155,15 +158,16 @@ ++ { \ ++ dsg[j] += vals[l]; \ ++ dsg[k] += vals[l]; \ +++ l++; \ ++ } \ ++ } \ ++ } \ ++ for (j=1; jn_allele; j++) \ ++- printf("%c%.1f",j==1?'\t':',',dsg[j]); \ +++ printf("%c%f",j==1?'\t':',',dsg[j]); \ ++ ptr += nret; \ ++ } \ ++ } ++- switch (pl_type) +++ switch (gl_type) ++ { ++ case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_missing,ptr[j]==bcf_int32_vector_end); break; ++ case BCF_HT_REAL: BRANCH(float,bcf_float_is_missing(ptr[j]),bcf_float_is_vector_end(ptr[j])); break; ++@@ -187,7 +191,7 @@ ++ { ++ if ( ptr[j]==bcf_int32_vector_end || bcf_gt_is_missing(ptr[j]) ) break; ++ int idx = bcf_gt_allele(ptr[j]); ++- if ( idx > rec->n_allele ) error("The allele index is out of range at %s:%d\n", bcf_seqname(in_hdr,rec),rec->pos+1); +++ if ( idx > rec->n_allele ) error("The allele index is out of range at %s:%"PRId64"\n", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); ++ dsg[idx] += 1; ++ } ++ if ( !j ) ++@@ -300,7 +304,7 @@ ++ { ++ int i,j, ret; ++ ++- printf("%s\t%d\t%s", bcf_seqname(in_hdr,rec),rec->pos+1,rec->d.allele[0]); +++ printf("%s\t%"PRId64"\t%s", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1,rec->d.allele[0]); ++ if ( rec->n_allele == 1 ) printf("\t."); ++ else for (i=1; in_allele; i++) printf("%c%s", i==1?'\t':',', rec->d.allele[i]); ++ if ( rec->n_allele==1 ) ++--- python-pysam.orig/bcftools/plugins/dosage.c.pysam.c +++++ python-pysam/bcftools/plugins/dosage.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* plugins/dosage.c -- prints genotype dosage. ++ ++- Copyright (C) 2014 Genome Research Ltd. +++ Copyright (C) 2014-2018 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -29,6 +29,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include "bcftools.h" ++ ++ ++@@ -89,7 +90,7 @@ ++ for (j=0; jn_allele); \ ++ int k, l = 0; \ ++ for (j=0; jn_allele; j++) \ ++@@ -105,11 +107,12 @@ ++ { \ ++ dsg[j] += vals[l]; \ ++ dsg[k] += vals[l]; \ +++ l++; \ ++ } \ ++ } \ ++ } \ ++ for (j=1; jn_allele; j++) \ ++- fprintf(bcftools_stdout, "%c%.1f",j==1?'\t':',',dsg[j]); \ +++ fprintf(bcftools_stdout, "%c%f",j==1?'\t':',',dsg[j]); \ ++ ptr += nret; \ ++ } \ ++ } ++@@ -124,7 +127,7 @@ ++ ++ int calc_dosage_GL(bcf1_t *rec) ++ { ++- int i, j, nret = bcf_get_format_values(in_hdr,rec,"GL",(void**)&buf,&nbuf,pl_type); +++ int i, j, nret = bcf_get_format_values(in_hdr,rec,"GL",(void**)&buf,&nbuf,gl_type); ++ if ( nret<0 ) return -1; ++ ++ nret /= rec->n_sample; ++@@ -140,15 +143,15 @@ ++ for (j=0; jn_allele; j++) dsg[j] = -1; \ ++ else \ ++ { \ ++- for (; jn_allele); \ ++ int k, l = 0; \ ++ for (j=0; jn_allele; j++) \ ++@@ -157,15 +160,16 @@ ++ { \ ++ dsg[j] += vals[l]; \ ++ dsg[k] += vals[l]; \ +++ l++; \ ++ } \ ++ } \ ++ } \ ++ for (j=1; jn_allele; j++) \ ++- fprintf(bcftools_stdout, "%c%.1f",j==1?'\t':',',dsg[j]); \ +++ fprintf(bcftools_stdout, "%c%f",j==1?'\t':',',dsg[j]); \ ++ ptr += nret; \ ++ } \ ++ } ++- switch (pl_type) +++ switch (gl_type) ++ { ++ case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_missing,ptr[j]==bcf_int32_vector_end); break; ++ case BCF_HT_REAL: BRANCH(float,bcf_float_is_missing(ptr[j]),bcf_float_is_vector_end(ptr[j])); break; ++@@ -189,7 +193,7 @@ ++ { ++ if ( ptr[j]==bcf_int32_vector_end || bcf_gt_is_missing(ptr[j]) ) break; ++ int idx = bcf_gt_allele(ptr[j]); ++- if ( idx > rec->n_allele ) error("The allele index is out of range at %s:%d\n", bcf_seqname(in_hdr,rec),rec->pos+1); +++ if ( idx > rec->n_allele ) error("The allele index is out of range at %s:%"PRId64"\n", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); ++ dsg[idx] += 1; ++ } ++ if ( !j ) ++@@ -302,7 +306,7 @@ ++ { ++ int i,j, ret; ++ ++- fprintf(bcftools_stdout, "%s\t%d\t%s", bcf_seqname(in_hdr,rec),rec->pos+1,rec->d.allele[0]); +++ fprintf(bcftools_stdout, "%s\t%"PRId64"\t%s", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1,rec->d.allele[0]); ++ if ( rec->n_allele == 1 ) fprintf(bcftools_stdout, "\t."); ++ else for (i=1; in_allele; i++) fprintf(bcftools_stdout, "%c%s", i==1?'\t':',', rec->d.allele[i]); ++ if ( rec->n_allele==1 ) ++--- python-pysam.orig/bcftools/plugins/fill-AN-AC.c +++++ python-pysam/bcftools/plugins/fill-AN-AC.c ++@@ -33,7 +33,7 @@ ++ ++ const char *about(void) ++ { ++- return "Fill INFO fields AN and AC.\n"; +++ return "Fill INFO fields AN and AC. This plugin is DEPRECATED, use fill-tags instead.\n"; ++ } ++ ++ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) ++--- python-pysam.orig/bcftools/plugins/fill-AN-AC.c.pysam.c +++++ python-pysam/bcftools/plugins/fill-AN-AC.c.pysam.c ++@@ -35,7 +35,7 @@ ++ ++ const char *about(void) ++ { ++- return "Fill INFO fields AN and AC.\n"; +++ return "Fill INFO fields AN and AC. This plugin is DEPRECATED, use fill-tags instead.\n"; ++ } ++ ++ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) ++--- python-pysam.orig/bcftools/plugins/fill-from-fasta.c +++++ python-pysam/bcftools/plugins/fill-from-fasta.c ++@@ -26,6 +26,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -54,6 +55,7 @@ ++ " -h, --header-lines optional file containing header lines to append\n" ++ " -i, --include annotate only records passing filter expression\n" ++ " -e, --exclude annotate only records failing filter expression\n" +++" -N, --replace-non-ACGTN replace non-ACGTN characters with N\n" ++ ++ "\n" ++ "Examples:\n" ++@@ -74,6 +76,7 @@ ++ faidx_t *faidx; ++ int anno = 0; ++ char *column = NULL; +++int replace_nonACGTN = 0; ++ ++ #define ANNO_REF 1 ++ #define ANNO_STRING 2 ++@@ -92,6 +95,7 @@ ++ char *ref_fname = NULL, *header_fname = NULL; ++ static struct option loptions[] = ++ { +++ {"replace-non-ACGTN",no_argument,NULL,'N'}, ++ {"exclude",required_argument,NULL,'e'}, ++ {"include",required_argument,NULL,'i'}, ++ {"column",required_argument,NULL,'c'}, ++@@ -99,12 +103,13 @@ ++ {"header-lines",required_argument,NULL,'h'}, ++ {NULL,0,NULL,0} ++ }; ++- while ((c = getopt_long(argc, argv, "c:f:?h:i:e:",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "c:f:?h:i:e:N",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { ++ case 'e': filter_str = optarg; filter_logic |= FLT_EXCLUDE; break; ++ case 'i': filter_str = optarg; filter_logic |= FLT_INCLUDE; break; +++ case 'N': replace_nonACGTN = 1; break; ++ case 'c': column = optarg; break; ++ case 'f': ref_fname = optarg; break; ++ case 'h': header_fname = optarg; break; ++@@ -132,7 +137,8 @@ ++ } ++ hts_close(file); ++ free(str.s); ++- bcf_hdr_sync(out_hdr); +++ if (bcf_hdr_sync(out_hdr) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ } ++ if (!strcasecmp("REF", column)) anno = ANNO_REF; ++ else { ++@@ -181,9 +187,12 @@ ++ // could be sped up here by fetching the whole chromosome? could assume ++ // sorted, but revert to this when non-sorted records found? ++ char *fa = faidx_fetch_seq(faidx, bcf_seqname(in_hdr,rec), rec->pos, rec->pos+ref_len-1, &fa_len); ++- if ( !fa ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(in_hdr,rec->rid), rec->pos+1); +++ if ( !fa ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(in_hdr,rec->rid),(int64_t) rec->pos+1); ++ for (i=0; i96 ) fa[i] -= 32; +++ if ( replace_nonACGTN && fa[i]!='A' && fa[i]!='C' && fa[i]!='G' && fa[i]!='T' && fa[i]!='N' ) fa[i] = 'N'; +++ } ++ ++ assert(ref_len == fa_len); ++ if (anno==ANNO_REF) ++--- python-pysam.orig/bcftools/plugins/fill-from-fasta.c.pysam.c +++++ python-pysam/bcftools/plugins/fill-from-fasta.c.pysam.c ++@@ -28,6 +28,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -56,6 +57,7 @@ ++ " -h, --header-lines optional file containing header lines to append\n" ++ " -i, --include annotate only records passing filter expression\n" ++ " -e, --exclude annotate only records failing filter expression\n" +++" -N, --replace-non-ACGTN replace non-ACGTN characters with N\n" ++ ++ "\n" ++ "Examples:\n" ++@@ -76,6 +78,7 @@ ++ faidx_t *faidx; ++ int anno = 0; ++ char *column = NULL; +++int replace_nonACGTN = 0; ++ ++ #define ANNO_REF 1 ++ #define ANNO_STRING 2 ++@@ -94,6 +97,7 @@ ++ char *ref_fname = NULL, *header_fname = NULL; ++ static struct option loptions[] = ++ { +++ {"replace-non-ACGTN",no_argument,NULL,'N'}, ++ {"exclude",required_argument,NULL,'e'}, ++ {"include",required_argument,NULL,'i'}, ++ {"column",required_argument,NULL,'c'}, ++@@ -101,12 +105,13 @@ ++ {"header-lines",required_argument,NULL,'h'}, ++ {NULL,0,NULL,0} ++ }; ++- while ((c = getopt_long(argc, argv, "c:f:?h:i:e:",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "c:f:?h:i:e:N",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { ++ case 'e': filter_str = optarg; filter_logic |= FLT_EXCLUDE; break; ++ case 'i': filter_str = optarg; filter_logic |= FLT_INCLUDE; break; +++ case 'N': replace_nonACGTN = 1; break; ++ case 'c': column = optarg; break; ++ case 'f': ref_fname = optarg; break; ++ case 'h': header_fname = optarg; break; ++@@ -134,7 +139,8 @@ ++ } ++ hts_close(file); ++ free(str.s); ++- bcf_hdr_sync(out_hdr); +++ if (bcf_hdr_sync(out_hdr) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ } ++ if (!strcasecmp("REF", column)) anno = ANNO_REF; ++ else { ++@@ -183,9 +189,12 @@ ++ // could be sped up here by fetching the whole chromosome? could assume ++ // sorted, but revert to this when non-sorted records found? ++ char *fa = faidx_fetch_seq(faidx, bcf_seqname(in_hdr,rec), rec->pos, rec->pos+ref_len-1, &fa_len); ++- if ( !fa ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(in_hdr,rec->rid), rec->pos+1); +++ if ( !fa ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(in_hdr,rec->rid),(int64_t) rec->pos+1); ++ for (i=0; i96 ) fa[i] -= 32; +++ if ( replace_nonACGTN && fa[i]!='A' && fa[i]!='C' && fa[i]!='G' && fa[i]!='T' && fa[i]!='N' ) fa[i] = 'N'; +++ } ++ ++ assert(ref_len == fa_len); ++ if (anno==ANNO_REF) ++--- python-pysam.orig/bcftools/plugins/fill-tags.c +++++ python-pysam/bcftools/plugins/fill-tags.c ++@@ -1,6 +1,6 @@ ++ /* The MIT License ++ ++- Copyright (c) 2015 Genome Research Ltd. +++ Copyright (c) 2015-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -29,10 +29,12 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++ #include +++#include ++ #include "bcftools.h" ++ ++ #define SET_AN (1<<0) ++@@ -45,6 +47,17 @@ ++ #define SET_MAF (1<<7) ++ #define SET_HWE (1<<8) ++ #define SET_ExcHet (1<<9) +++#define SET_FUNC (1<<10) +++ +++typedef struct _args_t args_t; +++typedef struct _ftf_t ftf_t; +++typedef int (*fill_tag_f)(args_t *, bcf1_t *, ftf_t *); +++struct _ftf_t +++{ +++ char *src_tag, *dst_tag; +++ fill_tag_f func; +++ int *pop_vals; // for now assuming only 1 integer value per annotation +++}; ++ ++ typedef struct ++ { ++@@ -62,7 +75,7 @@ ++ } ++ pop_t; ++ ++-typedef struct +++struct _args_t ++ { ++ bcf_hdr_t *in_hdr, *out_hdr; ++ int npop, tags, drop_missing, gt_id; ++@@ -72,21 +85,24 @@ ++ double *hwe_probs; ++ int mhwe_probs; ++ kstring_t str; ++-} ++-args_t; +++ kbitset_t *bset; +++ ftf_t *ftf; +++ int nftf; +++}; ++ ++ static args_t *args; ++ ++ const char *about(void) ++ { ++- return "Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS.\n"; +++ return "Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS and more.\n"; ++ } ++ ++ const char *usage(void) ++ { ++ return ++ "\n" ++- "About: Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS.\n" +++ "About: Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS\n" +++ " or custom INFO/TAG=func(FMT/TAG), use -l for detailed description\n" ++ "Usage: bcftools +fill-tags [General Options] -- [Plugin Options]\n" ++ "Options:\n" ++ " run \"bcftools plugin\" for a list of common options\n" ++@@ -94,14 +110,24 @@ ++ "Plugin options:\n" ++ " -d, --drop-missing do not count half-missing genotypes \"./1\" as hemizygous\n" ++ " -l, --list-tags list available tags with description\n" ++- " -t, --tags LIST list of output tags. By default, all tags are filled.\n" +++ " -t, --tags LIST list of output tags, \"all\" for all tags\n" ++ " -S, --samples-file FILE list of samples (first column) and comma-separated list of populations (second column)\n" ++ "\n" ++ "Example:\n" ++- " bcftools +fill-tags in.bcf -Ob -o out.bcf\n" +++ " # Print a detailed list of available tags\n" +++ " bcftools +fill-tags -- -l\n" +++ "\n" +++ " # Fill INFO/AN and INFO/AC\n" ++ " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t AN,AC\n" ++- " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -d\n" +++ "\n" +++ " # Fill all available tags\n" +++ " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t all\n" +++ "\n" +++ " # Calculate HWE for sample groups (possibly multiple) read from a file\n" ++ " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -S sample-group.txt -t HWE\n" +++ "\n" +++ " # Calculate total read depth (INFO/DP) from per-sample depths (FORMAT/DP)\n" +++ " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t 'DP=sum(DP)'\n" ++ "\n"; ++ } ++ ++@@ -180,7 +206,7 @@ ++ khash_str2int_destroy_free(smpli); ++ free(str.s); ++ free(off); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); ++ } ++ ++ void init_pops(args_t *args) ++@@ -211,13 +237,118 @@ ++ } ++ } ++ +++void ftf_destroy(args_t *args) +++{ +++ int i; +++ for (i=0; inftf; i++) +++ { +++ ftf_t *ftf = &args->ftf[i]; +++ free(ftf->src_tag); +++ free(ftf->dst_tag); +++ free(ftf->pop_vals); +++ } +++ free(args->ftf); +++} +++int ftf_sum(args_t *args, bcf1_t *rec, ftf_t *ftf) +++{ +++ int nsmpl = bcf_hdr_nsamples(args->in_hdr); +++ int nval = bcf_get_format_int32(args->in_hdr, rec, ftf->src_tag, &args->iarr, &args->miarr); +++ if ( nval<=0 ) return 0; +++ nval /= nsmpl; +++ +++ int i; +++ for (i=0; inpop; i++) +++ ftf->pop_vals[i] = -1; +++ +++ for (i=0; iiarr[i*nval]==bcf_int32_missing || args->iarr[i*nval]==bcf_int32_vector_end ) continue; +++ +++ pop_t **pop = &args->smpl2pop[i*(args->npop+1)]; +++ while ( *pop ) +++ { +++ int ipop = (int)(*pop - args->pop); +++ if ( ftf->pop_vals[ipop]<0 ) ftf->pop_vals[ipop] = 0; +++ ftf->pop_vals[ipop] += args->iarr[i*nval]; +++ pop++; +++ } +++ } +++ +++ for (i=0; inpop; i++) +++ { +++ if ( ftf->pop_vals[i]<0 ) continue; +++ args->str.l = 0; +++ ksprintf(&args->str, "%s%s", ftf->dst_tag,args->pop[i].suffix); +++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,ftf->pop_vals+i,1)!=0 ) +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); +++ } +++ +++ return 0; +++} +++ +++void hdr_append(args_t *args, char *fmt) +++{ +++ int i; +++ for (i=0; inpop; i++) +++ bcf_hdr_printf(args->out_hdr, fmt, args->pop[i].suffix,*args->pop[i].name ? " in " : "",args->pop[i].name); +++} +++ +++int parse_func(args_t *args, char *tag, char *expr) +++{ +++ args->nftf++; +++ args->ftf = (ftf_t *)realloc(args->ftf,sizeof(*args->ftf)*args->nftf); +++ ftf_t *ftf = &args->ftf[ args->nftf - 1 ]; +++ +++ ftf->pop_vals = (int*)calloc(args->npop,sizeof(*ftf->pop_vals)); +++ ftf->dst_tag = (char*)calloc(expr-tag,1); +++ memcpy(ftf->dst_tag, tag, expr-tag-1); +++ +++ if ( !strncasecmp(expr,"sum(",4) ) { ftf->func = ftf_sum; expr += 4; } +++ else error("Error: the expression not recognised: %s\n",tag); +++ +++ char *tmp = expr; +++ while ( *tmp && *tmp!=')' ) tmp++; +++ if ( !*tmp ) error("Error: could not parse: %s\n",tag); +++ +++ ftf->src_tag = (char*)calloc(tmp-expr+2,1); +++ memcpy(ftf->src_tag, expr, tmp-expr); +++ +++ int id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,ftf->src_tag); +++ if ( !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) ) error("Error: the field FORMAT/%s is not present\n",ftf->src_tag); +++ +++ int i = 0; +++ for (i=0; inpop; i++) +++ { +++ args->str.l = 0; +++ ksprintf(&args->str, "%s%s", ftf->dst_tag,args->pop[i].suffix); +++ id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,args->str.s); +++ if ( bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) ) +++ { +++ if ( bcf_hdr_id2length(args->in_hdr,BCF_HL_FMT,id)!=BCF_VL_FIXED ) +++ error("Error: the field INFO/%s already exists with a definition different from Number=1\n",args->str.s); +++ if ( bcf_hdr_id2number(args->in_hdr,BCF_HL_FMT,id)!=1 ) +++ error("Error: the field INFO/%s already exists with a definition different from Number=1\n",args->str.s); +++ if ( bcf_hdr_id2type(args->in_hdr,BCF_HT_INT,id)!=BCF_HT_INT ) +++ error("Error: the field INFO/%s already exists with a definition different from Type=Integer\n",args->str.s); +++ } +++ else +++ bcf_hdr_printf(args->out_hdr, "##INFO=",args->str.s,tag,*args->pop[i].name ? " in " : "",args->pop[i].name); +++ } +++ return SET_FUNC; +++} ++ int parse_tags(args_t *args, const char *str) ++ { ++- int i, flag = 0, n_tags; ++- char **tags = hts_readlist(str, 0, &n_tags); +++ if ( !args->in_hdr ) error("%s", usage()); +++ +++ int i,j, flag = 0, n_tags; +++ char **tags = hts_readlist(str, 0, &n_tags), *ptr; ++ for(i=0; inpop; i++) ++- bcf_hdr_printf(args->out_hdr, fmt, args->pop[i].suffix,*args->pop[i].name ? " in " : "",args->pop[i].name); ++-} ++- ++ void list_tags(void) ++ { ++ error( ++@@ -256,8 +381,10 @@ ++ "INFO/AC_Hemi Number:A Type:Integer .. Allele counts in hemizygous genotypes\n" ++ "INFO/AF Number:A Type:Float .. Allele frequency\n" ++ "INFO/MAF Number:A Type:Float .. Minor Allele frequency\n" ++- "INFO/HWE Number:A Type:Float .. HWE test (PMID:15789306)\n" ++- "INFO/ExcHet Number:A Type:Float .. Probability of excess heterozygosity\n" +++ "INFO/HWE Number:A Type:Float .. HWE test (PMID:15789306); 1=good, 0=bad\n" +++ "INFO/ExcHet Number:A Type:Float .. Test excess heterozygosity; 1=good, 0=bad\n" +++ "TAG=func(TAG) Number:1 Type:Integer .. Experimental support for user-defined\n" +++ " expressions such as \"DP=sum(DP)\". This is currently very basic, to be extended.\n" ++ ); ++ } ++ ++@@ -266,7 +393,7 @@ ++ args = (args_t*) calloc(1,sizeof(args_t)); ++ args->in_hdr = in; ++ args->out_hdr = out; ++- char *samples_fname = NULL; +++ char *samples_fname = NULL, *tags_str = "all"; ++ static struct option loptions[] = ++ { ++ {"list-tags",0,0,'l'}, ++@@ -282,7 +409,7 @@ ++ { ++ case 'l': list_tags(); break; ++ case 'd': args->drop_missing = 1; break; ++- case 't': args->tags |= parse_tags(args,optarg); break; +++ case 't': tags_str = optarg; break; ++ case 'S': samples_fname = optarg; break; ++ case 'h': ++ case '?': ++@@ -295,12 +422,11 @@ ++ args->gt_id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,"GT"); ++ if ( args->gt_id<0 ) error("Error: GT field is not present\n"); ++ ++- if ( !args->tags ) ++- for (c=0; c<=9; c++) args->tags |= 1<tags |= parse_tags(args,tags_str); +++ ++ if ( args->tags & SET_AN ) hdr_append(args, "##INFO="); ++ if ( args->tags & SET_AC ) hdr_append(args, "##INFO="); ++ if ( args->tags & SET_NS ) hdr_append(args, "##INFO="); ++@@ -309,8 +435,8 @@ ++ if ( args->tags & SET_AC_Hemi ) hdr_append(args, "##INFO="); ++ if ( args->tags & SET_AF ) hdr_append(args, "##INFO="); ++ if ( args->tags & SET_MAF ) hdr_append(args, "##INFO="); ++- if ( args->tags & SET_HWE ) hdr_append(args, "##INFO="); ++- if ( args->tags & SET_ExcHet ) hdr_append(args, "##INFO="); +++ if ( args->tags & SET_HWE ) hdr_append(args, "##INFO="); +++ if ( args->tags & SET_ExcHet ) hdr_append(args, "##INFO="); ++ ++ return 0; ++ } ++@@ -340,7 +466,7 @@ ++ double *probs = args->hwe_probs; ++ ++ // start at midpoint ++- int mid = nrare * (nref + nalt - nrare) / (nref + nalt); +++ int mid = (double)nrare * (nref + nalt - nrare) / (nref + nalt); ++ ++ // check to ensure that midpoint and rare alleles have same parity ++ if ( (nrare & 1) ^ (mid & 1) ) mid++; ++@@ -389,19 +515,17 @@ ++ *p_hwe = prob; ++ } ++ ++-static inline void set_counts(pop_t *pop, int is_half, int is_hom, int is_hemi, int als) +++static inline void set_counts(pop_t *pop, int is_half, int is_hom, int is_hemi, kbitset_t *bset) ++ { ++- int ial; ++- for (ial=0; als; ial++) +++ kbitset_iter_t itr; +++ int i; +++ kbs_start(&itr); +++ while ((i = kbs_next(bset, &itr)) >= 0) ++ { ++- if ( als&1 ) ++- { ++- if ( is_half ) pop->counts[ial].nac++; ++- else if ( !is_hom ) pop->counts[ial].nhet++; ++- else if ( !is_hemi ) pop->counts[ial].nhom += 2; ++- else pop->counts[ial].nhemi++; ++- } ++- als >>= 1; +++ if ( is_half ) pop->counts[i].nac++; +++ else if ( !is_hom ) pop->counts[i].nhet++; +++ else if ( !is_hemi ) pop->counts[i].nhom += 2; +++ else pop->counts[i].nhemi++; ++ } ++ pop->ns++; ++ } ++@@ -413,9 +537,13 @@ ++ ++ bcf1_t *process(bcf1_t *rec) ++ { +++ bcf_unpack(rec, BCF_UN_FMT); +++ ++ int i,j, nsmpl = bcf_hdr_nsamples(args->in_hdr);; ++ ++- bcf_unpack(rec, BCF_UN_FMT); +++ for (i=0; inftf; i++) +++ args->ftf[i].func(args, rec, &args->ftf[i]); +++ ++ bcf_fmt_t *fmt_gt = NULL; ++ for (i=0; in_fmt; i++) ++ if ( rec->d.fmt[i].id==args->gt_id ) { fmt_gt = &rec->d.fmt[i]; break; } ++@@ -429,14 +557,15 @@ ++ for (i=0; inpop; i++) ++ clean_counts(&args->pop[i], rec->n_allele); ++ ++- assert( rec->n_allele < 8*sizeof(int) ); +++ if ( kbs_resize(&args->bset, rec->n_allele) < 0 ) error("kbs_resize: failed to store %d bits\n", rec->n_allele); ++ ++ #define BRANCH_INT(type_t,vector_end) \ ++ { \ ++ for (i=0; ip + i*fmt_gt->size); \ ++- int ial, als = 0, nals = 0, is_half, is_hom, is_hemi; \ +++ int ial, nbits = 0, nals = 0, is_half, is_hom, is_hemi; \ +++ kbs_clear(args->bset); \ ++ for (ial=0; ialn; ial++) \ ++ { \ ++ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ ++@@ -445,11 +574,12 @@ ++ nals++; \ ++ \ ++ if ( idx >= rec->n_allele ) \ ++- error("Incorrect allele (\"%d\") in %s at %s:%d\n",idx,args->in_hdr->samples[i],bcf_seqname(args->in_hdr,rec),rec->pos+1); \ ++- als |= (1<in_hdr->samples[i],bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); \ +++ if ( !kbs_exists(args->bset, idx) ) nbits++; \ +++ kbs_insert(args->bset, idx); \ ++ } \ ++ if ( nals==0 ) continue; /* missing genotype */ \ ++- is_hom = als && !(als & (als-1)); /* only one bit is set */ \ +++ is_hom = nbits==1 ? 1 : 0; /* only one bit is set for homs */ \ ++ if ( nals!=ial ) \ ++ { \ ++ if ( args->drop_missing ) is_hemi = 0, is_half = 1; \ ++@@ -458,14 +588,14 @@ ++ else if ( nals==1 ) is_hemi = 1, is_half = 0; \ ++ else is_hemi = 0, is_half = 0; \ ++ pop_t **pop = &args->smpl2pop[i*(args->npop+1)]; \ ++- while ( *pop ) { set_counts(*pop,is_half,is_hom,is_hemi,als); pop++; }\ +++ while ( *pop ) { set_counts(*pop,is_half,is_hom,is_hemi,args->bset); pop++; } \ ++ } \ ++ } ++ switch (fmt_gt->type) { ++ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; ++ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; ++ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; ++- default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args->in_hdr,rec),rec->pos+1); break; +++ default: error("The GT type is not recognised: %d at %s:%"PRId64"\n",fmt_gt->type, bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); break; ++ } ++ #undef BRANCH_INT ++ ++@@ -476,7 +606,7 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "NS%s", args->pop[i].suffix); ++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,&args->pop[i].ns,1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( args->tags & SET_AN ) ++@@ -491,7 +621,7 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "AN%s", args->pop[i].suffix); ++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,&an,1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( args->tags & (SET_AF | SET_MAF) ) ++@@ -507,25 +637,29 @@ ++ args->farr[j-1] += pop->counts[j].nhet + pop->counts[j].nhom + pop->counts[j].nhemi + pop->counts[j].nac; ++ an = pop->counts[0].nhet + pop->counts[0].nhom + pop->counts[0].nhemi + pop->counts[0].nac; ++ for (j=1; jn_allele; j++) an += args->farr[j-1]; ++- if ( !an ) continue; ++- for (j=1; jn_allele; j++) args->farr[j-1] /= an; +++ if ( an ) +++ for (j=1; jn_allele; j++) args->farr[j-1] /= an; +++ else +++ for (j=1; jn_allele; j++) bcf_float_set_missing(args->farr[j-1]); ++ } ++ if ( args->tags & SET_AF ) ++ { ++ args->str.l = 0; ++ ksprintf(&args->str, "AF%s", args->pop[i].suffix); ++ if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,args->farr,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ if ( args->tags & SET_MAF ) ++ { ++- if ( !an ) continue; ++- for (j=1; jn_allele; j++) ++- if ( args->farr[j-1] > 0.5 ) args->farr[j-1] = 1 - args->farr[j-1]; // todo: this is incorrect for multiallelic sites +++ if ( an ) +++ { +++ for (j=1; jn_allele; j++) +++ if ( args->farr[j-1] > 0.5 ) args->farr[j-1] = 1 - args->farr[j-1]; // todo: this is incorrect for multiallelic sites +++ } ++ args->str.l = 0; ++ ksprintf(&args->str, "MAF%s", args->pop[i].suffix); ++ if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,args->farr,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ } ++@@ -543,7 +677,7 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "AC%s", args->pop[i].suffix); ++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( args->tags & SET_AC_Het ) ++@@ -560,7 +694,7 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "AC_Het%s", args->pop[i].suffix); ++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( args->tags & SET_AC_Hom ) ++@@ -577,7 +711,7 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "AC_Hom%s", args->pop[i].suffix); ++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( args->tags & SET_AC_Hemi && rec->n_allele > 1 ) ++@@ -594,7 +728,7 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "AC_Hemi%s", args->pop[i].suffix); ++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( args->tags & (SET_HWE|SET_ExcHet) ) ++@@ -625,14 +759,14 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "HWE%s", args->pop[i].suffix); ++ if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,fhwe,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ if ( args->tags & SET_ExcHet ) ++ { ++ args->str.l = 0; ++ ksprintf(&args->str, "ExcHet%s", args->pop[i].suffix); ++ if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,fexc_het,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ } ++@@ -650,12 +784,14 @@ ++ free(args->pop[i].smpl); ++ free(args->pop[i].counts); ++ } +++ kbs_destroy(args->bset); ++ free(args->str.s); ++ free(args->pop); ++ free(args->smpl2pop); ++ free(args->iarr); ++ free(args->farr); ++ free(args->hwe_probs); +++ ftf_destroy(args); ++ free(args); ++ } ++ ++--- python-pysam.orig/bcftools/plugins/fill-tags.c.pysam.c +++++ python-pysam/bcftools/plugins/fill-tags.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* The MIT License ++ ++- Copyright (c) 2015 Genome Research Ltd. +++ Copyright (c) 2015-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -31,10 +31,12 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++ #include +++#include ++ #include "bcftools.h" ++ ++ #define SET_AN (1<<0) ++@@ -47,6 +49,17 @@ ++ #define SET_MAF (1<<7) ++ #define SET_HWE (1<<8) ++ #define SET_ExcHet (1<<9) +++#define SET_FUNC (1<<10) +++ +++typedef struct _args_t args_t; +++typedef struct _ftf_t ftf_t; +++typedef int (*fill_tag_f)(args_t *, bcf1_t *, ftf_t *); +++struct _ftf_t +++{ +++ char *src_tag, *dst_tag; +++ fill_tag_f func; +++ int *pop_vals; // for now assuming only 1 integer value per annotation +++}; ++ ++ typedef struct ++ { ++@@ -64,7 +77,7 @@ ++ } ++ pop_t; ++ ++-typedef struct +++struct _args_t ++ { ++ bcf_hdr_t *in_hdr, *out_hdr; ++ int npop, tags, drop_missing, gt_id; ++@@ -74,21 +87,24 @@ ++ double *hwe_probs; ++ int mhwe_probs; ++ kstring_t str; ++-} ++-args_t; +++ kbitset_t *bset; +++ ftf_t *ftf; +++ int nftf; +++}; ++ ++ static args_t *args; ++ ++ const char *about(void) ++ { ++- return "Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS.\n"; +++ return "Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS and more.\n"; ++ } ++ ++ const char *usage(void) ++ { ++ return ++ "\n" ++- "About: Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS.\n" +++ "About: Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS\n" +++ " or custom INFO/TAG=func(FMT/TAG), use -l for detailed description\n" ++ "Usage: bcftools +fill-tags [General Options] -- [Plugin Options]\n" ++ "Options:\n" ++ " run \"bcftools plugin\" for a list of common options\n" ++@@ -96,14 +112,24 @@ ++ "Plugin options:\n" ++ " -d, --drop-missing do not count half-missing genotypes \"./1\" as hemizygous\n" ++ " -l, --list-tags list available tags with description\n" ++- " -t, --tags LIST list of output tags. By default, all tags are filled.\n" +++ " -t, --tags LIST list of output tags, \"all\" for all tags\n" ++ " -S, --samples-file FILE list of samples (first column) and comma-separated list of populations (second column)\n" ++ "\n" ++ "Example:\n" ++- " bcftools +fill-tags in.bcf -Ob -o out.bcf\n" +++ " # Print a detailed list of available tags\n" +++ " bcftools +fill-tags -- -l\n" +++ "\n" +++ " # Fill INFO/AN and INFO/AC\n" ++ " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t AN,AC\n" ++- " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -d\n" +++ "\n" +++ " # Fill all available tags\n" +++ " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t all\n" +++ "\n" +++ " # Calculate HWE for sample groups (possibly multiple) read from a file\n" ++ " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -S sample-group.txt -t HWE\n" +++ "\n" +++ " # Calculate total read depth (INFO/DP) from per-sample depths (FORMAT/DP)\n" +++ " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t 'DP=sum(DP)'\n" ++ "\n"; ++ } ++ ++@@ -182,7 +208,7 @@ ++ khash_str2int_destroy_free(smpli); ++ free(str.s); ++ free(off); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); ++ } ++ ++ void init_pops(args_t *args) ++@@ -213,13 +239,118 @@ ++ } ++ } ++ +++void ftf_destroy(args_t *args) +++{ +++ int i; +++ for (i=0; inftf; i++) +++ { +++ ftf_t *ftf = &args->ftf[i]; +++ free(ftf->src_tag); +++ free(ftf->dst_tag); +++ free(ftf->pop_vals); +++ } +++ free(args->ftf); +++} +++int ftf_sum(args_t *args, bcf1_t *rec, ftf_t *ftf) +++{ +++ int nsmpl = bcf_hdr_nsamples(args->in_hdr); +++ int nval = bcf_get_format_int32(args->in_hdr, rec, ftf->src_tag, &args->iarr, &args->miarr); +++ if ( nval<=0 ) return 0; +++ nval /= nsmpl; +++ +++ int i; +++ for (i=0; inpop; i++) +++ ftf->pop_vals[i] = -1; +++ +++ for (i=0; iiarr[i*nval]==bcf_int32_missing || args->iarr[i*nval]==bcf_int32_vector_end ) continue; +++ +++ pop_t **pop = &args->smpl2pop[i*(args->npop+1)]; +++ while ( *pop ) +++ { +++ int ipop = (int)(*pop - args->pop); +++ if ( ftf->pop_vals[ipop]<0 ) ftf->pop_vals[ipop] = 0; +++ ftf->pop_vals[ipop] += args->iarr[i*nval]; +++ pop++; +++ } +++ } +++ +++ for (i=0; inpop; i++) +++ { +++ if ( ftf->pop_vals[i]<0 ) continue; +++ args->str.l = 0; +++ ksprintf(&args->str, "%s%s", ftf->dst_tag,args->pop[i].suffix); +++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,ftf->pop_vals+i,1)!=0 ) +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); +++ } +++ +++ return 0; +++} +++ +++void hdr_append(args_t *args, char *fmt) +++{ +++ int i; +++ for (i=0; inpop; i++) +++ bcf_hdr_printf(args->out_hdr, fmt, args->pop[i].suffix,*args->pop[i].name ? " in " : "",args->pop[i].name); +++} +++ +++int parse_func(args_t *args, char *tag, char *expr) +++{ +++ args->nftf++; +++ args->ftf = (ftf_t *)realloc(args->ftf,sizeof(*args->ftf)*args->nftf); +++ ftf_t *ftf = &args->ftf[ args->nftf - 1 ]; +++ +++ ftf->pop_vals = (int*)calloc(args->npop,sizeof(*ftf->pop_vals)); +++ ftf->dst_tag = (char*)calloc(expr-tag,1); +++ memcpy(ftf->dst_tag, tag, expr-tag-1); +++ +++ if ( !strncasecmp(expr,"sum(",4) ) { ftf->func = ftf_sum; expr += 4; } +++ else error("Error: the expression not recognised: %s\n",tag); +++ +++ char *tmp = expr; +++ while ( *tmp && *tmp!=')' ) tmp++; +++ if ( !*tmp ) error("Error: could not parse: %s\n",tag); +++ +++ ftf->src_tag = (char*)calloc(tmp-expr+2,1); +++ memcpy(ftf->src_tag, expr, tmp-expr); +++ +++ int id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,ftf->src_tag); +++ if ( !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) ) error("Error: the field FORMAT/%s is not present\n",ftf->src_tag); +++ +++ int i = 0; +++ for (i=0; inpop; i++) +++ { +++ args->str.l = 0; +++ ksprintf(&args->str, "%s%s", ftf->dst_tag,args->pop[i].suffix); +++ id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,args->str.s); +++ if ( bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) ) +++ { +++ if ( bcf_hdr_id2length(args->in_hdr,BCF_HL_FMT,id)!=BCF_VL_FIXED ) +++ error("Error: the field INFO/%s already exists with a definition different from Number=1\n",args->str.s); +++ if ( bcf_hdr_id2number(args->in_hdr,BCF_HL_FMT,id)!=1 ) +++ error("Error: the field INFO/%s already exists with a definition different from Number=1\n",args->str.s); +++ if ( bcf_hdr_id2type(args->in_hdr,BCF_HT_INT,id)!=BCF_HT_INT ) +++ error("Error: the field INFO/%s already exists with a definition different from Type=Integer\n",args->str.s); +++ } +++ else +++ bcf_hdr_printf(args->out_hdr, "##INFO=",args->str.s,tag,*args->pop[i].name ? " in " : "",args->pop[i].name); +++ } +++ return SET_FUNC; +++} ++ int parse_tags(args_t *args, const char *str) ++ { ++- int i, flag = 0, n_tags; ++- char **tags = hts_readlist(str, 0, &n_tags); +++ if ( !args->in_hdr ) error("%s", usage()); +++ +++ int i,j, flag = 0, n_tags; +++ char **tags = hts_readlist(str, 0, &n_tags), *ptr; ++ for(i=0; inpop; i++) ++- bcf_hdr_printf(args->out_hdr, fmt, args->pop[i].suffix,*args->pop[i].name ? " in " : "",args->pop[i].name); ++-} ++- ++ void list_tags(void) ++ { ++ error( ++@@ -258,8 +383,10 @@ ++ "INFO/AC_Hemi Number:A Type:Integer .. Allele counts in hemizygous genotypes\n" ++ "INFO/AF Number:A Type:Float .. Allele frequency\n" ++ "INFO/MAF Number:A Type:Float .. Minor Allele frequency\n" ++- "INFO/HWE Number:A Type:Float .. HWE test (PMID:15789306)\n" ++- "INFO/ExcHet Number:A Type:Float .. Probability of excess heterozygosity\n" +++ "INFO/HWE Number:A Type:Float .. HWE test (PMID:15789306); 1=good, 0=bad\n" +++ "INFO/ExcHet Number:A Type:Float .. Test excess heterozygosity; 1=good, 0=bad\n" +++ "TAG=func(TAG) Number:1 Type:Integer .. Experimental support for user-defined\n" +++ " expressions such as \"DP=sum(DP)\". This is currently very basic, to be extended.\n" ++ ); ++ } ++ ++@@ -268,7 +395,7 @@ ++ args = (args_t*) calloc(1,sizeof(args_t)); ++ args->in_hdr = in; ++ args->out_hdr = out; ++- char *samples_fname = NULL; +++ char *samples_fname = NULL, *tags_str = "all"; ++ static struct option loptions[] = ++ { ++ {"list-tags",0,0,'l'}, ++@@ -284,7 +411,7 @@ ++ { ++ case 'l': list_tags(); break; ++ case 'd': args->drop_missing = 1; break; ++- case 't': args->tags |= parse_tags(args,optarg); break; +++ case 't': tags_str = optarg; break; ++ case 'S': samples_fname = optarg; break; ++ case 'h': ++ case '?': ++@@ -297,12 +424,11 @@ ++ args->gt_id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,"GT"); ++ if ( args->gt_id<0 ) error("Error: GT field is not present\n"); ++ ++- if ( !args->tags ) ++- for (c=0; c<=9; c++) args->tags |= 1<tags |= parse_tags(args,tags_str); +++ ++ if ( args->tags & SET_AN ) hdr_append(args, "##INFO="); ++ if ( args->tags & SET_AC ) hdr_append(args, "##INFO="); ++ if ( args->tags & SET_NS ) hdr_append(args, "##INFO="); ++@@ -311,8 +437,8 @@ ++ if ( args->tags & SET_AC_Hemi ) hdr_append(args, "##INFO="); ++ if ( args->tags & SET_AF ) hdr_append(args, "##INFO="); ++ if ( args->tags & SET_MAF ) hdr_append(args, "##INFO="); ++- if ( args->tags & SET_HWE ) hdr_append(args, "##INFO="); ++- if ( args->tags & SET_ExcHet ) hdr_append(args, "##INFO="); +++ if ( args->tags & SET_HWE ) hdr_append(args, "##INFO="); +++ if ( args->tags & SET_ExcHet ) hdr_append(args, "##INFO="); ++ ++ return 0; ++ } ++@@ -342,7 +468,7 @@ ++ double *probs = args->hwe_probs; ++ ++ // start at midpoint ++- int mid = nrare * (nref + nalt - nrare) / (nref + nalt); +++ int mid = (double)nrare * (nref + nalt - nrare) / (nref + nalt); ++ ++ // check to ensure that midpoint and rare alleles have same parity ++ if ( (nrare & 1) ^ (mid & 1) ) mid++; ++@@ -391,19 +517,17 @@ ++ *p_hwe = prob; ++ } ++ ++-static inline void set_counts(pop_t *pop, int is_half, int is_hom, int is_hemi, int als) +++static inline void set_counts(pop_t *pop, int is_half, int is_hom, int is_hemi, kbitset_t *bset) ++ { ++- int ial; ++- for (ial=0; als; ial++) +++ kbitset_iter_t itr; +++ int i; +++ kbs_start(&itr); +++ while ((i = kbs_next(bset, &itr)) >= 0) ++ { ++- if ( als&1 ) ++- { ++- if ( is_half ) pop->counts[ial].nac++; ++- else if ( !is_hom ) pop->counts[ial].nhet++; ++- else if ( !is_hemi ) pop->counts[ial].nhom += 2; ++- else pop->counts[ial].nhemi++; ++- } ++- als >>= 1; +++ if ( is_half ) pop->counts[i].nac++; +++ else if ( !is_hom ) pop->counts[i].nhet++; +++ else if ( !is_hemi ) pop->counts[i].nhom += 2; +++ else pop->counts[i].nhemi++; ++ } ++ pop->ns++; ++ } ++@@ -415,9 +539,13 @@ ++ ++ bcf1_t *process(bcf1_t *rec) ++ { +++ bcf_unpack(rec, BCF_UN_FMT); +++ ++ int i,j, nsmpl = bcf_hdr_nsamples(args->in_hdr);; ++ ++- bcf_unpack(rec, BCF_UN_FMT); +++ for (i=0; inftf; i++) +++ args->ftf[i].func(args, rec, &args->ftf[i]); +++ ++ bcf_fmt_t *fmt_gt = NULL; ++ for (i=0; in_fmt; i++) ++ if ( rec->d.fmt[i].id==args->gt_id ) { fmt_gt = &rec->d.fmt[i]; break; } ++@@ -431,14 +559,15 @@ ++ for (i=0; inpop; i++) ++ clean_counts(&args->pop[i], rec->n_allele); ++ ++- assert( rec->n_allele < 8*sizeof(int) ); +++ if ( kbs_resize(&args->bset, rec->n_allele) < 0 ) error("kbs_resize: failed to store %d bits\n", rec->n_allele); ++ ++ #define BRANCH_INT(type_t,vector_end) \ ++ { \ ++ for (i=0; ip + i*fmt_gt->size); \ ++- int ial, als = 0, nals = 0, is_half, is_hom, is_hemi; \ +++ int ial, nbits = 0, nals = 0, is_half, is_hom, is_hemi; \ +++ kbs_clear(args->bset); \ ++ for (ial=0; ialn; ial++) \ ++ { \ ++ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ ++@@ -447,11 +576,12 @@ ++ nals++; \ ++ \ ++ if ( idx >= rec->n_allele ) \ ++- error("Incorrect allele (\"%d\") in %s at %s:%d\n",idx,args->in_hdr->samples[i],bcf_seqname(args->in_hdr,rec),rec->pos+1); \ ++- als |= (1<in_hdr->samples[i],bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); \ +++ if ( !kbs_exists(args->bset, idx) ) nbits++; \ +++ kbs_insert(args->bset, idx); \ ++ } \ ++ if ( nals==0 ) continue; /* missing genotype */ \ ++- is_hom = als && !(als & (als-1)); /* only one bit is set */ \ +++ is_hom = nbits==1 ? 1 : 0; /* only one bit is set for homs */ \ ++ if ( nals!=ial ) \ ++ { \ ++ if ( args->drop_missing ) is_hemi = 0, is_half = 1; \ ++@@ -460,14 +590,14 @@ ++ else if ( nals==1 ) is_hemi = 1, is_half = 0; \ ++ else is_hemi = 0, is_half = 0; \ ++ pop_t **pop = &args->smpl2pop[i*(args->npop+1)]; \ ++- while ( *pop ) { set_counts(*pop,is_half,is_hom,is_hemi,als); pop++; }\ +++ while ( *pop ) { set_counts(*pop,is_half,is_hom,is_hemi,args->bset); pop++; } \ ++ } \ ++ } ++ switch (fmt_gt->type) { ++ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; ++ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; ++ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; ++- default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args->in_hdr,rec),rec->pos+1); break; +++ default: error("The GT type is not recognised: %d at %s:%"PRId64"\n",fmt_gt->type, bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); break; ++ } ++ #undef BRANCH_INT ++ ++@@ -478,7 +608,7 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "NS%s", args->pop[i].suffix); ++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,&args->pop[i].ns,1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( args->tags & SET_AN ) ++@@ -493,7 +623,7 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "AN%s", args->pop[i].suffix); ++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,&an,1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( args->tags & (SET_AF | SET_MAF) ) ++@@ -509,25 +639,29 @@ ++ args->farr[j-1] += pop->counts[j].nhet + pop->counts[j].nhom + pop->counts[j].nhemi + pop->counts[j].nac; ++ an = pop->counts[0].nhet + pop->counts[0].nhom + pop->counts[0].nhemi + pop->counts[0].nac; ++ for (j=1; jn_allele; j++) an += args->farr[j-1]; ++- if ( !an ) continue; ++- for (j=1; jn_allele; j++) args->farr[j-1] /= an; +++ if ( an ) +++ for (j=1; jn_allele; j++) args->farr[j-1] /= an; +++ else +++ for (j=1; jn_allele; j++) bcf_float_set_missing(args->farr[j-1]); ++ } ++ if ( args->tags & SET_AF ) ++ { ++ args->str.l = 0; ++ ksprintf(&args->str, "AF%s", args->pop[i].suffix); ++ if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,args->farr,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ if ( args->tags & SET_MAF ) ++ { ++- if ( !an ) continue; ++- for (j=1; jn_allele; j++) ++- if ( args->farr[j-1] > 0.5 ) args->farr[j-1] = 1 - args->farr[j-1]; // todo: this is incorrect for multiallelic sites +++ if ( an ) +++ { +++ for (j=1; jn_allele; j++) +++ if ( args->farr[j-1] > 0.5 ) args->farr[j-1] = 1 - args->farr[j-1]; // todo: this is incorrect for multiallelic sites +++ } ++ args->str.l = 0; ++ ksprintf(&args->str, "MAF%s", args->pop[i].suffix); ++ if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,args->farr,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ } ++@@ -545,7 +679,7 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "AC%s", args->pop[i].suffix); ++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( args->tags & SET_AC_Het ) ++@@ -562,7 +696,7 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "AC_Het%s", args->pop[i].suffix); ++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( args->tags & SET_AC_Hom ) ++@@ -579,7 +713,7 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "AC_Hom%s", args->pop[i].suffix); ++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( args->tags & SET_AC_Hemi && rec->n_allele > 1 ) ++@@ -596,7 +730,7 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "AC_Hemi%s", args->pop[i].suffix); ++ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ if ( args->tags & (SET_HWE|SET_ExcHet) ) ++@@ -627,14 +761,14 @@ ++ args->str.l = 0; ++ ksprintf(&args->str, "HWE%s", args->pop[i].suffix); ++ if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,fhwe,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ if ( args->tags & SET_ExcHet ) ++ { ++ args->str.l = 0; ++ ksprintf(&args->str, "ExcHet%s", args->pop[i].suffix); ++ if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,fexc_het,rec->n_allele-1)!=0 ) ++- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ } ++ } ++@@ -652,12 +786,14 @@ ++ free(args->pop[i].smpl); ++ free(args->pop[i].counts); ++ } +++ kbs_destroy(args->bset); ++ free(args->str.s); ++ free(args->pop); ++ free(args->smpl2pop); ++ free(args->iarr); ++ free(args->farr); ++ free(args->hwe_probs); +++ ftf_destroy(args); ++ free(args); ++ } ++ ++--- python-pysam.orig/bcftools/plugins/fixploidy.c +++++ python-pysam/bcftools/plugins/fixploidy.c ++@@ -190,7 +190,7 @@ ++ return rec; // GT field not present ++ ++ if ( ngts % n_sample ) ++- error("Error at %s:%d: wrong number of GT fields\n",bcf_seqname(in_hdr,rec),rec->pos+1); +++ error("Error at %s:%"PRId64": wrong number of GT fields\n",bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); ++ ++ if ( force_ploidy==-1 ) ++ ploidy_query(ploidy, (char*)bcf_seqname(in_hdr,rec), rec->pos, sex2ploidy,NULL,&max_ploidy); ++@@ -215,7 +215,7 @@ ++ while ( jpos+1); +++ error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ else if ( ngts!=1 || max_ploidy!=1 ) ++ { ++@@ -232,7 +232,7 @@ ++ while ( jpos+1); +++ error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ return rec; ++ } ++--- python-pysam.orig/bcftools/plugins/fixploidy.c.pysam.c +++++ python-pysam/bcftools/plugins/fixploidy.c.pysam.c ++@@ -192,7 +192,7 @@ ++ return rec; // GT field not present ++ ++ if ( ngts % n_sample ) ++- error("Error at %s:%d: wrong number of GT fields\n",bcf_seqname(in_hdr,rec),rec->pos+1); +++ error("Error at %s:%"PRId64": wrong number of GT fields\n",bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); ++ ++ if ( force_ploidy==-1 ) ++ ploidy_query(ploidy, (char*)bcf_seqname(in_hdr,rec), rec->pos, sex2ploidy,NULL,&max_ploidy); ++@@ -217,7 +217,7 @@ ++ while ( jpos+1); +++ error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ else if ( ngts!=1 || max_ploidy!=1 ) ++ { ++@@ -234,7 +234,7 @@ ++ while ( jpos+1); +++ error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); ++ } ++ return rec; ++ } ++--- python-pysam.orig/bcftools/plugins/fixref.c +++++ python-pysam/bcftools/plugins/fixref.c ++@@ -76,6 +76,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -90,6 +91,7 @@ ++ #define MODE_TOP2FWD 2 ++ #define MODE_FLIP2FWD 3 ++ #define MODE_USE_ID 4 +++#define MODE_REF_ALT 5 ++ ++ typedef struct ++ { ++@@ -128,16 +130,20 @@ ++ "\n" ++ "About: This tool helps to determine and fix strand orientation.\n" ++ " Currently the following modes are recognised:\n" ++- " flip .. flips non-ambiguous SNPs and ignores the rest\n" ++- " id .. swap REF/ALT and GTs using the ID column to determine the REF allele\n" ++- " stats .. collect and print stats\n" ++- " top .. converts from Illumina TOP strand to fwd\n" +++ " flip .. flip REF/ALT columns and GTs for non-ambiguous SNPs and ignore the rest\n" +++ " id .. swap REF/ALT columns and GTs using the ID column to determine the REF allele\n" +++ " ref-alt .. swap REF/ALT columns to match the reference but not modify the genotypes\n" +++ " stats .. collect and print stats\n" +++ " top .. convert from Illumina TOP strand to fwd\n" ++ "\n" ++ " WARNING: Do not use the program blindly, make an effort to\n" ++ " understand what strand convention your data uses! Make sure\n" ++ " the reason for mismatching REF alleles is not a different\n" ++ " reference build!!\n" ++ "\n" +++ " Please check this page before messing up your VCF even more\n" +++ " http://samtools.github.io/bcftools/howtos/plugin.fixref.html\n" +++ "\n" ++ "Usage: bcftools +fixref [General Options] -- [Plugin Options]\n" ++ "Options:\n" ++ " run \"bcftools plugin\" for a list of common options\n" ++@@ -148,7 +154,7 @@ ++ " -i, --use-id Swap REF/ALT using the ID column to determine the REF allele, implies -m id.\n" ++ " Download the dbSNP file from\n" ++ " https://www.ncbi.nlm.nih.gov/variation/docs/human_variation_vcf\n" ++- " -m, --mode Collect stats (\"stats\") or convert (\"flip\", \"id\", \"top\") [stats]\n" +++ " -m, --mode Collect stats (\"stats\") or convert (\"flip\", \"id\", \"ref-alt\", \"top\") [stats]\n" ++ "\n" ++ "Examples:\n" ++ " # run stats\n" ++@@ -189,6 +195,7 @@ ++ if ( !strcasecmp(optarg,"top") ) args.mode = MODE_TOP2FWD; ++ else if ( !strcasecmp(optarg,"flip") ) args.mode = MODE_FLIP2FWD; ++ else if ( !strcasecmp(optarg,"id") ) args.mode = MODE_USE_ID; +++ else if ( !strcasecmp(optarg,"ref-alt") ) args.mode = MODE_REF_ALT; ++ else if ( !strcasecmp(optarg,"stats") ) args.mode = MODE_STATS; ++ else error("The source strand convention not recognised: %s\n", optarg); ++ break; ++@@ -217,6 +224,8 @@ ++ if ( !swap ) return rec; // only fix the alleles, leaving GTs unchanged ++ ++ int ngts = bcf_get_genotypes(args->hdr, rec, &args->gts, &args->ngts); +++ if ( ngts<=0 ) return rec; // no samples, no genotypes +++ ++ int i, j, nsmpl = bcf_hdr_nsamples(args->hdr); ++ ngts /= nsmpl; ++ for (i=0; iskip_rid = rec->rid; ++ return -2; ++ } ++- error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ } ++ int ir = nt2int(*ref); ++ free(ref); ++@@ -288,6 +297,7 @@ ++ args->i2m = kh_init(i2m); ++ bcf_srs_t *sr = bcf_sr_init(); ++ if ( bcf_sr_set_regions(sr, chr, 0) != 0 ) goto done; +++ if ( !args->dbsnp_fname ) error("No ID file specified, use -i/--use-id\n"); ++ if ( !bcf_sr_add_reader(sr,args->dbsnp_fname) ) error("Failed to open %s: %s\n", args->dbsnp_fname,bcf_sr_strerror(sr->errnum)); ++ while ( bcf_sr_next_line(sr) ) ++ { ++@@ -330,7 +340,7 @@ ++ ++ ref = kh_val(args->i2m, k).ref; ++ if ( ref!=ir ) ++- error("Reference base mismatch at %s:%d .. %c vs %c\n",bcf_seqname(args->hdr,rec),rec->pos+1,int2nt(ref),int2nt(ir)); +++ error("Reference base mismatch at %s:%"PRId64" .. %c vs %c\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,int2nt(ref),int2nt(ir)); ++ ++ if ( ia==ref ) return rec; ++ if ( ib==ref ) { args->nswap++; return set_ref_alt(args,rec,int2nt(ib),int2nt(ia),1); } ++@@ -408,14 +418,22 @@ ++ if ( !args.unsorted && args.pos > rec->pos ) ++ { ++ fprintf(stderr, ++- "Warning: corrected position(s) results in unsorted VCF, for example %s:%d comes after %s:%d\n" +++ "Warning: corrected position(s) results in unsorted VCF, for example %s:%"PRId64" comes after %s:%d\n" ++ " The standard unix `sort` or `vcf-sort` from vcftools can be used to fix the order.\n", ++- bcf_seqname(args.hdr,rec),rec->pos+1,bcf_seqname(args.hdr,rec),args.pos); +++ bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1,bcf_seqname(args.hdr,rec),args.pos); ++ args.unsorted = 1; ++ } ++ args.pos = rec->pos; ++ return ret; ++ } +++ else if ( args.mode==MODE_REF_ALT ) // only change the REF/ALT column, leave the genotypes as is +++ { +++ if ( ir==ia ) return ret; +++ if ( ir==ib ) { args.nswap++; return set_ref_alt(&args,rec,int2nt(ib),int2nt(ia),0); } +++ if ( ir==revint(ia) ) { args.nflip++; return set_ref_alt(&args,rec,int2nt(revint(ia)),int2nt(revint(ib)),0); } +++ if ( ir==revint(ib) ) { args.nflip_swap++; return set_ref_alt(&args,rec,int2nt(revint(ib)),int2nt(revint(ia)),0); } +++ error("FIXME: this should not happen %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); +++ } ++ else if ( args.mode==MODE_FLIP2FWD ) ++ { ++ int pair = 1 << ia | 1 << ib; ++@@ -428,7 +446,7 @@ ++ if ( ir==ib ) { args.nswap++; return set_ref_alt(&args,rec,int2nt(ib),int2nt(ia),1); } ++ if ( ir==revint(ia) ) { args.nflip++; return set_ref_alt(&args,rec,int2nt(revint(ia)),int2nt(revint(ib)),0); } ++ if ( ir==revint(ib) ) { args.nflip_swap++; return set_ref_alt(&args,rec,int2nt(revint(ib)),int2nt(revint(ia)),1); } ++- error("FIXME: this should not happen %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); +++ error("FIXME: this should not happen %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); ++ } ++ else if ( args.mode==MODE_TOP2FWD ) ++ { ++@@ -457,8 +475,8 @@ ++ { ++ int len, win = rec->pos > 100 ? 100 : rec->pos, beg = rec->pos - win, end = rec->pos + win; ++ char *ref = faidx_fetch_seq(args.fai, (char*)bcf_seqname(args.hdr,rec), beg,end, &len); ++- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); ++- if ( end - beg + 1 != len ) error("FIXME: check win=%d,len=%d at %s:%d (%d %d)\n", win,len, bcf_seqname(args.hdr,rec),rec->pos+1, end,beg); +++ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); +++ if ( end - beg + 1 != len ) error("FIXME: check win=%d,len=%d at %s:%"PRId64" (%d %d)\n", win,len, bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1, end,beg); ++ ++ int i, mid = rec->pos - beg, strand = 0; ++ for (i=1; i<=win; i++) ++--- python-pysam.orig/bcftools/plugins/fixref.c.pysam.c +++++ python-pysam/bcftools/plugins/fixref.c.pysam.c ++@@ -78,6 +78,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -92,6 +93,7 @@ ++ #define MODE_TOP2FWD 2 ++ #define MODE_FLIP2FWD 3 ++ #define MODE_USE_ID 4 +++#define MODE_REF_ALT 5 ++ ++ typedef struct ++ { ++@@ -130,16 +132,20 @@ ++ "\n" ++ "About: This tool helps to determine and fix strand orientation.\n" ++ " Currently the following modes are recognised:\n" ++- " flip .. flips non-ambiguous SNPs and ignores the rest\n" ++- " id .. swap REF/ALT and GTs using the ID column to determine the REF allele\n" ++- " stats .. collect and print stats\n" ++- " top .. converts from Illumina TOP strand to fwd\n" +++ " flip .. flip REF/ALT columns and GTs for non-ambiguous SNPs and ignore the rest\n" +++ " id .. swap REF/ALT columns and GTs using the ID column to determine the REF allele\n" +++ " ref-alt .. swap REF/ALT columns to match the reference but not modify the genotypes\n" +++ " stats .. collect and print stats\n" +++ " top .. convert from Illumina TOP strand to fwd\n" ++ "\n" ++ " WARNING: Do not use the program blindly, make an effort to\n" ++ " understand what strand convention your data uses! Make sure\n" ++ " the reason for mismatching REF alleles is not a different\n" ++ " reference build!!\n" ++ "\n" +++ " Please check this page before messing up your VCF even more\n" +++ " http://samtools.github.io/bcftools/howtos/plugin.fixref.html\n" +++ "\n" ++ "Usage: bcftools +fixref [General Options] -- [Plugin Options]\n" ++ "Options:\n" ++ " run \"bcftools plugin\" for a list of common options\n" ++@@ -150,7 +156,7 @@ ++ " -i, --use-id Swap REF/ALT using the ID column to determine the REF allele, implies -m id.\n" ++ " Download the dbSNP file from\n" ++ " https://www.ncbi.nlm.nih.gov/variation/docs/human_variation_vcf\n" ++- " -m, --mode Collect stats (\"stats\") or convert (\"flip\", \"id\", \"top\") [stats]\n" +++ " -m, --mode Collect stats (\"stats\") or convert (\"flip\", \"id\", \"ref-alt\", \"top\") [stats]\n" ++ "\n" ++ "Examples:\n" ++ " # run stats\n" ++@@ -191,6 +197,7 @@ ++ if ( !strcasecmp(optarg,"top") ) args.mode = MODE_TOP2FWD; ++ else if ( !strcasecmp(optarg,"flip") ) args.mode = MODE_FLIP2FWD; ++ else if ( !strcasecmp(optarg,"id") ) args.mode = MODE_USE_ID; +++ else if ( !strcasecmp(optarg,"ref-alt") ) args.mode = MODE_REF_ALT; ++ else if ( !strcasecmp(optarg,"stats") ) args.mode = MODE_STATS; ++ else error("The source strand convention not recognised: %s\n", optarg); ++ break; ++@@ -219,6 +226,8 @@ ++ if ( !swap ) return rec; // only fix the alleles, leaving GTs unchanged ++ ++ int ngts = bcf_get_genotypes(args->hdr, rec, &args->gts, &args->ngts); +++ if ( ngts<=0 ) return rec; // no samples, no genotypes +++ ++ int i, j, nsmpl = bcf_hdr_nsamples(args->hdr); ++ ngts /= nsmpl; ++ for (i=0; iskip_rid = rec->rid; ++ return -2; ++ } ++- error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); +++ error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); ++ } ++ int ir = nt2int(*ref); ++ free(ref); ++@@ -290,6 +299,7 @@ ++ args->i2m = kh_init(i2m); ++ bcf_srs_t *sr = bcf_sr_init(); ++ if ( bcf_sr_set_regions(sr, chr, 0) != 0 ) goto done; +++ if ( !args->dbsnp_fname ) error("No ID file specified, use -i/--use-id\n"); ++ if ( !bcf_sr_add_reader(sr,args->dbsnp_fname) ) error("Failed to open %s: %s\n", args->dbsnp_fname,bcf_sr_strerror(sr->errnum)); ++ while ( bcf_sr_next_line(sr) ) ++ { ++@@ -332,7 +342,7 @@ ++ ++ ref = kh_val(args->i2m, k).ref; ++ if ( ref!=ir ) ++- error("Reference base mismatch at %s:%d .. %c vs %c\n",bcf_seqname(args->hdr,rec),rec->pos+1,int2nt(ref),int2nt(ir)); +++ error("Reference base mismatch at %s:%"PRId64" .. %c vs %c\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,int2nt(ref),int2nt(ir)); ++ ++ if ( ia==ref ) return rec; ++ if ( ib==ref ) { args->nswap++; return set_ref_alt(args,rec,int2nt(ib),int2nt(ia),1); } ++@@ -410,14 +420,22 @@ ++ if ( !args.unsorted && args.pos > rec->pos ) ++ { ++ fprintf(bcftools_stderr, ++- "Warning: corrected position(s) results in unsorted VCF, for example %s:%d comes after %s:%d\n" +++ "Warning: corrected position(s) results in unsorted VCF, for example %s:%"PRId64" comes after %s:%d\n" ++ " The standard unix `sort` or `vcf-sort` from vcftools can be used to fix the order.\n", ++- bcf_seqname(args.hdr,rec),rec->pos+1,bcf_seqname(args.hdr,rec),args.pos); +++ bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1,bcf_seqname(args.hdr,rec),args.pos); ++ args.unsorted = 1; ++ } ++ args.pos = rec->pos; ++ return ret; ++ } +++ else if ( args.mode==MODE_REF_ALT ) // only change the REF/ALT column, leave the genotypes as is +++ { +++ if ( ir==ia ) return ret; +++ if ( ir==ib ) { args.nswap++; return set_ref_alt(&args,rec,int2nt(ib),int2nt(ia),0); } +++ if ( ir==revint(ia) ) { args.nflip++; return set_ref_alt(&args,rec,int2nt(revint(ia)),int2nt(revint(ib)),0); } +++ if ( ir==revint(ib) ) { args.nflip_swap++; return set_ref_alt(&args,rec,int2nt(revint(ib)),int2nt(revint(ia)),0); } +++ error("FIXME: this should not happen %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); +++ } ++ else if ( args.mode==MODE_FLIP2FWD ) ++ { ++ int pair = 1 << ia | 1 << ib; ++@@ -430,7 +448,7 @@ ++ if ( ir==ib ) { args.nswap++; return set_ref_alt(&args,rec,int2nt(ib),int2nt(ia),1); } ++ if ( ir==revint(ia) ) { args.nflip++; return set_ref_alt(&args,rec,int2nt(revint(ia)),int2nt(revint(ib)),0); } ++ if ( ir==revint(ib) ) { args.nflip_swap++; return set_ref_alt(&args,rec,int2nt(revint(ib)),int2nt(revint(ia)),1); } ++- error("FIXME: this should not happen %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); +++ error("FIXME: this should not happen %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); ++ } ++ else if ( args.mode==MODE_TOP2FWD ) ++ { ++@@ -459,8 +477,8 @@ ++ { ++ int len, win = rec->pos > 100 ? 100 : rec->pos, beg = rec->pos - win, end = rec->pos + win; ++ char *ref = faidx_fetch_seq(args.fai, (char*)bcf_seqname(args.hdr,rec), beg,end, &len); ++- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); ++- if ( end - beg + 1 != len ) error("FIXME: check win=%d,len=%d at %s:%d (%d %d)\n", win,len, bcf_seqname(args.hdr,rec),rec->pos+1, end,beg); +++ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); +++ if ( end - beg + 1 != len ) error("FIXME: check win=%d,len=%d at %s:%"PRId64" (%d %d)\n", win,len, bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1, end,beg); ++ ++ int i, mid = rec->pos - beg, strand = 0; ++ for (i=1; i<=win; i++) ++--- python-pysam.orig/bcftools/plugins/guess-ploidy.c +++++ python-pysam/bcftools/plugins/guess-ploidy.c ++@@ -387,7 +387,7 @@ ++ counts->pdip += log(pdip); ++ counts->ncount++; ++ if ( args->verbose>1 ) ++- printf("DBG\t%s\t%d\t%s\t%e\t%e\t%e\t%e\t%e\t%e\n", bcf_seqname(args->hdr,rec),rec->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,ismpl), +++ printf("DBG\t%s\t%"PRId64"\t%s\t%e\t%e\t%e\t%e\t%e\t%e\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,ismpl), ++ freq[1],tmp[0],tmp[1],tmp[2],phap,pdip); ++ } ++ } ++@@ -444,7 +444,7 @@ ++ else if ( !strcasecmp(optarg,"hg38") ) region = "chrX:2781480-155701381"; ++ else error("The argument not recognised, expected --genome b37, b38, hg19 or hg38: %s\n", optarg); ++ break; ++- case 'R': region_is_file = 1; +++ case 'R': region_is_file = 1; // fall-through ++ case 'r': region = optarg; break; ++ case 'v': args->verbose++; break; ++ case 't': ++--- python-pysam.orig/bcftools/plugins/guess-ploidy.c.pysam.c +++++ python-pysam/bcftools/plugins/guess-ploidy.c.pysam.c ++@@ -389,7 +389,7 @@ ++ counts->pdip += log(pdip); ++ counts->ncount++; ++ if ( args->verbose>1 ) ++- fprintf(bcftools_stdout, "DBG\t%s\t%d\t%s\t%e\t%e\t%e\t%e\t%e\t%e\n", bcf_seqname(args->hdr,rec),rec->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,ismpl), +++ fprintf(bcftools_stdout, "DBG\t%s\t%"PRId64"\t%s\t%e\t%e\t%e\t%e\t%e\t%e\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,ismpl), ++ freq[1],tmp[0],tmp[1],tmp[2],phap,pdip); ++ } ++ } ++@@ -446,7 +446,7 @@ ++ else if ( !strcasecmp(optarg,"hg38") ) region = "chrX:2781480-155701381"; ++ else error("The argument not recognised, expected --genome b37, b38, hg19 or hg38: %s\n", optarg); ++ break; ++- case 'R': region_is_file = 1; +++ case 'R': region_is_file = 1; // fall-through ++ case 'r': region = optarg; break; ++ case 'v': args->verbose++; break; ++ case 't': ++--- /dev/null +++++ python-pysam/bcftools/plugins/gvcfz.c ++@@ -0,0 +1,378 @@ +++/* +++ Copyright (C) 2017 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++ THE SOFTWARE. +++*/ +++/* +++ Compress gVCF file by resizing gVCF blocks according to specified criteria. +++*/ +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "bcftools.h" +++#include "filter.h" +++ +++#define FLT_INCLUDE 1 +++#define FLT_EXCLUDE 2 +++ +++#define GQ_KEY_NONE NULL +++#define GQ_KEY_GQ "GQ" +++#define GQ_KEY_RGQ "RGQ" +++ +++typedef struct +++{ +++ int32_t end, min_dp, gq, pl[3], grp; +++ char *gq_key; +++ bcf1_t *rec; +++} +++block_t; +++typedef struct +++{ +++ char *expr; // expression +++ int flt_id; // filter id, -1 for PASS +++ filter_t *flt; // filter +++} +++grp_t; +++typedef struct +++{ +++ filter_t *filter; +++ char *filter_str; +++ int filter_logic; +++ block_t gvcf; +++ htsFile *fh_out; +++ int ngrp; +++ grp_t *grp; +++ char *group_by; +++ int argc, region_is_file, target_is_file, output_type, trim_alts; +++ int32_t *tmpi, mtmpi, mean_min_dp_reported; +++ char **argv, *region, *target, *fname, *output_fname, *keep_tags; +++ bcf_hdr_t *hdr_in, *hdr_out; +++ bcf_srs_t *sr; +++} +++args_t; +++ +++const char *about(void) +++{ +++ return "Compress gVCF file by resizing gVCF blocks according to specified criteria.\n"; +++} +++ +++static const char *usage_text(void) +++{ +++ return +++ "\n" +++ "About: Compress gVCF file by resizing gVCF blocks according to specified criteria.\n" +++ "\n" +++ "Usage: bcftools +gvcfz [Options]\n" +++ "Plugin options:\n" +++ " -a, --trim-alt-alleles trim alternate alleles not seen in the genotypes\n" +++ " -e, --exclude exclude sites for which the expression is true\n" +++ " -i, --include include sites for which the expression is true\n" +++ " -g, --group-by EXPR group gVCF blocks according to the expression\n" +++ " -o, --output FILE write gVCF output to the FILE\n" +++ " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" +++ "Examples:\n" +++ " # Compress blocks by GQ and DP. Multiple blocks separated by a semicolon can be defined\n" +++ " bcftools +gvcfz input.bcf -g'PASS:GQ>60 & DP<20; PASS:GQ>40 & DP<15; Flt1:QG>20; Flt2:-'\n" +++ "\n" +++ " # Compress all non-reference sites into a single block, remove unused alternate alleles\n" +++ " bcftools +gvcfz input.bcf -a -g'PASS:GT!=\"alt\"'\n" +++ "\n"; +++} +++ +++static void init_groups(args_t *args) +++{ +++ args->hdr_out = bcf_hdr_dup(args->hdr_in); +++ bcf_hdr_printf(args->hdr_out, "##INFO="); +++ +++ // avoid nested double quotes in FILTER description +++ char *hdr_str = strdup(args->group_by); +++ char *tmp = hdr_str; +++ while (*tmp) +++ { +++ if ( *tmp=='"' ) *tmp = '\''; +++ tmp++; +++ } +++ +++ char *rmme_str = strdup(args->group_by), *beg = rmme_str; +++ while ( *beg ) +++ { +++ while ( *beg && isspace(*beg) ) beg++; +++ if ( !beg ) break; +++ char *end = beg; +++ while ( *end && *end!=':' ) end++; +++ if ( *end!=':' ) error("Could not parse the expression: \"%s\"\n", args->group_by); +++ *end = 0; +++ char *flt = beg; +++ beg = ++end; +++ while ( *end && *end!=';' ) end++; +++ char tmp = *end; *end = 0; +++ if ( strcmp(flt,"PASS") ) +++ { +++ bcf_hdr_printf(args->hdr_out, "##FILTER=", flt, hdr_str); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); +++ } +++ args->ngrp++; +++ args->grp = (grp_t*) realloc(args->grp,sizeof(grp_t)*args->ngrp); +++ grp_t *grp = args->grp + args->ngrp - 1; +++ grp->expr = strdup(beg); +++ grp->flt_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt); +++ if ( !bcf_hdr_idinfo_exists(args->hdr_out, BCF_HL_FLT, grp->flt_id) ) error("Could not initialize the filter \"%s\"\n", flt); +++ if ( !strcmp(flt,"PASS") ) grp->flt_id = -1; +++ +++ // remove trailing spaces +++ beg = grp->expr + strlen(grp->expr); while ( beg >= grp->expr && isspace(*beg) ) { *beg = 0; beg--; } +++ beg = grp->expr; while ( *beg && isspace(*beg) ) beg++; +++ +++ grp->flt = strcmp("-",beg) ? filter_init(args->hdr_in, grp->expr) : NULL; +++ +++ if ( !tmp ) break; +++ beg = end + 1; +++ } +++ free(rmme_str); +++ free(hdr_str); +++} +++ +++static void destroy_data(args_t *args) +++{ +++ int i; +++ for (i=0; ingrp; i++) +++ { +++ if ( args->grp[i].flt ) filter_destroy(args->grp[i].flt); +++ free(args->grp[i].expr); +++ } +++ free(args->grp); +++ +++ if ( args->filter ) filter_destroy(args->filter); +++ if ( hts_close(args->fh_out)!=0 ) error("failed to close %s\n", args->output_fname); +++ +++ bcf_sr_destroy(args->sr); +++ if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out); +++ if ( args->gvcf.rec ) bcf_destroy(args->gvcf.rec); +++ free(args->tmpi); +++ free(args); +++} +++ +++static void flush_block(args_t *args, bcf1_t *rec) +++{ +++ block_t *gvcf = &args->gvcf; +++ if ( gvcf->grp < 0 ) return; +++ if ( rec && gvcf->end - 1 >= rec->pos ) gvcf->end = rec->pos; // NB: end is 1-based, rec->pos is 0-based +++ +++ if ( gvcf->rec->pos+1 < gvcf->end && bcf_update_info_int32(args->hdr_out,gvcf->rec,"END",&gvcf->end,1) != 0 ) +++ error("Could not update INFO/END at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); +++ if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"DP",&gvcf->min_dp,1) != 0 ) +++ error("Could not update FORMAT/DP at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); +++ if ( gvcf->gq_key ) +++ { +++ if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,gvcf->gq_key,&gvcf->gq,1) != 0 ) +++ error("Could not update FORMAT/%s at %s:%"PRId64"\n", gvcf->gq_key, bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); +++ } +++ if ( gvcf->pl[0] >=0 ) +++ { +++ if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"PL",&gvcf->pl,3) != 0 ) +++ error("Could not update FORMAT/PL at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); +++ } +++ if ( gvcf->grp < args->ngrp && args->grp[gvcf->grp].flt_id >= 0 ) +++ bcf_add_filter(args->hdr_out, gvcf->rec, args->grp[gvcf->grp].flt_id); +++ +++ if ( bcf_write(args->fh_out, args->hdr_out, gvcf->rec)!=0 ) error("Failed to write the header\n"); +++ +++ gvcf->grp = -1; +++} +++static void process_gvcf(args_t *args) +++{ +++ bcf1_t *rec = bcf_sr_get_line(args->sr,0); +++ +++ if ( args->filter ) +++ { +++ int pass = filter_test(args->filter, rec, NULL); +++ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; +++ if ( !pass ) return; +++ } +++ +++ if ( rec->n_allele > 2 || (rec->n_allele == 2 && strcmp("",rec->d.allele[1]) && strcmp("<*>",rec->d.allele[1])) ) +++ { +++ if ( args->trim_alts ) +++ { +++ bcf_unpack(rec, BCF_UN_ALL); +++ if ( bcf_trim_alleles(args->hdr_in, rec)<0 ) +++ error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr_in, rec),(int64_t) rec->pos+1); +++ +++ // trim the ref allele if necessary +++ if ( rec->d.allele[0][1] ) +++ { +++ rec->d.allele[0][1] = 0; +++ bcf_update_alleles(args->hdr_in, rec, (const char**)rec->d.allele, 1); +++ } +++ +++ } +++ if ( rec->n_allele > 2 || (rec->n_allele == 2 && strcmp("",rec->d.allele[1]) && strcmp("<*>",rec->d.allele[1])) ) +++ { +++ // not a gvcf block +++ flush_block(args, rec); +++ if ( bcf_write(args->fh_out, args->hdr_out, rec)!=0 ) error("Failed to write\n"); +++ return; +++ } +++ } +++ +++ int ret = bcf_get_info_int32(args->hdr_in,rec,"END",&args->tmpi,&args->mtmpi); +++ int32_t end = ret==1 ? args->tmpi[0] : rec->pos + 1; +++ +++ char *gq_key = GQ_KEY_GQ; +++ ret = bcf_get_format_int32(args->hdr_in,rec,gq_key,&args->tmpi,&args->mtmpi); +++ if ( ret!=1 ) +++ { +++ gq_key = GQ_KEY_RGQ; +++ if ( ret<1 ) ret = bcf_get_format_int32(args->hdr_in,rec,gq_key,&args->tmpi,&args->mtmpi); +++ if ( ret!=1 ) gq_key = GQ_KEY_NONE; +++ } +++ int32_t gq = ret==1 ? args->tmpi[0] : 0; +++ +++ int32_t min_dp = 0; +++ if ( bcf_get_format_int32(args->hdr_in,rec,"MIN_DP",&args->tmpi,&args->mtmpi)==1 ) +++ min_dp = args->tmpi[0]; +++ else if ( bcf_get_format_int32(args->hdr_in,rec,"DP",&args->tmpi,&args->mtmpi)==1 ) +++ min_dp = args->tmpi[0]; +++ else +++ error("Expected one FORMAT/MIN_DP or FORMAT/DP value at %s:%"PRId64"\n", bcf_seqname(args->hdr_in,rec),(int64_t) rec->pos+1); +++ +++ int32_t pl[3] = {-1,-1,-1}; +++ ret = bcf_get_format_int32(args->hdr_in,rec,"PL",&args->tmpi,&args->mtmpi); +++ if ( ret>3 ) error("Expected three FORMAT/PL values at %s:%"PRId64"\n", bcf_seqname(args->hdr_in,rec),(int64_t) rec->pos+1); +++ else if ( ret==3 ) +++ { +++ pl[0] = args->tmpi[0]; +++ pl[1] = args->tmpi[1]; +++ pl[2] = args->tmpi[2]; +++ } +++ +++ int i; +++ for (i=0; ingrp; i++) +++ if ( !args->grp[i].flt || filter_test(args->grp[i].flt, rec, NULL)==1 ) break; +++ +++ if ( args->gvcf.grp != i ) flush_block(args, rec); // new block +++ if ( args->gvcf.grp >= 0 && args->gvcf.rec->rid != rec->rid ) flush_block(args, NULL); // new chromosome +++ +++ if ( args->gvcf.grp >= 0 ) // extend an existing block +++ { +++ if ( args->gvcf.end < end ) args->gvcf.end = end; +++ if ( args->gvcf.gq_key!=GQ_KEY_NONE && gq_key!=GQ_KEY_NONE && args->gvcf.gq > gq ) args->gvcf.gq = gq; +++ if ( args->gvcf.min_dp > min_dp ) args->gvcf.min_dp = min_dp; +++ if ( args->gvcf.pl[0] > pl[0] ) args->gvcf.pl[0] = pl[0]; +++ if ( args->gvcf.pl[1] > pl[1] ) args->gvcf.pl[1] = pl[1]; +++ if ( args->gvcf.pl[2] > pl[2] ) args->gvcf.pl[2] = pl[2]; +++ return; +++ } +++ +++ // start a new block +++ args->gvcf.rec = bcf_copy(args->gvcf.rec, rec); +++ args->gvcf.grp = i; +++ args->gvcf.min_dp = min_dp; +++ args->gvcf.end = end; +++ args->gvcf.pl[0] = pl[0]; +++ args->gvcf.pl[1] = pl[1]; +++ args->gvcf.pl[2] = pl[2]; +++ args->gvcf.gq_key = gq_key; +++ if ( gq_key!=GQ_KEY_NONE ) args->gvcf.gq = gq; +++} +++ +++int run(int argc, char **argv) +++{ +++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); +++ args->argc = argc; args->argv = argv; +++ args->output_type = FT_VCF; +++ args->output_fname = "-"; +++ static struct option loptions[] = +++ { +++ {"trim-alt-alleles",required_argument,0,'a'}, +++ {"include",required_argument,0,'i'}, +++ {"exclude",required_argument,0,'e'}, +++ {"group-by",required_argument,NULL,'g'}, +++ {"stats",required_argument,NULL,'s'}, +++ {"output",required_argument,NULL,'o'}, +++ {"output-type",required_argument,NULL,'O'}, +++ {NULL,0,NULL,0} +++ }; +++ int c; +++ while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:g:i:e:a",loptions,NULL)) >= 0) +++ { +++ switch (c) +++ { +++ case 'a': args->trim_alts = 1; break; +++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; +++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; +++ case 'g': args->group_by = optarg; break; +++ case 'o': args->output_fname = optarg; break; +++ case 'O': +++ switch (optarg[0]) { +++ case 'b': args->output_type = FT_BCF_GZ; break; +++ case 'u': args->output_type = FT_BCF; break; +++ case 'z': args->output_type = FT_VCF_GZ; break; +++ case 'v': args->output_type = FT_VCF; break; +++ default: error("The output type \"%s\" not recognised\n", optarg); +++ } +++ break; +++ case 'h': +++ case '?': +++ default: error("%s", usage_text()); break; +++ } +++ } +++ if ( optind==argc ) +++ { +++ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin +++ else { error("%s", usage_text()); } +++ } +++ else if ( optind+1!=argc ) error("%s", usage_text()); +++ else args->fname = argv[optind]; +++ +++ if ( !args->group_by ) error("Missing the -g option\n"); +++ +++ args->gvcf.rec = bcf_init(); +++ args->gvcf.grp = -1; // the block is inactive +++ args->sr = bcf_sr_init(); +++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); +++ args->hdr_in = bcf_sr_get_header(args->sr,0); +++ if ( args->filter_str ) +++ args->filter = filter_init(args->hdr_in, args->filter_str); +++ init_groups(args); +++ args->fh_out = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); +++ if ( bcf_hdr_write(args->fh_out, args->hdr_out)!=0 ) error("Failed to write the header\n"); +++ while ( bcf_sr_next_line(args->sr) ) process_gvcf(args); +++ flush_block(args, NULL); +++ +++ destroy_data(args); +++ return 0; +++} +++ +++ ++--- /dev/null +++++ python-pysam/bcftools/plugins/gvcfz.c.pysam.c ++@@ -0,0 +1,380 @@ +++#include "bcftools.pysam.h" +++ +++/* +++ Copyright (C) 2017 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++ THE SOFTWARE. +++*/ +++/* +++ Compress gVCF file by resizing gVCF blocks according to specified criteria. +++*/ +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "bcftools.h" +++#include "filter.h" +++ +++#define FLT_INCLUDE 1 +++#define FLT_EXCLUDE 2 +++ +++#define GQ_KEY_NONE NULL +++#define GQ_KEY_GQ "GQ" +++#define GQ_KEY_RGQ "RGQ" +++ +++typedef struct +++{ +++ int32_t end, min_dp, gq, pl[3], grp; +++ char *gq_key; +++ bcf1_t *rec; +++} +++block_t; +++typedef struct +++{ +++ char *expr; // expression +++ int flt_id; // filter id, -1 for PASS +++ filter_t *flt; // filter +++} +++grp_t; +++typedef struct +++{ +++ filter_t *filter; +++ char *filter_str; +++ int filter_logic; +++ block_t gvcf; +++ htsFile *fh_out; +++ int ngrp; +++ grp_t *grp; +++ char *group_by; +++ int argc, region_is_file, target_is_file, output_type, trim_alts; +++ int32_t *tmpi, mtmpi, mean_min_dp_reported; +++ char **argv, *region, *target, *fname, *output_fname, *keep_tags; +++ bcf_hdr_t *hdr_in, *hdr_out; +++ bcf_srs_t *sr; +++} +++args_t; +++ +++const char *about(void) +++{ +++ return "Compress gVCF file by resizing gVCF blocks according to specified criteria.\n"; +++} +++ +++static const char *usage_text(void) +++{ +++ return +++ "\n" +++ "About: Compress gVCF file by resizing gVCF blocks according to specified criteria.\n" +++ "\n" +++ "Usage: bcftools +gvcfz [Options]\n" +++ "Plugin options:\n" +++ " -a, --trim-alt-alleles trim alternate alleles not seen in the genotypes\n" +++ " -e, --exclude exclude sites for which the expression is true\n" +++ " -i, --include include sites for which the expression is true\n" +++ " -g, --group-by EXPR group gVCF blocks according to the expression\n" +++ " -o, --output FILE write gVCF output to the FILE\n" +++ " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" +++ "Examples:\n" +++ " # Compress blocks by GQ and DP. Multiple blocks separated by a semicolon can be defined\n" +++ " bcftools +gvcfz input.bcf -g'PASS:GQ>60 & DP<20; PASS:GQ>40 & DP<15; Flt1:QG>20; Flt2:-'\n" +++ "\n" +++ " # Compress all non-reference sites into a single block, remove unused alternate alleles\n" +++ " bcftools +gvcfz input.bcf -a -g'PASS:GT!=\"alt\"'\n" +++ "\n"; +++} +++ +++static void init_groups(args_t *args) +++{ +++ args->hdr_out = bcf_hdr_dup(args->hdr_in); +++ bcf_hdr_printf(args->hdr_out, "##INFO="); +++ +++ // avoid nested double quotes in FILTER description +++ char *hdr_str = strdup(args->group_by); +++ char *tmp = hdr_str; +++ while (*tmp) +++ { +++ if ( *tmp=='"' ) *tmp = '\''; +++ tmp++; +++ } +++ +++ char *rmme_str = strdup(args->group_by), *beg = rmme_str; +++ while ( *beg ) +++ { +++ while ( *beg && isspace(*beg) ) beg++; +++ if ( !beg ) break; +++ char *end = beg; +++ while ( *end && *end!=':' ) end++; +++ if ( *end!=':' ) error("Could not parse the expression: \"%s\"\n", args->group_by); +++ *end = 0; +++ char *flt = beg; +++ beg = ++end; +++ while ( *end && *end!=';' ) end++; +++ char tmp = *end; *end = 0; +++ if ( strcmp(flt,"PASS") ) +++ { +++ bcf_hdr_printf(args->hdr_out, "##FILTER=", flt, hdr_str); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); +++ } +++ args->ngrp++; +++ args->grp = (grp_t*) realloc(args->grp,sizeof(grp_t)*args->ngrp); +++ grp_t *grp = args->grp + args->ngrp - 1; +++ grp->expr = strdup(beg); +++ grp->flt_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt); +++ if ( !bcf_hdr_idinfo_exists(args->hdr_out, BCF_HL_FLT, grp->flt_id) ) error("Could not initialize the filter \"%s\"\n", flt); +++ if ( !strcmp(flt,"PASS") ) grp->flt_id = -1; +++ +++ // remove trailing spaces +++ beg = grp->expr + strlen(grp->expr); while ( beg >= grp->expr && isspace(*beg) ) { *beg = 0; beg--; } +++ beg = grp->expr; while ( *beg && isspace(*beg) ) beg++; +++ +++ grp->flt = strcmp("-",beg) ? filter_init(args->hdr_in, grp->expr) : NULL; +++ +++ if ( !tmp ) break; +++ beg = end + 1; +++ } +++ free(rmme_str); +++ free(hdr_str); +++} +++ +++static void destroy_data(args_t *args) +++{ +++ int i; +++ for (i=0; ingrp; i++) +++ { +++ if ( args->grp[i].flt ) filter_destroy(args->grp[i].flt); +++ free(args->grp[i].expr); +++ } +++ free(args->grp); +++ +++ if ( args->filter ) filter_destroy(args->filter); +++ if ( hts_close(args->fh_out)!=0 ) error("failed to close %s\n", args->output_fname); +++ +++ bcf_sr_destroy(args->sr); +++ if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out); +++ if ( args->gvcf.rec ) bcf_destroy(args->gvcf.rec); +++ free(args->tmpi); +++ free(args); +++} +++ +++static void flush_block(args_t *args, bcf1_t *rec) +++{ +++ block_t *gvcf = &args->gvcf; +++ if ( gvcf->grp < 0 ) return; +++ if ( rec && gvcf->end - 1 >= rec->pos ) gvcf->end = rec->pos; // NB: end is 1-based, rec->pos is 0-based +++ +++ if ( gvcf->rec->pos+1 < gvcf->end && bcf_update_info_int32(args->hdr_out,gvcf->rec,"END",&gvcf->end,1) != 0 ) +++ error("Could not update INFO/END at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); +++ if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"DP",&gvcf->min_dp,1) != 0 ) +++ error("Could not update FORMAT/DP at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); +++ if ( gvcf->gq_key ) +++ { +++ if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,gvcf->gq_key,&gvcf->gq,1) != 0 ) +++ error("Could not update FORMAT/%s at %s:%"PRId64"\n", gvcf->gq_key, bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); +++ } +++ if ( gvcf->pl[0] >=0 ) +++ { +++ if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"PL",&gvcf->pl,3) != 0 ) +++ error("Could not update FORMAT/PL at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); +++ } +++ if ( gvcf->grp < args->ngrp && args->grp[gvcf->grp].flt_id >= 0 ) +++ bcf_add_filter(args->hdr_out, gvcf->rec, args->grp[gvcf->grp].flt_id); +++ +++ if ( bcf_write(args->fh_out, args->hdr_out, gvcf->rec)!=0 ) error("Failed to write the header\n"); +++ +++ gvcf->grp = -1; +++} +++static void process_gvcf(args_t *args) +++{ +++ bcf1_t *rec = bcf_sr_get_line(args->sr,0); +++ +++ if ( args->filter ) +++ { +++ int pass = filter_test(args->filter, rec, NULL); +++ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; +++ if ( !pass ) return; +++ } +++ +++ if ( rec->n_allele > 2 || (rec->n_allele == 2 && strcmp("",rec->d.allele[1]) && strcmp("<*>",rec->d.allele[1])) ) +++ { +++ if ( args->trim_alts ) +++ { +++ bcf_unpack(rec, BCF_UN_ALL); +++ if ( bcf_trim_alleles(args->hdr_in, rec)<0 ) +++ error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr_in, rec),(int64_t) rec->pos+1); +++ +++ // trim the ref allele if necessary +++ if ( rec->d.allele[0][1] ) +++ { +++ rec->d.allele[0][1] = 0; +++ bcf_update_alleles(args->hdr_in, rec, (const char**)rec->d.allele, 1); +++ } +++ +++ } +++ if ( rec->n_allele > 2 || (rec->n_allele == 2 && strcmp("",rec->d.allele[1]) && strcmp("<*>",rec->d.allele[1])) ) +++ { +++ // not a gvcf block +++ flush_block(args, rec); +++ if ( bcf_write(args->fh_out, args->hdr_out, rec)!=0 ) error("Failed to write\n"); +++ return; +++ } +++ } +++ +++ int ret = bcf_get_info_int32(args->hdr_in,rec,"END",&args->tmpi,&args->mtmpi); +++ int32_t end = ret==1 ? args->tmpi[0] : rec->pos + 1; +++ +++ char *gq_key = GQ_KEY_GQ; +++ ret = bcf_get_format_int32(args->hdr_in,rec,gq_key,&args->tmpi,&args->mtmpi); +++ if ( ret!=1 ) +++ { +++ gq_key = GQ_KEY_RGQ; +++ if ( ret<1 ) ret = bcf_get_format_int32(args->hdr_in,rec,gq_key,&args->tmpi,&args->mtmpi); +++ if ( ret!=1 ) gq_key = GQ_KEY_NONE; +++ } +++ int32_t gq = ret==1 ? args->tmpi[0] : 0; +++ +++ int32_t min_dp = 0; +++ if ( bcf_get_format_int32(args->hdr_in,rec,"MIN_DP",&args->tmpi,&args->mtmpi)==1 ) +++ min_dp = args->tmpi[0]; +++ else if ( bcf_get_format_int32(args->hdr_in,rec,"DP",&args->tmpi,&args->mtmpi)==1 ) +++ min_dp = args->tmpi[0]; +++ else +++ error("Expected one FORMAT/MIN_DP or FORMAT/DP value at %s:%"PRId64"\n", bcf_seqname(args->hdr_in,rec),(int64_t) rec->pos+1); +++ +++ int32_t pl[3] = {-1,-1,-1}; +++ ret = bcf_get_format_int32(args->hdr_in,rec,"PL",&args->tmpi,&args->mtmpi); +++ if ( ret>3 ) error("Expected three FORMAT/PL values at %s:%"PRId64"\n", bcf_seqname(args->hdr_in,rec),(int64_t) rec->pos+1); +++ else if ( ret==3 ) +++ { +++ pl[0] = args->tmpi[0]; +++ pl[1] = args->tmpi[1]; +++ pl[2] = args->tmpi[2]; +++ } +++ +++ int i; +++ for (i=0; ingrp; i++) +++ if ( !args->grp[i].flt || filter_test(args->grp[i].flt, rec, NULL)==1 ) break; +++ +++ if ( args->gvcf.grp != i ) flush_block(args, rec); // new block +++ if ( args->gvcf.grp >= 0 && args->gvcf.rec->rid != rec->rid ) flush_block(args, NULL); // new chromosome +++ +++ if ( args->gvcf.grp >= 0 ) // extend an existing block +++ { +++ if ( args->gvcf.end < end ) args->gvcf.end = end; +++ if ( args->gvcf.gq_key!=GQ_KEY_NONE && gq_key!=GQ_KEY_NONE && args->gvcf.gq > gq ) args->gvcf.gq = gq; +++ if ( args->gvcf.min_dp > min_dp ) args->gvcf.min_dp = min_dp; +++ if ( args->gvcf.pl[0] > pl[0] ) args->gvcf.pl[0] = pl[0]; +++ if ( args->gvcf.pl[1] > pl[1] ) args->gvcf.pl[1] = pl[1]; +++ if ( args->gvcf.pl[2] > pl[2] ) args->gvcf.pl[2] = pl[2]; +++ return; +++ } +++ +++ // start a new block +++ args->gvcf.rec = bcf_copy(args->gvcf.rec, rec); +++ args->gvcf.grp = i; +++ args->gvcf.min_dp = min_dp; +++ args->gvcf.end = end; +++ args->gvcf.pl[0] = pl[0]; +++ args->gvcf.pl[1] = pl[1]; +++ args->gvcf.pl[2] = pl[2]; +++ args->gvcf.gq_key = gq_key; +++ if ( gq_key!=GQ_KEY_NONE ) args->gvcf.gq = gq; +++} +++ +++int run(int argc, char **argv) +++{ +++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); +++ args->argc = argc; args->argv = argv; +++ args->output_type = FT_VCF; +++ args->output_fname = "-"; +++ static struct option loptions[] = +++ { +++ {"trim-alt-alleles",required_argument,0,'a'}, +++ {"include",required_argument,0,'i'}, +++ {"exclude",required_argument,0,'e'}, +++ {"group-by",required_argument,NULL,'g'}, +++ {"stats",required_argument,NULL,'s'}, +++ {"output",required_argument,NULL,'o'}, +++ {"output-type",required_argument,NULL,'O'}, +++ {NULL,0,NULL,0} +++ }; +++ int c; +++ while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:g:i:e:a",loptions,NULL)) >= 0) +++ { +++ switch (c) +++ { +++ case 'a': args->trim_alts = 1; break; +++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; +++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; +++ case 'g': args->group_by = optarg; break; +++ case 'o': args->output_fname = optarg; break; +++ case 'O': +++ switch (optarg[0]) { +++ case 'b': args->output_type = FT_BCF_GZ; break; +++ case 'u': args->output_type = FT_BCF; break; +++ case 'z': args->output_type = FT_VCF_GZ; break; +++ case 'v': args->output_type = FT_VCF; break; +++ default: error("The output type \"%s\" not recognised\n", optarg); +++ } +++ break; +++ case 'h': +++ case '?': +++ default: error("%s", usage_text()); break; +++ } +++ } +++ if ( optind==argc ) +++ { +++ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin +++ else { error("%s", usage_text()); } +++ } +++ else if ( optind+1!=argc ) error("%s", usage_text()); +++ else args->fname = argv[optind]; +++ +++ if ( !args->group_by ) error("Missing the -g option\n"); +++ +++ args->gvcf.rec = bcf_init(); +++ args->gvcf.grp = -1; // the block is inactive +++ args->sr = bcf_sr_init(); +++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); +++ args->hdr_in = bcf_sr_get_header(args->sr,0); +++ if ( args->filter_str ) +++ args->filter = filter_init(args->hdr_in, args->filter_str); +++ init_groups(args); +++ args->fh_out = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); +++ if ( bcf_hdr_write(args->fh_out, args->hdr_out)!=0 ) error("Failed to write the header\n"); +++ while ( bcf_sr_next_line(args->sr) ) process_gvcf(args); +++ flush_block(args, NULL); +++ +++ destroy_data(args); +++ return 0; +++} +++ +++ ++--- /dev/null +++++ python-pysam/bcftools/plugins/indel-stats.c ++@@ -0,0 +1,753 @@ +++/* The MIT License +++ +++ Copyright (c) 2018 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++ THE SOFTWARE. +++ +++ */ +++ +++#include +++#include +++#include +++#include // for isatty +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "bcftools.h" +++#include "filter.h" +++ +++ +++// Logic of the filters: include or exclude sites which match the filters? +++#define FLT_INCLUDE 1 +++#define FLT_EXCLUDE 2 +++ +++static int NVAF = 20; +++static int MAX_LEN = 20; +++ +++static inline int len2bin(int len) +++{ +++ if ( len < -MAX_LEN ) return 0; +++ if ( len > MAX_LEN ) return 2*MAX_LEN; +++ return MAX_LEN + len; +++} +++HTS_UNUSED static inline int bin2len(int bin) +++{ +++ return bin - MAX_LEN; +++} +++static inline int vaf2bin(float vaf) +++{ +++ return vaf*(NVAF-1); +++} +++HTS_UNUSED static inline float bin2vaf(int bin) +++{ +++ return (float)bin/(NVAF-1); +++} +++ +++typedef struct +++{ +++ uint32_t +++ *nvaf, // number of indels genotypes with low VAF (<=0.2) and high VAF (>0.2); use vaf2bin and bin2vaf +++ *nlen, // length distribution (-MAX_LEN,MAX_LEN); use len2bin and bin2len; site-wise unless samples are present +++ npass_gt, // number of indel genotypes passing the filter +++ npass, // number of sites passing the filter +++ nsites, // number of sites total +++ nins, ndel, // number of insertions and deletions, site-wise, not genotype-wise +++ nframeshift, ninframe, // site-wise +++ *nfrac; // number of het indels contributing to dfrac +++ double +++ *dfrac; // minor allele fraction at HET indel genotypes, determined from FORMAT/AD +++} +++stats_t; +++ +++typedef struct +++{ +++ stats_t stats; +++ filter_t *filter; +++ char *expr; +++} +++flt_stats_t; +++ +++#define iCHILD 0 +++#define iFATHER 1 +++#define iMOTHER 2 +++ +++typedef struct +++{ +++ int idx[3]; // VCF sample index for father, mother and child +++ int pass; // do all three pass the filters? +++} +++trio_t; +++ +++typedef struct +++{ +++ int argc, filter_logic, regions_is_file, targets_is_file; +++ int nflt_str; +++ char *filter_str, **flt_str; +++ char **argv, *output_fname, *fname, *regions, *targets, *csq_tag, *ped_fname; +++ trio_t *trio; +++ int ntrio, mtrio; +++ bcf_srs_t *sr; +++ bcf_hdr_t *hdr; +++ flt_stats_t *filters; +++ int nfilters, nsmpl; +++ char *csq_str; +++ int32_t *gt_arr, *ad_arr, *ac; +++ int mgt_arr, mad_arr, mac, mcsq_str; +++ int ngt, ngt1, nad, nad1; +++ int allow_alt2ref_DNMs; // is "0/0 0/1 1/1" (child,father,mother) a valid DNM? +++} +++args_t; +++ +++args_t args; +++ +++const char *about(void) +++{ +++ return "Calculate indel stats scanning over a range of thresholds simultaneously.\n"; +++} +++ +++static const char *usage_text(void) +++{ +++ return +++ "\n" +++ "About: Calculates indel stats. Use curly brackets to scan a range of values simultaneously\n" +++ "Usage: bcftools +indel-stats [Plugin Options]\n" +++ "Plugin options:\n" +++ " --alt2ref-DNM consider GT errors such as 0/1 + 1/1 -> 0/0 a valid DNM\n" +++ " -c, --csq-tag STR VEP or BCSQ tag to determine inframe and frameshift variants [CSQ]\n" +++ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" +++ " -i, --include EXPR include sites and samples for which the expression is true\n" +++ " --max-len INT maximum indel length to consider [20]\n" +++ " --nvaf INT number of variant allele frequency bins [20]\n" +++ " -o, --output FILE output file name [stdout]\n" +++ " -p, --ped FILE limit the stats to de novo indels\n" +++ " -r, --regions REG restrict to comma-separated list of regions\n" +++ " -R, --regions-file FILE restrict to regions listed in a file\n" +++ " -t, --targets REG similar to -r but streams rather than index-jumps\n" +++ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" +++ "\n" +++ "Example:\n" +++ " bcftools +indel-stats -i 'GQ>{10,20,30,40,50}' file.bcf\n" +++ "\n"; +++} +++ +++static void parse_filters(args_t *args) +++{ +++ if ( !args->filter_str ) return; +++ int mflt = 1; +++ args->nflt_str = 1; +++ args->flt_str = (char**) malloc(sizeof(char*)); +++ args->flt_str[0] = strdup(args->filter_str); +++ while (1) +++ { +++ int i, expanded = 0; +++ for (i=args->nflt_str-1; i>=0; i--) +++ { +++ char *exp_beg = strchr(args->flt_str[i], '{'); +++ if ( !exp_beg ) continue; +++ char *exp_end = strchr(exp_beg+1, '}'); +++ if ( !exp_end ) error("Could not parse the expression: %s\n", args->filter_str); +++ char *beg = exp_beg+1, *mid = beg; +++ while ( midflt_str[i], exp_beg - args->flt_str[i], &tmp); +++ kputsn(beg, mid - beg, &tmp); +++ kputs(exp_end+1, &tmp); +++ args->nflt_str++; +++ hts_expand(char*, args->nflt_str, mflt, args->flt_str); +++ args->flt_str[args->nflt_str-1] = tmp.s; +++ beg = ++mid; +++ } +++ expanded = 1; +++ free(args->flt_str[i]); +++ memmove(&args->flt_str[i], &args->flt_str[i+1], (args->nflt_str-i-1)*sizeof(*args->flt_str)); +++ args->nflt_str--; +++ args->flt_str[args->nflt_str] = NULL; +++ } +++ if ( !expanded ) break; +++ } +++ +++ fprintf(stderr,"Collecting data for %d filtering expressions\n", args->nflt_str); +++} +++ +++static int cmp_trios(const void *_a, const void *_b) +++{ +++ trio_t *a = (trio_t *) _a; +++ trio_t *b = (trio_t *) _b; +++ int i; +++ int amin = a->idx[0]; +++ for (i=1; i<3; i++) +++ if ( amin > a->idx[i] ) amin = a->idx[i]; +++ int bmin = b->idx[0]; +++ for (i=1; i<3; i++) +++ if ( bmin > b->idx[i] ) bmin = b->idx[i]; +++ if ( amin < bmin ) return -1; +++ if ( amin > bmin ) return 1; +++ return 0; +++} +++static void parse_ped(args_t *args, char *fname) +++{ +++ htsFile *fp = hts_open(fname, "r"); +++ if ( !fp ) error("Could not read: %s\n", fname); +++ +++ kstring_t str = {0,0,0}; +++ if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname); +++ +++ int moff = 0, *off = NULL; +++ do +++ { +++ // familyID sampleID paternalID maternalID sex phenotype population relationship siblings secondOrder thirdOrder children comment +++ // BB03 HG01884 HG01885 HG01956 2 0 ACB child 0 0 0 0 +++ int ncols = ksplit_core(str.s,0,&moff,&off); +++ if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s); +++ +++ int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]); +++ if ( father<0 ) continue; +++ int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]); +++ if ( mother<0 ) continue; +++ int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]); +++ if ( child<0 ) continue; +++ +++ args->ntrio++; +++ hts_expand0(trio_t,args->ntrio,args->mtrio,args->trio); +++ trio_t *trio = &args->trio[args->ntrio-1]; +++ trio->idx[iFATHER] = father; +++ trio->idx[iMOTHER] = mother; +++ trio->idx[iCHILD] = child; +++ } +++ while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); +++ +++ fprintf(stderr,"Identified %d complete trios in the VCF file\n", args->ntrio); +++ if ( !args->ntrio ) error("No complete trio identified\n"); +++ +++ // sort the sample by index so that they are accessed more or less sequentially +++ qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); +++ +++ free(str.s); +++ free(off); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); +++} +++ +++static void init_data(args_t *args) +++{ +++ args->sr = bcf_sr_init(); +++ if ( args->regions ) +++ { +++ args->sr->require_index = 1; +++ if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); +++ } +++ if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); +++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); +++ args->hdr = bcf_sr_get_header(args->sr,0); +++ +++ if ( args->ped_fname ) +++ parse_ped(args, args->ped_fname); +++ +++ parse_filters(args); +++ +++ int i; +++ if ( !args->nflt_str ) +++ { +++ args->filters = (flt_stats_t*) calloc(1, sizeof(flt_stats_t)); +++ args->nfilters = 1; +++ args->filters[0].expr = strdup("all"); +++ args->filters[0].stats.nvaf = (uint32_t*) calloc(NVAF, sizeof(uint32_t)); +++ args->filters[0].stats.nlen = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); +++ args->filters[0].stats.nfrac = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); +++ args->filters[0].stats.dfrac = (double*) calloc(MAX_LEN*2+1, sizeof(double)); +++ } +++ else +++ { +++ args->nfilters = args->nflt_str; +++ args->filters = (flt_stats_t*) calloc(args->nfilters, sizeof(flt_stats_t)); +++ for (i=0; infilters; i++) +++ { +++ args->filters[i].filter = filter_init(args->hdr, args->flt_str[i]); +++ args->filters[i].expr = strdup(args->flt_str[i]); +++ args->filters[i].stats.nvaf = (uint32_t*) calloc(NVAF, sizeof(uint32_t)); +++ args->filters[i].stats.nlen = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); +++ args->filters[i].stats.nfrac = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); +++ args->filters[i].stats.dfrac = (double*) calloc(MAX_LEN*2+1, sizeof(double)); +++ +++ // replace tab's with spaces so that the output stays parsable +++ char *tmp = args->filters[i].expr; +++ while ( *tmp ) +++ { +++ if ( *tmp=='\t' ) *tmp = ' '; +++ tmp++; +++ } +++ } +++ } +++ args->nsmpl = bcf_hdr_nsamples(args->hdr); +++} +++static void destroy_data(args_t *args) +++{ +++ int i; +++ for (i=0; infilters; i++) +++ { +++ if ( args->filters[i].filter ) filter_destroy(args->filters[i].filter); +++ free(args->filters[i].stats.nvaf); +++ free(args->filters[i].stats.nlen); +++ free(args->filters[i].stats.nfrac); +++ free(args->filters[i].stats.dfrac); +++ free(args->filters[i].expr); +++ } +++ free(args->filters); +++ for (i=0; inflt_str; i++) free(args->flt_str[i]); +++ free(args->flt_str); +++ bcf_sr_destroy(args->sr); +++ free(args->ac); +++ free(args->trio); +++ free(args->csq_str); +++ free(args->gt_arr); +++ free(args->ad_arr); +++ free(args); +++} +++static void report_stats(args_t *args) +++{ +++ int i = 0,j; +++ FILE *fh = !args->output_fname || !strcmp("-",args->output_fname) ? stdout : fopen(args->output_fname,"w"); +++ if ( !fh ) error("Could not open the file for writing: %s\n", args->output_fname); +++ fprintf(fh,"# CMD line shows the command line used to generate this output\n"); +++ fprintf(fh,"# DEF lines define expressions for all tested thresholds\n"); +++ fprintf(fh,"# SN* summary number for every threshold:\n"); +++ fprintf(fh,"# %d) SN*, filter id\n", ++i); +++ fprintf(fh,"# %d) number of samples (or trios with -p)\n", ++i); +++ fprintf(fh,"# %d) number of indel sites total\n", ++i); +++ fprintf(fh,"# %d) number of indel sites that pass the filter (and, with -p, have a de novo indel)\n", ++i); +++ fprintf(fh,"# %d) number of indel genotypes that pass the filter (and, with -p, are de novo)\n", ++i); +++ fprintf(fh,"# %d) number of insertions (site-wise, not genotype-wise)\n", ++i); +++ fprintf(fh,"# %d) number of deletions (site-wise, not genotype-wise)\n", ++i); +++ fprintf(fh,"# %d) number of frameshifts (site-wise, not genotype-wise)\n", ++i); +++ fprintf(fh,"# %d) number of inframe indels (site-wise, not genotype-wise)\n", ++i); +++ fprintf(fh,"#\n"); +++ i = 0; +++ fprintf(fh,"# DVAF* lines report indel variant allele frequency (VAF) distribution for every threshold,\n"); +++ fprintf(fh,"# k-th bin corresponds to the frequency k/(nVAF-1):\n"); +++ fprintf(fh,"# %d) DVAF*, filter id\n", ++i); +++ fprintf(fh,"# %d) nVAF, number of bins which split the [0,1] VAF interval.\n", ++i); +++ fprintf(fh,"# %d-%d) counts of indel genotypes in the VAF bin. For non-reference hets, the VAF of the less supported allele is recorded\n", i+1, i+NVAF); +++ fprintf(fh,"#\n"); +++ i = 0; +++ fprintf(fh,"# DLEN* lines report indel length distribution for every threshold. When genotype fields are available,\n"); +++ fprintf(fh,"# the counts correspond to the number of genotypes, otherwise the number of sites are given.\n"); +++ fprintf(fh,"# The k-th bin corresponds to the indel size k-MAX_LEN, negative for deletions, positive for insertions.\n"); +++ fprintf(fh,"# The firt/last bin contains also all deletions/insertions larger than MAX_LEN:\n"); +++ fprintf(fh,"# %d) DLEN*, filter id\n", ++i); +++ fprintf(fh,"# %d) maximum indel length\n", ++i); +++ fprintf(fh,"# %d-%d) counts of indel lengths (-max,..,0,..,max), all unique alleles in a genotype are recorded (alt hets increase the counters 2x, alt homs 1x)\n", i+1, i+MAX_LEN*2+1); +++ fprintf(fh,"#\n"); +++ i = 0; +++ fprintf(fh,"# DFRAC* lines report the mean minor allele fraction at HET indel genotypes as a function of indel size.\n"); +++ fprintf(fh,"# The format is the same as for DLEN:\n"); +++ fprintf(fh,"# %d) DFRAC*, filter id\n", ++i); +++ fprintf(fh,"# %d) maximum indel length\n", ++i); +++ fprintf(fh,"# %d-%d) mean fraction at indel lengths (-max,..,0,..,max)\n", i+1, i+MAX_LEN*2+1); +++ fprintf(fh,"#\n"); +++ i = 0; +++ fprintf(fh,"# NFRAC* lines report the number of indels informing the DFRAC distribution.\n"); +++ fprintf(fh,"# %d) NFRAC*, filter id\n", ++i); +++ fprintf(fh,"# %d) maximum indel length\n", ++i); +++ fprintf(fh,"# %d-%d) counts at indel lengths (-max,..,0,..,max)\n", i+1, i+MAX_LEN*2+1); +++ fprintf(fh,"#\n"); +++ fprintf(fh, "CMD\t%s", args->argv[0]); +++ for (i=1; iargc; i++) fprintf(fh, " %s",args->argv[i]); +++ fprintf(fh, "\n"); +++ for (i=0; infilters; i++) +++ { +++ flt_stats_t *flt = &args->filters[i]; +++ fprintf(fh,"DEF\tFLT%d\t%s\n", i, flt->expr); +++ } +++ for (i=0; infilters; i++) +++ { +++ stats_t *stats = &args->filters[i].stats; +++ +++ fprintf(fh,"SN%d", i); +++ fprintf(fh,"\t%u", args->ntrio ? args->ntrio : args->nsmpl); +++ fprintf(fh,"\t%u", stats->nsites); +++ fprintf(fh,"\t%u", stats->npass); +++ fprintf(fh,"\t%u", stats->npass_gt); +++ fprintf(fh,"\t%u", stats->nins); +++ fprintf(fh,"\t%u", stats->ndel); +++ fprintf(fh,"\t%u", stats->nframeshift); +++ fprintf(fh,"\t%u", stats->ninframe); +++ fprintf(fh,"\n"); +++ +++ fprintf(fh,"DVAF%d", i); +++ fprintf(fh,"\t%d", NVAF); +++ for (j=0; jnvaf[j]); +++ fprintf(fh,"\n"); +++ +++ fprintf(fh,"DLEN%d", i); +++ fprintf(fh,"\t%d", MAX_LEN); +++ for (j=0; jnlen[j]); +++ fprintf(fh,"\n"); +++ +++ fprintf(fh,"DFRAC%d", i); +++ fprintf(fh,"\t%d", MAX_LEN); +++ for (j=0; jnfrac[j] ) fprintf(fh,"\t%.2f",stats->dfrac[j]/stats->nfrac[j]); +++ else fprintf(fh,"\t."); +++ fprintf(fh,"\n"); +++ +++ fprintf(fh,"NFRAC%d", i); +++ fprintf(fh,"\t%d", MAX_LEN); +++ for (j=0; jnfrac[j]); +++ fprintf(fh,"\n"); +++ } +++ if ( fclose(fh)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "stdout" : args->output_fname); +++} +++ +++static inline int parse_genotype(int32_t *arr, int ngt1, int idx, int als[2]) +++{ +++ int32_t *ptr = arr + ngt1 * idx; +++ if ( bcf_gt_is_missing(ptr[0]) ) return -1; +++ als[0] = bcf_gt_allele(ptr[0]); +++ +++ if ( ngt1==1 || ptr[1]==bcf_int32_vector_end ) { ptr[1] = ptr[0]; return -2; } +++ +++ if ( bcf_gt_is_missing(ptr[1]) ) return -1; +++ als[1] = bcf_gt_allele(ptr[1]); +++ +++ return 0; +++} +++ +++static inline void update_indel_stats(args_t *args, bcf1_t *rec, stats_t *stats, int ismpl, int *als) +++{ +++ int j; +++ if ( als[0] >= args->nad1 || als[1] >= args->nad1 ) error("Incorrect GT allele at %s:%"PRId64" .. %d/%d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,als[0],als[1]); +++ int32_t *ad_ptr = args->ad_arr + ismpl*args->nad1; +++ +++ // find the allele with most support +++ uint32_t ntot = 0; +++ for (j=0; jnad1; j++) +++ { +++ if ( ad_ptr[j]==bcf_int32_missing ) continue; +++ if ( ad_ptr[j]==bcf_int32_vector_end ) break; +++ ntot += ad_ptr[j]; +++ } +++ if ( !ntot ) return; +++ +++ // Find the alternate allele fraction, total and relative. Set al0 to be the more frequent indel allele. +++ // The genotypes have been already sanitized in parse_genotype(). +++ int al0 = als[0], al1 = als[1]; +++ if ( !(bcf_get_variant_type(rec,al0) & VCF_INDEL) ) +++ { +++ if ( !(bcf_get_variant_type(rec,al1) & VCF_INDEL) ) error("FIXME: this should not happen .. %s:%"PRId64" .. %d/%d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,al0,al1); +++ al0 = als[1]; al1 = als[0]; +++ } +++ else if ( (bcf_get_variant_type(rec,al1) & VCF_INDEL) && al0!=al1 ) +++ { +++ // Select the more frequent indel allele. +++ if ( ad_ptr[al0] < ad_ptr[al1] ) al0 = als[1], al1 = als[0]; +++ +++ // Record length of both indel alleles +++ int bin = len2bin(rec->d.var[al1].n); +++ if ( bin >= 0 ) stats->nlen[bin]++; +++ } +++ +++ float vaf = (float)ad_ptr[al0] / ntot; +++ int bin = vaf2bin(vaf); +++ stats->nvaf[bin]++; +++ +++ // al0 is now the major indel allele +++ int len_bin = len2bin(rec->d.var[al0].n); +++ if ( len_bin < 0 ) return; +++ stats->nlen[len_bin]++; +++ +++ if ( al0!=al1 ) +++ { +++ ntot = ad_ptr[al0] + ad_ptr[al1]; +++ if ( ntot ) +++ { +++ stats->nfrac[len_bin]++; +++ stats->dfrac[len_bin]+= (double)ad_ptr[al0] / ntot; +++ } +++ } +++} +++ +++static void process_record(args_t *args, bcf1_t *rec, flt_stats_t *flt) +++{ +++ int i,j; +++ uint8_t *smpl_pass = NULL; +++ +++ stats_t *stats = &flt->stats; +++ stats->nsites++; +++ +++ // Find out which samples/trios pass and if the site passes +++ if ( flt->filter ) +++ { +++ int pass_site = filter_test(flt->filter, rec, (const uint8_t**) &smpl_pass); +++ if ( args->ntrio ) +++ { +++ if ( args->filter_logic & FLT_EXCLUDE ) +++ { +++ if ( pass_site ) +++ { +++ if ( !smpl_pass ) return; +++ pass_site = 0; +++ for (i=0; intrio; i++) +++ { +++ int pass_trio = 1; +++ for (j=0; j<3; j++) +++ { +++ int idx = args->trio[i].idx[j]; +++ if ( smpl_pass[idx] ) { pass_trio = 0; break; } +++ } +++ args->trio[i].pass = pass_trio; +++ if ( pass_trio ) pass_site = 1; +++ } +++ if ( !pass_site ) return; +++ } +++ else +++ for (i=0; intrio; i++) args->trio[i].pass = 1; +++ } +++ else if ( !pass_site ) return; +++ else if ( smpl_pass ) +++ { +++ pass_site = 0; +++ for (i=0; intrio; i++) +++ { +++ int pass_trio = 1; +++ for (j=0; j<3; j++) +++ { +++ int idx = args->trio[i].idx[j]; +++ if ( !smpl_pass[idx] ) { pass_trio = 0; break; } +++ } +++ args->trio[i].pass = pass_trio; +++ if ( pass_trio ) pass_site = 1; +++ } +++ if ( !pass_site ) return; +++ } +++ else +++ for (i=0; intrio; i++) args->trio[i].pass = 1; +++ } +++ else +++ { +++ if ( args->filter_logic & FLT_EXCLUDE ) +++ { +++ if ( pass_site ) +++ { +++ if ( !smpl_pass ) return; +++ pass_site = 0; +++ for (i=0; insmpl; i++) +++ { +++ if ( smpl_pass[i] ) smpl_pass[i] = 0; +++ else { smpl_pass[i] = 1; pass_site = 1; } +++ } +++ if ( !pass_site ) return; +++ } +++ else +++ for (i=0; insmpl; i++) smpl_pass[i] = 1; +++ } +++ else if ( !pass_site ) return; +++ } +++ } +++ +++ args->ngt = 0; +++ if ( args->nsmpl ) +++ { +++ // Get the genotypes +++ args->ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); +++ args->ngt1 = args->ngt / rec->n_sample; +++ +++ if ( args->ngt>0 ) +++ { +++ // Get the AD counts +++ args->nad = bcf_get_format_int32(args->hdr, rec, "AD", &args->ad_arr, &args->mad_arr); +++ args->nad1 = args->nad / rec->n_sample; +++ if ( args->nad>0 && args->nad1 != rec->n_allele ) error("Incorrect number of FORMAT/AD values at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ } +++ } +++ +++ // Is there a star allele? Don't count overlapping deletions twice +++ int star_allele = -1; +++ for (i=1; in_allele; i++) +++ if ( !rec->d.allele[i][1] && rec->d.allele[i][0]=='*' ) { star_allele = i; break; } +++ +++ +++ if ( args->ngt>0 && args->ntrio ) +++ { +++ int is_dnm = 0; +++ for (i=0; intrio; i++) +++ { +++ if ( flt->filter && !args->trio[i].pass ) continue; +++ +++ // Determine the alternate allele and the genotypes, skip if any of the alleles is missing. +++ // the order is: child, father, mother +++ int als[6], *als_child = als, *als_father = als+2, *als_mother = als+4; +++ if ( parse_genotype(args->gt_arr, args->ngt1, args->trio[i].idx[iCHILD], als_child) < 0 ) continue; +++ if ( parse_genotype(args->gt_arr, args->ngt1, args->trio[i].idx[iFATHER], als_father) < 0 ) continue; +++ if ( parse_genotype(args->gt_arr, args->ngt1, args->trio[i].idx[iMOTHER], als_mother) < 0 ) continue; +++ +++ // Is it a DNM? +++ if ( !args->allow_alt2ref_DNMs && als_child[0]==0 && als_child[1]==0 ) continue; +++ if ( (als_child[0]==als_father[0] || als_child[0]==als_father[1]) && (als_child[1]==als_mother[0] || als_child[1]==als_mother[1]) ) continue; +++ if ( (als_child[1]==als_father[0] || als_child[1]==als_father[1]) && (als_child[0]==als_mother[0] || als_child[0]==als_mother[1]) ) continue; +++ if ( als_child[0]==star_allele || als_child[1]==star_allele ) continue; // don't count the same event multiple times +++ if ( als_father[0]==star_allele || als_father[1]==star_allele ) continue; +++ if ( als_mother[0]==star_allele || als_mother[1]==star_allele ) continue; +++ +++ int child_is_indel = (bcf_get_variant_type(rec,als_child[0]) & VCF_INDEL) || (bcf_get_variant_type(rec,als_child[1]) & VCF_INDEL) ? 1 : 0; +++ +++ if ( !args->allow_alt2ref_DNMs ) +++ { +++ if ( !child_is_indel ) continue; +++ } +++ else +++ { +++ if ( !child_is_indel && +++ !(bcf_get_variant_type(rec,als_father[0]) & VCF_INDEL) && +++ !(bcf_get_variant_type(rec,als_father[1]) & VCF_INDEL) && +++ !(bcf_get_variant_type(rec,als_mother[0]) & VCF_INDEL) && +++ !(bcf_get_variant_type(rec,als_mother[1]) & VCF_INDEL) ) continue; // not an indel, in any sample +++ } +++ +++ if ( child_is_indel ) +++ update_indel_stats(args, rec, stats, args->trio[i].idx[iCHILD], als_child); +++ +++ //printf("MERR\t%s\t%d\t%s\n", bcf_seqname(args->hdr,rec),rec->pos+1,args->hdr->samples[args->trio[i].idx[iCHILD]]); +++ +++ stats->npass_gt++; +++ +++ is_dnm = 1; +++ } +++ if ( !is_dnm ) return; +++ } +++ else if ( args->ngt>0 ) +++ { +++ for (i=0; insmpl; i++) +++ { +++ if ( smpl_pass && !smpl_pass[i] ) continue; +++ +++ // Determine the alternate allele and the genotypes, skip if any of the alleles is missing. +++ int als[2] = {0,0}; +++ int ret = parse_genotype(args->gt_arr, args->ngt1, i, als); +++ if ( ret==-1 ) continue; // missing genotype +++ if ( !(bcf_get_variant_type(rec,als[0]) & VCF_INDEL) && !(bcf_get_variant_type(rec,als[1]) & VCF_INDEL) ) continue; // not an indel +++ +++ update_indel_stats(args, rec, stats, i, als); +++ +++ stats->npass_gt++; +++ } +++ } +++ +++ if ( bcf_get_info_string(args->hdr,rec,args->csq_tag,&args->csq_str,&args->mcsq_str) > 0 ) +++ { +++ if ( strstr(args->csq_str,"inframe") ) stats->ninframe++; +++ if ( strstr(args->csq_str,"frameshift") ) stats->nframeshift++; +++ } +++ +++ for (i=1; in_allele; i++) +++ { +++ if ( !(bcf_get_variant_type(rec,i) & VCF_INDEL) ) continue; +++ if ( rec->d.var[i].n < 0 ) stats->ndel++; +++ else if ( rec->d.var[i].n > 0 ) stats->nins++; +++ if ( args->ngt <= 0 ) +++ { +++ int bin = len2bin(rec->d.var[i].n); +++ if ( bin >= 0 ) stats->nlen[bin]++; +++ } +++ } +++ stats->npass++; +++} +++ +++int run(int argc, char **argv) +++{ +++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); +++ args->argc = argc; args->argv = argv; +++ args->output_fname = "-"; +++ args->csq_tag = "CSQ"; +++ static struct option loptions[] = +++ { +++ {"max-len",required_argument,0,1}, +++ {"nvaf",required_argument,0,2}, +++ {"alt2ref-DNM",no_argument,0,3}, +++ {"ped",required_argument,0,'p'}, +++ {"csq-tag",required_argument,0,'c'}, +++ {"include",required_argument,0,'i'}, +++ {"exclude",required_argument,0,'e'}, +++ {"output",required_argument,NULL,'o'}, +++ {"regions",1,0,'r'}, +++ {"regions-file",1,0,'R'}, +++ {"targets",1,0,'t'}, +++ {"targets-file",1,0,'T'}, +++ {NULL,0,NULL,0} +++ }; +++ char *tmp; +++ int c, i; +++ while ((c = getopt_long(argc, argv, "o:s:i:e:r:R:t:T:c:p:",loptions,NULL)) >= 0) +++ { +++ switch (c) +++ { +++ case 1 : +++ MAX_LEN = strtod(optarg,&tmp); +++ if ( *tmp ) error("Could not parse: --max-len %s\n", optarg); +++ if ( MAX_LEN<=0 ) error("Expected value bigger than 0 --max-len\n"); +++ break; +++ case 2 : +++ NVAF = strtod(optarg,&tmp); +++ if ( *tmp ) error("Could not parse: --max-len %s\n", optarg); +++ if ( NVAF<0 || NVAF>1 ) error("Expected value from the interval [0,1] with --nvaf\n"); +++ break; +++ case 3 : args->allow_alt2ref_DNMs = 1; break; +++ case 'p': args->ped_fname = optarg; break; +++ case 'c': args->csq_tag = optarg; break; +++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; +++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; +++ case 't': args->targets = optarg; break; +++ case 'T': args->targets = optarg; args->targets_is_file = 1; break; +++ case 'r': args->regions = optarg; break; +++ case 'R': args->regions = optarg; args->regions_is_file = 1; break; +++ case 'o': args->output_fname = optarg; break; +++ case 'h': +++ case '?': +++ default: error("%s", usage_text()); break; +++ } +++ } +++ if ( optind==argc ) +++ { +++ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin +++ else { error("%s",usage_text()); } +++ } +++ else if ( optind+1!=argc ) error("%s",usage_text()); +++ else args->fname = argv[optind]; +++ +++ init_data(args); +++ +++ while ( bcf_sr_next_line(args->sr) ) +++ { +++ bcf1_t *rec = bcf_sr_get_line(args->sr,0); +++ if ( !(bcf_get_variant_types(rec) & VCF_INDEL) ) continue; +++ for (i=0; infilters; i++) +++ process_record(args, rec, &args->filters[i]); +++ } +++ +++ report_stats(args); +++ destroy_data(args); +++ +++ return 0; +++} ++--- /dev/null +++++ python-pysam/bcftools/plugins/indel-stats.c.pysam.c ++@@ -0,0 +1,755 @@ +++#include "bcftools.pysam.h" +++ +++/* The MIT License +++ +++ Copyright (c) 2018 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++ THE SOFTWARE. +++ +++ */ +++ +++#include +++#include +++#include +++#include // for isatty +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "bcftools.h" +++#include "filter.h" +++ +++ +++// Logic of the filters: include or exclude sites which match the filters? +++#define FLT_INCLUDE 1 +++#define FLT_EXCLUDE 2 +++ +++static int NVAF = 20; +++static int MAX_LEN = 20; +++ +++static inline int len2bin(int len) +++{ +++ if ( len < -MAX_LEN ) return 0; +++ if ( len > MAX_LEN ) return 2*MAX_LEN; +++ return MAX_LEN + len; +++} +++HTS_UNUSED static inline int bin2len(int bin) +++{ +++ return bin - MAX_LEN; +++} +++static inline int vaf2bin(float vaf) +++{ +++ return vaf*(NVAF-1); +++} +++HTS_UNUSED static inline float bin2vaf(int bin) +++{ +++ return (float)bin/(NVAF-1); +++} +++ +++typedef struct +++{ +++ uint32_t +++ *nvaf, // number of indels genotypes with low VAF (<=0.2) and high VAF (>0.2); use vaf2bin and bin2vaf +++ *nlen, // length distribution (-MAX_LEN,MAX_LEN); use len2bin and bin2len; site-wise unless samples are present +++ npass_gt, // number of indel genotypes passing the filter +++ npass, // number of sites passing the filter +++ nsites, // number of sites total +++ nins, ndel, // number of insertions and deletions, site-wise, not genotype-wise +++ nframeshift, ninframe, // site-wise +++ *nfrac; // number of het indels contributing to dfrac +++ double +++ *dfrac; // minor allele fraction at HET indel genotypes, determined from FORMAT/AD +++} +++stats_t; +++ +++typedef struct +++{ +++ stats_t stats; +++ filter_t *filter; +++ char *expr; +++} +++flt_stats_t; +++ +++#define iCHILD 0 +++#define iFATHER 1 +++#define iMOTHER 2 +++ +++typedef struct +++{ +++ int idx[3]; // VCF sample index for father, mother and child +++ int pass; // do all three pass the filters? +++} +++trio_t; +++ +++typedef struct +++{ +++ int argc, filter_logic, regions_is_file, targets_is_file; +++ int nflt_str; +++ char *filter_str, **flt_str; +++ char **argv, *output_fname, *fname, *regions, *targets, *csq_tag, *ped_fname; +++ trio_t *trio; +++ int ntrio, mtrio; +++ bcf_srs_t *sr; +++ bcf_hdr_t *hdr; +++ flt_stats_t *filters; +++ int nfilters, nsmpl; +++ char *csq_str; +++ int32_t *gt_arr, *ad_arr, *ac; +++ int mgt_arr, mad_arr, mac, mcsq_str; +++ int ngt, ngt1, nad, nad1; +++ int allow_alt2ref_DNMs; // is "0/0 0/1 1/1" (child,father,mother) a valid DNM? +++} +++args_t; +++ +++args_t args; +++ +++const char *about(void) +++{ +++ return "Calculate indel stats scanning over a range of thresholds simultaneously.\n"; +++} +++ +++static const char *usage_text(void) +++{ +++ return +++ "\n" +++ "About: Calculates indel stats. Use curly brackets to scan a range of values simultaneously\n" +++ "Usage: bcftools +indel-stats [Plugin Options]\n" +++ "Plugin options:\n" +++ " --alt2ref-DNM consider GT errors such as 0/1 + 1/1 -> 0/0 a valid DNM\n" +++ " -c, --csq-tag STR VEP or BCSQ tag to determine inframe and frameshift variants [CSQ]\n" +++ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" +++ " -i, --include EXPR include sites and samples for which the expression is true\n" +++ " --max-len INT maximum indel length to consider [20]\n" +++ " --nvaf INT number of variant allele frequency bins [20]\n" +++ " -o, --output FILE output file name [bcftools_stdout]\n" +++ " -p, --ped FILE limit the stats to de novo indels\n" +++ " -r, --regions REG restrict to comma-separated list of regions\n" +++ " -R, --regions-file FILE restrict to regions listed in a file\n" +++ " -t, --targets REG similar to -r but streams rather than index-jumps\n" +++ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" +++ "\n" +++ "Example:\n" +++ " bcftools +indel-stats -i 'GQ>{10,20,30,40,50}' file.bcf\n" +++ "\n"; +++} +++ +++static void parse_filters(args_t *args) +++{ +++ if ( !args->filter_str ) return; +++ int mflt = 1; +++ args->nflt_str = 1; +++ args->flt_str = (char**) malloc(sizeof(char*)); +++ args->flt_str[0] = strdup(args->filter_str); +++ while (1) +++ { +++ int i, expanded = 0; +++ for (i=args->nflt_str-1; i>=0; i--) +++ { +++ char *exp_beg = strchr(args->flt_str[i], '{'); +++ if ( !exp_beg ) continue; +++ char *exp_end = strchr(exp_beg+1, '}'); +++ if ( !exp_end ) error("Could not parse the expression: %s\n", args->filter_str); +++ char *beg = exp_beg+1, *mid = beg; +++ while ( midflt_str[i], exp_beg - args->flt_str[i], &tmp); +++ kputsn(beg, mid - beg, &tmp); +++ kputs(exp_end+1, &tmp); +++ args->nflt_str++; +++ hts_expand(char*, args->nflt_str, mflt, args->flt_str); +++ args->flt_str[args->nflt_str-1] = tmp.s; +++ beg = ++mid; +++ } +++ expanded = 1; +++ free(args->flt_str[i]); +++ memmove(&args->flt_str[i], &args->flt_str[i+1], (args->nflt_str-i-1)*sizeof(*args->flt_str)); +++ args->nflt_str--; +++ args->flt_str[args->nflt_str] = NULL; +++ } +++ if ( !expanded ) break; +++ } +++ +++ fprintf(bcftools_stderr,"Collecting data for %d filtering expressions\n", args->nflt_str); +++} +++ +++static int cmp_trios(const void *_a, const void *_b) +++{ +++ trio_t *a = (trio_t *) _a; +++ trio_t *b = (trio_t *) _b; +++ int i; +++ int amin = a->idx[0]; +++ for (i=1; i<3; i++) +++ if ( amin > a->idx[i] ) amin = a->idx[i]; +++ int bmin = b->idx[0]; +++ for (i=1; i<3; i++) +++ if ( bmin > b->idx[i] ) bmin = b->idx[i]; +++ if ( amin < bmin ) return -1; +++ if ( amin > bmin ) return 1; +++ return 0; +++} +++static void parse_ped(args_t *args, char *fname) +++{ +++ htsFile *fp = hts_open(fname, "r"); +++ if ( !fp ) error("Could not read: %s\n", fname); +++ +++ kstring_t str = {0,0,0}; +++ if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname); +++ +++ int moff = 0, *off = NULL; +++ do +++ { +++ // familyID sampleID paternalID maternalID sex phenotype population relationship siblings secondOrder thirdOrder children comment +++ // BB03 HG01884 HG01885 HG01956 2 0 ACB child 0 0 0 0 +++ int ncols = ksplit_core(str.s,0,&moff,&off); +++ if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s); +++ +++ int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]); +++ if ( father<0 ) continue; +++ int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]); +++ if ( mother<0 ) continue; +++ int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]); +++ if ( child<0 ) continue; +++ +++ args->ntrio++; +++ hts_expand0(trio_t,args->ntrio,args->mtrio,args->trio); +++ trio_t *trio = &args->trio[args->ntrio-1]; +++ trio->idx[iFATHER] = father; +++ trio->idx[iMOTHER] = mother; +++ trio->idx[iCHILD] = child; +++ } +++ while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); +++ +++ fprintf(bcftools_stderr,"Identified %d complete trios in the VCF file\n", args->ntrio); +++ if ( !args->ntrio ) error("No complete trio identified\n"); +++ +++ // sort the sample by index so that they are accessed more or less sequentially +++ qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); +++ +++ free(str.s); +++ free(off); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); +++} +++ +++static void init_data(args_t *args) +++{ +++ args->sr = bcf_sr_init(); +++ if ( args->regions ) +++ { +++ args->sr->require_index = 1; +++ if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); +++ } +++ if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); +++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); +++ args->hdr = bcf_sr_get_header(args->sr,0); +++ +++ if ( args->ped_fname ) +++ parse_ped(args, args->ped_fname); +++ +++ parse_filters(args); +++ +++ int i; +++ if ( !args->nflt_str ) +++ { +++ args->filters = (flt_stats_t*) calloc(1, sizeof(flt_stats_t)); +++ args->nfilters = 1; +++ args->filters[0].expr = strdup("all"); +++ args->filters[0].stats.nvaf = (uint32_t*) calloc(NVAF, sizeof(uint32_t)); +++ args->filters[0].stats.nlen = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); +++ args->filters[0].stats.nfrac = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); +++ args->filters[0].stats.dfrac = (double*) calloc(MAX_LEN*2+1, sizeof(double)); +++ } +++ else +++ { +++ args->nfilters = args->nflt_str; +++ args->filters = (flt_stats_t*) calloc(args->nfilters, sizeof(flt_stats_t)); +++ for (i=0; infilters; i++) +++ { +++ args->filters[i].filter = filter_init(args->hdr, args->flt_str[i]); +++ args->filters[i].expr = strdup(args->flt_str[i]); +++ args->filters[i].stats.nvaf = (uint32_t*) calloc(NVAF, sizeof(uint32_t)); +++ args->filters[i].stats.nlen = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); +++ args->filters[i].stats.nfrac = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); +++ args->filters[i].stats.dfrac = (double*) calloc(MAX_LEN*2+1, sizeof(double)); +++ +++ // replace tab's with spaces so that the output stays parsable +++ char *tmp = args->filters[i].expr; +++ while ( *tmp ) +++ { +++ if ( *tmp=='\t' ) *tmp = ' '; +++ tmp++; +++ } +++ } +++ } +++ args->nsmpl = bcf_hdr_nsamples(args->hdr); +++} +++static void destroy_data(args_t *args) +++{ +++ int i; +++ for (i=0; infilters; i++) +++ { +++ if ( args->filters[i].filter ) filter_destroy(args->filters[i].filter); +++ free(args->filters[i].stats.nvaf); +++ free(args->filters[i].stats.nlen); +++ free(args->filters[i].stats.nfrac); +++ free(args->filters[i].stats.dfrac); +++ free(args->filters[i].expr); +++ } +++ free(args->filters); +++ for (i=0; inflt_str; i++) free(args->flt_str[i]); +++ free(args->flt_str); +++ bcf_sr_destroy(args->sr); +++ free(args->ac); +++ free(args->trio); +++ free(args->csq_str); +++ free(args->gt_arr); +++ free(args->ad_arr); +++ free(args); +++} +++static void report_stats(args_t *args) +++{ +++ int i = 0,j; +++ FILE *fh = !args->output_fname || !strcmp("-",args->output_fname) ? bcftools_stdout : fopen(args->output_fname,"w"); +++ if ( !fh ) error("Could not open the file for writing: %s\n", args->output_fname); +++ fprintf(fh,"# CMD line shows the command line used to generate this output\n"); +++ fprintf(fh,"# DEF lines define expressions for all tested thresholds\n"); +++ fprintf(fh,"# SN* summary number for every threshold:\n"); +++ fprintf(fh,"# %d) SN*, filter id\n", ++i); +++ fprintf(fh,"# %d) number of samples (or trios with -p)\n", ++i); +++ fprintf(fh,"# %d) number of indel sites total\n", ++i); +++ fprintf(fh,"# %d) number of indel sites that pass the filter (and, with -p, have a de novo indel)\n", ++i); +++ fprintf(fh,"# %d) number of indel genotypes that pass the filter (and, with -p, are de novo)\n", ++i); +++ fprintf(fh,"# %d) number of insertions (site-wise, not genotype-wise)\n", ++i); +++ fprintf(fh,"# %d) number of deletions (site-wise, not genotype-wise)\n", ++i); +++ fprintf(fh,"# %d) number of frameshifts (site-wise, not genotype-wise)\n", ++i); +++ fprintf(fh,"# %d) number of inframe indels (site-wise, not genotype-wise)\n", ++i); +++ fprintf(fh,"#\n"); +++ i = 0; +++ fprintf(fh,"# DVAF* lines report indel variant allele frequency (VAF) distribution for every threshold,\n"); +++ fprintf(fh,"# k-th bin corresponds to the frequency k/(nVAF-1):\n"); +++ fprintf(fh,"# %d) DVAF*, filter id\n", ++i); +++ fprintf(fh,"# %d) nVAF, number of bins which split the [0,1] VAF interval.\n", ++i); +++ fprintf(fh,"# %d-%d) counts of indel genotypes in the VAF bin. For non-reference hets, the VAF of the less supported allele is recorded\n", i+1, i+NVAF); +++ fprintf(fh,"#\n"); +++ i = 0; +++ fprintf(fh,"# DLEN* lines report indel length distribution for every threshold. When genotype fields are available,\n"); +++ fprintf(fh,"# the counts correspond to the number of genotypes, otherwise the number of sites are given.\n"); +++ fprintf(fh,"# The k-th bin corresponds to the indel size k-MAX_LEN, negative for deletions, positive for insertions.\n"); +++ fprintf(fh,"# The firt/last bin contains also all deletions/insertions larger than MAX_LEN:\n"); +++ fprintf(fh,"# %d) DLEN*, filter id\n", ++i); +++ fprintf(fh,"# %d) maximum indel length\n", ++i); +++ fprintf(fh,"# %d-%d) counts of indel lengths (-max,..,0,..,max), all unique alleles in a genotype are recorded (alt hets increase the counters 2x, alt homs 1x)\n", i+1, i+MAX_LEN*2+1); +++ fprintf(fh,"#\n"); +++ i = 0; +++ fprintf(fh,"# DFRAC* lines report the mean minor allele fraction at HET indel genotypes as a function of indel size.\n"); +++ fprintf(fh,"# The format is the same as for DLEN:\n"); +++ fprintf(fh,"# %d) DFRAC*, filter id\n", ++i); +++ fprintf(fh,"# %d) maximum indel length\n", ++i); +++ fprintf(fh,"# %d-%d) mean fraction at indel lengths (-max,..,0,..,max)\n", i+1, i+MAX_LEN*2+1); +++ fprintf(fh,"#\n"); +++ i = 0; +++ fprintf(fh,"# NFRAC* lines report the number of indels informing the DFRAC distribution.\n"); +++ fprintf(fh,"# %d) NFRAC*, filter id\n", ++i); +++ fprintf(fh,"# %d) maximum indel length\n", ++i); +++ fprintf(fh,"# %d-%d) counts at indel lengths (-max,..,0,..,max)\n", i+1, i+MAX_LEN*2+1); +++ fprintf(fh,"#\n"); +++ fprintf(fh, "CMD\t%s", args->argv[0]); +++ for (i=1; iargc; i++) fprintf(fh, " %s",args->argv[i]); +++ fprintf(fh, "\n"); +++ for (i=0; infilters; i++) +++ { +++ flt_stats_t *flt = &args->filters[i]; +++ fprintf(fh,"DEF\tFLT%d\t%s\n", i, flt->expr); +++ } +++ for (i=0; infilters; i++) +++ { +++ stats_t *stats = &args->filters[i].stats; +++ +++ fprintf(fh,"SN%d", i); +++ fprintf(fh,"\t%u", args->ntrio ? args->ntrio : args->nsmpl); +++ fprintf(fh,"\t%u", stats->nsites); +++ fprintf(fh,"\t%u", stats->npass); +++ fprintf(fh,"\t%u", stats->npass_gt); +++ fprintf(fh,"\t%u", stats->nins); +++ fprintf(fh,"\t%u", stats->ndel); +++ fprintf(fh,"\t%u", stats->nframeshift); +++ fprintf(fh,"\t%u", stats->ninframe); +++ fprintf(fh,"\n"); +++ +++ fprintf(fh,"DVAF%d", i); +++ fprintf(fh,"\t%d", NVAF); +++ for (j=0; jnvaf[j]); +++ fprintf(fh,"\n"); +++ +++ fprintf(fh,"DLEN%d", i); +++ fprintf(fh,"\t%d", MAX_LEN); +++ for (j=0; jnlen[j]); +++ fprintf(fh,"\n"); +++ +++ fprintf(fh,"DFRAC%d", i); +++ fprintf(fh,"\t%d", MAX_LEN); +++ for (j=0; jnfrac[j] ) fprintf(fh,"\t%.2f",stats->dfrac[j]/stats->nfrac[j]); +++ else fprintf(fh,"\t."); +++ fprintf(fh,"\n"); +++ +++ fprintf(fh,"NFRAC%d", i); +++ fprintf(fh,"\t%d", MAX_LEN); +++ for (j=0; jnfrac[j]); +++ fprintf(fh,"\n"); +++ } +++ if ( fclose(fh)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "bcftools_stdout" : args->output_fname); +++} +++ +++static inline int parse_genotype(int32_t *arr, int ngt1, int idx, int als[2]) +++{ +++ int32_t *ptr = arr + ngt1 * idx; +++ if ( bcf_gt_is_missing(ptr[0]) ) return -1; +++ als[0] = bcf_gt_allele(ptr[0]); +++ +++ if ( ngt1==1 || ptr[1]==bcf_int32_vector_end ) { ptr[1] = ptr[0]; return -2; } +++ +++ if ( bcf_gt_is_missing(ptr[1]) ) return -1; +++ als[1] = bcf_gt_allele(ptr[1]); +++ +++ return 0; +++} +++ +++static inline void update_indel_stats(args_t *args, bcf1_t *rec, stats_t *stats, int ismpl, int *als) +++{ +++ int j; +++ if ( als[0] >= args->nad1 || als[1] >= args->nad1 ) error("Incorrect GT allele at %s:%"PRId64" .. %d/%d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,als[0],als[1]); +++ int32_t *ad_ptr = args->ad_arr + ismpl*args->nad1; +++ +++ // find the allele with most support +++ uint32_t ntot = 0; +++ for (j=0; jnad1; j++) +++ { +++ if ( ad_ptr[j]==bcf_int32_missing ) continue; +++ if ( ad_ptr[j]==bcf_int32_vector_end ) break; +++ ntot += ad_ptr[j]; +++ } +++ if ( !ntot ) return; +++ +++ // Find the alternate allele fraction, total and relative. Set al0 to be the more frequent indel allele. +++ // The genotypes have been already sanitized in parse_genotype(). +++ int al0 = als[0], al1 = als[1]; +++ if ( !(bcf_get_variant_type(rec,al0) & VCF_INDEL) ) +++ { +++ if ( !(bcf_get_variant_type(rec,al1) & VCF_INDEL) ) error("FIXME: this should not happen .. %s:%"PRId64" .. %d/%d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,al0,al1); +++ al0 = als[1]; al1 = als[0]; +++ } +++ else if ( (bcf_get_variant_type(rec,al1) & VCF_INDEL) && al0!=al1 ) +++ { +++ // Select the more frequent indel allele. +++ if ( ad_ptr[al0] < ad_ptr[al1] ) al0 = als[1], al1 = als[0]; +++ +++ // Record length of both indel alleles +++ int bin = len2bin(rec->d.var[al1].n); +++ if ( bin >= 0 ) stats->nlen[bin]++; +++ } +++ +++ float vaf = (float)ad_ptr[al0] / ntot; +++ int bin = vaf2bin(vaf); +++ stats->nvaf[bin]++; +++ +++ // al0 is now the major indel allele +++ int len_bin = len2bin(rec->d.var[al0].n); +++ if ( len_bin < 0 ) return; +++ stats->nlen[len_bin]++; +++ +++ if ( al0!=al1 ) +++ { +++ ntot = ad_ptr[al0] + ad_ptr[al1]; +++ if ( ntot ) +++ { +++ stats->nfrac[len_bin]++; +++ stats->dfrac[len_bin]+= (double)ad_ptr[al0] / ntot; +++ } +++ } +++} +++ +++static void process_record(args_t *args, bcf1_t *rec, flt_stats_t *flt) +++{ +++ int i,j; +++ uint8_t *smpl_pass = NULL; +++ +++ stats_t *stats = &flt->stats; +++ stats->nsites++; +++ +++ // Find out which samples/trios pass and if the site passes +++ if ( flt->filter ) +++ { +++ int pass_site = filter_test(flt->filter, rec, (const uint8_t**) &smpl_pass); +++ if ( args->ntrio ) +++ { +++ if ( args->filter_logic & FLT_EXCLUDE ) +++ { +++ if ( pass_site ) +++ { +++ if ( !smpl_pass ) return; +++ pass_site = 0; +++ for (i=0; intrio; i++) +++ { +++ int pass_trio = 1; +++ for (j=0; j<3; j++) +++ { +++ int idx = args->trio[i].idx[j]; +++ if ( smpl_pass[idx] ) { pass_trio = 0; break; } +++ } +++ args->trio[i].pass = pass_trio; +++ if ( pass_trio ) pass_site = 1; +++ } +++ if ( !pass_site ) return; +++ } +++ else +++ for (i=0; intrio; i++) args->trio[i].pass = 1; +++ } +++ else if ( !pass_site ) return; +++ else if ( smpl_pass ) +++ { +++ pass_site = 0; +++ for (i=0; intrio; i++) +++ { +++ int pass_trio = 1; +++ for (j=0; j<3; j++) +++ { +++ int idx = args->trio[i].idx[j]; +++ if ( !smpl_pass[idx] ) { pass_trio = 0; break; } +++ } +++ args->trio[i].pass = pass_trio; +++ if ( pass_trio ) pass_site = 1; +++ } +++ if ( !pass_site ) return; +++ } +++ else +++ for (i=0; intrio; i++) args->trio[i].pass = 1; +++ } +++ else +++ { +++ if ( args->filter_logic & FLT_EXCLUDE ) +++ { +++ if ( pass_site ) +++ { +++ if ( !smpl_pass ) return; +++ pass_site = 0; +++ for (i=0; insmpl; i++) +++ { +++ if ( smpl_pass[i] ) smpl_pass[i] = 0; +++ else { smpl_pass[i] = 1; pass_site = 1; } +++ } +++ if ( !pass_site ) return; +++ } +++ else +++ for (i=0; insmpl; i++) smpl_pass[i] = 1; +++ } +++ else if ( !pass_site ) return; +++ } +++ } +++ +++ args->ngt = 0; +++ if ( args->nsmpl ) +++ { +++ // Get the genotypes +++ args->ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); +++ args->ngt1 = args->ngt / rec->n_sample; +++ +++ if ( args->ngt>0 ) +++ { +++ // Get the AD counts +++ args->nad = bcf_get_format_int32(args->hdr, rec, "AD", &args->ad_arr, &args->mad_arr); +++ args->nad1 = args->nad / rec->n_sample; +++ if ( args->nad>0 && args->nad1 != rec->n_allele ) error("Incorrect number of FORMAT/AD values at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ } +++ } +++ +++ // Is there a star allele? Don't count overlapping deletions twice +++ int star_allele = -1; +++ for (i=1; in_allele; i++) +++ if ( !rec->d.allele[i][1] && rec->d.allele[i][0]=='*' ) { star_allele = i; break; } +++ +++ +++ if ( args->ngt>0 && args->ntrio ) +++ { +++ int is_dnm = 0; +++ for (i=0; intrio; i++) +++ { +++ if ( flt->filter && !args->trio[i].pass ) continue; +++ +++ // Determine the alternate allele and the genotypes, skip if any of the alleles is missing. +++ // the order is: child, father, mother +++ int als[6], *als_child = als, *als_father = als+2, *als_mother = als+4; +++ if ( parse_genotype(args->gt_arr, args->ngt1, args->trio[i].idx[iCHILD], als_child) < 0 ) continue; +++ if ( parse_genotype(args->gt_arr, args->ngt1, args->trio[i].idx[iFATHER], als_father) < 0 ) continue; +++ if ( parse_genotype(args->gt_arr, args->ngt1, args->trio[i].idx[iMOTHER], als_mother) < 0 ) continue; +++ +++ // Is it a DNM? +++ if ( !args->allow_alt2ref_DNMs && als_child[0]==0 && als_child[1]==0 ) continue; +++ if ( (als_child[0]==als_father[0] || als_child[0]==als_father[1]) && (als_child[1]==als_mother[0] || als_child[1]==als_mother[1]) ) continue; +++ if ( (als_child[1]==als_father[0] || als_child[1]==als_father[1]) && (als_child[0]==als_mother[0] || als_child[0]==als_mother[1]) ) continue; +++ if ( als_child[0]==star_allele || als_child[1]==star_allele ) continue; // don't count the same event multiple times +++ if ( als_father[0]==star_allele || als_father[1]==star_allele ) continue; +++ if ( als_mother[0]==star_allele || als_mother[1]==star_allele ) continue; +++ +++ int child_is_indel = (bcf_get_variant_type(rec,als_child[0]) & VCF_INDEL) || (bcf_get_variant_type(rec,als_child[1]) & VCF_INDEL) ? 1 : 0; +++ +++ if ( !args->allow_alt2ref_DNMs ) +++ { +++ if ( !child_is_indel ) continue; +++ } +++ else +++ { +++ if ( !child_is_indel && +++ !(bcf_get_variant_type(rec,als_father[0]) & VCF_INDEL) && +++ !(bcf_get_variant_type(rec,als_father[1]) & VCF_INDEL) && +++ !(bcf_get_variant_type(rec,als_mother[0]) & VCF_INDEL) && +++ !(bcf_get_variant_type(rec,als_mother[1]) & VCF_INDEL) ) continue; // not an indel, in any sample +++ } +++ +++ if ( child_is_indel ) +++ update_indel_stats(args, rec, stats, args->trio[i].idx[iCHILD], als_child); +++ +++ //printf("MERR\t%s\t%d\t%s\n", bcf_seqname(args->hdr,rec),rec->pos+1,args->hdr->samples[args->trio[i].idx[iCHILD]]); +++ +++ stats->npass_gt++; +++ +++ is_dnm = 1; +++ } +++ if ( !is_dnm ) return; +++ } +++ else if ( args->ngt>0 ) +++ { +++ for (i=0; insmpl; i++) +++ { +++ if ( smpl_pass && !smpl_pass[i] ) continue; +++ +++ // Determine the alternate allele and the genotypes, skip if any of the alleles is missing. +++ int als[2] = {0,0}; +++ int ret = parse_genotype(args->gt_arr, args->ngt1, i, als); +++ if ( ret==-1 ) continue; // missing genotype +++ if ( !(bcf_get_variant_type(rec,als[0]) & VCF_INDEL) && !(bcf_get_variant_type(rec,als[1]) & VCF_INDEL) ) continue; // not an indel +++ +++ update_indel_stats(args, rec, stats, i, als); +++ +++ stats->npass_gt++; +++ } +++ } +++ +++ if ( bcf_get_info_string(args->hdr,rec,args->csq_tag,&args->csq_str,&args->mcsq_str) > 0 ) +++ { +++ if ( strstr(args->csq_str,"inframe") ) stats->ninframe++; +++ if ( strstr(args->csq_str,"frameshift") ) stats->nframeshift++; +++ } +++ +++ for (i=1; in_allele; i++) +++ { +++ if ( !(bcf_get_variant_type(rec,i) & VCF_INDEL) ) continue; +++ if ( rec->d.var[i].n < 0 ) stats->ndel++; +++ else if ( rec->d.var[i].n > 0 ) stats->nins++; +++ if ( args->ngt <= 0 ) +++ { +++ int bin = len2bin(rec->d.var[i].n); +++ if ( bin >= 0 ) stats->nlen[bin]++; +++ } +++ } +++ stats->npass++; +++} +++ +++int run(int argc, char **argv) +++{ +++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); +++ args->argc = argc; args->argv = argv; +++ args->output_fname = "-"; +++ args->csq_tag = "CSQ"; +++ static struct option loptions[] = +++ { +++ {"max-len",required_argument,0,1}, +++ {"nvaf",required_argument,0,2}, +++ {"alt2ref-DNM",no_argument,0,3}, +++ {"ped",required_argument,0,'p'}, +++ {"csq-tag",required_argument,0,'c'}, +++ {"include",required_argument,0,'i'}, +++ {"exclude",required_argument,0,'e'}, +++ {"output",required_argument,NULL,'o'}, +++ {"regions",1,0,'r'}, +++ {"regions-file",1,0,'R'}, +++ {"targets",1,0,'t'}, +++ {"targets-file",1,0,'T'}, +++ {NULL,0,NULL,0} +++ }; +++ char *tmp; +++ int c, i; +++ while ((c = getopt_long(argc, argv, "o:s:i:e:r:R:t:T:c:p:",loptions,NULL)) >= 0) +++ { +++ switch (c) +++ { +++ case 1 : +++ MAX_LEN = strtod(optarg,&tmp); +++ if ( *tmp ) error("Could not parse: --max-len %s\n", optarg); +++ if ( MAX_LEN<=0 ) error("Expected value bigger than 0 --max-len\n"); +++ break; +++ case 2 : +++ NVAF = strtod(optarg,&tmp); +++ if ( *tmp ) error("Could not parse: --max-len %s\n", optarg); +++ if ( NVAF<0 || NVAF>1 ) error("Expected value from the interval [0,1] with --nvaf\n"); +++ break; +++ case 3 : args->allow_alt2ref_DNMs = 1; break; +++ case 'p': args->ped_fname = optarg; break; +++ case 'c': args->csq_tag = optarg; break; +++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; +++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; +++ case 't': args->targets = optarg; break; +++ case 'T': args->targets = optarg; args->targets_is_file = 1; break; +++ case 'r': args->regions = optarg; break; +++ case 'R': args->regions = optarg; args->regions_is_file = 1; break; +++ case 'o': args->output_fname = optarg; break; +++ case 'h': +++ case '?': +++ default: error("%s", usage_text()); break; +++ } +++ } +++ if ( optind==argc ) +++ { +++ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin +++ else { error("%s",usage_text()); } +++ } +++ else if ( optind+1!=argc ) error("%s",usage_text()); +++ else args->fname = argv[optind]; +++ +++ init_data(args); +++ +++ while ( bcf_sr_next_line(args->sr) ) +++ { +++ bcf1_t *rec = bcf_sr_get_line(args->sr,0); +++ if ( !(bcf_get_variant_types(rec) & VCF_INDEL) ) continue; +++ for (i=0; infilters; i++) +++ process_record(args, rec, &args->filters[i]); +++ } +++ +++ report_stats(args); +++ destroy_data(args); +++ +++ return 0; +++} ++--- python-pysam.orig/bcftools/plugins/isecGT.c +++++ python-pysam/bcftools/plugins/isecGT.c ++@@ -131,14 +131,14 @@ ++ smpl_ilist_t *smpl = smpl_ilist_map(args->hdr_a, args->hdr_b, SMPL_STRICT); ++ args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); ++ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); ++- bcf_hdr_write(args->out_fh, args->hdr_a); +++ if ( bcf_hdr_write(args->out_fh, args->hdr_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ ++ while ( bcf_sr_next_line(args->sr) ) ++ { ++ if ( !bcf_sr_has_line(args->sr,0) ) continue; ++ if ( !bcf_sr_has_line(args->sr,1) ) ++ { ++- bcf_write(args->out_fh, args->hdr_a, bcf_sr_get_line(args->sr,0)); +++ if ( bcf_write(args->out_fh, args->hdr_a, bcf_sr_get_line(args->sr,0))!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ continue; ++ } ++ ++@@ -163,7 +163,7 @@ ++ } ++ } ++ if ( dirty ) bcf_update_genotypes(args->hdr_a, line_a, args->arr_a, ngt_a*smpl->n); ++- bcf_write(args->out_fh, args->hdr_a, line_a); +++ if ( bcf_write(args->out_fh, args->hdr_a, line_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ ++ if ( hts_close(args->out_fh)!=0 ) error("Close failed: %s\n",args->output_fname); ++--- python-pysam.orig/bcftools/plugins/isecGT.c.pysam.c +++++ python-pysam/bcftools/plugins/isecGT.c.pysam.c ++@@ -133,14 +133,14 @@ ++ smpl_ilist_t *smpl = smpl_ilist_map(args->hdr_a, args->hdr_b, SMPL_STRICT); ++ args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); ++ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); ++- bcf_hdr_write(args->out_fh, args->hdr_a); +++ if ( bcf_hdr_write(args->out_fh, args->hdr_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ ++ while ( bcf_sr_next_line(args->sr) ) ++ { ++ if ( !bcf_sr_has_line(args->sr,0) ) continue; ++ if ( !bcf_sr_has_line(args->sr,1) ) ++ { ++- bcf_write(args->out_fh, args->hdr_a, bcf_sr_get_line(args->sr,0)); +++ if ( bcf_write(args->out_fh, args->hdr_a, bcf_sr_get_line(args->sr,0))!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ continue; ++ } ++ ++@@ -165,7 +165,7 @@ ++ } ++ } ++ if ( dirty ) bcf_update_genotypes(args->hdr_a, line_a, args->arr_a, ngt_a*smpl->n); ++- bcf_write(args->out_fh, args->hdr_a, line_a); +++ if ( bcf_write(args->out_fh, args->hdr_a, line_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ ++ if ( hts_close(args->out_fh)!=0 ) error("Close failed: %s\n",args->output_fname); ++--- python-pysam.orig/bcftools/plugins/mendelian.c +++++ python-pysam/bcftools/plugins/mendelian.c ++@@ -1,6 +1,6 @@ ++ /* The MIT License ++ ++- Copyright (c) 2015 Genome Research Ltd. +++ Copyright (c) 2015-2018 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -27,16 +27,18 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++ #include ++ #include ++ #include // for isatty ++-#include "bcftools.h" ++-#include "regidx.h" +++#include "../bcftools.h" +++#include "../regidx.h" ++ ++ #define MODE_COUNT 1 ++ #define MODE_LIST_GOOD 2 ++@@ -148,7 +150,7 @@ ++ " -r, --rules [?] predefined rules, 'list' to print available settings, append '?' for details\n" ++ " -R, --rules-file inheritance rules, see example below\n" ++ " -t, --trio names of mother, father and the child\n" ++- " -T, --trio-file list of trios, one per line\n" +++ " -T, --trio-file list of trios, one per line (mother,father,child)\n" ++ "\n" ++ "Example:\n" ++ " # Default inheritance patterns, override with -r\n" ++@@ -363,13 +365,22 @@ ++ if ( !args.mode ) error("Expected one of the -c, -d or -l options\n"); ++ if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD; ++ +++ FILE *log_fh = stderr; +++ if ( args.mode==MODE_COUNT ) +++ { +++ log_fh = strcmp("-",args.output_fname) ? fopen(args.output_fname,"w") : stdout; +++ if ( !log_fh ) error("Error: cannot write to %s\n", args.output_fname); +++ } +++ ++ args.sr = bcf_sr_init(); ++- if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum)); +++ if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args.sr->errnum)); ++ args.hdr = bcf_sr_get_header(args.sr, 0); ++- args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); ++- if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); ++- bcf_hdr_write(args.out_fh, args.hdr); ++- +++ if ( args.mode!=MODE_COUNT ) +++ { +++ args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); +++ if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); +++ if ( bcf_hdr_write(args.out_fh, args.hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname); +++ } ++ ++ int i, n = 0; ++ char **list; ++@@ -420,29 +431,30 @@ ++ if ( line ) ++ { ++ if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); ++- bcf_write1(args.out_fh, args.hdr, line); +++ if ( args.out_fh && bcf_write1(args.out_fh, args.hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname); ++ } ++ } +++ if ( args.out_fh && hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); ++ ++- ++- fprintf(stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n"); +++ fprintf(log_fh,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio (mother,father,child)\n"); ++ for (i=0; inok,trio->nbad,args.nrec-(trio->nok+trio->nbad), ++ bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), ++ bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), ++ bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) ++ ); ++ } +++ if ( log_fh!=stderr && log_fh!=stdout && fclose(log_fh) ) error("Error: close failed for %s\n", args.output_fname); +++ ++ free(args.gt_arr); ++ free(args.trios); ++ regitr_destroy(args.itr); ++ regitr_destroy(args.itr_ori); ++ regidx_destroy(args.rules); ++ bcf_sr_destroy(args.sr); ++- if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); ++ return 0; ++ } ++ ++@@ -450,7 +462,7 @@ ++ { ++ static int warned = 0; ++ if ( warned ) return; ++- fprintf(stderr,"Incorrect ploidy at %s:%d, skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),rec->pos+1); +++ fprintf(stderr,"Incorrect ploidy at %s:%"PRId64", skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); ++ warned = 1; ++ } ++ ++@@ -555,7 +567,7 @@ ++ } ++ ++ if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) ) ++- error("Could not update GT field at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); +++ error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); ++ ++ if ( args.mode&MODE_DELETE ) return rec; ++ if ( args.mode&MODE_LIST_GOOD ) return has_bad ? NULL : rec; ++--- python-pysam.orig/bcftools/plugins/mendelian.c.pysam.c +++++ python-pysam/bcftools/plugins/mendelian.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* The MIT License ++ ++- Copyright (c) 2015 Genome Research Ltd. +++ Copyright (c) 2015-2018 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -29,16 +29,18 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++ #include ++ #include ++ #include // for isatty ++-#include "bcftools.h" ++-#include "regidx.h" +++#include "../bcftools.h" +++#include "../regidx.h" ++ ++ #define MODE_COUNT 1 ++ #define MODE_LIST_GOOD 2 ++@@ -150,7 +152,7 @@ ++ " -r, --rules [?] predefined rules, 'list' to print available settings, append '?' for details\n" ++ " -R, --rules-file inheritance rules, see example below\n" ++ " -t, --trio names of mother, father and the child\n" ++- " -T, --trio-file list of trios, one per line\n" +++ " -T, --trio-file list of trios, one per line (mother,father,child)\n" ++ "\n" ++ "Example:\n" ++ " # Default inheritance patterns, override with -r\n" ++@@ -365,13 +367,22 @@ ++ if ( !args.mode ) error("Expected one of the -c, -d or -l options\n"); ++ if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD; ++ +++ FILE *log_fh = bcftools_stderr; +++ if ( args.mode==MODE_COUNT ) +++ { +++ log_fh = strcmp("-",args.output_fname) ? fopen(args.output_fname,"w") : bcftools_stdout; +++ if ( !log_fh ) error("Error: cannot write to %s\n", args.output_fname); +++ } +++ ++ args.sr = bcf_sr_init(); ++- if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum)); +++ if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args.sr->errnum)); ++ args.hdr = bcf_sr_get_header(args.sr, 0); ++- args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); ++- if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); ++- bcf_hdr_write(args.out_fh, args.hdr); ++- +++ if ( args.mode!=MODE_COUNT ) +++ { +++ args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); +++ if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); +++ if ( bcf_hdr_write(args.out_fh, args.hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname); +++ } ++ ++ int i, n = 0; ++ char **list; ++@@ -422,29 +433,30 @@ ++ if ( line ) ++ { ++ if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); ++- bcf_write1(args.out_fh, args.hdr, line); +++ if ( args.out_fh && bcf_write1(args.out_fh, args.hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname); ++ } ++ } +++ if ( args.out_fh && hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); ++ ++- ++- fprintf(bcftools_stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n"); +++ fprintf(log_fh,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio (mother,father,child)\n"); ++ for (i=0; inok,trio->nbad,args.nrec-(trio->nok+trio->nbad), ++ bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), ++ bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), ++ bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) ++ ); ++ } +++ if ( log_fh!=bcftools_stderr && log_fh!=bcftools_stdout && fclose(log_fh) ) error("Error: close failed for %s\n", args.output_fname); +++ ++ free(args.gt_arr); ++ free(args.trios); ++ regitr_destroy(args.itr); ++ regitr_destroy(args.itr_ori); ++ regidx_destroy(args.rules); ++ bcf_sr_destroy(args.sr); ++- if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); ++ return 0; ++ } ++ ++@@ -452,7 +464,7 @@ ++ { ++ static int warned = 0; ++ if ( warned ) return; ++- fprintf(bcftools_stderr,"Incorrect ploidy at %s:%d, skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),rec->pos+1); +++ fprintf(bcftools_stderr,"Incorrect ploidy at %s:%"PRId64", skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); ++ warned = 1; ++ } ++ ++@@ -557,7 +569,7 @@ ++ } ++ ++ if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) ) ++- error("Could not update GT field at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); +++ error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); ++ ++ if ( args.mode&MODE_DELETE ) return rec; ++ if ( args.mode&MODE_LIST_GOOD ) return has_bad ? NULL : rec; ++--- python-pysam.orig/bcftools/plugins/missing2ref.c +++++ python-pysam/bcftools/plugins/missing2ref.c ++@@ -109,7 +109,7 @@ ++ } ++ } ++ else{ ++- fprintf(stderr,"Warning: Could not calculate allele count at position %d\n", rec->pos); +++ fprintf(stderr,"Warning: Could not calculate allele count at position %"PRId64"\n", (int64_t) rec->pos); ++ exit(1); ++ } ++ ++--- python-pysam.orig/bcftools/plugins/missing2ref.c.pysam.c +++++ python-pysam/bcftools/plugins/missing2ref.c.pysam.c ++@@ -111,7 +111,7 @@ ++ } ++ } ++ else{ ++- fprintf(bcftools_stderr,"Warning: Could not calculate allele count at position %d\n", rec->pos); +++ fprintf(bcftools_stderr,"Warning: Could not calculate allele count at position %"PRId64"\n", (int64_t) rec->pos); ++ exit(1); ++ } ++ ++--- /dev/null +++++ python-pysam/bcftools/plugins/parental-origin.c ++@@ -0,0 +1,410 @@ +++/* The MIT License +++ +++ Copyright (c) 2019 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++ THE SOFTWARE. +++ +++ */ +++ +++#include +++#include +++#include +++#include +++#include +++#include // for isatty +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "bcftools.h" +++#include "filter.h" +++ +++// Logic of the filters: include or exclude sites which match the filters? +++#define FLT_INCLUDE 1 +++#define FLT_EXCLUDE 2 +++ +++#define CNV_DEL 0 +++#define CNV_DUP 1 +++ +++#define iCHILD 0 +++#define iFATHER 1 +++#define iMOTHER 2 +++ +++typedef struct +++{ +++ int idx[3]; // VCF sample index for child, father, mother +++ int pass; // do all three pass the filters? +++} +++trio_t; +++ +++typedef struct +++{ +++ int argc, filter_logic, cnv_type, debug, greedy; +++ filter_t *filter; +++ char *filter_str; +++ char **argv, *pfm, *fname, *region; +++ bcf_srs_t *sr; +++ bcf_hdr_t *hdr; +++ trio_t trio; +++ int32_t *pl, *ad, *gt; // input FMT/PL, AD, and GT values +++ int mpl, mad, mgt; +++ double ppat,pmat; // method 1: probability of paternal/maternal origin +++ int ntest; // number of informative sites +++ int nmat, npat; // method 2: number of pat/mat sites based on simple ad[0] < ad[1] comparison +++ double min_pbinom; // minimum binomial probability of paternal hets +++} +++args_t; +++ +++args_t args; +++ +++const char *about(void) +++{ +++ return "Determine parental origin of a CNV region in a trio.\n"; +++} +++ +++static const char *usage_text(void) +++{ +++ return +++ "\n" +++ "About: Determine parental origin of a CNV region\n" +++ "Usage: bcftools +parental-origin [Plugin Options]\n" +++ "Plugin options:\n" +++ " -b, --min-binom-prob FLOAT exclude parental HETs with skewed ALT allele fraction [1e-2]\n" +++ " -d, --debug list informative sites\n" +++ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" +++ " -g, --greedy use also ambigous sites, e.g. het+hom parents for deletions\n" +++ " -i, --include EXPR include sites and samples for which the expression is true\n" +++ " -p, --pfm P,F,M sample names of proband, father, and mother\n" +++ " -r, --region REGION chr:beg-end\n" +++ " -t, --type the CNV type\n" +++ "\n" +++ "Example:\n" +++ " bcftools +parental-origin -p proband,father,mother -t dup -r 14:22671179-22947951 file.bcf\n" +++ "\n"; +++} +++ +++static void init_data(args_t *args) +++{ +++ args->sr = bcf_sr_init(); +++ if ( args->region ) +++ { +++ args->sr->require_index = 1; +++ if ( bcf_sr_set_regions(args->sr, args->region, 0)<0 ) error("Failed to read the region: %s\n",args->region); +++ } +++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); +++ args->hdr = bcf_sr_get_header(args->sr,0); +++ +++ int id; +++ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) +++ error("Error: the tag FORMAT/PL is not present in %s\n", args->fname); +++ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "AD"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) +++ error("Error: the tag FORMAT/AD is not present in %s\n", args->fname); +++ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) +++ error("Error: the tag FORMAT/GT is not present in %s\n", args->fname); +++ +++ if ( args->filter_str ) +++ args->filter = filter_init(args->hdr, args->filter_str); +++ +++ int i, n = 0; +++ char **list; +++ list = hts_readlist(args->pfm, 0, &n); +++ if ( n!=3 ) error("Expected three sample names with -t\n"); +++ args->trio.idx[iCHILD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]); +++ args->trio.idx[iFATHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[1]); +++ args->trio.idx[iMOTHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[2]); +++ for (i=0; itrio.idx[i] < 0 ) error("The sample is not present: %s\n", list[i]); +++ free(list[i]); +++ } +++ free(list); +++} +++static void destroy_data(args_t *args) +++{ +++ if ( args->filter ) filter_destroy(args->filter); +++ free(args->pl); +++ free(args->ad); +++ free(args->gt); +++ bcf_sr_destroy(args->sr); +++ free(args); +++} +++static inline double calc_binom_two_sided(int na, int nb, double aprob) +++{ +++ double prob = na > nb ? 2 * kf_betai(na, nb+1, aprob) : 2 * kf_betai(nb, na+1, aprob); +++ if ( prob > 1 ) prob = 1; +++ return prob; +++} +++static inline double calc_binom_one_sided(int na, int nb, double aprob, int ge) +++{ +++ return ge ? kf_betai(na, nb + 1, aprob) : kf_betai(nb, na + 1, 1 - aprob); +++} +++static void process_record(args_t *args, bcf1_t *rec) +++{ +++ if ( rec->n_allele!=2 || bcf_get_variant_types(rec)!=VCF_SNP ) return; +++ +++ int i,j; +++ if ( args->filter ) +++ { +++ uint8_t *smpl_pass = NULL; +++ int pass_site = filter_test(args->filter, rec, (const uint8_t**) &smpl_pass); +++ if ( args->filter_logic & FLT_EXCLUDE ) +++ { +++ if ( pass_site ) +++ { +++ if ( !smpl_pass ) return; +++ pass_site = 0; +++ for (i=0; i<3; i++) +++ { +++ if ( smpl_pass[args->trio.idx[i]] ) smpl_pass[args->trio.idx[i]] = 0; +++ else { smpl_pass[args->trio.idx[i]] = 1; pass_site = 1; } +++ } +++ if ( !pass_site ) return; +++ } +++ else +++ for (i=0; i<3; i++) smpl_pass[args->trio.idx[i]] = 1; +++ } +++ else if ( !pass_site ) return; +++ +++ if ( smpl_pass ) +++ { +++ for (i=0; i<3; i++) +++ if ( !smpl_pass[args->trio.idx[i]] ) return; +++ } +++ } +++ +++ int nsmpl = bcf_hdr_nsamples(args->hdr); +++ int nret = bcf_get_format_int32(args->hdr,rec,"AD",&args->ad,&args->mad); +++ if ( nret<=0 ) +++ { +++ printf("The FORMAT/AD tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ return; +++ } +++ int nad1 = nret/nsmpl; +++ +++ nret = bcf_get_format_int32(args->hdr,rec,"PL",&args->pl,&args->mpl); +++ if ( nret<=0 ) error("The FORMAT/PL tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ int npl1 = nret/nsmpl; +++ if ( npl1!=rec->n_allele*(rec->n_allele+1)/2 ) +++ { +++ printf("todo: not a diploid site at %s:%"PRId64": %d alleles, %d PLs\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_allele,npl1); +++ return; +++ } +++ +++ nret = bcf_get_genotypes(args->hdr,rec,&args->gt,&args->mgt); +++ if ( nret<=0 ) error("The FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ int ngt1 = nret/nsmpl; +++ if ( ngt1!=2 ) error("Todo: assuming diploid fields for now .. %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ +++ // number of ref and alt alleles in the proband +++ int32_t ad[6], *adP = ad, *adF = ad+2, *adM = ad+4; +++ int32_t dsg[3], *dsgP = dsg, *dsgF = dsg+1, *dsgM = dsg+2; +++ double gl[9], *glP = gl, *glF = gl+3, *glM = gl+6; +++ for (i=0; i<3; i++) // trio +++ { +++ int isum = 0; +++ int32_t *src = args->pl + npl1*args->trio.idx[i]; +++ double *gl_dst = gl + 3*i; +++ double sum = 0; +++ for (j=0; j<3; j++) // iterate over PL +++ { +++ if ( src[j]==bcf_int32_missing || src[j]==bcf_int32_vector_end ) return; +++ gl_dst[j] = pow(10,-0.1*src[j]); +++ sum += gl_dst[j]; +++ isum += src[j]; +++ } +++ if ( isum==0 ) return; +++ for (j=0; j<3; j++) gl_dst[j] /= sum; +++ +++ int32_t *gt = args->gt + ngt1*args->trio.idx[i]; +++ dsg[i] = 0; +++ for (j=0; jad + nad1*args->trio.idx[i]; +++ ad[2*i] = src[0]; +++ ad[2*i+1] = src[1]; +++ } +++ +++ #define is_RR(x) (x[0]==0) +++ #define is_RA(x) (x[1]==0) +++ #define is_AA(x) (x[2]==0) +++ if ( args->cnv_type==CNV_DEL ) +++ { +++ if ( *dsgP!=0 && *dsgP!=2 ) return; // proband not a hom +++ if ( *dsgF == *dsgM ) return; // cannot distinguish between parents +++ if ( !args->greedy ) +++ { +++ if ( *dsgF==1 && *dsgP==*dsgM ) return; // both parents have the proband's allele +++ if ( *dsgM==1 && *dsgP==*dsgF ) return; +++ } +++ double pmat = glP[0]*(0.5*glM[0]*glF[0] + 2/3.*glM[0]*glF[1] + glM[0]*glF[2] + 1/3.*glM[1]*glF[0] + 0.5*glM[1]*glF[1] + glM[1]*glF[2]) + +++ glP[2]*(0.5*glM[2]*glF[2] + 2/3.*glM[2]*glF[1] + glM[2]*glF[0] + 1/3.*glM[1]*glF[2] + 0.5*glM[1]*glF[1] + glM[1]*glF[0]); +++ double ppat = glP[0]*(0.5*glM[0]*glF[0] + 2/3.*glM[1]*glF[0] + glM[2]*glF[0] + 1/3.*glM[0]*glF[1] + 0.5*glM[1]*glF[1] + glM[2]*glF[1]) + +++ glP[2]*(0.5*glM[2]*glF[2] + 2/3.*glM[1]*glF[2] + glM[0]*glF[2] + 1/3.*glM[2]*glF[1] + 0.5*glM[1]*glF[1] + glM[0]*glF[1]); +++ +++ // NB: pmat/ppat is the probability of parental origin of the observed, not the deleted allele; +++ // args->pmat/ppat is the probability of parental origin of the deleted allele +++ args->pmat += log(ppat); +++ args->ppat += log(pmat); +++ args->ntest++; +++ +++ if ( args->debug ) +++ { +++ // output: position, paternal probability, maternal probability, PLs of child, father, mother +++ printf("DBG\t%"PRId64"\t%e\t%e\t", (int64_t) rec->pos+1,ppat,pmat); +++ for (i=0; i<3; i++) +++ { +++ for (j=0; j<3; j++) printf(" %d",args->pl[npl1*args->trio.idx[i]+j]); +++ printf("\t"); +++ } +++ printf("\n"); +++ } +++ } +++ if ( args->cnv_type==CNV_DUP ) +++ { +++ if ( !adP[0] || !adP[1] ) return; // proband is homozygous or has no coverage +++ if ( adP[0] == adP[1] ) return; // proband's alleles are not informative, any or none could have been duplicated +++ if ( *dsgP!=1 ) return; // the proband's genotype is not a het +++ if ( *dsgF == *dsgM ) return; // cannot distinguish between parents +++ +++ if ( args->min_pbinom!=0 ) +++ { +++ // exclude parental hets with skewed ALT allele proportion +++ if ( *dsgF==1 && adF[0] && adF[1] && calc_binom_two_sided(adF[0],adF[1],0.5) < args->min_pbinom ) return; +++ if ( *dsgM==1 && adM[0] && adM[1] && calc_binom_two_sided(adM[0],adM[1],0.5) < args->min_pbinom ) return; +++ } +++ +++ double prra = glP[1] * calc_binom_one_sided(adP[1],adP[0],1/3.,1); +++ double praa = glP[1] * calc_binom_one_sided(adP[1],adP[0],2/3.,0); +++ double ppat = prra*(glM[1]*glF[0] + glM[2]*glF[0] + 0.5*glM[1]*glF[1] + glM[2]*glF[1]) + +++ praa*(glM[1]*glF[2] + glM[0]*glF[2] + 0.5*glM[1]*glF[1] + glM[0]*glF[1]); +++ double pmat = prra*(glM[0]*glF[1] + glM[0]*glF[2] + 0.5*glM[1]*glF[1] + glM[1]*glF[2]) + +++ praa*(glM[2]*glF[1] + glM[2]*glF[0] + 0.5*glM[1]*glF[1] + glM[1]*glF[0]); +++ args->pmat += log(pmat); +++ args->ppat += log(ppat); +++ args->ntest++; +++ +++ if ( args->debug ) +++ { +++ // output: position; paternal probability; maternal probability; ADs of child, father,mother; PLs of child, father, mother +++ printf("DBG\t%"PRId64"\t%e\t%e\t", (int64_t) rec->pos+1,ppat,pmat); +++ for (i=0; i<3; i++) +++ { +++ printf("%d %d\t",ad[2*i],ad[2*i+1]); +++ } +++ for (i=0; i<3; i++) +++ { +++ for (j=0; j<3; j++) printf(" %d",args->pl[npl1*args->trio.idx[i]+j]); +++ printf("\t"); +++ } +++ printf("\n"); +++ } +++ } +++} +++ +++int run(int argc, char **argv) +++{ +++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); +++ args->argc = argc; args->argv = argv; +++ args->min_pbinom = 1e-2; +++ static struct option loptions[] = +++ { +++ {"include",required_argument,0,'i'}, +++ {"exclude",required_argument,0,'e'}, +++ {"pfm",required_argument,NULL,'p'}, +++ {"region",required_argument,0,'r'}, +++ {"type",required_argument,0,'t'}, +++ {"debug",no_argument,0,'d'}, +++ {"greedy",no_argument,0,'g'}, +++ {"min-binom-prob",required_argument,0,'b'}, +++ {NULL,0,NULL,0} +++ }; +++ int c; +++ char *tmp; +++ while ((c = getopt_long(argc, argv, "h?e:i:p:r:t:dgb:",loptions,NULL)) >= 0) +++ { +++ switch (c) +++ { +++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; +++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; +++ case 't': +++ if ( !strcasecmp("dup",optarg) ) args->cnv_type = CNV_DUP; +++ else if ( !strcasecmp("del",optarg) ) args->cnv_type = CNV_DEL; +++ break; +++ case 'r': args->region = optarg; break; +++ case 'p': args->pfm = optarg; break; +++ case 'd': args->debug = 1; break; +++ case 'g': args->greedy = 1; break; +++ case 'b': +++ args->min_pbinom = strtod(optarg,&tmp); +++ if ( *tmp ) error("Could not parse: -b %s\n", optarg); +++ if ( args->min_pbinom<0 || args->min_pbinom>1 ) error("Expected value from the interval [0,1] with --min-binom-prob\n"); +++ break; +++ case 'h': +++ case '?': +++ default: error("%s", usage_text()); break; +++ } +++ } +++ if ( optind==argc ) +++ { +++ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin +++ else { error("%s", usage_text()); } +++ } +++ else if ( optind+1!=argc ) error("%s", usage_text()); +++ else args->fname = argv[optind]; +++ +++ if ( !args->pfm ) error("Missing the -p option\n"); +++ +++ init_data(args); +++ if ( args->debug ) +++ { +++ if ( args->cnv_type==CNV_DEL ) printf("# DBG: position; paternal probability; maternal probability; PLs of child, father, mother\n"); +++ else printf("# DBG: position; paternal probability; maternal probability; ADs of child, father, mother; PLs of child, father, mother\n"); +++ } +++ +++ while ( bcf_sr_next_line(args->sr) ) +++ process_record(args, bcf_sr_get_line(args->sr,0)); +++ +++ double qual = 4.3429*fabs(args->ppat - args->pmat); +++ char *origin = "uncertain"; +++ if ( args->ppat > args->pmat ) origin = "paternal"; +++ else if ( args->ppat < args->pmat ) origin = "maternal"; +++ +++ int i; +++ printf("# bcftools +%s", args->argv[0]); +++ for (i=1; iargc; i++) printf(" %s",args->argv[i]); +++ printf("\n"); +++ printf("# [1]type\t[2]predicted_origin\t[3]quality\t[4]nmarkers\n"); +++ printf("%s\t%s\t%f\t%d\n", args->cnv_type==CNV_DUP ? "dup" : "del", origin, qual, args->ntest); +++ +++ destroy_data(args); +++ +++ return 0; +++} ++--- /dev/null +++++ python-pysam/bcftools/plugins/parental-origin.c.pysam.c ++@@ -0,0 +1,412 @@ +++#include "bcftools.pysam.h" +++ +++/* The MIT License +++ +++ Copyright (c) 2019 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++ THE SOFTWARE. +++ +++ */ +++ +++#include +++#include +++#include +++#include +++#include +++#include // for isatty +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "bcftools.h" +++#include "filter.h" +++ +++// Logic of the filters: include or exclude sites which match the filters? +++#define FLT_INCLUDE 1 +++#define FLT_EXCLUDE 2 +++ +++#define CNV_DEL 0 +++#define CNV_DUP 1 +++ +++#define iCHILD 0 +++#define iFATHER 1 +++#define iMOTHER 2 +++ +++typedef struct +++{ +++ int idx[3]; // VCF sample index for child, father, mother +++ int pass; // do all three pass the filters? +++} +++trio_t; +++ +++typedef struct +++{ +++ int argc, filter_logic, cnv_type, debug, greedy; +++ filter_t *filter; +++ char *filter_str; +++ char **argv, *pfm, *fname, *region; +++ bcf_srs_t *sr; +++ bcf_hdr_t *hdr; +++ trio_t trio; +++ int32_t *pl, *ad, *gt; // input FMT/PL, AD, and GT values +++ int mpl, mad, mgt; +++ double ppat,pmat; // method 1: probability of paternal/maternal origin +++ int ntest; // number of informative sites +++ int nmat, npat; // method 2: number of pat/mat sites based on simple ad[0] < ad[1] comparison +++ double min_pbinom; // minimum binomial probability of paternal hets +++} +++args_t; +++ +++args_t args; +++ +++const char *about(void) +++{ +++ return "Determine parental origin of a CNV region in a trio.\n"; +++} +++ +++static const char *usage_text(void) +++{ +++ return +++ "\n" +++ "About: Determine parental origin of a CNV region\n" +++ "Usage: bcftools +parental-origin [Plugin Options]\n" +++ "Plugin options:\n" +++ " -b, --min-binom-prob FLOAT exclude parental HETs with skewed ALT allele fraction [1e-2]\n" +++ " -d, --debug list informative sites\n" +++ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" +++ " -g, --greedy use also ambigous sites, e.g. het+hom parents for deletions\n" +++ " -i, --include EXPR include sites and samples for which the expression is true\n" +++ " -p, --pfm P,F,M sample names of proband, father, and mother\n" +++ " -r, --region REGION chr:beg-end\n" +++ " -t, --type the CNV type\n" +++ "\n" +++ "Example:\n" +++ " bcftools +parental-origin -p proband,father,mother -t dup -r 14:22671179-22947951 file.bcf\n" +++ "\n"; +++} +++ +++static void init_data(args_t *args) +++{ +++ args->sr = bcf_sr_init(); +++ if ( args->region ) +++ { +++ args->sr->require_index = 1; +++ if ( bcf_sr_set_regions(args->sr, args->region, 0)<0 ) error("Failed to read the region: %s\n",args->region); +++ } +++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); +++ args->hdr = bcf_sr_get_header(args->sr,0); +++ +++ int id; +++ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) +++ error("Error: the tag FORMAT/PL is not present in %s\n", args->fname); +++ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "AD"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) +++ error("Error: the tag FORMAT/AD is not present in %s\n", args->fname); +++ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) +++ error("Error: the tag FORMAT/GT is not present in %s\n", args->fname); +++ +++ if ( args->filter_str ) +++ args->filter = filter_init(args->hdr, args->filter_str); +++ +++ int i, n = 0; +++ char **list; +++ list = hts_readlist(args->pfm, 0, &n); +++ if ( n!=3 ) error("Expected three sample names with -t\n"); +++ args->trio.idx[iCHILD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]); +++ args->trio.idx[iFATHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[1]); +++ args->trio.idx[iMOTHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[2]); +++ for (i=0; itrio.idx[i] < 0 ) error("The sample is not present: %s\n", list[i]); +++ free(list[i]); +++ } +++ free(list); +++} +++static void destroy_data(args_t *args) +++{ +++ if ( args->filter ) filter_destroy(args->filter); +++ free(args->pl); +++ free(args->ad); +++ free(args->gt); +++ bcf_sr_destroy(args->sr); +++ free(args); +++} +++static inline double calc_binom_two_sided(int na, int nb, double aprob) +++{ +++ double prob = na > nb ? 2 * kf_betai(na, nb+1, aprob) : 2 * kf_betai(nb, na+1, aprob); +++ if ( prob > 1 ) prob = 1; +++ return prob; +++} +++static inline double calc_binom_one_sided(int na, int nb, double aprob, int ge) +++{ +++ return ge ? kf_betai(na, nb + 1, aprob) : kf_betai(nb, na + 1, 1 - aprob); +++} +++static void process_record(args_t *args, bcf1_t *rec) +++{ +++ if ( rec->n_allele!=2 || bcf_get_variant_types(rec)!=VCF_SNP ) return; +++ +++ int i,j; +++ if ( args->filter ) +++ { +++ uint8_t *smpl_pass = NULL; +++ int pass_site = filter_test(args->filter, rec, (const uint8_t**) &smpl_pass); +++ if ( args->filter_logic & FLT_EXCLUDE ) +++ { +++ if ( pass_site ) +++ { +++ if ( !smpl_pass ) return; +++ pass_site = 0; +++ for (i=0; i<3; i++) +++ { +++ if ( smpl_pass[args->trio.idx[i]] ) smpl_pass[args->trio.idx[i]] = 0; +++ else { smpl_pass[args->trio.idx[i]] = 1; pass_site = 1; } +++ } +++ if ( !pass_site ) return; +++ } +++ else +++ for (i=0; i<3; i++) smpl_pass[args->trio.idx[i]] = 1; +++ } +++ else if ( !pass_site ) return; +++ +++ if ( smpl_pass ) +++ { +++ for (i=0; i<3; i++) +++ if ( !smpl_pass[args->trio.idx[i]] ) return; +++ } +++ } +++ +++ int nsmpl = bcf_hdr_nsamples(args->hdr); +++ int nret = bcf_get_format_int32(args->hdr,rec,"AD",&args->ad,&args->mad); +++ if ( nret<=0 ) +++ { +++ fprintf(bcftools_stdout, "The FORMAT/AD tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ return; +++ } +++ int nad1 = nret/nsmpl; +++ +++ nret = bcf_get_format_int32(args->hdr,rec,"PL",&args->pl,&args->mpl); +++ if ( nret<=0 ) error("The FORMAT/PL tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ int npl1 = nret/nsmpl; +++ if ( npl1!=rec->n_allele*(rec->n_allele+1)/2 ) +++ { +++ fprintf(bcftools_stdout, "todo: not a diploid site at %s:%"PRId64": %d alleles, %d PLs\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_allele,npl1); +++ return; +++ } +++ +++ nret = bcf_get_genotypes(args->hdr,rec,&args->gt,&args->mgt); +++ if ( nret<=0 ) error("The FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ int ngt1 = nret/nsmpl; +++ if ( ngt1!=2 ) error("Todo: assuming diploid fields for now .. %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ +++ // number of ref and alt alleles in the proband +++ int32_t ad[6], *adP = ad, *adF = ad+2, *adM = ad+4; +++ int32_t dsg[3], *dsgP = dsg, *dsgF = dsg+1, *dsgM = dsg+2; +++ double gl[9], *glP = gl, *glF = gl+3, *glM = gl+6; +++ for (i=0; i<3; i++) // trio +++ { +++ int isum = 0; +++ int32_t *src = args->pl + npl1*args->trio.idx[i]; +++ double *gl_dst = gl + 3*i; +++ double sum = 0; +++ for (j=0; j<3; j++) // iterate over PL +++ { +++ if ( src[j]==bcf_int32_missing || src[j]==bcf_int32_vector_end ) return; +++ gl_dst[j] = pow(10,-0.1*src[j]); +++ sum += gl_dst[j]; +++ isum += src[j]; +++ } +++ if ( isum==0 ) return; +++ for (j=0; j<3; j++) gl_dst[j] /= sum; +++ +++ int32_t *gt = args->gt + ngt1*args->trio.idx[i]; +++ dsg[i] = 0; +++ for (j=0; jad + nad1*args->trio.idx[i]; +++ ad[2*i] = src[0]; +++ ad[2*i+1] = src[1]; +++ } +++ +++ #define is_RR(x) (x[0]==0) +++ #define is_RA(x) (x[1]==0) +++ #define is_AA(x) (x[2]==0) +++ if ( args->cnv_type==CNV_DEL ) +++ { +++ if ( *dsgP!=0 && *dsgP!=2 ) return; // proband not a hom +++ if ( *dsgF == *dsgM ) return; // cannot distinguish between parents +++ if ( !args->greedy ) +++ { +++ if ( *dsgF==1 && *dsgP==*dsgM ) return; // both parents have the proband's allele +++ if ( *dsgM==1 && *dsgP==*dsgF ) return; +++ } +++ double pmat = glP[0]*(0.5*glM[0]*glF[0] + 2/3.*glM[0]*glF[1] + glM[0]*glF[2] + 1/3.*glM[1]*glF[0] + 0.5*glM[1]*glF[1] + glM[1]*glF[2]) + +++ glP[2]*(0.5*glM[2]*glF[2] + 2/3.*glM[2]*glF[1] + glM[2]*glF[0] + 1/3.*glM[1]*glF[2] + 0.5*glM[1]*glF[1] + glM[1]*glF[0]); +++ double ppat = glP[0]*(0.5*glM[0]*glF[0] + 2/3.*glM[1]*glF[0] + glM[2]*glF[0] + 1/3.*glM[0]*glF[1] + 0.5*glM[1]*glF[1] + glM[2]*glF[1]) + +++ glP[2]*(0.5*glM[2]*glF[2] + 2/3.*glM[1]*glF[2] + glM[0]*glF[2] + 1/3.*glM[2]*glF[1] + 0.5*glM[1]*glF[1] + glM[0]*glF[1]); +++ +++ // NB: pmat/ppat is the probability of parental origin of the observed, not the deleted allele; +++ // args->pmat/ppat is the probability of parental origin of the deleted allele +++ args->pmat += log(ppat); +++ args->ppat += log(pmat); +++ args->ntest++; +++ +++ if ( args->debug ) +++ { +++ // output: position, paternal probability, maternal probability, PLs of child, father, mother +++ fprintf(bcftools_stdout, "DBG\t%"PRId64"\t%e\t%e\t", (int64_t) rec->pos+1,ppat,pmat); +++ for (i=0; i<3; i++) +++ { +++ for (j=0; j<3; j++) fprintf(bcftools_stdout, " %d",args->pl[npl1*args->trio.idx[i]+j]); +++ fprintf(bcftools_stdout, "\t"); +++ } +++ fprintf(bcftools_stdout, "\n"); +++ } +++ } +++ if ( args->cnv_type==CNV_DUP ) +++ { +++ if ( !adP[0] || !adP[1] ) return; // proband is homozygous or has no coverage +++ if ( adP[0] == adP[1] ) return; // proband's alleles are not informative, any or none could have been duplicated +++ if ( *dsgP!=1 ) return; // the proband's genotype is not a het +++ if ( *dsgF == *dsgM ) return; // cannot distinguish between parents +++ +++ if ( args->min_pbinom!=0 ) +++ { +++ // exclude parental hets with skewed ALT allele proportion +++ if ( *dsgF==1 && adF[0] && adF[1] && calc_binom_two_sided(adF[0],adF[1],0.5) < args->min_pbinom ) return; +++ if ( *dsgM==1 && adM[0] && adM[1] && calc_binom_two_sided(adM[0],adM[1],0.5) < args->min_pbinom ) return; +++ } +++ +++ double prra = glP[1] * calc_binom_one_sided(adP[1],adP[0],1/3.,1); +++ double praa = glP[1] * calc_binom_one_sided(adP[1],adP[0],2/3.,0); +++ double ppat = prra*(glM[1]*glF[0] + glM[2]*glF[0] + 0.5*glM[1]*glF[1] + glM[2]*glF[1]) + +++ praa*(glM[1]*glF[2] + glM[0]*glF[2] + 0.5*glM[1]*glF[1] + glM[0]*glF[1]); +++ double pmat = prra*(glM[0]*glF[1] + glM[0]*glF[2] + 0.5*glM[1]*glF[1] + glM[1]*glF[2]) + +++ praa*(glM[2]*glF[1] + glM[2]*glF[0] + 0.5*glM[1]*glF[1] + glM[1]*glF[0]); +++ args->pmat += log(pmat); +++ args->ppat += log(ppat); +++ args->ntest++; +++ +++ if ( args->debug ) +++ { +++ // output: position; paternal probability; maternal probability; ADs of child, father,mother; PLs of child, father, mother +++ fprintf(bcftools_stdout, "DBG\t%"PRId64"\t%e\t%e\t", (int64_t) rec->pos+1,ppat,pmat); +++ for (i=0; i<3; i++) +++ { +++ fprintf(bcftools_stdout, "%d %d\t",ad[2*i],ad[2*i+1]); +++ } +++ for (i=0; i<3; i++) +++ { +++ for (j=0; j<3; j++) fprintf(bcftools_stdout, " %d",args->pl[npl1*args->trio.idx[i]+j]); +++ fprintf(bcftools_stdout, "\t"); +++ } +++ fprintf(bcftools_stdout, "\n"); +++ } +++ } +++} +++ +++int run(int argc, char **argv) +++{ +++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); +++ args->argc = argc; args->argv = argv; +++ args->min_pbinom = 1e-2; +++ static struct option loptions[] = +++ { +++ {"include",required_argument,0,'i'}, +++ {"exclude",required_argument,0,'e'}, +++ {"pfm",required_argument,NULL,'p'}, +++ {"region",required_argument,0,'r'}, +++ {"type",required_argument,0,'t'}, +++ {"debug",no_argument,0,'d'}, +++ {"greedy",no_argument,0,'g'}, +++ {"min-binom-prob",required_argument,0,'b'}, +++ {NULL,0,NULL,0} +++ }; +++ int c; +++ char *tmp; +++ while ((c = getopt_long(argc, argv, "h?e:i:p:r:t:dgb:",loptions,NULL)) >= 0) +++ { +++ switch (c) +++ { +++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; +++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; +++ case 't': +++ if ( !strcasecmp("dup",optarg) ) args->cnv_type = CNV_DUP; +++ else if ( !strcasecmp("del",optarg) ) args->cnv_type = CNV_DEL; +++ break; +++ case 'r': args->region = optarg; break; +++ case 'p': args->pfm = optarg; break; +++ case 'd': args->debug = 1; break; +++ case 'g': args->greedy = 1; break; +++ case 'b': +++ args->min_pbinom = strtod(optarg,&tmp); +++ if ( *tmp ) error("Could not parse: -b %s\n", optarg); +++ if ( args->min_pbinom<0 || args->min_pbinom>1 ) error("Expected value from the interval [0,1] with --min-binom-prob\n"); +++ break; +++ case 'h': +++ case '?': +++ default: error("%s", usage_text()); break; +++ } +++ } +++ if ( optind==argc ) +++ { +++ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin +++ else { error("%s", usage_text()); } +++ } +++ else if ( optind+1!=argc ) error("%s", usage_text()); +++ else args->fname = argv[optind]; +++ +++ if ( !args->pfm ) error("Missing the -p option\n"); +++ +++ init_data(args); +++ if ( args->debug ) +++ { +++ if ( args->cnv_type==CNV_DEL ) fprintf(bcftools_stdout, "# DBG: position; paternal probability; maternal probability; PLs of child, father, mother\n"); +++ else fprintf(bcftools_stdout, "# DBG: position; paternal probability; maternal probability; ADs of child, father, mother; PLs of child, father, mother\n"); +++ } +++ +++ while ( bcf_sr_next_line(args->sr) ) +++ process_record(args, bcf_sr_get_line(args->sr,0)); +++ +++ double qual = 4.3429*fabs(args->ppat - args->pmat); +++ char *origin = "uncertain"; +++ if ( args->ppat > args->pmat ) origin = "paternal"; +++ else if ( args->ppat < args->pmat ) origin = "maternal"; +++ +++ int i; +++ fprintf(bcftools_stdout, "# bcftools +%s", args->argv[0]); +++ for (i=1; iargc; i++) fprintf(bcftools_stdout, " %s",args->argv[i]); +++ fprintf(bcftools_stdout, "\n"); +++ fprintf(bcftools_stdout, "# [1]type\t[2]predicted_origin\t[3]quality\t[4]nmarkers\n"); +++ fprintf(bcftools_stdout, "%s\t%s\t%f\t%d\n", args->cnv_type==CNV_DUP ? "dup" : "del", origin, qual, args->ntest); +++ +++ destroy_data(args); +++ +++ return 0; +++} ++--- python-pysam.orig/bcftools/plugins/prune.c +++++ python-pysam/bcftools/plugins/prune.c ++@@ -129,7 +129,7 @@ ++ bcf_hdr_printf(args->hdr,"##INFO=%e upstream\">",args->info_pos,args->max_ld); ++ bcf_hdr_printf(args->hdr,"##INFO=%e upstream\">",args->info_r2,args->max_ld); ++ } ++- bcf_hdr_write(args->out_fh, args->hdr); +++ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ if ( args->filter_r2 ) ++ args->filter_r2_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, args->filter_r2); ++ ++@@ -147,7 +147,7 @@ ++ { ++ if ( args->filter ) ++ filter_destroy(args->filter); ++- hts_close(args->out_fh); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ vcfbuf_destroy(args->vcfbuf); ++ bcf_sr_destroy(args->sr); ++ free(args->info_pos); ++@@ -158,7 +158,7 @@ ++ { ++ bcf1_t *rec; ++ while ( (rec = vcfbuf_flush(args->vcfbuf, flush_all)) ) ++- bcf_write1(args->out_fh, args->hdr, rec); +++ if ( bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ static void process(args_t *args) ++ { ++@@ -251,9 +251,9 @@ ++ else if ( !strcasecmp("kb",tmp) ) args->ld_win *= -1000; ++ else error("Could not parse: --window %s\n", optarg); ++ break; ++- case 'T': args->target_is_file = 1; +++ case 'T': args->target_is_file = 1; // fall-through ++ case 't': args->target = optarg; break; ++- case 'R': args->region_is_file = 1; +++ case 'R': args->region_is_file = 1; // fall-through ++ case 'r': args->region = optarg; break; ++ case 'o': args->output_fname = optarg; break; ++ case 'O': ++--- python-pysam.orig/bcftools/plugins/prune.c.pysam.c +++++ python-pysam/bcftools/plugins/prune.c.pysam.c ++@@ -131,7 +131,7 @@ ++ bcf_hdr_printf(args->hdr,"##INFO=%e upstream\">",args->info_pos,args->max_ld); ++ bcf_hdr_printf(args->hdr,"##INFO=%e upstream\">",args->info_r2,args->max_ld); ++ } ++- bcf_hdr_write(args->out_fh, args->hdr); +++ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ if ( args->filter_r2 ) ++ args->filter_r2_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, args->filter_r2); ++ ++@@ -149,7 +149,7 @@ ++ { ++ if ( args->filter ) ++ filter_destroy(args->filter); ++- hts_close(args->out_fh); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ vcfbuf_destroy(args->vcfbuf); ++ bcf_sr_destroy(args->sr); ++ free(args->info_pos); ++@@ -160,7 +160,7 @@ ++ { ++ bcf1_t *rec; ++ while ( (rec = vcfbuf_flush(args->vcfbuf, flush_all)) ) ++- bcf_write1(args->out_fh, args->hdr, rec); +++ if ( bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ static void process(args_t *args) ++ { ++@@ -253,9 +253,9 @@ ++ else if ( !strcasecmp("kb",tmp) ) args->ld_win *= -1000; ++ else error("Could not parse: --window %s\n", optarg); ++ break; ++- case 'T': args->target_is_file = 1; +++ case 'T': args->target_is_file = 1; // fall-through ++ case 't': args->target = optarg; break; ++- case 'R': args->region_is_file = 1; +++ case 'R': args->region_is_file = 1; // fall-through ++ case 'r': args->region = optarg; break; ++ case 'o': args->output_fname = optarg; break; ++ case 'O': ++--- /dev/null +++++ python-pysam/bcftools/plugins/remove-overlaps.c ++@@ -0,0 +1,219 @@ +++/* +++ Copyright (C) 2017-2019 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++ THE SOFTWARE. +++*/ +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "bcftools.h" +++#include "vcfbuf.h" +++#include "filter.h" +++ +++#define FLT_INCLUDE 1 +++#define FLT_EXCLUDE 2 +++ +++typedef struct +++{ +++ filter_t *filter; +++ char *filter_str; +++ int filter_logic; // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e) +++ vcfbuf_t *vcfbuf; +++ int argc, region_is_file, target_is_file, output_type, verbose, nrm, ntot, print_overlaps, rmdup; +++ char **argv, *region, *target, *fname, *output_fname; +++ htsFile *out_fh; +++ bcf_hdr_t *hdr; +++ bcf_srs_t *sr; +++} +++args_t; +++ +++const char *about(void) +++{ +++ return "Remove overlapping variants\n"; +++} +++ +++static const char *usage_text(void) +++{ +++ return +++ "\n" +++ "About: Remove overlapping variants.\n" +++ "\n" +++ "Usage: bcftools +remove-overlaps [Options]\n" +++ "Plugin options:\n" +++ " -d, --rm-dup remove only duplicate sites and remove them completely\n" +++ " -p, --print-overlaps do the opposite and print only overlapping sites\n" +++ " -v, --verbose print a list of removed sites\n" +++ "Standard options:\n" +++ " -e, --exclude EXPR exclude sites for which the expression is true\n" +++ " -i, --include EXPR include only sites for which the expression is true\n" +++ " -o, --output FILE write output to the FILE [standard output]\n" +++ " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" +++ " -r, --regions REGION restrict to comma-separated list of regions\n" +++ " -R, --regions-file FILE restrict to regions listed in a file\n" +++ " -t, --targets REGION similar to -r but streams rather than index-jumps\n" +++ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" +++ "\n"; +++} +++ +++static void init_data(args_t *args) +++{ +++ args->sr = bcf_sr_init(); +++ if ( args->region ) +++ { +++ args->sr->require_index = 1; +++ if ( bcf_sr_set_regions(args->sr, args->region, args->region_is_file)<0 ) error("Failed to read the regions: %s\n",args->region); +++ } +++ if ( args->target && bcf_sr_set_targets(args->sr, args->target, args->target_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->target); +++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); +++ args->hdr = bcf_sr_get_header(args->sr,0); +++ +++ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); +++ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); +++ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ +++ args->vcfbuf = vcfbuf_init(args->hdr, 0); +++ if ( args->rmdup ) +++ vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_RMDUP,1) +++ else +++ vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_OVERLAP_WIN,1) +++ +++ if ( args->filter_str ) +++ args->filter = filter_init(args->hdr, args->filter_str); +++} +++static void destroy_data(args_t *args) +++{ +++ if ( args->filter ) +++ filter_destroy(args->filter); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); +++ vcfbuf_destroy(args->vcfbuf); +++ bcf_sr_destroy(args->sr); +++ free(args); +++} +++static void flush(args_t *args, int flush_all) +++{ +++ int nbuf = vcfbuf_nsites(args->vcfbuf); +++ bcf1_t *rec; +++ while ( (rec = vcfbuf_flush(args->vcfbuf, flush_all)) ) +++ { +++ if ( nbuf>2 || (nbuf>1 && flush_all) ) +++ { +++ args->nrm++; +++ if ( args->verbose ) printf("%s\t%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ if ( args->print_overlaps && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ continue; // skip overlapping variants +++ } +++ if ( !args->print_overlaps && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ } +++} +++static void process(args_t *args) +++{ +++ args->ntot++; +++ bcf1_t *rec = bcf_sr_get_line(args->sr,0); +++ if ( args->filter ) +++ { +++ int ret = filter_test(args->filter, rec, NULL); +++ if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return; } +++ else if ( ret ) return; +++ } +++ bcf_sr_t *sr = bcf_sr_get_reader(args->sr, 0); +++ sr->buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1); +++ flush(args,0); +++} +++ +++int run(int argc, char **argv) +++{ +++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); +++ args->argc = argc; args->argv = argv; +++ args->output_type = FT_VCF; +++ args->output_fname = "-"; +++ static struct option loptions[] = +++ { +++ {"rm-dup",no_argument,NULL,'d'}, +++ {"print-overlaps",no_argument,NULL,'p'}, +++ {"exclude",required_argument,NULL,'e'}, +++ {"include",required_argument,NULL,'i'}, +++ {"regions",required_argument,NULL,'r'}, +++ {"regions-file",required_argument,NULL,'R'}, +++ {"output",required_argument,NULL,'o'}, +++ {"output-type",required_argument,NULL,'O'}, +++ {"verbose",no_argument,NULL,'v'}, +++ {NULL,0,NULL,0} +++ }; +++ int c; +++ while ((c = getopt_long(argc, argv, "r:R:t:T:o:O:i:e:vpd",loptions,NULL)) >= 0) +++ { +++ switch (c) +++ { +++ case 'd': args->rmdup = 1; break; +++ case 'p': args->print_overlaps = 1; break; +++ case 'v': args->verbose = 1; break; +++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; +++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; +++ case 'T': args->target_is_file = 1; // fall-through +++ case 't': args->target = optarg; break; +++ case 'R': args->region_is_file = 1; // fall-through +++ case 'r': args->region = optarg; break; +++ case 'o': args->output_fname = optarg; break; +++ case 'O': +++ switch (optarg[0]) { +++ case 'b': args->output_type = FT_BCF_GZ; break; +++ case 'u': args->output_type = FT_BCF; break; +++ case 'z': args->output_type = FT_VCF_GZ; break; +++ case 'v': args->output_type = FT_VCF; break; +++ default: error("The output type \"%s\" not recognised\n", optarg); +++ } +++ break; +++ case 'h': +++ case '?': +++ default: error("%s", usage_text()); break; +++ } +++ } +++ if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n"); +++ if ( optind==argc ) +++ { +++ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin +++ else { error("%s",usage_text()); } +++ } +++ else if ( optind+1!=argc ) error("%s",usage_text()); +++ else args->fname = argv[optind]; +++ +++ init_data(args); +++ +++ while ( bcf_sr_next_line(args->sr) ) process(args); +++ flush(args,1); +++ +++ fprintf(stderr,"Processed/Removed\t%d\t%d\n",args->ntot,args->nrm); +++ +++ destroy_data(args); +++ return 0; +++} +++ +++ ++--- /dev/null +++++ python-pysam/bcftools/plugins/remove-overlaps.c.pysam.c ++@@ -0,0 +1,221 @@ +++#include "bcftools.pysam.h" +++ +++/* +++ Copyright (C) 2017-2019 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++ THE SOFTWARE. +++*/ +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "bcftools.h" +++#include "vcfbuf.h" +++#include "filter.h" +++ +++#define FLT_INCLUDE 1 +++#define FLT_EXCLUDE 2 +++ +++typedef struct +++{ +++ filter_t *filter; +++ char *filter_str; +++ int filter_logic; // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e) +++ vcfbuf_t *vcfbuf; +++ int argc, region_is_file, target_is_file, output_type, verbose, nrm, ntot, print_overlaps, rmdup; +++ char **argv, *region, *target, *fname, *output_fname; +++ htsFile *out_fh; +++ bcf_hdr_t *hdr; +++ bcf_srs_t *sr; +++} +++args_t; +++ +++const char *about(void) +++{ +++ return "Remove overlapping variants\n"; +++} +++ +++static const char *usage_text(void) +++{ +++ return +++ "\n" +++ "About: Remove overlapping variants.\n" +++ "\n" +++ "Usage: bcftools +remove-overlaps [Options]\n" +++ "Plugin options:\n" +++ " -d, --rm-dup remove only duplicate sites and remove them completely\n" +++ " -p, --print-overlaps do the opposite and print only overlapping sites\n" +++ " -v, --verbose print a list of removed sites\n" +++ "Standard options:\n" +++ " -e, --exclude EXPR exclude sites for which the expression is true\n" +++ " -i, --include EXPR include only sites for which the expression is true\n" +++ " -o, --output FILE write output to the FILE [standard output]\n" +++ " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" +++ " -r, --regions REGION restrict to comma-separated list of regions\n" +++ " -R, --regions-file FILE restrict to regions listed in a file\n" +++ " -t, --targets REGION similar to -r but streams rather than index-jumps\n" +++ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" +++ "\n"; +++} +++ +++static void init_data(args_t *args) +++{ +++ args->sr = bcf_sr_init(); +++ if ( args->region ) +++ { +++ args->sr->require_index = 1; +++ if ( bcf_sr_set_regions(args->sr, args->region, args->region_is_file)<0 ) error("Failed to read the regions: %s\n",args->region); +++ } +++ if ( args->target && bcf_sr_set_targets(args->sr, args->target, args->target_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->target); +++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); +++ args->hdr = bcf_sr_get_header(args->sr,0); +++ +++ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); +++ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); +++ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ +++ args->vcfbuf = vcfbuf_init(args->hdr, 0); +++ if ( args->rmdup ) +++ vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_RMDUP,1) +++ else +++ vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_OVERLAP_WIN,1) +++ +++ if ( args->filter_str ) +++ args->filter = filter_init(args->hdr, args->filter_str); +++} +++static void destroy_data(args_t *args) +++{ +++ if ( args->filter ) +++ filter_destroy(args->filter); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); +++ vcfbuf_destroy(args->vcfbuf); +++ bcf_sr_destroy(args->sr); +++ free(args); +++} +++static void flush(args_t *args, int flush_all) +++{ +++ int nbuf = vcfbuf_nsites(args->vcfbuf); +++ bcf1_t *rec; +++ while ( (rec = vcfbuf_flush(args->vcfbuf, flush_all)) ) +++ { +++ if ( nbuf>2 || (nbuf>1 && flush_all) ) +++ { +++ args->nrm++; +++ if ( args->verbose ) fprintf(bcftools_stdout, "%s\t%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ if ( args->print_overlaps && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ continue; // skip overlapping variants +++ } +++ if ( !args->print_overlaps && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ } +++} +++static void process(args_t *args) +++{ +++ args->ntot++; +++ bcf1_t *rec = bcf_sr_get_line(args->sr,0); +++ if ( args->filter ) +++ { +++ int ret = filter_test(args->filter, rec, NULL); +++ if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return; } +++ else if ( ret ) return; +++ } +++ bcf_sr_t *sr = bcf_sr_get_reader(args->sr, 0); +++ sr->buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1); +++ flush(args,0); +++} +++ +++int run(int argc, char **argv) +++{ +++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); +++ args->argc = argc; args->argv = argv; +++ args->output_type = FT_VCF; +++ args->output_fname = "-"; +++ static struct option loptions[] = +++ { +++ {"rm-dup",no_argument,NULL,'d'}, +++ {"print-overlaps",no_argument,NULL,'p'}, +++ {"exclude",required_argument,NULL,'e'}, +++ {"include",required_argument,NULL,'i'}, +++ {"regions",required_argument,NULL,'r'}, +++ {"regions-file",required_argument,NULL,'R'}, +++ {"output",required_argument,NULL,'o'}, +++ {"output-type",required_argument,NULL,'O'}, +++ {"verbose",no_argument,NULL,'v'}, +++ {NULL,0,NULL,0} +++ }; +++ int c; +++ while ((c = getopt_long(argc, argv, "r:R:t:T:o:O:i:e:vpd",loptions,NULL)) >= 0) +++ { +++ switch (c) +++ { +++ case 'd': args->rmdup = 1; break; +++ case 'p': args->print_overlaps = 1; break; +++ case 'v': args->verbose = 1; break; +++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; +++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; +++ case 'T': args->target_is_file = 1; // fall-through +++ case 't': args->target = optarg; break; +++ case 'R': args->region_is_file = 1; // fall-through +++ case 'r': args->region = optarg; break; +++ case 'o': args->output_fname = optarg; break; +++ case 'O': +++ switch (optarg[0]) { +++ case 'b': args->output_type = FT_BCF_GZ; break; +++ case 'u': args->output_type = FT_BCF; break; +++ case 'z': args->output_type = FT_VCF_GZ; break; +++ case 'v': args->output_type = FT_VCF; break; +++ default: error("The output type \"%s\" not recognised\n", optarg); +++ } +++ break; +++ case 'h': +++ case '?': +++ default: error("%s", usage_text()); break; +++ } +++ } +++ if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n"); +++ if ( optind==argc ) +++ { +++ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin +++ else { error("%s",usage_text()); } +++ } +++ else if ( optind+1!=argc ) error("%s",usage_text()); +++ else args->fname = argv[optind]; +++ +++ init_data(args); +++ +++ while ( bcf_sr_next_line(args->sr) ) process(args); +++ flush(args,1); +++ +++ fprintf(bcftools_stderr,"Processed/Removed\t%d\t%d\n",args->ntot,args->nrm); +++ +++ destroy_data(args); +++ return 0; +++} +++ +++ ++--- python-pysam.orig/bcftools/plugins/setGT.c +++++ python-pysam/bcftools/plugins/setGT.c ++@@ -320,7 +320,7 @@ ++ hts_expand(int,rec->n_allele,args->marr,args->arr); ++ int ret = bcf_calc_ac(args->in_hdr,rec,args->arr,BCF_UN_FMT); ++ if ( ret<= 0 ) ++- error("Could not calculate allele count at %s:%d\n", bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Could not calculate allele count at %s:%"PRId64"\n", bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ ++ for(i=0; i < rec->n_allele; ++i) ++ { ++@@ -353,8 +353,8 @@ ++ int ia = bcf_gt_allele(ptr[0]); ++ int ib = bcf_gt_allele(ptr[1]); ++ if ( ia>=nbinom || ib>=nbinom ) ++- error("The sample %s has incorrect number of %s fields at %s:%d\n", ++- args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("The sample %s has incorrect number of %s fields at %s:%"PRId64"\n", +++ args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ ++ double prob = calc_binom(args->iarr[i*nbinom+ia],args->iarr[i*nbinom+ib]); ++ if ( !args->binom_cmp(prob,args->binom_val) ) continue; ++@@ -391,7 +391,7 @@ ++ ++ for (i=0; in_sample; i++) ++ { ++- if ( !args->smpl_pass[i] ) continue; +++ if ( args->smpl_pass && !args->smpl_pass[i] ) continue; ++ if ( args->new_mask>_UNPHASED ) ++ changed += unphase_gt(args->gts + i*ngts, ngts); ++ else if ( args->new_mask==GT_PHASED ) ++--- python-pysam.orig/bcftools/plugins/setGT.c.pysam.c +++++ python-pysam/bcftools/plugins/setGT.c.pysam.c ++@@ -322,7 +322,7 @@ ++ hts_expand(int,rec->n_allele,args->marr,args->arr); ++ int ret = bcf_calc_ac(args->in_hdr,rec,args->arr,BCF_UN_FMT); ++ if ( ret<= 0 ) ++- error("Could not calculate allele count at %s:%d\n", bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("Could not calculate allele count at %s:%"PRId64"\n", bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ ++ for(i=0; i < rec->n_allele; ++i) ++ { ++@@ -355,8 +355,8 @@ ++ int ia = bcf_gt_allele(ptr[0]); ++ int ib = bcf_gt_allele(ptr[1]); ++ if ( ia>=nbinom || ib>=nbinom ) ++- error("The sample %s has incorrect number of %s fields at %s:%d\n", ++- args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),rec->pos+1); +++ error("The sample %s has incorrect number of %s fields at %s:%"PRId64"\n", +++ args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); ++ ++ double prob = calc_binom(args->iarr[i*nbinom+ia],args->iarr[i*nbinom+ib]); ++ if ( !args->binom_cmp(prob,args->binom_val) ) continue; ++@@ -393,7 +393,7 @@ ++ ++ for (i=0; in_sample; i++) ++ { ++- if ( !args->smpl_pass[i] ) continue; +++ if ( args->smpl_pass && !args->smpl_pass[i] ) continue; ++ if ( args->new_mask>_UNPHASED ) ++ changed += unphase_gt(args->gts + i*ngts, ngts); ++ else if ( args->new_mask==GT_PHASED ) ++--- python-pysam.orig/bcftools/plugins/smpl-stats.c +++++ python-pysam/bcftools/plugins/smpl-stats.c ++@@ -28,6 +28,7 @@ ++ #include ++ #include ++ #include // for isatty +++#include ++ #include ++ #include ++ #include ++@@ -230,11 +231,11 @@ ++ fprintf(fh,"# %d) number of indels\n", ++i); ++ fprintf(fh,"# %d) number of singletons\n", ++i); ++ fprintf(fh,"# %d) number of missing genotypes (./., ., ./0, etc)\n", ++i); ++- fprintf(fh,"# %d) number of transitions (genotypes such as \"1/2\" are counted twice)\n", ++i); ++- fprintf(fh,"# %d) number of transversions (genotypes such as \"1/2\" are counted twice)\n", ++i); +++ fprintf(fh,"# %d) number of transitions (alt het genotypes such as \"1/2\" are counted twice)\n", ++i); +++ fprintf(fh,"# %d) number of transversions (alt het genotypes such as \"1/2\" are counted twice)\n", ++i); ++ fprintf(fh,"# %d) overall ts/tv\n", ++i); ++ i = 0; ++- fprintf(fh,"# SITE* lines report numbers for every threshold and site:\n"); +++ fprintf(fh,"# SITE* lines report numbers for every threshold:\n"); ++ fprintf(fh,"# %d) filter id\n", ++i); ++ fprintf(fh,"# %d) number of sites which pass the filter\n", ++i); ++ fprintf(fh,"# %d) number of SNVs\n", ++i); ++@@ -390,7 +391,7 @@ ++ { ++ if ( als[j]==0 || als[j]==star_allele ) continue; ++ if ( als[j] >= rec->n_allele ) ++- error("The GT index is out of range at %s:%d in %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,args->hdr->samples[j]); +++ error("The GT index is out of range at %s:%"PRId64" in %s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->hdr->samples[j]); ++ ++ if ( args->ac[als[j]]==1 ) { stats->nsingleton++; site_singleton = 1; } ++ ++--- python-pysam.orig/bcftools/plugins/smpl-stats.c.pysam.c +++++ python-pysam/bcftools/plugins/smpl-stats.c.pysam.c ++@@ -30,6 +30,7 @@ ++ #include ++ #include ++ #include // for isatty +++#include ++ #include ++ #include ++ #include ++@@ -232,11 +233,11 @@ ++ fprintf(fh,"# %d) number of indels\n", ++i); ++ fprintf(fh,"# %d) number of singletons\n", ++i); ++ fprintf(fh,"# %d) number of missing genotypes (./., ., ./0, etc)\n", ++i); ++- fprintf(fh,"# %d) number of transitions (genotypes such as \"1/2\" are counted twice)\n", ++i); ++- fprintf(fh,"# %d) number of transversions (genotypes such as \"1/2\" are counted twice)\n", ++i); +++ fprintf(fh,"# %d) number of transitions (alt het genotypes such as \"1/2\" are counted twice)\n", ++i); +++ fprintf(fh,"# %d) number of transversions (alt het genotypes such as \"1/2\" are counted twice)\n", ++i); ++ fprintf(fh,"# %d) overall ts/tv\n", ++i); ++ i = 0; ++- fprintf(fh,"# SITE* lines report numbers for every threshold and site:\n"); +++ fprintf(fh,"# SITE* lines report numbers for every threshold:\n"); ++ fprintf(fh,"# %d) filter id\n", ++i); ++ fprintf(fh,"# %d) number of sites which pass the filter\n", ++i); ++ fprintf(fh,"# %d) number of SNVs\n", ++i); ++@@ -392,7 +393,7 @@ ++ { ++ if ( als[j]==0 || als[j]==star_allele ) continue; ++ if ( als[j] >= rec->n_allele ) ++- error("The GT index is out of range at %s:%d in %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,args->hdr->samples[j]); +++ error("The GT index is out of range at %s:%"PRId64" in %s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->hdr->samples[j]); ++ ++ if ( args->ac[als[j]]==1 ) { stats->nsingleton++; site_singleton = 1; } ++ ++--- /dev/null +++++ python-pysam/bcftools/plugins/split-vep.c ++@@ -0,0 +1,934 @@ +++/* The MIT License +++ +++ Copyright (c) 2019 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++ THE SOFTWARE. +++ +++ */ +++ +++#include +++#include +++#include +++#include +++#include // for isatty +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "../bcftools.h" +++#include "../filter.h" +++#include "../convert.h" +++#include "../cols.h" +++ +++ +++// Logic of the filters: include or exclude sites which match the filters? +++#define FLT_INCLUDE 1 +++#define FLT_EXCLUDE 2 +++ +++#define SELECT_TR_ALL 0 +++#define SELECT_TR_WORST 1 +++#define SELECT_TR_PRIMARY 2 +++#define SELECT_CSQ_ANY -1 +++ +++typedef struct +++{ +++ char *field; // the name of the VEP field, e.g. Consequence,Gene,etc. +++ char *tag; // the name of the VCF tag: the annot_t.field with the -p prefix +++ int idx; // 0-based index within the VEP annotation string +++ int type; // annotation type, one of the BCF_HT_* types +++ kstring_t str; // annotation value, ready to pass to bcf_update_info_* +++} +++annot_t; +++ +++typedef struct +++{ +++ convert_t *convert; +++ filter_t *filter; +++ int argc, filter_logic, regions_is_file, targets_is_file, list_hdr; +++ kstring_t kstr; +++ char *filter_str, +++ *vep_tag; // the --annotation INFO tag to process +++ char **argv, *output_fname, *fname, *regions, *targets, *format_str; +++ int output_type; +++ htsFile *fh_vcf; +++ BGZF *fh_bgzf; +++ bcf_srs_t *sr; +++ bcf_hdr_t *hdr, *hdr_out; +++ int nfield; // number of all available VEP fields +++ char **field; // list of all available VEP fields +++ int nannot; // number of requested fields +++ annot_t *annot; // requested fields +++ int nscale; // number of items in the severity scale +++ char **scale; // severity scale (list) +++ int ncsq_str; // the length of csq_str allocated by bcf_get_info_string() +++ char *csq_str; // the current bcf_get_info_string() result +++ int csq_idx, // the index of the Consequence field; for the --select CSQ option +++ primary_id; // the index of the CANONICAL field; for the --select TR option +++ char *severity, // the --severity scale option +++ *select, // the --select option +++ *column_str, // the --columns option +++ *annot_prefix; // the --annot-prefix option +++ void *field2idx, // VEP field name to index, used in initialization +++ *csq2severity; // consequence type to severity score +++ cols_t *cols_tr, // the current CSQ tag split into transcripts +++ *cols_csq; // the current CSQ transcript split into fields +++ int min_severity, max_severity; // ignore consequences outside this severity range +++ int drop_sites; // the -x, --drop-sites option +++ int select_tr; // one of SELECT_TR_* +++ uint8_t *smpl_pass; // for filtering at sample level, used with -f +++ int duplicate; // the -d, --duplicate option is set +++ char *all_fields_delim; // the -A, --all-fields option is set +++ float *farr; // helper arrays for bcf_update_* functions +++ int32_t *iarr; +++ int niarr,miarr, nfarr,mfarr; +++} +++args_t; +++ +++args_t args; +++ +++const char *about(void) +++{ +++ return "Query structured annotations such as the CSQ created by VEP.\n"; +++} +++ +++static const char *default_severity(void) +++{ +++ return +++ "# Default consequence substrings ordered in ascending order by severity.\n" +++ "# Consequences with the same severity can be put on the same line in arbitrary order.\n" +++ "intergenic\n" +++ "downstream upstream\n" +++ "intron\n" +++ "non_coding\n" +++ "regulatory\n" +++ "5_prime_utr 3_prime_utr\n" +++ "stop_retained start_retained synonymous\n" +++ "splice_region\n" +++ "coding_sequence\n" +++ "missense\n" +++ "inframe\n" +++ "exon_loss\n" +++ "disruptive\n" +++ "splice_acceptor splice_donor\n" +++ "start_lost stop_lost stop_gained frameshift\n"; +++} +++static const char *usage_text(void) +++{ +++ return +++ "\n" +++ "About: Query structured annotations such INFO/CSQ created by bcftools/csq or VEP. For more\n" +++ " more information and pointers see http://samtools.github.io/bcftools/howtos/plugin.split-vep.html\n" +++ "Usage: bcftools +split-vep [Plugin Options]\n" +++ "Plugin options:\n" +++ " -a, --annotation STR INFO annotation to parse [CSQ]\n" +++ " -A, --all-fields DELIM Output all fields replacing the -a tag (\"%CSQ\" by default) in the -f\n" +++ " filtering expression using the output field delimiter DELIM. This can be\n" +++ " \"tab\", \"space\" or an arbitrary string.\n" +++ " -c, --columns LIST[:type] Extract the fields listed either as indexes or names. The default type\n" +++ " of the new annotation is String but can be also Integer/Int or Float/Real.\n" +++ " -d, --duplicate Output per transcript/allele consequences on a new line rather rather than\n" +++ " as comma-separated fields on a single line\n" +++ " -f, --format Formatting expression for non-VCF/BCF output, same as `bcftools query -f`\n" +++ " -l, --list Parse the VCF header and list the annotation fields\n" +++ " -p, --annot-prefix Prefix of INFO annotations to be created after splitting the CSQ string\n" +++ " -s, --select TR:CSQ Select transcripts to extract by type and/or consequence. (See also the -x switch.)\n" +++ " TR, transcript: worst,primary(*),all [all]\n" +++ " CSQ, consequence: any,missense,missense+,etc [any]\n" +++ " (*) Primary transcripts have the field \"CANONICAL\" set to \"YES\"\n" +++ " -S, --severity -|FILE Pass \"-\" to print the default severity scale or FILE to override\n" +++ " the default scale\n" +++ " -x, --drop-sites Drop sites with none of the consequences matching the severity specified by -s.\n" +++ " This switch is intended for use with VCF/BCF output (i.e. -f not given).\n" +++ "Common options:\n" +++ " -e, --exclude EXPR Exclude sites and samples for which the expression is true\n" +++ " -i, --include EXPR Include sites and samples for which the expression is true\n" +++ " -o, --output FILE Output file name [stdout]\n" +++ " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF or text, v: uncompressed VCF or text [v]\n" +++ " -r, --regions REG Restrict to comma-separated list of regions\n" +++ " -R, --regions-file FILE Restrict to regions listed in a file\n" +++ " -t, --targets REG Similar to -r but streams rather than index-jumps\n" +++ " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" +++ "\n" +++ "Examples:\n" +++ " # List available fields of the INFO/CSQ annotation\n" +++ " bcftools +split-vep -l file.vcf.gz\n" +++ "\n" +++ " # List the default severity scale\n" +++ " bcftools +split-vep -S -\n" +++ "\n" +++ " # Extract Consequence, IMPACT and gene SYMBOL of the most severe consequence into\n" +++ " # INFO annotations starting with the prefix \"vep\". For brevity, the columns can\n" +++ " # be given also as 0-based indexes\n" +++ " bcftools +split-vep -c Consequence,IMPACT,SYMBOL -s worst -p vep file.vcf.gz\n" +++ " bcftools +split-vep -c 1-3 -s worst -p vep file.vcf.gz\n" +++ "\n" +++ " # Same as above but use the text output of the \"bcftools query\" format\n" +++ " bcftools +split-vep -s worst -f '%CHROM %POS %Consequence %IMPACT %SYMBOL\\n' file.vcf.gz\n" +++ "\n" +++ " # Print all subfields (tab-delimited) in place of %CSQ, each consequence on a new line\n" +++ " bcftools +split-vep -f '%CHROM %POS %CSQ\\n' -d -A tab file.vcf.gz\n" +++ "\n" +++ " # Extract gnomAD_AF subfield into a new INFO/gnomAD_AF annotation of Type=Float so that\n" +++ " # numeric filtering can be used.\n" +++ " bcftools +split-vep -c gnomAD_AF:Float file.vcf.gz -i'gnomAD_AF<0.001'\n" +++ "\n" +++ " # Similar to above, but add the annotation only if the consequence severity is missense\n" +++ " # or equivalent. In order to drop sites with different consequences completely, we add\n" +++ " # the -x switch. See the online documentation referenced above for more examples.\n" +++ " bcftools +split-vep -c gnomAD_AF:Float -s :missense file.vcf.gz\n" +++ " bcftools +split-vep -c gnomAD_AF:Float -s :missense -x file.vcf.gz\n" +++ "\n"; +++} +++ +++static void expand_csq_expression(args_t *args, kstring_t *str) +++{ +++ if ( !args->all_fields_delim ) return; +++ +++ str->l = 0; +++ kputc('%',str); +++ kputs(args->vep_tag,str); +++ char *ptr = strstr(args->format_str,str->s); +++ if ( !ptr ) return; +++ char *end = ptr + str->l, tmp = *end; +++ if ( isalnum(tmp) || tmp=='_' || tmp=='.' ) return; +++ *end = 0; +++ +++ str->l = 0; +++ kputsn(args->format_str, ptr - args->format_str, str); +++ +++ int i; +++ for (i=0; infield; i++) +++ { +++ if ( i>0 ) kputs(args->all_fields_delim, str); +++ kputc('%', str); +++ kputs(args->field[i], str); +++ } +++ +++ *end = tmp; +++ kputs(end, str); +++ +++ free(args->format_str); +++ args->format_str = str->s; +++ str->l = str->m = 0; +++ str->s = NULL; +++} +++ +++static void init_data(args_t *args) +++{ +++ args->sr = bcf_sr_init(); +++ if ( args->regions ) +++ { +++ args->sr->require_index = 1; +++ if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); +++ } +++ if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); +++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); +++ args->hdr = bcf_sr_get_header(args->sr,0); +++ args->hdr_out = bcf_hdr_dup(args->hdr); +++ +++ // Parse the header CSQ line, must contain Description with "Format: ..." declaration +++ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr, BCF_HL_INFO, NULL, args->vep_tag, NULL); +++ if ( !hrec ) error("The tag INFO/%s not found in the header\n", args->vep_tag); +++ int ret = bcf_hrec_find_key(hrec, "Description"); +++ if ( ret<0 ) error("No \"Description\" field was found for the tag INFO/%s in the header\n", args->vep_tag); +++ char *format = strstr(hrec->vals[ret], "Format: "); +++ if ( !format ) error("Expected \"Format: \" substring in the header INFO/%s/Description, found: %s\n", args->vep_tag,hrec->vals[ret]); +++ format += 8; +++ char *ep = format; +++ while ( *ep ) +++ { +++ char *bp = ep; +++ while ( *ep && *ep!='|' ) ep++; +++ char tmp = *ep; +++ *ep = 0; +++ args->nfield++; +++ args->field = (char**)realloc(args->field,args->nfield*sizeof(*args->field)); +++ args->field[args->nfield-1] = strdup(bp); +++ if ( !tmp ) break; +++ ep++; +++ } +++ if ( !args->nfield ) error("Could not parse Description of INFO/%s: %s\n", args->vep_tag,hrec->vals[ret]); +++ int len = strlen(args->field[args->nfield-1]); +++ if ( args->field[args->nfield-1][len-1]=='"' ) args->field[args->nfield-1][len-1] = 0; // remove the trailing doublequote character +++ args->field2idx = khash_str2int_init(); +++ int i,j; +++ for (i=0; infield; i++) +++ { +++ if ( khash_str2int_has_key(args->field2idx, args->field[i]) ) +++ { +++ fprintf(stderr,"Warning: duplicate INFO/%s key \"%s\"\n", args->vep_tag,args->field[i]); +++ continue; +++ } +++ khash_str2int_set(args->field2idx, args->field[i], i); +++ } +++ +++ // Create a text output as with `bcftools query -f`. For this we need to determine the fields to be extracted +++ // from the formatting expression +++ kstring_t str = {0,0,0}; +++ if ( args->format_str && !args->column_str ) +++ { +++ // Special case: -A was given, extract all fields, for this the -a tag (%CSQ) must be present +++ if ( args->all_fields_delim ) expand_csq_expression(args, &str); +++ +++ for (i=0; infield; i++) +++ { +++ str.l = 0; +++ kputc('%',&str); +++ kputs(args->field[i],&str); +++ char end, *ptr = args->format_str; +++ while ( ptr ) +++ { +++ ptr = strstr(ptr,str.s); +++ if ( !ptr ) break; +++ end = ptr[str.l]; +++ if ( isalnum(end) || end=='_' || end=='.' ) +++ { +++ ptr++; +++ continue; +++ } +++ break; +++ } +++ if ( !ptr ) continue; +++ ptr[str.l] = 0; +++ int tag_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, ptr+1); +++ if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,tag_id) ) +++ fprintf(stderr,"Note: ambigous key %s, using the %s subfield of %s, not the INFO/%s tag\n", ptr,ptr+1,args->vep_tag,ptr+1); +++ +++ int olen = args->column_str ? strlen(args->column_str) : 0; +++ int nlen = strlen(ptr) - 1; +++ args->column_str = (char*)realloc(args->column_str, olen + nlen + 2); +++ if ( olen ) +++ { +++ memcpy(args->column_str+olen,",",1); +++ olen++; +++ } +++ memcpy(args->column_str+olen,ptr+1,nlen); +++ args->column_str[olen+nlen] = 0; +++ +++ ptr[str.l] = end; +++ } +++ } +++ +++ // The "Consequence" column to look up severity, its name is hardwired for now +++ if ( khash_str2int_get(args->field2idx,"Consequence",&args->csq_idx)!=0 ) +++ error("The field \"Consequence\" is not present in INFO/%s: %s\n", args->vep_tag,hrec->vals[ret]); +++ +++ // Columns to extract: given as names, 0-based indexes or ranges of indexes +++ if ( args->column_str ) +++ { +++ int *column = NULL; +++ int *types = NULL; +++ ep = args->column_str; +++ while ( *ep ) +++ { +++ char *tp, *bp = ep; +++ while ( *ep && *ep!=',' ) ep++; +++ char tmp = *ep; +++ *ep = 0; +++ int type = BCF_HT_STR; +++ int idx_beg, idx_end; +++ if ( khash_str2int_get(args->field2idx, bp, &idx_beg)==0 ) +++ idx_end = idx_beg; +++ else if ( (tp=strrchr(bp,':')) ) +++ { +++ *tp = 0; +++ if ( khash_str2int_get(args->field2idx, bp, &idx_beg)!=0 ) +++ { +++ *tp = ':'; +++ error("No such column: \"%s\"\n", bp); +++ } +++ idx_end = idx_beg; +++ *tp = ':'; +++ if ( !strcasecmp(tp+1,"string") ) type = BCF_HT_STR; +++ else if ( !strcasecmp(tp+1,"float") || !strcasecmp(tp+1,"real") ) type = BCF_HT_REAL; +++ else if ( !strcasecmp(tp+1,"integer") || !strcasecmp(tp+1,"int") ) type = BCF_HT_INT; +++ else if ( !strcasecmp(tp+1,"flag") ) type = BCF_HT_FLAG; +++ else error("The type \"%s\" (or column \"%s\"?) not recognised\n", tp+1,bp); +++ } +++ else +++ { +++ char *mp; +++ idx_beg = strtol(bp,&mp,10); +++ if ( !*mp ) idx_end = idx_beg; +++ else if ( *mp=='-' ) +++ idx_end = strtol(mp+1,&mp,10); +++ if ( *mp ) +++ { +++ if ( *mp==':' ) +++ { +++ idx_end = idx_beg; +++ if ( !strcasecmp(mp+1,"string") ) type = BCF_HT_STR; +++ else if ( !strcasecmp(mp+1,"float") || !strcasecmp(mp+1,"real") ) type = BCF_HT_REAL; +++ else if ( !strcasecmp(mp+1,"integer") || !strcasecmp(mp+1,"int") ) type = BCF_HT_INT; +++ else if ( !strcasecmp(mp+1,"flag") ) type = BCF_HT_FLAG; +++ else error("The type \"%s\" (or column \"%s\"?) not recognised\n", mp+1,bp); +++ } +++ else +++ error("No such column: \"%s\"\n", bp); +++ } +++ } +++ +++ i = args->nannot; +++ args->nannot += idx_end - idx_beg + 1; +++ column = (int*)realloc(column,args->nannot*sizeof(*column)); +++ types = (int*)realloc(types,args->nannot*sizeof(*types)); +++ for (j=idx_beg; j<=idx_end; j++) +++ { +++ if ( j >= args->nfield ) error("The index is too big: %d\n", j); +++ column[i] = j; +++ types[i] = type; +++ i++; +++ } +++ if ( !tmp ) break; +++ ep++; +++ } +++ args->annot = (annot_t*)calloc(args->nannot,sizeof(*args->annot)); +++ int len = args->annot_prefix ? strlen(args->annot_prefix) : 0; +++ for (i=0; inannot; i++) +++ { +++ annot_t *ann = &args->annot[i]; +++ ann->type = types[i]; +++ ann->idx = j = column[i]; +++ ann->field = strdup(args->field[j]); +++ int clen = strlen(args->field[j]); +++ ann->tag = (char*)malloc(clen+len+1); +++ if ( len ) memcpy(ann->tag,args->annot_prefix,len); +++ memcpy(ann->tag+len,ann->field,clen); +++ ann->tag[len+clen] = 0; +++ args->kstr.l = 0; +++ char *type = "String"; +++ if ( ann->type==BCF_HT_REAL ) type = "Float"; +++ else if ( ann->type==BCF_HT_INT ) type = "Integer"; +++ else if ( ann->type==BCF_HT_FLAG ) type = "Flag"; +++ ksprintf(&args->kstr,"##INFO=",type); +++ bcf_hdr_printf(args->hdr_out, args->kstr.s, ann->tag,ann->field,args->vep_tag); +++ } +++ free(column); +++ free(types); +++ +++ if ( bcf_hdr_sync(args->hdr_out)<0 ) +++ error_errno("[%s] Failed to update header", __func__); +++ } +++ if ( args->format_str ) +++ { +++ if ( !args->column_str && !args->select ) error("Error: No %s field selected in the formatting expression and -s not given: a typo?\n",args->vep_tag); +++ args->convert = convert_init(args->hdr_out, NULL, 0, args->format_str); +++ if ( !args->convert ) error("Could not parse the expression: %s\n", args->format_str); +++ } +++ if ( args->filter_str ) +++ { +++ int max_unpack = args->convert ? convert_max_unpack(args->convert) : 0; +++ args->filter = filter_init(args->hdr_out, args->filter_str); +++ max_unpack |= filter_max_unpack(args->filter); +++ args->sr->max_unpack = max_unpack; +++ if ( max_unpack & BCF_UN_FMT ) +++ convert_set_option(args->convert, subset_samples, &args->smpl_pass); +++ } +++ +++ // Severity scale +++ args->csq2severity = khash_str2int_init(); +++ int severity = 0; +++ str.l = 0; +++ if ( args->severity ) +++ { +++ kstring_t tmp = {0,0,0}; +++ htsFile *fp = hts_open(args->severity,"r"); +++ if ( !fp ) error("Cannot read %s\n", args->severity); +++ while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 ) +++ { +++ kputs(tmp.s, &str); +++ kputc('\n', &str); +++ } +++ free(tmp.s); +++ } +++ else +++ kputs(default_severity(),&str); +++ ep = str.s; +++ while ( *ep ) +++ { +++ if ( *ep=='#' ) +++ { +++ while ( *ep && *ep!='\n' ) { *ep = tolower(*ep); ep++; } +++ if ( !*ep ) break; +++ ep++; +++ continue; +++ } +++ char *bp = ep; +++ while ( *ep && !isspace(*ep) ) { *ep = tolower(*ep); ep++; } +++ char tmp = *ep; +++ *ep = 0; +++ args->nscale++; +++ args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale)); +++ args->scale[args->nscale-1] = strdup(bp); +++ if ( !khash_str2int_has_key(args->csq2severity,args->scale[args->nscale-1]) ) +++ khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity); +++ if ( !tmp ) break; +++ if ( tmp=='\n' ) severity++; +++ ep++; +++ while ( *ep && isspace(*ep) ) ep++; +++ } +++ free(str.s); +++ +++ // Transcript and/or consequence selection +++ if ( !args->select ) args->select = "all:any"; +++ cols_t *cols = cols_split(args->select, NULL, ':'); +++ char *sel_tr = cols->off[0][0] ? cols->off[0] : "all"; +++ char *sel_csq = cols->n==2 && cols->off[1][0] ? cols->off[1] : "any"; +++ if ( !strcasecmp(sel_tr,"all") ) args->select_tr = SELECT_TR_ALL; +++ else if ( !strcasecmp(sel_tr,"worst") ) args->select_tr = SELECT_TR_WORST; +++ else if ( !strcasecmp(sel_tr,"primary") ) args->select_tr = SELECT_TR_PRIMARY; +++ else error("Error: the transcript selection key \"%s\" is not recognised.\n", sel_tr); +++ if ( !strcasecmp(sel_csq,"any") ) { args->min_severity = args->max_severity = SELECT_CSQ_ANY; } // to avoid unnecessary lookups +++ else +++ { +++ int len = strlen(sel_csq); +++ int severity, modifier = '='; +++ if ( sel_csq[len-1]=='+' ) { modifier = '+'; sel_csq[len-1] = 0; } +++ else if ( sel_csq[len-1]=='-' ) { modifier = '-'; sel_csq[len-1] = 0; } +++ if ( khash_str2int_get(args->csq2severity, sel_csq, &severity)!=0 ) +++ error("Error: the consequence \"%s\" is not recognised. Run \"bcftools +split-vep -S ?\" to see the default list.\n", sel_csq); +++ if ( modifier=='=' ) { args->min_severity = severity; args->max_severity = severity; } +++ else if ( modifier=='+' ) { args->min_severity = severity; args->max_severity = INT_MAX; } +++ else if ( modifier=='-' ) { args->min_severity = 0; args->max_severity = severity; } +++ } +++ cols_destroy(cols); +++ +++ // The 'CANONICAL' column to look up severity, its name is hardwired for now +++ if ( args->select_tr==SELECT_TR_PRIMARY && khash_str2int_get(args->field2idx,"CANONICAL",&args->primary_id)!=0 ) +++ error("The primary transcript was requested but the field \"CANONICAL\" is not present in INFO/%s: %s\n",args->vep_tag,hrec->vals[ret]); +++} +++static void destroy_data(args_t *args) +++{ +++ free(args->farr); +++ free(args->iarr); +++ free(args->kstr.s); +++ free(args->column_str); +++ free(args->format_str); +++ cols_destroy(args->cols_csq); +++ cols_destroy(args->cols_tr); +++ int i; +++ for (i=0; inscale; i++) free(args->scale[i]); +++ free(args->scale); +++ for (i=0; infield; i++) free(args->field[i]); +++ free(args->field); +++ for (i=0; inannot; i++) +++ { +++ annot_t *ann = &args->annot[i]; +++ free(ann->field); +++ free(ann->tag); +++ free(ann->str.s); +++ } +++ free(args->annot); +++ if ( args->field2idx ) khash_str2int_destroy(args->field2idx); +++ if ( args->csq2severity ) khash_str2int_destroy(args->csq2severity); +++ bcf_sr_destroy(args->sr); +++ bcf_hdr_destroy(args->hdr_out); +++ free(args->csq_str); +++ if ( args->filter ) filter_destroy(args->filter); +++ if ( args->convert ) convert_destroy(args->convert); +++ if ( args->fh_vcf && hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); +++ if ( args->fh_bgzf && bgzf_close(args->fh_bgzf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); +++ free(args); +++} +++static void list_header(args_t *args) +++{ +++ int i; +++ for (i=0; infield; i++) printf("%d\t%s\n", i,args->field[i]); +++} +++ +++static void csq_to_severity(args_t *args, char *csq, int *min_severity, int *max_severity, int exact_match) +++{ +++ *min_severity = INT_MAX; +++ *max_severity = -1; +++ char *ep = csq; +++ while ( *ep ) +++ { +++ char *bp = ep; +++ while ( *ep && *ep!='&' ) { *ep = tolower(*ep); ep++; } +++ char tmp = *ep; +++ *ep = 0; +++ +++ int i, severity = -1; +++ if ( khash_str2int_get(args->csq2severity, bp, &severity)!=0 ) +++ { +++ for (i=0; inscale; i++) +++ if ( strstr(bp,args->scale[i]) ) break; +++ +++ if ( i!=args->nscale ) +++ khash_str2int_get(args->csq2severity, args->scale[i], &severity); +++ else +++ severity = args->nscale + 1; +++ +++ args->nscale++; +++ args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale)); +++ args->scale[args->nscale-1] = strdup(bp); +++ khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity); +++ if ( i==args->nscale ) +++ fprintf(stderr,"Note: assigning a (high) severity score to a new consequence, use -S to override: %s -> %d\n",args->scale[args->nscale-1],args->nscale); +++ +++ if ( khash_str2int_get(args->csq2severity, bp, &severity)!=0 ) error("FIXME: failed to look up the consequence \"%s\"\n", bp); +++ } +++ if ( exact_match < 0 ) +++ { +++ if ( *min_severity > severity ) *min_severity = severity; +++ if ( *max_severity < severity ) *max_severity = severity; +++ } +++ else +++ { +++ if ( severity==exact_match ) +++ { +++ *min_severity = *max_severity = severity; +++ *ep = tmp; +++ return; +++ } +++ } +++ +++ if ( !tmp ) break; +++ *ep = tmp; +++ ep++; +++ } +++} +++ +++static int csq_severity_pass(args_t *args, char *csq) +++{ +++ if ( args->min_severity==args->max_severity && args->min_severity==SELECT_CSQ_ANY ) return 1; +++ +++ int min_severity, max_severity, exact_match = args->min_severity==args->max_severity ? args->min_severity : -1; +++ csq_to_severity(args, csq, &min_severity, &max_severity, exact_match); +++ if ( max_severity < args->min_severity ) return 0; +++ if ( min_severity > args->max_severity ) return 0; +++ return 1; +++} +++ +++static int get_primary_transcript(args_t *args, bcf1_t *rec, cols_t *cols_tr) // modifies args->cols_csq! +++{ +++ int i; +++ for (i=0; in; i++) +++ { +++ args->cols_csq = cols_split(cols_tr->off[i], args->cols_csq, '|'); +++ if ( args->primary_id >= args->cols_csq->n ) +++ error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->primary_id,args->cols_csq->n); +++ if ( !strcmp("YES",args->cols_csq->off[args->primary_id]) ) return i; +++ } +++ return -1; +++} +++static int get_worst_transcript(args_t *args, bcf1_t *rec, cols_t *cols_tr) // modifies args->cols_csq! +++{ +++ int i, max_severity = -1, imax_severity = 0; +++ for (i=0; in; i++) +++ { +++ args->cols_csq = cols_split(cols_tr->off[i], args->cols_csq, '|'); +++ if ( args->csq_idx >= args->cols_csq->n ) +++ error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->csq_idx,args->cols_csq->n); +++ char *csq = args->cols_csq->off[args->csq_idx]; +++ +++ int min, max; +++ csq_to_severity(args, csq, &min, &max, -1); +++ if ( max_severity < max ) { imax_severity = i; max_severity = max; } +++ } +++ return imax_severity; +++} +++static void annot_reset(annot_t *annot, int nannot) +++{ +++ int i; +++ for (i=0; istr.l ) kputc(',',&ann->str); +++ kputs(value, &ann->str); +++} +++static inline void parse_array_real(char *str, float **arr, int *marr, int *narr) +++{ +++ char *bp = str, *ep; +++ float *ptr = *arr; +++ int i, n = 1, m = *marr; +++ for (i=0; *bp; bp++) +++ if ( *bp == ',' ) n++; +++ +++ hts_expand(float*,n,m,ptr); +++ +++ i = 0; +++ bp = str; +++ while ( *bp ) +++ { +++ ptr[i] = strtod(bp, &ep); +++ if ( bp==ep ) +++ bcf_float_set_missing(ptr[i]); +++ i++; +++ while ( *ep && *ep!=',' ) ep++; +++ bp = *ep ? ep + 1 : ep; +++ } +++ *narr = i; +++ *marr = m; +++ *arr = ptr; +++} +++static inline void parse_array_int32(char *str, int **arr, int *marr, int *narr) +++{ +++ char *bp = str, *ep; +++ int32_t *ptr = *arr; +++ int i, n = 1, m = *marr; +++ for (i=0; *bp; bp++) +++ if ( *bp == ',' ) n++; +++ +++ hts_expand(int32_t*,n,m,ptr); +++ +++ i = 0; +++ bp = str; +++ while ( *bp ) +++ { +++ ptr[i] = strtol(bp, &ep, 10); +++ if ( bp==ep ) +++ ptr[i] = bcf_int32_missing; +++ i++; +++ while ( *ep && *ep!=',' ) ep++; +++ bp = *ep ? ep + 1 : ep; +++ } +++ *narr = i; +++ *marr = m; +++ *arr = ptr; +++} +++static void filter_and_output(args_t *args, bcf1_t *rec, int severity_pass, int all_missing) +++{ +++ int i, updated = 0; +++ for (i=0; inannot; i++) +++ { +++ annot_t *ann = &args->annot[i]; +++ if ( !ann->str.l ) continue; +++ if ( ann->type==BCF_HT_REAL ) +++ { +++ parse_array_real(ann->str.s,&args->farr,&args->mfarr,&args->nfarr); +++ bcf_update_info_float(args->hdr_out,rec,ann->tag,args->farr,args->nfarr); +++ } +++ else if ( ann->type==BCF_HT_INT ) +++ { +++ parse_array_int32(ann->str.s,&args->iarr,&args->miarr,&args->niarr); +++ bcf_update_info_int32(args->hdr_out,rec,ann->tag,args->iarr,args->niarr); +++ } +++ else +++ bcf_update_info_string(args->hdr_out,rec,ann->tag,ann->str.s); +++ updated++; +++ } +++ if ( args->filter ) +++ { +++ int pass = filter_test(args->filter, rec, (const uint8_t**) &args->smpl_pass); +++ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; +++ if ( !pass ) return; +++ } +++ if ( args->format_str ) +++ { +++ if ( args->nannot ) +++ { +++ if ( !updated || all_missing ) return; // the standard case: using -f to print the CSQ subfields, skipping if missing +++ } +++ else +++ { +++ if ( !severity_pass ) return; // request to print only non-CSQ tags at sites that pass severity +++ } +++ +++ args->kstr.l = 0; +++ convert_line(args->convert, rec, &args->kstr); +++ if ( args->kstr.l && bgzf_write(args->fh_bgzf, args->kstr.s, args->kstr.l)!=args->kstr.l ) +++ error("Failed to write to %s\n", args->output_fname); +++ return; +++ } +++ if ( bcf_write(args->fh_vcf, args->hdr_out,rec)!=0 ) +++ error("Failed to write to %s\n", args->output_fname); +++} +++static void process_record(args_t *args, bcf1_t *rec) +++{ +++ int len = bcf_get_info_string(args->hdr,rec,args->vep_tag,&args->csq_str,&args->ncsq_str); +++ if ( len<=0 ) return; +++ +++ args->cols_tr = cols_split(args->csq_str, args->cols_tr, ','); +++ +++ int i,j, itr_min = 0, itr_max = args->cols_tr->n - 1; +++ if ( args->select_tr==SELECT_TR_PRIMARY ) +++ { +++ itr_min = itr_max = get_primary_transcript(args, rec, args->cols_tr); +++ if ( itr_min<0 ) itr_max = itr_min - 1; +++ } +++ else if ( args->select_tr==SELECT_TR_WORST ) +++ itr_min = itr_max = get_worst_transcript(args, rec, args->cols_tr); +++ +++ annot_reset(args->annot, args->nannot); +++ int severity_pass = 0; // consequence severity requested via the -s option (BCF record may be output but not annotated) +++ int all_missing = 1; // transcripts with all requested annotations missing will be discarded if -f was given +++ static int too_few_fields_warned = 0; +++ for (i=itr_min; i<=itr_max; i++) +++ { +++ args->cols_csq = cols_split(args->cols_tr->off[i], args->cols_csq, '|'); +++ if ( args->csq_idx >= args->cols_csq->n ) +++ error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->csq_idx,args->cols_csq->n); +++ +++ char *csq = args->cols_csq->off[args->csq_idx]; +++ if ( !csq_severity_pass(args, csq) ) continue; +++ severity_pass = 1; +++ +++ for (j=0; jnannot; j++) +++ { +++ annot_t *ann = &args->annot[j]; +++ if ( ann->idx >= args->cols_csq->n ) +++ { +++ if ( !too_few_fields_warned ) +++ { +++ fprintf(stderr, "Warning: fewer %s fields than expected at %s:%"PRId64", filling with dots. This warning is printed only once.\n", args->vep_tag,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ too_few_fields_warned = 1; +++ } +++ annot_append(ann, "."); +++ continue; +++ } +++ +++ if ( !*args->cols_csq->off[ann->idx] ) +++ annot_append(ann, "."); // missing value +++ else +++ { +++ annot_append(ann, args->cols_csq->off[ann->idx]); +++ all_missing = 0; +++ } +++ } +++ +++ if ( args->duplicate ) +++ { +++ filter_and_output(args, rec, severity_pass, all_missing); +++ annot_reset(args->annot, args->nannot); +++ all_missing = 1; +++ severity_pass = 0; +++ } +++ } +++ if ( !severity_pass && args->drop_sites ) return; +++ if ( !args->duplicate ) +++ filter_and_output(args, rec, severity_pass, all_missing); +++} +++ +++int run(int argc, char **argv) +++{ +++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); +++ args->argc = argc; args->argv = argv; +++ args->output_fname = "-"; +++ args->output_type = FT_VCF; +++ args->vep_tag = "CSQ"; +++ static struct option loptions[] = +++ { +++ {"drop-sites",no_argument,0,'x'}, +++ {"all-fields",no_argument,0,'A'}, +++ {"duplicate",no_argument,0,'d'}, +++ {"format",required_argument,0,'f'}, +++ {"annotation",required_argument,0,'a'}, +++ {"annot-prefix",required_argument,0,'p'}, +++ {"columns",required_argument,0,'c'}, +++ {"select",required_argument,0,'s'}, +++ {"severity",required_argument,0,'S'}, +++ {"list",no_argument,0,'l'}, +++ {"include",required_argument,0,'i'}, +++ {"exclude",required_argument,0,'e'}, +++ {"output",required_argument,NULL,'o'}, +++ {"output-type",required_argument,NULL,'O'}, +++ {"regions",1,0,'r'}, +++ {"regions-file",1,0,'R'}, +++ {"targets",1,0,'t'}, +++ {"targets-file",1,0,'T'}, +++ {NULL,0,NULL,0} +++ }; +++ int c; +++ while ((c = getopt_long(argc, argv, "o:O:i:e:r:R:t:T:lS:s:c:p:a:f:dA:x",loptions,NULL)) >= 0) +++ { +++ switch (c) +++ { +++ case 'A': +++ if ( !strcasecmp(optarg,"tab") ) args->all_fields_delim = "\t"; +++ else if ( !strcasecmp(optarg,"space") ) args->all_fields_delim = " "; +++ else args->all_fields_delim = optarg; +++ break; +++ case 'x': args->drop_sites = 1; break; +++ case 'd': args->duplicate = 1; break; +++ case 'f': args->format_str = strdup(optarg); break; +++ case 'a': args->vep_tag = optarg; break; +++ case 'p': args->annot_prefix = optarg; break; +++ case 'c': args->column_str = strdup(optarg); break; +++ case 'S': args->severity = optarg; break; +++ case 's': args->select = optarg; break; +++ case 'l': args->list_hdr = 1; break; +++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; +++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; +++ case 't': args->targets = optarg; break; +++ case 'T': args->targets = optarg; args->targets_is_file = 1; break; +++ case 'r': args->regions = optarg; break; +++ case 'R': args->regions = optarg; args->regions_is_file = 1; break; +++ case 'o': args->output_fname = optarg; break; +++ case 'O': +++ switch (optarg[0]) { +++ case 'b': args->output_type = FT_BCF_GZ; break; +++ case 'u': args->output_type = FT_BCF; break; +++ case 'z': args->output_type = FT_VCF_GZ; break; +++ case 'v': args->output_type = FT_VCF; break; +++ default: error("The output type \"%s\" not recognised\n", optarg); +++ } +++ break; +++ case 'h': +++ case '?': +++ default: error("%s", usage_text()); break; +++ } +++ } +++ if ( args->drop_sites && args->format_str ) error("Error: the -x behavior is the default (and only supported) with -f\n"); +++ if ( args->all_fields_delim && !args->format_str ) error("Error: the -A option must be used with -f\n"); +++ if ( args->severity && (!strcmp("?",args->severity) || !strcmp("-",args->severity)) ) error("%s", default_severity()); +++ if ( optind==argc ) +++ { +++ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin +++ else { error("%s", usage_text()); } +++ } +++ else if ( optind+1!=argc ) error("%s", usage_text()); +++ else args->fname = argv[optind]; +++ +++ init_data(args); +++ +++ if ( args->list_hdr ) +++ list_header(args); +++ else +++ { +++ if ( !args->format_str && !args->column_str ) +++ { +++ if ( args->min_severity==SELECT_CSQ_ANY && args->max_severity==SELECT_CSQ_ANY ) +++ error("Error: none of the -c,-f,-s options was given, why not use \"bcftools view\" instead?\n"); +++ else if ( !args->drop_sites ) +++ error("Error: when the -s option is used without -x, everything is printed; why not use \"bcftools view\" instead?\n"); +++ } +++ +++ if ( args->format_str ) +++ args->fh_bgzf = bgzf_open(args->output_fname, args->output_type&FT_GZ ? "wg" : "wu"); +++ else +++ { +++ args->fh_vcf = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); +++ if ( bcf_hdr_write(args->fh_vcf, args->hdr_out)!=0 ) error("Failed to write the header to %s\n", args->output_fname); +++ } +++ while ( bcf_sr_next_line(args->sr) ) +++ process_record(args, bcf_sr_get_line(args->sr,0)); +++ } +++ +++ destroy_data(args); +++ +++ return 0; +++} ++--- /dev/null +++++ python-pysam/bcftools/plugins/split-vep.c.pysam.c ++@@ -0,0 +1,936 @@ +++#include "bcftools.pysam.h" +++ +++/* The MIT License +++ +++ Copyright (c) 2019 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++ THE SOFTWARE. +++ +++ */ +++ +++#include +++#include +++#include +++#include +++#include // for isatty +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "../bcftools.h" +++#include "../filter.h" +++#include "../convert.h" +++#include "../cols.h" +++ +++ +++// Logic of the filters: include or exclude sites which match the filters? +++#define FLT_INCLUDE 1 +++#define FLT_EXCLUDE 2 +++ +++#define SELECT_TR_ALL 0 +++#define SELECT_TR_WORST 1 +++#define SELECT_TR_PRIMARY 2 +++#define SELECT_CSQ_ANY -1 +++ +++typedef struct +++{ +++ char *field; // the name of the VEP field, e.g. Consequence,Gene,etc. +++ char *tag; // the name of the VCF tag: the annot_t.field with the -p prefix +++ int idx; // 0-based index within the VEP annotation string +++ int type; // annotation type, one of the BCF_HT_* types +++ kstring_t str; // annotation value, ready to pass to bcf_update_info_* +++} +++annot_t; +++ +++typedef struct +++{ +++ convert_t *convert; +++ filter_t *filter; +++ int argc, filter_logic, regions_is_file, targets_is_file, list_hdr; +++ kstring_t kstr; +++ char *filter_str, +++ *vep_tag; // the --annotation INFO tag to process +++ char **argv, *output_fname, *fname, *regions, *targets, *format_str; +++ int output_type; +++ htsFile *fh_vcf; +++ BGZF *fh_bgzf; +++ bcf_srs_t *sr; +++ bcf_hdr_t *hdr, *hdr_out; +++ int nfield; // number of all available VEP fields +++ char **field; // list of all available VEP fields +++ int nannot; // number of requested fields +++ annot_t *annot; // requested fields +++ int nscale; // number of items in the severity scale +++ char **scale; // severity scale (list) +++ int ncsq_str; // the length of csq_str allocated by bcf_get_info_string() +++ char *csq_str; // the current bcf_get_info_string() result +++ int csq_idx, // the index of the Consequence field; for the --select CSQ option +++ primary_id; // the index of the CANONICAL field; for the --select TR option +++ char *severity, // the --severity scale option +++ *select, // the --select option +++ *column_str, // the --columns option +++ *annot_prefix; // the --annot-prefix option +++ void *field2idx, // VEP field name to index, used in initialization +++ *csq2severity; // consequence type to severity score +++ cols_t *cols_tr, // the current CSQ tag split into transcripts +++ *cols_csq; // the current CSQ transcript split into fields +++ int min_severity, max_severity; // ignore consequences outside this severity range +++ int drop_sites; // the -x, --drop-sites option +++ int select_tr; // one of SELECT_TR_* +++ uint8_t *smpl_pass; // for filtering at sample level, used with -f +++ int duplicate; // the -d, --duplicate option is set +++ char *all_fields_delim; // the -A, --all-fields option is set +++ float *farr; // helper arrays for bcf_update_* functions +++ int32_t *iarr; +++ int niarr,miarr, nfarr,mfarr; +++} +++args_t; +++ +++args_t args; +++ +++const char *about(void) +++{ +++ return "Query structured annotations such as the CSQ created by VEP.\n"; +++} +++ +++static const char *default_severity(void) +++{ +++ return +++ "# Default consequence substrings ordered in ascending order by severity.\n" +++ "# Consequences with the same severity can be put on the same line in arbitrary order.\n" +++ "intergenic\n" +++ "downstream upstream\n" +++ "intron\n" +++ "non_coding\n" +++ "regulatory\n" +++ "5_prime_utr 3_prime_utr\n" +++ "stop_retained start_retained synonymous\n" +++ "splice_region\n" +++ "coding_sequence\n" +++ "missense\n" +++ "inframe\n" +++ "exon_loss\n" +++ "disruptive\n" +++ "splice_acceptor splice_donor\n" +++ "start_lost stop_lost stop_gained frameshift\n"; +++} +++static const char *usage_text(void) +++{ +++ return +++ "\n" +++ "About: Query structured annotations such INFO/CSQ created by bcftools/csq or VEP. For more\n" +++ " more information and pointers see http://samtools.github.io/bcftools/howtos/plugin.split-vep.html\n" +++ "Usage: bcftools +split-vep [Plugin Options]\n" +++ "Plugin options:\n" +++ " -a, --annotation STR INFO annotation to parse [CSQ]\n" +++ " -A, --all-fields DELIM Output all fields replacing the -a tag (\"%CSQ\" by default) in the -f\n" +++ " filtering expression using the output field delimiter DELIM. This can be\n" +++ " \"tab\", \"space\" or an arbitrary string.\n" +++ " -c, --columns LIST[:type] Extract the fields listed either as indexes or names. The default type\n" +++ " of the new annotation is String but can be also Integer/Int or Float/Real.\n" +++ " -d, --duplicate Output per transcript/allele consequences on a new line rather rather than\n" +++ " as comma-separated fields on a single line\n" +++ " -f, --format Formatting expression for non-VCF/BCF output, same as `bcftools query -f`\n" +++ " -l, --list Parse the VCF header and list the annotation fields\n" +++ " -p, --annot-prefix Prefix of INFO annotations to be created after splitting the CSQ string\n" +++ " -s, --select TR:CSQ Select transcripts to extract by type and/or consequence. (See also the -x switch.)\n" +++ " TR, transcript: worst,primary(*),all [all]\n" +++ " CSQ, consequence: any,missense,missense+,etc [any]\n" +++ " (*) Primary transcripts have the field \"CANONICAL\" set to \"YES\"\n" +++ " -S, --severity -|FILE Pass \"-\" to print the default severity scale or FILE to override\n" +++ " the default scale\n" +++ " -x, --drop-sites Drop sites with none of the consequences matching the severity specified by -s.\n" +++ " This switch is intended for use with VCF/BCF output (i.e. -f not given).\n" +++ "Common options:\n" +++ " -e, --exclude EXPR Exclude sites and samples for which the expression is true\n" +++ " -i, --include EXPR Include sites and samples for which the expression is true\n" +++ " -o, --output FILE Output file name [bcftools_stdout]\n" +++ " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF or text, v: uncompressed VCF or text [v]\n" +++ " -r, --regions REG Restrict to comma-separated list of regions\n" +++ " -R, --regions-file FILE Restrict to regions listed in a file\n" +++ " -t, --targets REG Similar to -r but streams rather than index-jumps\n" +++ " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" +++ "\n" +++ "Examples:\n" +++ " # List available fields of the INFO/CSQ annotation\n" +++ " bcftools +split-vep -l file.vcf.gz\n" +++ "\n" +++ " # List the default severity scale\n" +++ " bcftools +split-vep -S -\n" +++ "\n" +++ " # Extract Consequence, IMPACT and gene SYMBOL of the most severe consequence into\n" +++ " # INFO annotations starting with the prefix \"vep\". For brevity, the columns can\n" +++ " # be given also as 0-based indexes\n" +++ " bcftools +split-vep -c Consequence,IMPACT,SYMBOL -s worst -p vep file.vcf.gz\n" +++ " bcftools +split-vep -c 1-3 -s worst -p vep file.vcf.gz\n" +++ "\n" +++ " # Same as above but use the text output of the \"bcftools query\" format\n" +++ " bcftools +split-vep -s worst -f '%CHROM %POS %Consequence %IMPACT %SYMBOL\\n' file.vcf.gz\n" +++ "\n" +++ " # Print all subfields (tab-delimited) in place of %CSQ, each consequence on a new line\n" +++ " bcftools +split-vep -f '%CHROM %POS %CSQ\\n' -d -A tab file.vcf.gz\n" +++ "\n" +++ " # Extract gnomAD_AF subfield into a new INFO/gnomAD_AF annotation of Type=Float so that\n" +++ " # numeric filtering can be used.\n" +++ " bcftools +split-vep -c gnomAD_AF:Float file.vcf.gz -i'gnomAD_AF<0.001'\n" +++ "\n" +++ " # Similar to above, but add the annotation only if the consequence severity is missense\n" +++ " # or equivalent. In order to drop sites with different consequences completely, we add\n" +++ " # the -x switch. See the online documentation referenced above for more examples.\n" +++ " bcftools +split-vep -c gnomAD_AF:Float -s :missense file.vcf.gz\n" +++ " bcftools +split-vep -c gnomAD_AF:Float -s :missense -x file.vcf.gz\n" +++ "\n"; +++} +++ +++static void expand_csq_expression(args_t *args, kstring_t *str) +++{ +++ if ( !args->all_fields_delim ) return; +++ +++ str->l = 0; +++ kputc('%',str); +++ kputs(args->vep_tag,str); +++ char *ptr = strstr(args->format_str,str->s); +++ if ( !ptr ) return; +++ char *end = ptr + str->l, tmp = *end; +++ if ( isalnum(tmp) || tmp=='_' || tmp=='.' ) return; +++ *end = 0; +++ +++ str->l = 0; +++ kputsn(args->format_str, ptr - args->format_str, str); +++ +++ int i; +++ for (i=0; infield; i++) +++ { +++ if ( i>0 ) kputs(args->all_fields_delim, str); +++ kputc('%', str); +++ kputs(args->field[i], str); +++ } +++ +++ *end = tmp; +++ kputs(end, str); +++ +++ free(args->format_str); +++ args->format_str = str->s; +++ str->l = str->m = 0; +++ str->s = NULL; +++} +++ +++static void init_data(args_t *args) +++{ +++ args->sr = bcf_sr_init(); +++ if ( args->regions ) +++ { +++ args->sr->require_index = 1; +++ if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); +++ } +++ if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); +++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); +++ args->hdr = bcf_sr_get_header(args->sr,0); +++ args->hdr_out = bcf_hdr_dup(args->hdr); +++ +++ // Parse the header CSQ line, must contain Description with "Format: ..." declaration +++ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr, BCF_HL_INFO, NULL, args->vep_tag, NULL); +++ if ( !hrec ) error("The tag INFO/%s not found in the header\n", args->vep_tag); +++ int ret = bcf_hrec_find_key(hrec, "Description"); +++ if ( ret<0 ) error("No \"Description\" field was found for the tag INFO/%s in the header\n", args->vep_tag); +++ char *format = strstr(hrec->vals[ret], "Format: "); +++ if ( !format ) error("Expected \"Format: \" substring in the header INFO/%s/Description, found: %s\n", args->vep_tag,hrec->vals[ret]); +++ format += 8; +++ char *ep = format; +++ while ( *ep ) +++ { +++ char *bp = ep; +++ while ( *ep && *ep!='|' ) ep++; +++ char tmp = *ep; +++ *ep = 0; +++ args->nfield++; +++ args->field = (char**)realloc(args->field,args->nfield*sizeof(*args->field)); +++ args->field[args->nfield-1] = strdup(bp); +++ if ( !tmp ) break; +++ ep++; +++ } +++ if ( !args->nfield ) error("Could not parse Description of INFO/%s: %s\n", args->vep_tag,hrec->vals[ret]); +++ int len = strlen(args->field[args->nfield-1]); +++ if ( args->field[args->nfield-1][len-1]=='"' ) args->field[args->nfield-1][len-1] = 0; // remove the trailing doublequote character +++ args->field2idx = khash_str2int_init(); +++ int i,j; +++ for (i=0; infield; i++) +++ { +++ if ( khash_str2int_has_key(args->field2idx, args->field[i]) ) +++ { +++ fprintf(bcftools_stderr,"Warning: duplicate INFO/%s key \"%s\"\n", args->vep_tag,args->field[i]); +++ continue; +++ } +++ khash_str2int_set(args->field2idx, args->field[i], i); +++ } +++ +++ // Create a text output as with `bcftools query -f`. For this we need to determine the fields to be extracted +++ // from the formatting expression +++ kstring_t str = {0,0,0}; +++ if ( args->format_str && !args->column_str ) +++ { +++ // Special case: -A was given, extract all fields, for this the -a tag (%CSQ) must be present +++ if ( args->all_fields_delim ) expand_csq_expression(args, &str); +++ +++ for (i=0; infield; i++) +++ { +++ str.l = 0; +++ kputc('%',&str); +++ kputs(args->field[i],&str); +++ char end, *ptr = args->format_str; +++ while ( ptr ) +++ { +++ ptr = strstr(ptr,str.s); +++ if ( !ptr ) break; +++ end = ptr[str.l]; +++ if ( isalnum(end) || end=='_' || end=='.' ) +++ { +++ ptr++; +++ continue; +++ } +++ break; +++ } +++ if ( !ptr ) continue; +++ ptr[str.l] = 0; +++ int tag_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, ptr+1); +++ if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,tag_id) ) +++ fprintf(bcftools_stderr,"Note: ambigous key %s, using the %s subfield of %s, not the INFO/%s tag\n", ptr,ptr+1,args->vep_tag,ptr+1); +++ +++ int olen = args->column_str ? strlen(args->column_str) : 0; +++ int nlen = strlen(ptr) - 1; +++ args->column_str = (char*)realloc(args->column_str, olen + nlen + 2); +++ if ( olen ) +++ { +++ memcpy(args->column_str+olen,",",1); +++ olen++; +++ } +++ memcpy(args->column_str+olen,ptr+1,nlen); +++ args->column_str[olen+nlen] = 0; +++ +++ ptr[str.l] = end; +++ } +++ } +++ +++ // The "Consequence" column to look up severity, its name is hardwired for now +++ if ( khash_str2int_get(args->field2idx,"Consequence",&args->csq_idx)!=0 ) +++ error("The field \"Consequence\" is not present in INFO/%s: %s\n", args->vep_tag,hrec->vals[ret]); +++ +++ // Columns to extract: given as names, 0-based indexes or ranges of indexes +++ if ( args->column_str ) +++ { +++ int *column = NULL; +++ int *types = NULL; +++ ep = args->column_str; +++ while ( *ep ) +++ { +++ char *tp, *bp = ep; +++ while ( *ep && *ep!=',' ) ep++; +++ char tmp = *ep; +++ *ep = 0; +++ int type = BCF_HT_STR; +++ int idx_beg, idx_end; +++ if ( khash_str2int_get(args->field2idx, bp, &idx_beg)==0 ) +++ idx_end = idx_beg; +++ else if ( (tp=strrchr(bp,':')) ) +++ { +++ *tp = 0; +++ if ( khash_str2int_get(args->field2idx, bp, &idx_beg)!=0 ) +++ { +++ *tp = ':'; +++ error("No such column: \"%s\"\n", bp); +++ } +++ idx_end = idx_beg; +++ *tp = ':'; +++ if ( !strcasecmp(tp+1,"string") ) type = BCF_HT_STR; +++ else if ( !strcasecmp(tp+1,"float") || !strcasecmp(tp+1,"real") ) type = BCF_HT_REAL; +++ else if ( !strcasecmp(tp+1,"integer") || !strcasecmp(tp+1,"int") ) type = BCF_HT_INT; +++ else if ( !strcasecmp(tp+1,"flag") ) type = BCF_HT_FLAG; +++ else error("The type \"%s\" (or column \"%s\"?) not recognised\n", tp+1,bp); +++ } +++ else +++ { +++ char *mp; +++ idx_beg = strtol(bp,&mp,10); +++ if ( !*mp ) idx_end = idx_beg; +++ else if ( *mp=='-' ) +++ idx_end = strtol(mp+1,&mp,10); +++ if ( *mp ) +++ { +++ if ( *mp==':' ) +++ { +++ idx_end = idx_beg; +++ if ( !strcasecmp(mp+1,"string") ) type = BCF_HT_STR; +++ else if ( !strcasecmp(mp+1,"float") || !strcasecmp(mp+1,"real") ) type = BCF_HT_REAL; +++ else if ( !strcasecmp(mp+1,"integer") || !strcasecmp(mp+1,"int") ) type = BCF_HT_INT; +++ else if ( !strcasecmp(mp+1,"flag") ) type = BCF_HT_FLAG; +++ else error("The type \"%s\" (or column \"%s\"?) not recognised\n", mp+1,bp); +++ } +++ else +++ error("No such column: \"%s\"\n", bp); +++ } +++ } +++ +++ i = args->nannot; +++ args->nannot += idx_end - idx_beg + 1; +++ column = (int*)realloc(column,args->nannot*sizeof(*column)); +++ types = (int*)realloc(types,args->nannot*sizeof(*types)); +++ for (j=idx_beg; j<=idx_end; j++) +++ { +++ if ( j >= args->nfield ) error("The index is too big: %d\n", j); +++ column[i] = j; +++ types[i] = type; +++ i++; +++ } +++ if ( !tmp ) break; +++ ep++; +++ } +++ args->annot = (annot_t*)calloc(args->nannot,sizeof(*args->annot)); +++ int len = args->annot_prefix ? strlen(args->annot_prefix) : 0; +++ for (i=0; inannot; i++) +++ { +++ annot_t *ann = &args->annot[i]; +++ ann->type = types[i]; +++ ann->idx = j = column[i]; +++ ann->field = strdup(args->field[j]); +++ int clen = strlen(args->field[j]); +++ ann->tag = (char*)malloc(clen+len+1); +++ if ( len ) memcpy(ann->tag,args->annot_prefix,len); +++ memcpy(ann->tag+len,ann->field,clen); +++ ann->tag[len+clen] = 0; +++ args->kstr.l = 0; +++ char *type = "String"; +++ if ( ann->type==BCF_HT_REAL ) type = "Float"; +++ else if ( ann->type==BCF_HT_INT ) type = "Integer"; +++ else if ( ann->type==BCF_HT_FLAG ) type = "Flag"; +++ ksprintf(&args->kstr,"##INFO=",type); +++ bcf_hdr_printf(args->hdr_out, args->kstr.s, ann->tag,ann->field,args->vep_tag); +++ } +++ free(column); +++ free(types); +++ +++ if ( bcf_hdr_sync(args->hdr_out)<0 ) +++ error_errno("[%s] Failed to update header", __func__); +++ } +++ if ( args->format_str ) +++ { +++ if ( !args->column_str && !args->select ) error("Error: No %s field selected in the formatting expression and -s not given: a typo?\n",args->vep_tag); +++ args->convert = convert_init(args->hdr_out, NULL, 0, args->format_str); +++ if ( !args->convert ) error("Could not parse the expression: %s\n", args->format_str); +++ } +++ if ( args->filter_str ) +++ { +++ int max_unpack = args->convert ? convert_max_unpack(args->convert) : 0; +++ args->filter = filter_init(args->hdr_out, args->filter_str); +++ max_unpack |= filter_max_unpack(args->filter); +++ args->sr->max_unpack = max_unpack; +++ if ( max_unpack & BCF_UN_FMT ) +++ convert_set_option(args->convert, subset_samples, &args->smpl_pass); +++ } +++ +++ // Severity scale +++ args->csq2severity = khash_str2int_init(); +++ int severity = 0; +++ str.l = 0; +++ if ( args->severity ) +++ { +++ kstring_t tmp = {0,0,0}; +++ htsFile *fp = hts_open(args->severity,"r"); +++ if ( !fp ) error("Cannot read %s\n", args->severity); +++ while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 ) +++ { +++ kputs(tmp.s, &str); +++ kputc('\n', &str); +++ } +++ free(tmp.s); +++ } +++ else +++ kputs(default_severity(),&str); +++ ep = str.s; +++ while ( *ep ) +++ { +++ if ( *ep=='#' ) +++ { +++ while ( *ep && *ep!='\n' ) { *ep = tolower(*ep); ep++; } +++ if ( !*ep ) break; +++ ep++; +++ continue; +++ } +++ char *bp = ep; +++ while ( *ep && !isspace(*ep) ) { *ep = tolower(*ep); ep++; } +++ char tmp = *ep; +++ *ep = 0; +++ args->nscale++; +++ args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale)); +++ args->scale[args->nscale-1] = strdup(bp); +++ if ( !khash_str2int_has_key(args->csq2severity,args->scale[args->nscale-1]) ) +++ khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity); +++ if ( !tmp ) break; +++ if ( tmp=='\n' ) severity++; +++ ep++; +++ while ( *ep && isspace(*ep) ) ep++; +++ } +++ free(str.s); +++ +++ // Transcript and/or consequence selection +++ if ( !args->select ) args->select = "all:any"; +++ cols_t *cols = cols_split(args->select, NULL, ':'); +++ char *sel_tr = cols->off[0][0] ? cols->off[0] : "all"; +++ char *sel_csq = cols->n==2 && cols->off[1][0] ? cols->off[1] : "any"; +++ if ( !strcasecmp(sel_tr,"all") ) args->select_tr = SELECT_TR_ALL; +++ else if ( !strcasecmp(sel_tr,"worst") ) args->select_tr = SELECT_TR_WORST; +++ else if ( !strcasecmp(sel_tr,"primary") ) args->select_tr = SELECT_TR_PRIMARY; +++ else error("Error: the transcript selection key \"%s\" is not recognised.\n", sel_tr); +++ if ( !strcasecmp(sel_csq,"any") ) { args->min_severity = args->max_severity = SELECT_CSQ_ANY; } // to avoid unnecessary lookups +++ else +++ { +++ int len = strlen(sel_csq); +++ int severity, modifier = '='; +++ if ( sel_csq[len-1]=='+' ) { modifier = '+'; sel_csq[len-1] = 0; } +++ else if ( sel_csq[len-1]=='-' ) { modifier = '-'; sel_csq[len-1] = 0; } +++ if ( khash_str2int_get(args->csq2severity, sel_csq, &severity)!=0 ) +++ error("Error: the consequence \"%s\" is not recognised. Run \"bcftools +split-vep -S ?\" to see the default list.\n", sel_csq); +++ if ( modifier=='=' ) { args->min_severity = severity; args->max_severity = severity; } +++ else if ( modifier=='+' ) { args->min_severity = severity; args->max_severity = INT_MAX; } +++ else if ( modifier=='-' ) { args->min_severity = 0; args->max_severity = severity; } +++ } +++ cols_destroy(cols); +++ +++ // The 'CANONICAL' column to look up severity, its name is hardwired for now +++ if ( args->select_tr==SELECT_TR_PRIMARY && khash_str2int_get(args->field2idx,"CANONICAL",&args->primary_id)!=0 ) +++ error("The primary transcript was requested but the field \"CANONICAL\" is not present in INFO/%s: %s\n",args->vep_tag,hrec->vals[ret]); +++} +++static void destroy_data(args_t *args) +++{ +++ free(args->farr); +++ free(args->iarr); +++ free(args->kstr.s); +++ free(args->column_str); +++ free(args->format_str); +++ cols_destroy(args->cols_csq); +++ cols_destroy(args->cols_tr); +++ int i; +++ for (i=0; inscale; i++) free(args->scale[i]); +++ free(args->scale); +++ for (i=0; infield; i++) free(args->field[i]); +++ free(args->field); +++ for (i=0; inannot; i++) +++ { +++ annot_t *ann = &args->annot[i]; +++ free(ann->field); +++ free(ann->tag); +++ free(ann->str.s); +++ } +++ free(args->annot); +++ if ( args->field2idx ) khash_str2int_destroy(args->field2idx); +++ if ( args->csq2severity ) khash_str2int_destroy(args->csq2severity); +++ bcf_sr_destroy(args->sr); +++ bcf_hdr_destroy(args->hdr_out); +++ free(args->csq_str); +++ if ( args->filter ) filter_destroy(args->filter); +++ if ( args->convert ) convert_destroy(args->convert); +++ if ( args->fh_vcf && hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); +++ if ( args->fh_bgzf && bgzf_close(args->fh_bgzf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); +++ free(args); +++} +++static void list_header(args_t *args) +++{ +++ int i; +++ for (i=0; infield; i++) fprintf(bcftools_stdout, "%d\t%s\n", i,args->field[i]); +++} +++ +++static void csq_to_severity(args_t *args, char *csq, int *min_severity, int *max_severity, int exact_match) +++{ +++ *min_severity = INT_MAX; +++ *max_severity = -1; +++ char *ep = csq; +++ while ( *ep ) +++ { +++ char *bp = ep; +++ while ( *ep && *ep!='&' ) { *ep = tolower(*ep); ep++; } +++ char tmp = *ep; +++ *ep = 0; +++ +++ int i, severity = -1; +++ if ( khash_str2int_get(args->csq2severity, bp, &severity)!=0 ) +++ { +++ for (i=0; inscale; i++) +++ if ( strstr(bp,args->scale[i]) ) break; +++ +++ if ( i!=args->nscale ) +++ khash_str2int_get(args->csq2severity, args->scale[i], &severity); +++ else +++ severity = args->nscale + 1; +++ +++ args->nscale++; +++ args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale)); +++ args->scale[args->nscale-1] = strdup(bp); +++ khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity); +++ if ( i==args->nscale ) +++ fprintf(bcftools_stderr,"Note: assigning a (high) severity score to a new consequence, use -S to override: %s -> %d\n",args->scale[args->nscale-1],args->nscale); +++ +++ if ( khash_str2int_get(args->csq2severity, bp, &severity)!=0 ) error("FIXME: failed to look up the consequence \"%s\"\n", bp); +++ } +++ if ( exact_match < 0 ) +++ { +++ if ( *min_severity > severity ) *min_severity = severity; +++ if ( *max_severity < severity ) *max_severity = severity; +++ } +++ else +++ { +++ if ( severity==exact_match ) +++ { +++ *min_severity = *max_severity = severity; +++ *ep = tmp; +++ return; +++ } +++ } +++ +++ if ( !tmp ) break; +++ *ep = tmp; +++ ep++; +++ } +++} +++ +++static int csq_severity_pass(args_t *args, char *csq) +++{ +++ if ( args->min_severity==args->max_severity && args->min_severity==SELECT_CSQ_ANY ) return 1; +++ +++ int min_severity, max_severity, exact_match = args->min_severity==args->max_severity ? args->min_severity : -1; +++ csq_to_severity(args, csq, &min_severity, &max_severity, exact_match); +++ if ( max_severity < args->min_severity ) return 0; +++ if ( min_severity > args->max_severity ) return 0; +++ return 1; +++} +++ +++static int get_primary_transcript(args_t *args, bcf1_t *rec, cols_t *cols_tr) // modifies args->cols_csq! +++{ +++ int i; +++ for (i=0; in; i++) +++ { +++ args->cols_csq = cols_split(cols_tr->off[i], args->cols_csq, '|'); +++ if ( args->primary_id >= args->cols_csq->n ) +++ error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->primary_id,args->cols_csq->n); +++ if ( !strcmp("YES",args->cols_csq->off[args->primary_id]) ) return i; +++ } +++ return -1; +++} +++static int get_worst_transcript(args_t *args, bcf1_t *rec, cols_t *cols_tr) // modifies args->cols_csq! +++{ +++ int i, max_severity = -1, imax_severity = 0; +++ for (i=0; in; i++) +++ { +++ args->cols_csq = cols_split(cols_tr->off[i], args->cols_csq, '|'); +++ if ( args->csq_idx >= args->cols_csq->n ) +++ error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->csq_idx,args->cols_csq->n); +++ char *csq = args->cols_csq->off[args->csq_idx]; +++ +++ int min, max; +++ csq_to_severity(args, csq, &min, &max, -1); +++ if ( max_severity < max ) { imax_severity = i; max_severity = max; } +++ } +++ return imax_severity; +++} +++static void annot_reset(annot_t *annot, int nannot) +++{ +++ int i; +++ for (i=0; istr.l ) kputc(',',&ann->str); +++ kputs(value, &ann->str); +++} +++static inline void parse_array_real(char *str, float **arr, int *marr, int *narr) +++{ +++ char *bp = str, *ep; +++ float *ptr = *arr; +++ int i, n = 1, m = *marr; +++ for (i=0; *bp; bp++) +++ if ( *bp == ',' ) n++; +++ +++ hts_expand(float*,n,m,ptr); +++ +++ i = 0; +++ bp = str; +++ while ( *bp ) +++ { +++ ptr[i] = strtod(bp, &ep); +++ if ( bp==ep ) +++ bcf_float_set_missing(ptr[i]); +++ i++; +++ while ( *ep && *ep!=',' ) ep++; +++ bp = *ep ? ep + 1 : ep; +++ } +++ *narr = i; +++ *marr = m; +++ *arr = ptr; +++} +++static inline void parse_array_int32(char *str, int **arr, int *marr, int *narr) +++{ +++ char *bp = str, *ep; +++ int32_t *ptr = *arr; +++ int i, n = 1, m = *marr; +++ for (i=0; *bp; bp++) +++ if ( *bp == ',' ) n++; +++ +++ hts_expand(int32_t*,n,m,ptr); +++ +++ i = 0; +++ bp = str; +++ while ( *bp ) +++ { +++ ptr[i] = strtol(bp, &ep, 10); +++ if ( bp==ep ) +++ ptr[i] = bcf_int32_missing; +++ i++; +++ while ( *ep && *ep!=',' ) ep++; +++ bp = *ep ? ep + 1 : ep; +++ } +++ *narr = i; +++ *marr = m; +++ *arr = ptr; +++} +++static void filter_and_output(args_t *args, bcf1_t *rec, int severity_pass, int all_missing) +++{ +++ int i, updated = 0; +++ for (i=0; inannot; i++) +++ { +++ annot_t *ann = &args->annot[i]; +++ if ( !ann->str.l ) continue; +++ if ( ann->type==BCF_HT_REAL ) +++ { +++ parse_array_real(ann->str.s,&args->farr,&args->mfarr,&args->nfarr); +++ bcf_update_info_float(args->hdr_out,rec,ann->tag,args->farr,args->nfarr); +++ } +++ else if ( ann->type==BCF_HT_INT ) +++ { +++ parse_array_int32(ann->str.s,&args->iarr,&args->miarr,&args->niarr); +++ bcf_update_info_int32(args->hdr_out,rec,ann->tag,args->iarr,args->niarr); +++ } +++ else +++ bcf_update_info_string(args->hdr_out,rec,ann->tag,ann->str.s); +++ updated++; +++ } +++ if ( args->filter ) +++ { +++ int pass = filter_test(args->filter, rec, (const uint8_t**) &args->smpl_pass); +++ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; +++ if ( !pass ) return; +++ } +++ if ( args->format_str ) +++ { +++ if ( args->nannot ) +++ { +++ if ( !updated || all_missing ) return; // the standard case: using -f to print the CSQ subfields, skipping if missing +++ } +++ else +++ { +++ if ( !severity_pass ) return; // request to print only non-CSQ tags at sites that pass severity +++ } +++ +++ args->kstr.l = 0; +++ convert_line(args->convert, rec, &args->kstr); +++ if ( args->kstr.l && bgzf_write(args->fh_bgzf, args->kstr.s, args->kstr.l)!=args->kstr.l ) +++ error("Failed to write to %s\n", args->output_fname); +++ return; +++ } +++ if ( bcf_write(args->fh_vcf, args->hdr_out,rec)!=0 ) +++ error("Failed to write to %s\n", args->output_fname); +++} +++static void process_record(args_t *args, bcf1_t *rec) +++{ +++ int len = bcf_get_info_string(args->hdr,rec,args->vep_tag,&args->csq_str,&args->ncsq_str); +++ if ( len<=0 ) return; +++ +++ args->cols_tr = cols_split(args->csq_str, args->cols_tr, ','); +++ +++ int i,j, itr_min = 0, itr_max = args->cols_tr->n - 1; +++ if ( args->select_tr==SELECT_TR_PRIMARY ) +++ { +++ itr_min = itr_max = get_primary_transcript(args, rec, args->cols_tr); +++ if ( itr_min<0 ) itr_max = itr_min - 1; +++ } +++ else if ( args->select_tr==SELECT_TR_WORST ) +++ itr_min = itr_max = get_worst_transcript(args, rec, args->cols_tr); +++ +++ annot_reset(args->annot, args->nannot); +++ int severity_pass = 0; // consequence severity requested via the -s option (BCF record may be output but not annotated) +++ int all_missing = 1; // transcripts with all requested annotations missing will be discarded if -f was given +++ static int too_few_fields_warned = 0; +++ for (i=itr_min; i<=itr_max; i++) +++ { +++ args->cols_csq = cols_split(args->cols_tr->off[i], args->cols_csq, '|'); +++ if ( args->csq_idx >= args->cols_csq->n ) +++ error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->csq_idx,args->cols_csq->n); +++ +++ char *csq = args->cols_csq->off[args->csq_idx]; +++ if ( !csq_severity_pass(args, csq) ) continue; +++ severity_pass = 1; +++ +++ for (j=0; jnannot; j++) +++ { +++ annot_t *ann = &args->annot[j]; +++ if ( ann->idx >= args->cols_csq->n ) +++ { +++ if ( !too_few_fields_warned ) +++ { +++ fprintf(bcftools_stderr, "Warning: fewer %s fields than expected at %s:%"PRId64", filling with dots. This warning is printed only once.\n", args->vep_tag,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ too_few_fields_warned = 1; +++ } +++ annot_append(ann, "."); +++ continue; +++ } +++ +++ if ( !*args->cols_csq->off[ann->idx] ) +++ annot_append(ann, "."); // missing value +++ else +++ { +++ annot_append(ann, args->cols_csq->off[ann->idx]); +++ all_missing = 0; +++ } +++ } +++ +++ if ( args->duplicate ) +++ { +++ filter_and_output(args, rec, severity_pass, all_missing); +++ annot_reset(args->annot, args->nannot); +++ all_missing = 1; +++ severity_pass = 0; +++ } +++ } +++ if ( !severity_pass && args->drop_sites ) return; +++ if ( !args->duplicate ) +++ filter_and_output(args, rec, severity_pass, all_missing); +++} +++ +++int run(int argc, char **argv) +++{ +++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); +++ args->argc = argc; args->argv = argv; +++ args->output_fname = "-"; +++ args->output_type = FT_VCF; +++ args->vep_tag = "CSQ"; +++ static struct option loptions[] = +++ { +++ {"drop-sites",no_argument,0,'x'}, +++ {"all-fields",no_argument,0,'A'}, +++ {"duplicate",no_argument,0,'d'}, +++ {"format",required_argument,0,'f'}, +++ {"annotation",required_argument,0,'a'}, +++ {"annot-prefix",required_argument,0,'p'}, +++ {"columns",required_argument,0,'c'}, +++ {"select",required_argument,0,'s'}, +++ {"severity",required_argument,0,'S'}, +++ {"list",no_argument,0,'l'}, +++ {"include",required_argument,0,'i'}, +++ {"exclude",required_argument,0,'e'}, +++ {"output",required_argument,NULL,'o'}, +++ {"output-type",required_argument,NULL,'O'}, +++ {"regions",1,0,'r'}, +++ {"regions-file",1,0,'R'}, +++ {"targets",1,0,'t'}, +++ {"targets-file",1,0,'T'}, +++ {NULL,0,NULL,0} +++ }; +++ int c; +++ while ((c = getopt_long(argc, argv, "o:O:i:e:r:R:t:T:lS:s:c:p:a:f:dA:x",loptions,NULL)) >= 0) +++ { +++ switch (c) +++ { +++ case 'A': +++ if ( !strcasecmp(optarg,"tab") ) args->all_fields_delim = "\t"; +++ else if ( !strcasecmp(optarg,"space") ) args->all_fields_delim = " "; +++ else args->all_fields_delim = optarg; +++ break; +++ case 'x': args->drop_sites = 1; break; +++ case 'd': args->duplicate = 1; break; +++ case 'f': args->format_str = strdup(optarg); break; +++ case 'a': args->vep_tag = optarg; break; +++ case 'p': args->annot_prefix = optarg; break; +++ case 'c': args->column_str = strdup(optarg); break; +++ case 'S': args->severity = optarg; break; +++ case 's': args->select = optarg; break; +++ case 'l': args->list_hdr = 1; break; +++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; +++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; +++ case 't': args->targets = optarg; break; +++ case 'T': args->targets = optarg; args->targets_is_file = 1; break; +++ case 'r': args->regions = optarg; break; +++ case 'R': args->regions = optarg; args->regions_is_file = 1; break; +++ case 'o': args->output_fname = optarg; break; +++ case 'O': +++ switch (optarg[0]) { +++ case 'b': args->output_type = FT_BCF_GZ; break; +++ case 'u': args->output_type = FT_BCF; break; +++ case 'z': args->output_type = FT_VCF_GZ; break; +++ case 'v': args->output_type = FT_VCF; break; +++ default: error("The output type \"%s\" not recognised\n", optarg); +++ } +++ break; +++ case 'h': +++ case '?': +++ default: error("%s", usage_text()); break; +++ } +++ } +++ if ( args->drop_sites && args->format_str ) error("Error: the -x behavior is the default (and only supported) with -f\n"); +++ if ( args->all_fields_delim && !args->format_str ) error("Error: the -A option must be used with -f\n"); +++ if ( args->severity && (!strcmp("?",args->severity) || !strcmp("-",args->severity)) ) error("%s", default_severity()); +++ if ( optind==argc ) +++ { +++ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin +++ else { error("%s", usage_text()); } +++ } +++ else if ( optind+1!=argc ) error("%s", usage_text()); +++ else args->fname = argv[optind]; +++ +++ init_data(args); +++ +++ if ( args->list_hdr ) +++ list_header(args); +++ else +++ { +++ if ( !args->format_str && !args->column_str ) +++ { +++ if ( args->min_severity==SELECT_CSQ_ANY && args->max_severity==SELECT_CSQ_ANY ) +++ error("Error: none of the -c,-f,-s options was given, why not use \"bcftools view\" instead?\n"); +++ else if ( !args->drop_sites ) +++ error("Error: when the -s option is used without -x, everything is printed; why not use \"bcftools view\" instead?\n"); +++ } +++ +++ if ( args->format_str ) +++ args->fh_bgzf = bgzf_open(args->output_fname, args->output_type&FT_GZ ? "wg" : "wu"); +++ else +++ { +++ args->fh_vcf = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); +++ if ( bcf_hdr_write(args->fh_vcf, args->hdr_out)!=0 ) error("Failed to write the header to %s\n", args->output_fname); +++ } +++ while ( bcf_sr_next_line(args->sr) ) +++ process_record(args, bcf_sr_get_line(args->sr,0)); +++ } +++ +++ destroy_data(args); +++ +++ return 0; +++} ++--- python-pysam.orig/bcftools/plugins/split.c +++++ python-pysam/bcftools/plugins/split.c ++@@ -178,26 +178,6 @@ ++ if ( !nsmpl ) error("No samples to split: %s\n", args->fname); ++ args->fh = (htsFile**)calloc(nsmpl,sizeof(*args->fh)); ++ args->bnames = set_file_base_names(args); ++- kstring_t str = {0,0,0}; ++- for (i=0; ibnames[i] ) continue; ++- str.l = 0; ++- kputs(args->output_dir, &str); ++- if ( str.s[str.l-1] != '/' ) kputc('/', &str); ++- int k, l = str.l; ++- kputs(args->bnames[i], &str); ++- for (k=l; koutput_type & FT_BCF ) kputs(".bcf", &str); ++- else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); ++- else kputs(".vcf", &str); ++- args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type)); ++- if ( args->fh[i] == NULL ) error("Can't write to \"%s\": %s\n", str.s, strerror(errno)); ++- bcf_hdr_nsamples(args->hdr_out) = 1; ++- args->hdr_out->samples[0] = args->bnames[i]; ++- bcf_hdr_write(args->fh[i], args->hdr_out); ++- } ++- free(str.s); ++ ++ // parse tags ++ int is_info = 0, is_fmt = 0; ++@@ -235,6 +215,57 @@ ++ { ++ args->keep_info = args->keep_fmt = 1; ++ } +++ if ( !args->keep_fmt && !args->nfmt_tags ) args->keep_fmt = 1; +++ if ( !args->keep_info || args->ninfo_tags || args->nfmt_tags ) +++ { +++ int j; +++ for (j=args->hdr_out->nhrec-1; j>=0; j--) +++ { +++ bcf_hrec_t *hrec = args->hdr_out->hrec[j]; +++ if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FMT ) continue; +++ int k = bcf_hrec_find_key(hrec,"ID"); +++ assert( k>=0 ); // this should always be true for valid VCFs +++ int remove = 0; +++ if ( hrec->type==BCF_HL_INFO && (!args->keep_info || args->ninfo_tags) ) +++ { +++ int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); +++ if ( !args->keep_info || id >= args->ninfo_tags || !args->info_tags[id] ) remove = 1; +++ } +++ if ( hrec->type==BCF_HL_FMT && args->nfmt_tags ) +++ { +++ int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); +++ if ( id >= args->nfmt_tags || !args->fmt_tags[id] ) remove = 1; +++ } +++ if ( remove ) +++ { +++ char *str = strdup(hrec->vals[k]); +++ bcf_hdr_remove(args->hdr_out,hrec->type,str); +++ free(str); +++ } +++ } +++ if ( bcf_hdr_sync(args->hdr_out)!=0 ) error("Failed to update the VCF header\n"); +++ } +++ +++ kstring_t str = {0,0,0}; +++ for (i=0; ibnames[i] ) continue; +++ str.l = 0; +++ kputs(args->output_dir, &str); +++ if ( str.s[str.l-1] != '/' ) kputc('/', &str); +++ int k, l = str.l; +++ kputs(args->bnames[i], &str); +++ for (k=l; koutput_type & FT_BCF ) kputs(".bcf", &str); +++ else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); +++ else kputs(".vcf", &str); +++ args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type)); +++ if ( args->fh[i] == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__, str.s, strerror(errno)); +++ bcf_hdr_nsamples(args->hdr_out) = 1; +++ args->hdr_out->samples[0] = args->bnames[i]; +++ if ( bcf_hdr_write(args->fh[i], args->hdr_out)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,str.s); +++ } +++ free(str.s); ++ } ++ static void destroy_data(args_t *args) ++ { ++@@ -245,7 +276,7 @@ ++ int i, nsmpl = bcf_hdr_nsamples(args->hdr_in); ++ for (i=0; ifh[i] && hts_close(args->fh[i])!=0 ) error("Error: close failed!\n"); +++ if ( args->fh[i] && hts_close(args->fh[i])!=0 ) error("Error: close failed .. %s\n",args->bnames[i]); ++ free(args->bnames[i]); ++ } ++ free(args->bnames); ++@@ -307,7 +338,7 @@ ++ { ++ bcf_fmt_t *fmt = &src->d.fmt[i]; ++ int id = fmt->id; ++- if ( !args->keep_fmt && !args->fmt_tags[id] ) continue; +++ if ( !args->keep_fmt && (id>=args->nfmt_tags || !args->fmt_tags[id]) ) continue; ++ ++ bcf_enc_int1(&tmp, id); ++ bcf_enc_size(&tmp, fmt->n, fmt->type); ++@@ -343,7 +374,7 @@ ++ } ++ if ( !out ) out = rec_set_info(args, rec); ++ rec_set_format(args, rec, i, out); ++- bcf_write(args->fh[i], args->hdr_out, out); +++ if ( bcf_write(args->fh[i], args->hdr_out, out)!=0 ) error("[%s] Error: failed to write the record\n", __func__); ++ } ++ if ( out ) bcf_destroy(out); ++ } ++--- python-pysam.orig/bcftools/plugins/split.c.pysam.c +++++ python-pysam/bcftools/plugins/split.c.pysam.c ++@@ -180,26 +180,6 @@ ++ if ( !nsmpl ) error("No samples to split: %s\n", args->fname); ++ args->fh = (htsFile**)calloc(nsmpl,sizeof(*args->fh)); ++ args->bnames = set_file_base_names(args); ++- kstring_t str = {0,0,0}; ++- for (i=0; ibnames[i] ) continue; ++- str.l = 0; ++- kputs(args->output_dir, &str); ++- if ( str.s[str.l-1] != '/' ) kputc('/', &str); ++- int k, l = str.l; ++- kputs(args->bnames[i], &str); ++- for (k=l; koutput_type & FT_BCF ) kputs(".bcf", &str); ++- else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); ++- else kputs(".vcf", &str); ++- args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type)); ++- if ( args->fh[i] == NULL ) error("Can't write to \"%s\": %s\n", str.s, strerror(errno)); ++- bcf_hdr_nsamples(args->hdr_out) = 1; ++- args->hdr_out->samples[0] = args->bnames[i]; ++- bcf_hdr_write(args->fh[i], args->hdr_out); ++- } ++- free(str.s); ++ ++ // parse tags ++ int is_info = 0, is_fmt = 0; ++@@ -237,6 +217,57 @@ ++ { ++ args->keep_info = args->keep_fmt = 1; ++ } +++ if ( !args->keep_fmt && !args->nfmt_tags ) args->keep_fmt = 1; +++ if ( !args->keep_info || args->ninfo_tags || args->nfmt_tags ) +++ { +++ int j; +++ for (j=args->hdr_out->nhrec-1; j>=0; j--) +++ { +++ bcf_hrec_t *hrec = args->hdr_out->hrec[j]; +++ if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FMT ) continue; +++ int k = bcf_hrec_find_key(hrec,"ID"); +++ assert( k>=0 ); // this should always be true for valid VCFs +++ int remove = 0; +++ if ( hrec->type==BCF_HL_INFO && (!args->keep_info || args->ninfo_tags) ) +++ { +++ int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); +++ if ( !args->keep_info || id >= args->ninfo_tags || !args->info_tags[id] ) remove = 1; +++ } +++ if ( hrec->type==BCF_HL_FMT && args->nfmt_tags ) +++ { +++ int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); +++ if ( id >= args->nfmt_tags || !args->fmt_tags[id] ) remove = 1; +++ } +++ if ( remove ) +++ { +++ char *str = strdup(hrec->vals[k]); +++ bcf_hdr_remove(args->hdr_out,hrec->type,str); +++ free(str); +++ } +++ } +++ if ( bcf_hdr_sync(args->hdr_out)!=0 ) error("Failed to update the VCF header\n"); +++ } +++ +++ kstring_t str = {0,0,0}; +++ for (i=0; ibnames[i] ) continue; +++ str.l = 0; +++ kputs(args->output_dir, &str); +++ if ( str.s[str.l-1] != '/' ) kputc('/', &str); +++ int k, l = str.l; +++ kputs(args->bnames[i], &str); +++ for (k=l; koutput_type & FT_BCF ) kputs(".bcf", &str); +++ else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); +++ else kputs(".vcf", &str); +++ args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type)); +++ if ( args->fh[i] == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__, str.s, strerror(errno)); +++ bcf_hdr_nsamples(args->hdr_out) = 1; +++ args->hdr_out->samples[0] = args->bnames[i]; +++ if ( bcf_hdr_write(args->fh[i], args->hdr_out)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,str.s); +++ } +++ free(str.s); ++ } ++ static void destroy_data(args_t *args) ++ { ++@@ -247,7 +278,7 @@ ++ int i, nsmpl = bcf_hdr_nsamples(args->hdr_in); ++ for (i=0; ifh[i] && hts_close(args->fh[i])!=0 ) error("Error: close failed!\n"); +++ if ( args->fh[i] && hts_close(args->fh[i])!=0 ) error("Error: close failed .. %s\n",args->bnames[i]); ++ free(args->bnames[i]); ++ } ++ free(args->bnames); ++@@ -309,7 +340,7 @@ ++ { ++ bcf_fmt_t *fmt = &src->d.fmt[i]; ++ int id = fmt->id; ++- if ( !args->keep_fmt && !args->fmt_tags[id] ) continue; +++ if ( !args->keep_fmt && (id>=args->nfmt_tags || !args->fmt_tags[id]) ) continue; ++ ++ bcf_enc_int1(&tmp, id); ++ bcf_enc_size(&tmp, fmt->n, fmt->type); ++@@ -345,7 +376,7 @@ ++ } ++ if ( !out ) out = rec_set_info(args, rec); ++ rec_set_format(args, rec, i, out); ++- bcf_write(args->fh[i], args->hdr_out, out); +++ if ( bcf_write(args->fh[i], args->hdr_out, out)!=0 ) error("[%s] Error: failed to write the record\n", __func__); ++ } ++ if ( out ) bcf_destroy(out); ++ } ++--- python-pysam.orig/bcftools/plugins/tag2tag.c +++++ python-pysam/bcftools/plugins/tag2tag.c ++@@ -26,6 +26,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include "bcftools.h" ++@@ -217,8 +218,8 @@ ++ } ++ ++ if ( j!=nals*(nals+1)/2 ) ++- error("Wrong number of GP values for diploid genotype at %s:%d, expected %d, found %d\n", ++- bcf_seqname(in_hdr,rec),rec->pos+1, nals*(nals+1)/2,j); +++ error("Wrong number of GP values for diploid genotype at %s:%"PRId64", expected %d, found %d\n", +++ bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1, nals*(nals+1)/2,j); ++ ++ if (ptr[jmax] < 1-thresh) ++ { ++--- python-pysam.orig/bcftools/plugins/tag2tag.c.pysam.c +++++ python-pysam/bcftools/plugins/tag2tag.c.pysam.c ++@@ -28,6 +28,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include "bcftools.h" ++@@ -219,8 +220,8 @@ ++ } ++ ++ if ( j!=nals*(nals+1)/2 ) ++- error("Wrong number of GP values for diploid genotype at %s:%d, expected %d, found %d\n", ++- bcf_seqname(in_hdr,rec),rec->pos+1, nals*(nals+1)/2,j); +++ error("Wrong number of GP values for diploid genotype at %s:%"PRId64", expected %d, found %d\n", +++ bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1, nals*(nals+1)/2,j); ++ ++ if (ptr[jmax] < 1-thresh) ++ { ++--- /dev/null +++++ python-pysam/bcftools/plugins/trio-dnm.c ++@@ -0,0 +1,444 @@ +++/* The MIT License +++ +++ Copyright (c) 2018-2019 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++ THE SOFTWARE. +++ +++ */ +++ +++#include +++#include +++#include +++#include +++#include // for isatty +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "bcftools.h" +++#include "filter.h" +++ +++ +++// Logic of the filters: include or exclude sites which match the filters? +++#define FLT_INCLUDE 1 +++#define FLT_EXCLUDE 2 +++ +++#define iCHILD 0 +++#define iFATHER 1 +++#define iMOTHER 2 +++ +++typedef struct +++{ +++ int idx[3]; // VCF sample index for child, father, mother +++ int pass; // do all three pass the filters? +++} +++trio_t; +++ +++typedef struct +++{ +++ int argc, filter_logic, regions_is_file, targets_is_file, output_type; +++ char *filter_str; +++ char **argv, *ped_fname, *pfm, *output_fname, *fname, *regions, *targets; +++ htsFile *out_fh; +++ bcf_srs_t *sr; +++ bcf_hdr_t *hdr, *hdr_out; +++ trio_t *trio; +++ int has_fmt_ad; +++ int ntrio, mtrio; +++ int32_t *pl, *ad, *dnm_qual, *vaf; // input FMT/PL and AD values, output DNM and VAF +++ int mpl, mad; +++ double min_score; +++ double *aprob; // proband's allele probabilities +++ double *pl3; // normalized PLs converted to probs for proband,father,mother +++ int maprob, mpl3, midx, *idx, force_ad; +++} +++args_t; +++ +++args_t args; +++ +++const char *about(void) +++{ +++ return "Screen variants for possible de-novo mutations in trios.\n"; +++} +++ +++static const char *usage_text(void) +++{ +++ return +++ "\n" +++ "About: Screen variants for possible de-novo mutations in trios\n" +++ "Usage: bcftools +trio-dnm [Plugin Options]\n" +++ "Plugin options:\n" +++ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" +++ " --force-AD calculate VAF even if the number of FMT/AD fields is incorrect. Use at your own risk!\n" +++ " -i, --include EXPR include sites and samples for which the expression is true\n" +++ " -m, --min-score NUM do not add FMT/DNM annotation if the score is smaller than NUM\n" +++ " -o, --output FILE output file name [stdout]\n" +++ " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" +++ " -p, --pfm P,F,M sample names of proband, father, and mother\n" +++ " -P, --ped FILE PED file\n" +++ " -r, --regions REG restrict to comma-separated list of regions\n" +++ " -R, --regions-file FILE restrict to regions listed in a file\n" +++ " -t, --targets REG similar to -r but streams rather than index-jumps\n" +++ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" +++ "\n" +++ "Example:\n" +++ " # Annotate VCF with FORMAT/DNM, run for a single trio\n" +++ " bcftools +trio-dnm -p proband,father,mother file.bcf\n" +++ "\n" +++ " # Same as above, but read the trio(s) from a PED file\n" +++ " bcftools +trio-dnm -P file.ped file.bcf\n" +++ "\n" +++ " # Same as above plus extract a list of significant DNMs using the bcftools/query command\n" +++ " bcftools +trio-dnm -P file.ped file.bcf -Ou | bcftools query -i'DNM>10' -f'[%CHROM:%POS %SAMPLE %DNM\\n]'\n" +++ "\n"; +++} +++ +++static int cmp_trios(const void *_a, const void *_b) +++{ +++ trio_t *a = (trio_t *) _a; +++ trio_t *b = (trio_t *) _b; +++ int i; +++ int amin = a->idx[0]; +++ for (i=1; i<3; i++) +++ if ( amin > a->idx[i] ) amin = a->idx[i]; +++ int bmin = b->idx[0]; +++ for (i=1; i<3; i++) +++ if ( bmin > b->idx[i] ) bmin = b->idx[i]; +++ if ( amin < bmin ) return -1; +++ if ( amin > bmin ) return 1; +++ return 0; +++} +++static void parse_ped(args_t *args, char *fname) +++{ +++ htsFile *fp = hts_open(fname, "r"); +++ if ( !fp ) error("Could not read: %s\n", fname); +++ +++ kstring_t str = {0,0,0}; +++ if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname); +++ +++ int moff = 0, *off = NULL; +++ do +++ { +++ // familyID sampleID paternalID maternalID sex phenotype population relationship siblings secondOrder thirdOrder children comment +++ // BB03 HG01884 HG01885 HG01956 2 0 ACB child 0 0 0 0 +++ int ncols = ksplit_core(str.s,0,&moff,&off); +++ if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s); +++ +++ int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]); +++ if ( father<0 ) continue; +++ int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]); +++ if ( mother<0 ) continue; +++ int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]); +++ if ( child<0 ) continue; +++ +++ args->ntrio++; +++ hts_expand0(trio_t,args->ntrio,args->mtrio,args->trio); +++ trio_t *trio = &args->trio[args->ntrio-1]; +++ trio->idx[iFATHER] = father; +++ trio->idx[iMOTHER] = mother; +++ trio->idx[iCHILD] = child; +++ } +++ while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); +++ +++ fprintf(stderr,"Identified %d complete trio%s in the VCF file\n", args->ntrio,args->ntrio==1?"":"s"); +++ +++ // sort the sample by index so that they are accessed more or less sequentially +++ qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); +++ +++ free(str.s); +++ free(off); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); +++} +++static void init_data(args_t *args) +++{ +++ args->sr = bcf_sr_init(); +++ if ( args->regions ) +++ { +++ args->sr->require_index = 1; +++ if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); +++ } +++ if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); +++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); +++ args->hdr = bcf_sr_get_header(args->sr,0); +++ +++ int id; +++ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) +++ error("Error: the tag FORMAT/PL is not present in %s\n", args->fname); +++ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "AD"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) +++ fprintf(stderr, "Warning: the tag FORMAT/AD is not present in %s, the output tag FORMAT/VAF will not be added\n", args->fname); +++ else +++ args->has_fmt_ad = 1; +++ +++ args->hdr_out = bcf_hdr_dup(args->hdr); +++ bcf_hdr_append(args->hdr_out, "##FORMAT="); +++ if ( args->has_fmt_ad ) +++ bcf_hdr_append(args->hdr_out, "##FORMAT="); +++ +++ int i, n = 0; +++ char **list; +++ if ( args->pfm ) +++ { +++ args->ntrio = 1; +++ args->trio = (trio_t*) calloc(1,sizeof(trio_t)); +++ list = hts_readlist(args->pfm, 0, &n); +++ if ( n!=3 ) error("Expected three sample names with -t\n"); +++ args->trio[0].idx[iCHILD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]); +++ args->trio[0].idx[iFATHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[1]); +++ args->trio[0].idx[iMOTHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[2]); +++ for (i=0; itrio[0].idx[i] < 0 ) error("The sample is not present: %s\n", list[i]); +++ free(list[i]); +++ } +++ free(list); +++ } +++ else +++ { +++ parse_ped(args,args->ped_fname); +++ if ( !args->ntrio ) error("No complete trio present\n"); +++ } +++ +++ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); +++ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); +++ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ +++ args->dnm_qual = (int32_t*) malloc(sizeof(*args->dnm_qual)*bcf_hdr_nsamples(args->hdr)); +++ args->vaf = (int32_t*) malloc(sizeof(*args->vaf)*bcf_hdr_nsamples(args->hdr)); +++} +++static void destroy_data(args_t *args) +++{ +++ free(args->pl3); +++ free(args->aprob); +++ free(args->idx); +++ free(args->dnm_qual); +++ free(args->vaf); +++ free(args->trio); +++ free(args->pl); +++ free(args->ad); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); +++ bcf_hdr_destroy(args->hdr_out); +++ bcf_sr_destroy(args->sr); +++ free(args); +++} +++static float process_trio(args_t *args, int nals, double *pl[3], int npl, int *al0, int *al1) +++{ +++ assert( nals>1 ); +++ +++ // determine the two most likely proband's alleles +++ int i,j,k = 0,tmp; +++ +++ hts_expand(int,nals,args->midx,args->idx); +++ hts_expand(double,nals,args->maprob,args->aprob); +++ for (i=0; iaprob[i] = 0; +++ for (i=0; iaprob[i] += pl[iCHILD][k]; +++ args->aprob[j] += pl[iCHILD][k]; +++ k++; +++ } +++ } +++ +++ // sort in descendent order +++ double *arr = args->aprob; +++ int *idx = args->idx; +++ for (i=0; i0 && arr[idx[j]] > arr[idx[j-1]]; j--) +++ tmp = idx[j], idx[j] = idx[j-1], idx[j-1] = tmp; +++ +++ if ( idx[0] < idx[1] ) { *al0 = idx[0]; *al1 = idx[1]; } +++ else { *al0 = idx[1]; *al1 = idx[0]; } +++ +++ // Calculate the probability of inheriting the 00, 01, and 11 genotype. For DNM they all will be small +++ int k00 = bcf_alleles2gt(idx[0],idx[0]); +++ int k01 = bcf_alleles2gt(idx[0],idx[1]); +++ int k11 = bcf_alleles2gt(idx[1],idx[1]); +++ double pd00 = pl[iCHILD][k00] * (pl[iFATHER][k00] + 0.5*pl[iFATHER][k01]) * (pl[iMOTHER][k00] + 0.5*pl[iMOTHER][k01]); +++ double pd11 = pl[iCHILD][k11] * (pl[iFATHER][k11] + 0.5*pl[iFATHER][k01]) * (pl[iMOTHER][k11] + 0.5*pl[iMOTHER][k01]); +++ double pd01 = pl[iCHILD][k01] * (pl[iFATHER][k00] * (pl[iMOTHER][k11] + 0.5*pl[iMOTHER][k01]) + pl[iFATHER][k11] * (pl[iMOTHER][k00] + 0.5*pl[iMOTHER][k01]) +++ + 0.5*pl[iFATHER][k01] * (pl[iMOTHER][k00] + pl[iMOTHER][k01] + pl[iMOTHER][k11])); +++ +++ double max = pd01; +++ if ( max < pd00 ) max = pd00; +++ if ( max < pd11 ) max = pd11; +++ return fabs(4.3429 * log(max)); +++} +++static void process_record(args_t *args, bcf1_t *rec) +++{ +++ if ( rec->n_allele==1 ) +++ { +++ if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ return; +++ } +++ static int n_ad_warned = 0; +++ int nret, nsmpl = bcf_hdr_nsamples(args->hdr), n_ad = args->has_fmt_ad; +++ if ( n_ad ) +++ { +++ nret = bcf_get_format_int32(args->hdr,rec,"AD",&args->ad,&args->mad); +++ if ( nret<=0 ) n_ad = 0; +++ else +++ { +++ n_ad = nret / nsmpl; +++ if ( nret != nsmpl * rec->n_allele ) +++ { +++ if ( !n_ad_warned ) +++ { +++ hts_log_warning("Incorrect number of fields for FORMAT/AD at %s:%"PRId64". This warning is printed only once", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ n_ad_warned = 1; +++ } +++ if ( !args->force_ad ) n_ad = 0; +++ } +++ } +++ } +++ nret = bcf_get_format_int32(args->hdr,rec,"PL",&args->pl,&args->mpl); +++ if ( nret<=0 ) error("The FORMAT/PL tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ int npl1 = nret/nsmpl; +++ if ( npl1!=rec->n_allele*(rec->n_allele+1)/2 ) +++ error("fixme: not a diploid site at %s:%"PRId64": %d alleles, %d PLs\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_allele,npl1); +++ hts_expand(double,3*npl1,args->mpl3,args->pl3); +++ int i, j, k, al0, al1, write_dnm = 0, ad_set = 0; +++ for (i=0; idnm_qual[i] = bcf_int32_missing; +++ for (i=0; intrio; i++) +++ { +++ double *ppl[3]; +++ for (j=0; j<3; j++) +++ { +++ int32_t *src = args->pl + npl1 * args->trio[i].idx[j]; +++ double *dst = ppl[j] = args->pl3 + j*npl1; +++ double sum = 0; +++ for (k=0; kn_allele, ppl, npl1, &al0, &al1); +++ if ( score >= args->min_score ) +++ { +++ write_dnm = 1; +++ args->dnm_qual[ args->trio[i].idx[iCHILD] ] = score; +++ } +++ +++ if ( n_ad ) +++ { +++ if ( al0 < n_ad && al1 < n_ad ) +++ { +++ ad_set = 1; +++ for (j=0; j<3; j++) +++ { +++ int32_t *src = args->ad + n_ad * args->trio[i].idx[j]; +++ args->vaf[ args->trio[i].idx[j] ] = src[al0]+src[al1] ? round(src[al1]*100./(src[al0]+src[al1])) : 0; +++ } +++ } +++ else +++ for (j=0; j<3; j++) args->vaf[ args->trio[i].idx[j] ] = bcf_int32_missing; +++ } +++ } +++ if ( write_dnm ) +++ { +++ if ( bcf_update_format_int32(args->hdr_out,rec,"DNM",args->dnm_qual,nsmpl)!=0 ) +++ error("Failed to write FORMAT/DNM at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ if ( ad_set ) +++ { +++ if ( bcf_update_format_int32(args->hdr_out,rec,"VAF",args->vaf,nsmpl)!=0 ) +++ error("Failed to write FORMAT/VAF at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ } +++ } +++ if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s at %s:%"PRId64"\n", __func__,args->output_fname,bcf_seqname(args->hdr,rec),(int64_t)rec->pos+1); +++} +++ +++int run(int argc, char **argv) +++{ +++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); +++ args->argc = argc; args->argv = argv; +++ args->output_fname = "-"; +++ static struct option loptions[] = +++ { +++ {"force-AD",no_argument,0,1}, +++ {"min-score",required_argument,0,'m'}, +++ {"include",required_argument,0,'i'}, +++ {"exclude",required_argument,0,'e'}, +++ {"output",required_argument,NULL,'o'}, +++ {"output-type",required_argument,NULL,'O'}, +++ {"ped",required_argument,NULL,'P'}, +++ {"pfm",required_argument,NULL,'p'}, +++ {"regions",1,0,'r'}, +++ {"regions-file",1,0,'R'}, +++ {"targets",1,0,'t'}, +++ {"targets-file",1,0,'T'}, +++ {NULL,0,NULL,0} +++ }; +++ int c; +++ char *tmp; +++ while ((c = getopt_long(argc, argv, "p:P:o:O:s:i:e:r:R:t:T:m:",loptions,NULL)) >= 0) +++ { +++ switch (c) +++ { +++ case 1 : args->force_ad = 1; break; +++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; +++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; +++ case 't': args->targets = optarg; break; +++ case 'T': args->targets = optarg; args->targets_is_file = 1; break; +++ case 'r': args->regions = optarg; break; +++ case 'R': args->regions = optarg; args->regions_is_file = 1; break; +++ case 'o': args->output_fname = optarg; break; +++ case 'O': +++ switch (optarg[0]) { +++ case 'b': args->output_type = FT_BCF_GZ; break; +++ case 'u': args->output_type = FT_BCF; break; +++ case 'z': args->output_type = FT_VCF_GZ; break; +++ case 'v': args->output_type = FT_VCF; break; +++ default: error("The output type \"%s\" not recognised\n", optarg); +++ }; +++ break; +++ case 'P': args->ped_fname = optarg; break; +++ case 'p': args->pfm = optarg; break; +++ case 'm': args->min_score = strtod(optarg,&tmp); +++ if ( *tmp ) error("Could not parse: --min-score %s\n", optarg); +++ break; +++ case 'h': +++ case '?': +++ default: error("%s", usage_text()); break; +++ } +++ } +++ if ( optind==argc ) +++ { +++ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin +++ else { error("%s", usage_text()); } +++ } +++ else if ( optind+1!=argc ) error("%s", usage_text()); +++ else args->fname = argv[optind]; +++ +++ if ( !args->ped_fname && !args->pfm ) error("Missing the -p or -P option\n"); +++ if ( args->ped_fname && args->pfm ) error("Expected only -p or -P option, not both\n"); +++ +++ init_data(args); +++ +++ while ( bcf_sr_next_line(args->sr) ) +++ process_record(args, bcf_sr_get_line(args->sr,0)); +++ +++ destroy_data(args); +++ +++ return 0; +++} ++--- /dev/null +++++ python-pysam/bcftools/plugins/trio-dnm.c.pysam.c ++@@ -0,0 +1,446 @@ +++#include "bcftools.pysam.h" +++ +++/* The MIT License +++ +++ Copyright (c) 2018-2019 Genome Research Ltd. +++ +++ Author: Petr Danecek +++ +++ Permission is hereby granted, free of charge, to any person obtaining a copy +++ of this software and associated documentation files (the "Software"), to deal +++ in the Software without restriction, including without limitation the rights +++ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++ copies of the Software, and to permit persons to whom the Software is +++ furnished to do so, subject to the following conditions: +++ +++ The above copyright notice and this permission notice shall be included in +++ all copies or substantial portions of the Software. +++ +++ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++ THE SOFTWARE. +++ +++ */ +++ +++#include +++#include +++#include +++#include +++#include // for isatty +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "bcftools.h" +++#include "filter.h" +++ +++ +++// Logic of the filters: include or exclude sites which match the filters? +++#define FLT_INCLUDE 1 +++#define FLT_EXCLUDE 2 +++ +++#define iCHILD 0 +++#define iFATHER 1 +++#define iMOTHER 2 +++ +++typedef struct +++{ +++ int idx[3]; // VCF sample index for child, father, mother +++ int pass; // do all three pass the filters? +++} +++trio_t; +++ +++typedef struct +++{ +++ int argc, filter_logic, regions_is_file, targets_is_file, output_type; +++ char *filter_str; +++ char **argv, *ped_fname, *pfm, *output_fname, *fname, *regions, *targets; +++ htsFile *out_fh; +++ bcf_srs_t *sr; +++ bcf_hdr_t *hdr, *hdr_out; +++ trio_t *trio; +++ int has_fmt_ad; +++ int ntrio, mtrio; +++ int32_t *pl, *ad, *dnm_qual, *vaf; // input FMT/PL and AD values, output DNM and VAF +++ int mpl, mad; +++ double min_score; +++ double *aprob; // proband's allele probabilities +++ double *pl3; // normalized PLs converted to probs for proband,father,mother +++ int maprob, mpl3, midx, *idx, force_ad; +++} +++args_t; +++ +++args_t args; +++ +++const char *about(void) +++{ +++ return "Screen variants for possible de-novo mutations in trios.\n"; +++} +++ +++static const char *usage_text(void) +++{ +++ return +++ "\n" +++ "About: Screen variants for possible de-novo mutations in trios\n" +++ "Usage: bcftools +trio-dnm [Plugin Options]\n" +++ "Plugin options:\n" +++ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" +++ " --force-AD calculate VAF even if the number of FMT/AD fields is incorrect. Use at your own risk!\n" +++ " -i, --include EXPR include sites and samples for which the expression is true\n" +++ " -m, --min-score NUM do not add FMT/DNM annotation if the score is smaller than NUM\n" +++ " -o, --output FILE output file name [bcftools_stdout]\n" +++ " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" +++ " -p, --pfm P,F,M sample names of proband, father, and mother\n" +++ " -P, --ped FILE PED file\n" +++ " -r, --regions REG restrict to comma-separated list of regions\n" +++ " -R, --regions-file FILE restrict to regions listed in a file\n" +++ " -t, --targets REG similar to -r but streams rather than index-jumps\n" +++ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" +++ "\n" +++ "Example:\n" +++ " # Annotate VCF with FORMAT/DNM, run for a single trio\n" +++ " bcftools +trio-dnm -p proband,father,mother file.bcf\n" +++ "\n" +++ " # Same as above, but read the trio(s) from a PED file\n" +++ " bcftools +trio-dnm -P file.ped file.bcf\n" +++ "\n" +++ " # Same as above plus extract a list of significant DNMs using the bcftools/query command\n" +++ " bcftools +trio-dnm -P file.ped file.bcf -Ou | bcftools query -i'DNM>10' -f'[%CHROM:%POS %SAMPLE %DNM\\n]'\n" +++ "\n"; +++} +++ +++static int cmp_trios(const void *_a, const void *_b) +++{ +++ trio_t *a = (trio_t *) _a; +++ trio_t *b = (trio_t *) _b; +++ int i; +++ int amin = a->idx[0]; +++ for (i=1; i<3; i++) +++ if ( amin > a->idx[i] ) amin = a->idx[i]; +++ int bmin = b->idx[0]; +++ for (i=1; i<3; i++) +++ if ( bmin > b->idx[i] ) bmin = b->idx[i]; +++ if ( amin < bmin ) return -1; +++ if ( amin > bmin ) return 1; +++ return 0; +++} +++static void parse_ped(args_t *args, char *fname) +++{ +++ htsFile *fp = hts_open(fname, "r"); +++ if ( !fp ) error("Could not read: %s\n", fname); +++ +++ kstring_t str = {0,0,0}; +++ if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname); +++ +++ int moff = 0, *off = NULL; +++ do +++ { +++ // familyID sampleID paternalID maternalID sex phenotype population relationship siblings secondOrder thirdOrder children comment +++ // BB03 HG01884 HG01885 HG01956 2 0 ACB child 0 0 0 0 +++ int ncols = ksplit_core(str.s,0,&moff,&off); +++ if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s); +++ +++ int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]); +++ if ( father<0 ) continue; +++ int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]); +++ if ( mother<0 ) continue; +++ int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]); +++ if ( child<0 ) continue; +++ +++ args->ntrio++; +++ hts_expand0(trio_t,args->ntrio,args->mtrio,args->trio); +++ trio_t *trio = &args->trio[args->ntrio-1]; +++ trio->idx[iFATHER] = father; +++ trio->idx[iMOTHER] = mother; +++ trio->idx[iCHILD] = child; +++ } +++ while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); +++ +++ fprintf(bcftools_stderr,"Identified %d complete trio%s in the VCF file\n", args->ntrio,args->ntrio==1?"":"s"); +++ +++ // sort the sample by index so that they are accessed more or less sequentially +++ qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); +++ +++ free(str.s); +++ free(off); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); +++} +++static void init_data(args_t *args) +++{ +++ args->sr = bcf_sr_init(); +++ if ( args->regions ) +++ { +++ args->sr->require_index = 1; +++ if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); +++ } +++ if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); +++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); +++ args->hdr = bcf_sr_get_header(args->sr,0); +++ +++ int id; +++ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) +++ error("Error: the tag FORMAT/PL is not present in %s\n", args->fname); +++ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "AD"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) +++ fprintf(bcftools_stderr, "Warning: the tag FORMAT/AD is not present in %s, the output tag FORMAT/VAF will not be added\n", args->fname); +++ else +++ args->has_fmt_ad = 1; +++ +++ args->hdr_out = bcf_hdr_dup(args->hdr); +++ bcf_hdr_append(args->hdr_out, "##FORMAT="); +++ if ( args->has_fmt_ad ) +++ bcf_hdr_append(args->hdr_out, "##FORMAT="); +++ +++ int i, n = 0; +++ char **list; +++ if ( args->pfm ) +++ { +++ args->ntrio = 1; +++ args->trio = (trio_t*) calloc(1,sizeof(trio_t)); +++ list = hts_readlist(args->pfm, 0, &n); +++ if ( n!=3 ) error("Expected three sample names with -t\n"); +++ args->trio[0].idx[iCHILD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]); +++ args->trio[0].idx[iFATHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[1]); +++ args->trio[0].idx[iMOTHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[2]); +++ for (i=0; itrio[0].idx[i] < 0 ) error("The sample is not present: %s\n", list[i]); +++ free(list[i]); +++ } +++ free(list); +++ } +++ else +++ { +++ parse_ped(args,args->ped_fname); +++ if ( !args->ntrio ) error("No complete trio present\n"); +++ } +++ +++ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); +++ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); +++ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ +++ args->dnm_qual = (int32_t*) malloc(sizeof(*args->dnm_qual)*bcf_hdr_nsamples(args->hdr)); +++ args->vaf = (int32_t*) malloc(sizeof(*args->vaf)*bcf_hdr_nsamples(args->hdr)); +++} +++static void destroy_data(args_t *args) +++{ +++ free(args->pl3); +++ free(args->aprob); +++ free(args->idx); +++ free(args->dnm_qual); +++ free(args->vaf); +++ free(args->trio); +++ free(args->pl); +++ free(args->ad); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); +++ bcf_hdr_destroy(args->hdr_out); +++ bcf_sr_destroy(args->sr); +++ free(args); +++} +++static float process_trio(args_t *args, int nals, double *pl[3], int npl, int *al0, int *al1) +++{ +++ assert( nals>1 ); +++ +++ // determine the two most likely proband's alleles +++ int i,j,k = 0,tmp; +++ +++ hts_expand(int,nals,args->midx,args->idx); +++ hts_expand(double,nals,args->maprob,args->aprob); +++ for (i=0; iaprob[i] = 0; +++ for (i=0; iaprob[i] += pl[iCHILD][k]; +++ args->aprob[j] += pl[iCHILD][k]; +++ k++; +++ } +++ } +++ +++ // sort in descendent order +++ double *arr = args->aprob; +++ int *idx = args->idx; +++ for (i=0; i0 && arr[idx[j]] > arr[idx[j-1]]; j--) +++ tmp = idx[j], idx[j] = idx[j-1], idx[j-1] = tmp; +++ +++ if ( idx[0] < idx[1] ) { *al0 = idx[0]; *al1 = idx[1]; } +++ else { *al0 = idx[1]; *al1 = idx[0]; } +++ +++ // Calculate the probability of inheriting the 00, 01, and 11 genotype. For DNM they all will be small +++ int k00 = bcf_alleles2gt(idx[0],idx[0]); +++ int k01 = bcf_alleles2gt(idx[0],idx[1]); +++ int k11 = bcf_alleles2gt(idx[1],idx[1]); +++ double pd00 = pl[iCHILD][k00] * (pl[iFATHER][k00] + 0.5*pl[iFATHER][k01]) * (pl[iMOTHER][k00] + 0.5*pl[iMOTHER][k01]); +++ double pd11 = pl[iCHILD][k11] * (pl[iFATHER][k11] + 0.5*pl[iFATHER][k01]) * (pl[iMOTHER][k11] + 0.5*pl[iMOTHER][k01]); +++ double pd01 = pl[iCHILD][k01] * (pl[iFATHER][k00] * (pl[iMOTHER][k11] + 0.5*pl[iMOTHER][k01]) + pl[iFATHER][k11] * (pl[iMOTHER][k00] + 0.5*pl[iMOTHER][k01]) +++ + 0.5*pl[iFATHER][k01] * (pl[iMOTHER][k00] + pl[iMOTHER][k01] + pl[iMOTHER][k11])); +++ +++ double max = pd01; +++ if ( max < pd00 ) max = pd00; +++ if ( max < pd11 ) max = pd11; +++ return fabs(4.3429 * log(max)); +++} +++static void process_record(args_t *args, bcf1_t *rec) +++{ +++ if ( rec->n_allele==1 ) +++ { +++ if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ return; +++ } +++ static int n_ad_warned = 0; +++ int nret, nsmpl = bcf_hdr_nsamples(args->hdr), n_ad = args->has_fmt_ad; +++ if ( n_ad ) +++ { +++ nret = bcf_get_format_int32(args->hdr,rec,"AD",&args->ad,&args->mad); +++ if ( nret<=0 ) n_ad = 0; +++ else +++ { +++ n_ad = nret / nsmpl; +++ if ( nret != nsmpl * rec->n_allele ) +++ { +++ if ( !n_ad_warned ) +++ { +++ hts_log_warning("Incorrect number of fields for FORMAT/AD at %s:%"PRId64". This warning is printed only once", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ n_ad_warned = 1; +++ } +++ if ( !args->force_ad ) n_ad = 0; +++ } +++ } +++ } +++ nret = bcf_get_format_int32(args->hdr,rec,"PL",&args->pl,&args->mpl); +++ if ( nret<=0 ) error("The FORMAT/PL tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ int npl1 = nret/nsmpl; +++ if ( npl1!=rec->n_allele*(rec->n_allele+1)/2 ) +++ error("fixme: not a diploid site at %s:%"PRId64": %d alleles, %d PLs\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_allele,npl1); +++ hts_expand(double,3*npl1,args->mpl3,args->pl3); +++ int i, j, k, al0, al1, write_dnm = 0, ad_set = 0; +++ for (i=0; idnm_qual[i] = bcf_int32_missing; +++ for (i=0; intrio; i++) +++ { +++ double *ppl[3]; +++ for (j=0; j<3; j++) +++ { +++ int32_t *src = args->pl + npl1 * args->trio[i].idx[j]; +++ double *dst = ppl[j] = args->pl3 + j*npl1; +++ double sum = 0; +++ for (k=0; kn_allele, ppl, npl1, &al0, &al1); +++ if ( score >= args->min_score ) +++ { +++ write_dnm = 1; +++ args->dnm_qual[ args->trio[i].idx[iCHILD] ] = score; +++ } +++ +++ if ( n_ad ) +++ { +++ if ( al0 < n_ad && al1 < n_ad ) +++ { +++ ad_set = 1; +++ for (j=0; j<3; j++) +++ { +++ int32_t *src = args->ad + n_ad * args->trio[i].idx[j]; +++ args->vaf[ args->trio[i].idx[j] ] = src[al0]+src[al1] ? round(src[al1]*100./(src[al0]+src[al1])) : 0; +++ } +++ } +++ else +++ for (j=0; j<3; j++) args->vaf[ args->trio[i].idx[j] ] = bcf_int32_missing; +++ } +++ } +++ if ( write_dnm ) +++ { +++ if ( bcf_update_format_int32(args->hdr_out,rec,"DNM",args->dnm_qual,nsmpl)!=0 ) +++ error("Failed to write FORMAT/DNM at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ if ( ad_set ) +++ { +++ if ( bcf_update_format_int32(args->hdr_out,rec,"VAF",args->vaf,nsmpl)!=0 ) +++ error("Failed to write FORMAT/VAF at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); +++ } +++ } +++ if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s at %s:%"PRId64"\n", __func__,args->output_fname,bcf_seqname(args->hdr,rec),(int64_t)rec->pos+1); +++} +++ +++int run(int argc, char **argv) +++{ +++ args_t *args = (args_t*) calloc(1,sizeof(args_t)); +++ args->argc = argc; args->argv = argv; +++ args->output_fname = "-"; +++ static struct option loptions[] = +++ { +++ {"force-AD",no_argument,0,1}, +++ {"min-score",required_argument,0,'m'}, +++ {"include",required_argument,0,'i'}, +++ {"exclude",required_argument,0,'e'}, +++ {"output",required_argument,NULL,'o'}, +++ {"output-type",required_argument,NULL,'O'}, +++ {"ped",required_argument,NULL,'P'}, +++ {"pfm",required_argument,NULL,'p'}, +++ {"regions",1,0,'r'}, +++ {"regions-file",1,0,'R'}, +++ {"targets",1,0,'t'}, +++ {"targets-file",1,0,'T'}, +++ {NULL,0,NULL,0} +++ }; +++ int c; +++ char *tmp; +++ while ((c = getopt_long(argc, argv, "p:P:o:O:s:i:e:r:R:t:T:m:",loptions,NULL)) >= 0) +++ { +++ switch (c) +++ { +++ case 1 : args->force_ad = 1; break; +++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; +++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; +++ case 't': args->targets = optarg; break; +++ case 'T': args->targets = optarg; args->targets_is_file = 1; break; +++ case 'r': args->regions = optarg; break; +++ case 'R': args->regions = optarg; args->regions_is_file = 1; break; +++ case 'o': args->output_fname = optarg; break; +++ case 'O': +++ switch (optarg[0]) { +++ case 'b': args->output_type = FT_BCF_GZ; break; +++ case 'u': args->output_type = FT_BCF; break; +++ case 'z': args->output_type = FT_VCF_GZ; break; +++ case 'v': args->output_type = FT_VCF; break; +++ default: error("The output type \"%s\" not recognised\n", optarg); +++ }; +++ break; +++ case 'P': args->ped_fname = optarg; break; +++ case 'p': args->pfm = optarg; break; +++ case 'm': args->min_score = strtod(optarg,&tmp); +++ if ( *tmp ) error("Could not parse: --min-score %s\n", optarg); +++ break; +++ case 'h': +++ case '?': +++ default: error("%s", usage_text()); break; +++ } +++ } +++ if ( optind==argc ) +++ { +++ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin +++ else { error("%s", usage_text()); } +++ } +++ else if ( optind+1!=argc ) error("%s", usage_text()); +++ else args->fname = argv[optind]; +++ +++ if ( !args->ped_fname && !args->pfm ) error("Missing the -p or -P option\n"); +++ if ( args->ped_fname && args->pfm ) error("Expected only -p or -P option, not both\n"); +++ +++ init_data(args); +++ +++ while ( bcf_sr_next_line(args->sr) ) +++ process_record(args, bcf_sr_get_line(args->sr,0)); +++ +++ destroy_data(args); +++ +++ return 0; +++} ++--- python-pysam.orig/bcftools/plugins/trio-stats.c +++++ python-pysam/bcftools/plugins/trio-stats.c ++@@ -1,6 +1,6 @@ ++ /* The MIT License ++ ++- Copyright (c) 2018 Genome Research Ltd. +++ Copyright (c) 2018-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -26,14 +26,17 @@ ++ ++ #include ++ #include +++#include ++ #include ++ #include // for isatty +++#include ++ #include ++ #include ++ #include ++ #include ++ #include ++ #include +++#include ++ #include "bcftools.h" ++ #include "filter.h" ++ ++@@ -46,6 +49,9 @@ ++ #define iFATHER 1 ++ #define iMOTHER 2 ++ +++#define VERBOSE_MENDEL 1 +++#define VERBOSE_TRANSMITTED 2 +++ ++ typedef struct ++ { ++ int idx[3]; // VCF sample index for father, mother and child ++@@ -58,11 +64,13 @@ ++ uint32_t ++ npass, // number of genotypes passing the filter ++ nnon_ref, // number of non-reference genotypes ++- nmendel_err, // number of mendelian errors +++ nmendel_err, // number of DNMs / mendelian errors ++ nnovel, // a singleton allele, but observed only in the child. Counted as mendel_err as well. ++ nsingleton, // het mother or father different from everyone else ++- ndoubleton, // het mother+child or father+child different from everyone else ++- nts, ntv; // number of transitions and transversions +++ ndoubleton, // het mother+child or father+child different from everyone else (transmitted alleles) +++ nts, ntv, // number of transitions and transversions +++ ndnm_recurrent, // number of recurrent DNMs / mendelian errors (counted as GTs, not sites; in ambiguous cases the allele with smaller AF is chosen) +++ ndnm_hom; // number of homozygous DNMs / mendelian errors ++ } ++ trio_stats_t; ++ ++@@ -76,18 +84,33 @@ ++ ++ typedef struct ++ { +++ kbitset_t *sd_bset; // singleton (1) or doubleton (0) trio? +++ uint32_t +++ nalt, // number of all alternate trios +++ nsd, // number of singleton or doubleton trios +++ *idx; // indexes of the singleton and doubleon trios +++} +++alt_trios_t; // for one alt allele +++ +++typedef struct +++{ +++ int max_alt_trios; // maximum number of alternate trios [1] +++ int malt_trios; +++ alt_trios_t *alt_trios; ++ int argc, filter_logic, regions_is_file, targets_is_file; ++ int nflt_str; ++ char *filter_str, **flt_str; ++- char **argv, *ped_fname, *output_fname, *fname, *regions, *targets; +++ char **argv, *ped_fname, *pfm, *output_fname, *fname, *regions, *targets; ++ bcf_srs_t *sr; ++ bcf_hdr_t *hdr; ++ trio_t *trio; ++ int ntrio, mtrio; ++ flt_stats_t *filters; ++ int nfilters; ++- int32_t *gt_arr, *ac, *ac_trio; ++- int mgt_arr, mac, mac_trio; +++ int32_t *gt_arr, *ac, *ac_trio, *dnm_als; +++ int mgt_arr, mac, mac_trio, mdnm_als; +++ int verbose; +++ FILE *fp_out; ++ } ++ args_t; ++ ++@@ -106,10 +129,14 @@ ++ " a range of values simultaneously\n" ++ "Usage: bcftools +trio-stats [Plugin Options]\n" ++ "Plugin options:\n" +++ " -a, --alt-trios INT for transmission rate consider only sites with at most this\n" +++ " many alternate trios, 0 for unlimited [0]\n" +++ " -d, --debug TYPE comma-separted list of features: {mendel-errors,transmitted}\n" ++ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" ++ " -i, --include EXPR include sites and samples for which the expression is true\n" ++ " -o, --output FILE output file name [stdout]\n" ++ " -p, --ped FILE PED file\n" +++ " -P, --pfm P,F,M sample names of proband, father, and mother\n" ++ " -r, --regions REG restrict to comma-separated list of regions\n" ++ " -R, --regions-file FILE restrict to regions listed in a file\n" ++ " -t, --targets REG similar to -r but streams rather than index-jumps\n" ++@@ -169,13 +196,14 @@ ++ while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); ++ ++ fprintf(stderr,"Identified %d complete trios in the VCF file\n", args->ntrio); +++ if ( !args->ntrio ) error("No complete trio identified\n"); ++ ++ // sort the sample by index so that they are accessed more or less sequentially ++ qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); ++ ++ free(str.s); ++ free(off); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); ++ } ++ ++ static void parse_filters(args_t *args) ++@@ -231,7 +259,33 @@ ++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); ++ args->hdr = bcf_sr_get_header(args->sr,0); ++ ++- parse_ped(args, args->ped_fname); +++ if ( args->ped_fname ) +++ parse_ped(args, args->ped_fname); +++ else +++ { +++ args->ntrio = 1; +++ args->trio = (trio_t*) calloc(1,sizeof(trio_t)); +++ int ibeg, iend = 0; +++ while ( args->pfm[iend] && args->pfm[iend]!=',' ) iend++; +++ if ( !args->pfm[iend] ) error("Could not parse -P %s\n", args->pfm); +++ args->pfm[iend] = 0; +++ int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm); +++ if ( child<0 ) error("No such sample: \"%s\"\n", args->pfm); +++ args->pfm[iend] = ','; +++ ibeg = ++iend; +++ while ( args->pfm[iend] && args->pfm[iend]!=',' ) iend++; +++ if ( !args->pfm[iend] ) error("Could not parse -P %s\n", args->pfm); +++ args->pfm[iend] = 0; +++ int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm+ibeg); +++ if ( father<0 ) error("No such sample: \"%s\"\n", args->pfm+ibeg); +++ args->pfm[iend] = ','; +++ ibeg = ++iend; +++ int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm+ibeg); +++ if ( mother<0 ) error("No such sample: \"%s\"\n", args->pfm+ibeg); +++ args->trio[0].idx[iFATHER] = father; +++ args->trio[0].idx[iMOTHER] = mother; +++ args->trio[0].idx[iCHILD] = child; +++ } ++ parse_filters(args); ++ ++ int i; ++@@ -261,6 +315,66 @@ ++ } ++ for (i=0; infilters; i++) ++ args->filters[i].stats = (trio_stats_t*) calloc(args->ntrio,sizeof(trio_stats_t)); +++ +++ args->fp_out = !args->output_fname || !strcmp("-",args->output_fname) ? stdout : fopen(args->output_fname,"w"); +++ if ( !args->fp_out ) error("Could not open the file for writing: %s\n", args->output_fname); +++ fprintf(args->fp_out,"# CMD line shows the command line used to generate this output\n"); +++ fprintf(args->fp_out,"# DEF lines define expressions for all tested thresholds\n"); +++ fprintf(args->fp_out,"# FLT* lines report numbers for every threshold and every trio:\n"); +++ i = 0; +++ fprintf(args->fp_out,"# %d) filter id\n", ++i); +++ fprintf(args->fp_out,"# %d) child\n", ++i); +++ fprintf(args->fp_out,"# %d) father\n", ++i); +++ fprintf(args->fp_out,"# %d) mother\n", ++i); +++ fprintf(args->fp_out,"# %d) number of valid trio genotypes (all trio members pass filters, all non-missing)\n", ++i); +++ fprintf(args->fp_out,"# %d) number of non-reference trio GTs (at least one trio member carries an alternate allele)\n", ++i); +++ fprintf(args->fp_out,"# %d) number of DNMs/Mendelian errors\n", ++i); +++ fprintf(args->fp_out,"# %d) number of novel singleton alleles in the child (counted also as DNM / Mendelian error)\n", ++i); +++ fprintf(args->fp_out,"# %d) number of untransmitted trio singletons (one alternate allele present in one parent)\n", ++i); +++ fprintf(args->fp_out,"# %d) number of transmitted trio singletons (one alternate allele present in one parent and the child)\n", ++i); +++ fprintf(args->fp_out,"# %d) number of transitions, all distinct ALT alleles present in the trio are considered\n", ++i); +++ fprintf(args->fp_out,"# %d) number of transversions, all distinct ALT alleles present in the trio are considered\n", ++i); +++ fprintf(args->fp_out,"# %d) overall ts/tv, all distinct ALT alleles present in the trio are considered\n", ++i); +++ fprintf(args->fp_out,"# %d) number of homozygous DNMs/Mendelian errors (likely genotyping errors)\n", ++i); +++ fprintf(args->fp_out,"# %d) number of recurrent DNMs/Mendelian errors (non-inherited alleles present in other samples; counts GTs, not sites)\n", ++i); +++ fprintf(args->fp_out, "CMD\t%s", args->argv[0]); +++ for (i=1; iargc; i++) fprintf(args->fp_out, " %s",args->argv[i]); +++ fprintf(args->fp_out, "\n"); +++} +++static void alt_trios_reset(args_t *args, int nals) +++{ +++ int i; +++ hts_expand0(alt_trios_t, nals, args->malt_trios, args->alt_trios); +++ for (i=0; ialt_trios[i]; +++ if ( !tr->idx ) +++ { +++ tr->idx = (uint32_t*)malloc(sizeof(*tr->idx)*args->ntrio); +++ tr->sd_bset = kbs_init(args->ntrio); +++ } +++ else +++ kbs_clear(tr->sd_bset); +++ tr->nsd = 0; +++ tr->nalt = 0; +++ } +++} +++static void alt_trios_destroy(args_t *args) +++{ +++ if ( !args->max_alt_trios ) return; +++ int i; +++ for (i=0; imalt_trios; i++) +++ { +++ free(args->alt_trios[i].idx); +++ kbs_destroy(args->alt_trios[i].sd_bset); +++ } +++ free(args->alt_trios); +++} +++static inline void alt_trios_add(args_t *args, int itrio, int ial, int is_singleton) +++{ +++ alt_trios_t *tr = &args->alt_trios[ial]; +++ if ( is_singleton ) kbs_insert(tr->sd_bset, tr->nsd); +++ tr->idx[ tr->nsd++ ] = itrio; ++ } ++ static void destroy_data(args_t *args) ++ { ++@@ -275,64 +389,47 @@ ++ for (i=0; inflt_str; i++) free(args->flt_str[i]); ++ free(args->flt_str); ++ bcf_sr_destroy(args->sr); +++ alt_trios_destroy(args); ++ free(args->trio); ++ free(args->ac); ++ free(args->ac_trio); ++ free(args->gt_arr); +++ free(args->dnm_als); +++ if ( fclose(args->fp_out)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "stdout" : args->output_fname); ++ free(args); ++ } ++ static void report_stats(args_t *args) ++ { ++ int i = 0,j; ++- FILE *fh = !args->output_fname || !strcmp("-",args->output_fname) ? stdout : fopen(args->output_fname,"w"); ++- if ( !fh ) error("Could not open the file for writing: %s\n", args->output_fname); ++- fprintf(fh,"# CMD line shows the command line used to generate this output\n"); ++- fprintf(fh,"# DEF lines define expressions for all tested thresholds\n"); ++- fprintf(fh,"# FLT* lines report numbers for every threshold and every trio:\n"); ++- fprintf(fh,"# %d) filter id\n", ++i); ++- fprintf(fh,"# %d) child\n", ++i); ++- fprintf(fh,"# %d) father\n", ++i); ++- fprintf(fh,"# %d) mother\n", ++i); ++- fprintf(fh,"# %d) number of valid trio genotypes (all trio members pass filters, all non-missing)\n", ++i); ++- fprintf(fh,"# %d) number of non-reference trio GTs (at least one trio member carries an alternate allele)\n", ++i); ++- fprintf(fh,"# %d) number of Mendelian errors\n", ++i); ++- fprintf(fh,"# %d) number of novel singleton alleles in the child (counted also as a Mendelian error)\n", ++i); ++- fprintf(fh,"# %d) number of untransmitted singletons, present only in one parent\n", ++i); ++- fprintf(fh,"# %d) number of transmitted singletons, present only in one parent and the child\n", ++i); ++- fprintf(fh,"# %d) number of transitions, all ALT alleles present in the trio are considered\n", ++i); ++- fprintf(fh,"# %d) number of transversions, all ALT alleles present in the trio are considered\n", ++i); ++- fprintf(fh,"# %d) overall ts/tv, all ALT alleles present in the trio are considered\n", ++i); ++- fprintf(fh, "CMD\t%s", args->argv[0]); ++- for (i=1; iargc; i++) fprintf(fh, " %s",args->argv[i]); ++- fprintf(fh, "\n"); ++ for (i=0; infilters; i++) ++ { ++ flt_stats_t *flt = &args->filters[i]; ++- fprintf(fh,"DEF\tFLT%d\t%s\n", i, flt->expr); +++ fprintf(args->fp_out,"DEF\tFLT%d\t%s\n", i, flt->expr); ++ } ++ for (i=0; infilters; i++) ++ { ++ flt_stats_t *flt = &args->filters[i]; ++ for (j=0; jntrio; j++) ++ { ++- fprintf(fh,"FLT%d", i); ++- fprintf(fh,"\t%s",args->hdr->samples[args->trio[j].idx[iCHILD]]); ++- fprintf(fh,"\t%s",args->hdr->samples[args->trio[j].idx[iFATHER]]); ++- fprintf(fh,"\t%s",args->hdr->samples[args->trio[j].idx[iMOTHER]]); +++ fprintf(args->fp_out,"FLT%d", i); +++ fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iCHILD]]); +++ fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iFATHER]]); +++ fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iMOTHER]]); ++ trio_stats_t *stats = &flt->stats[j]; ++- fprintf(fh,"\t%d", stats->npass); ++- fprintf(fh,"\t%d", stats->nnon_ref); ++- fprintf(fh,"\t%d", stats->nmendel_err); ++- fprintf(fh,"\t%d", stats->nnovel); ++- fprintf(fh,"\t%d", stats->nsingleton); ++- fprintf(fh,"\t%d", stats->ndoubleton); ++- fprintf(fh,"\t%d", stats->nts); ++- fprintf(fh,"\t%d", stats->ntv); ++- fprintf(fh,"\t%.2f", stats->ntv ? (float)stats->nts/stats->ntv : INFINITY); ++- fprintf(fh,"\n"); +++ fprintf(args->fp_out,"\t%d", stats->npass); +++ fprintf(args->fp_out,"\t%d", stats->nnon_ref); +++ fprintf(args->fp_out,"\t%d", stats->nmendel_err); +++ fprintf(args->fp_out,"\t%d", stats->nnovel); +++ fprintf(args->fp_out,"\t%d", stats->nsingleton); +++ fprintf(args->fp_out,"\t%d", stats->ndoubleton); +++ fprintf(args->fp_out,"\t%d", stats->nts); +++ fprintf(args->fp_out,"\t%d", stats->ntv); +++ fprintf(args->fp_out,"\t%.2f", stats->ntv ? (float)stats->nts/stats->ntv : INFINITY); +++ fprintf(args->fp_out,"\t%d", stats->ndnm_hom); +++ fprintf(args->fp_out,"\t%d", stats->ndnm_recurrent); +++ fprintf(args->fp_out,"\n"); ++ } ++ } ++- if ( fclose(fh)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "stdout" : args->output_fname); ++ } ++ ++ static inline int parse_genotype(int32_t *arr, int ngt1, int idx, int als[2]) ++@@ -406,6 +503,7 @@ ++ hts_expand(int, rec->n_allele, args->mac, args->ac); ++ if ( !bcf_calc_ac(args->hdr, rec, args->ac, BCF_UN_INFO|BCF_UN_FMT) ) return; ++ hts_expand(int, rec->n_allele, args->mac_trio, args->ac_trio); +++ hts_expand(int, rec->n_allele, args->mdnm_als, args->dnm_als); ++ ++ // Get the genotypes ++ int ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); ++@@ -420,6 +518,9 @@ ++ for (i=1; in_allele; i++) ++ if ( !rec->d.allele[i][1] && rec->d.allele[i][0]=='*' ) { star_allele = i; break; } ++ +++ // number of non-reference trios +++ if ( args->max_alt_trios ) alt_trios_reset(args, rec->n_allele); +++ ++ // Run the stats ++ for (i=0; intrio; i++) ++ { ++@@ -441,8 +542,7 @@ ++ for (j=0; j<6; j++) ++ { ++ if ( als[j]==star_allele ) { has_star_allele = 1; continue; } ++- if ( als[j]==0 ) continue; ++- has_nonref = 1; +++ if ( als[j]!=0 ) has_nonref = 1; ++ args->ac_trio[ als[j] ]++; ++ } ++ if ( !has_nonref ) continue; // only ref or * in this trio ++@@ -457,7 +557,7 @@ ++ { ++ if ( als[j]==0 || als[j]==star_allele ) continue; ++ if ( als[j] >= rec->n_allele ) ++- error("The GT index is out of range at %s:%d in %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,args->hdr->samples[args->trio[i].idx[j/2]]); +++ error("The GT index is out of range at %s:%"PRId64" in %s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->hdr->samples[args->trio[i].idx[j/2]]); ++ if ( rec->d.allele[als[j]][1] ) continue; ++ ++ int alt = bcf_acgt2int(rec->d.allele[als[j]][0]); ++@@ -473,21 +573,111 @@ ++ if ( has_star_allele ) continue; ++ ++ // Detect mendelian errors ++- int mendel_ok = (als_child[0]==als_father[0] || als_child[0]==als_father[1]) && (als_child[1]==als_mother[0] || als_child[1]==als_mother[1]) ? 1 : 0; ++- if ( !mendel_ok ) mendel_ok = (als_child[1]==als_father[0] || als_child[1]==als_father[1]) && (als_child[0]==als_mother[0] || als_child[0]==als_mother[1]) ? 1 : 0; ++- if ( !mendel_ok ) stats->nmendel_err++; +++ int a0F = als_child[0]==als_father[0] || als_child[0]==als_father[1] ? 1 : 0; +++ int a1M = als_child[1]==als_mother[0] || als_child[1]==als_mother[1] ? 1 : 0; +++ if ( !a0F || !a1M ) +++ { +++ int a0M = als_child[0]==als_mother[0] || als_child[0]==als_mother[1] ? 1 : 0; +++ int a1F = als_child[1]==als_father[0] || als_child[1]==als_father[1] ? 1 : 0; +++ if ( !a0M || !a1F ) +++ { +++ stats->nmendel_err++; +++ +++ int dnm_hom = 0; +++ if ( als_child[0]==als_child[1] ) { stats->ndnm_hom++; dnm_hom = 1; } +++ +++ int culprit; // neglecting the unlikely possibility of alt het 1/2 DNM genotype +++ if ( !a0F && !a0M ) culprit = als_child[0]; +++ else if ( !a1F && !a1M ) culprit = als_child[1]; +++ else if ( args->ac[als_child[0]] < args->ac[als_child[1]] ) culprit = als_child[0]; +++ else culprit = als_child[1]; +++ +++ int dnm_recurrent = 0; +++ if ( (!dnm_hom && args->ac[culprit]>1) || (dnm_hom && args->ac[culprit]>2) ) { stats->ndnm_recurrent++; dnm_recurrent = 1; } +++ +++ if ( args->verbose & VERBOSE_MENDEL ) +++ fprintf(args->fp_out,"MERR\t%s\t%"PRId64"\t%s\t%s\t%s\t%s\t%s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, +++ args->hdr->samples[args->trio[i].idx[iCHILD]], +++ args->hdr->samples[args->trio[i].idx[iFATHER]], +++ args->hdr->samples[args->trio[i].idx[iMOTHER]], +++ dnm_hom ? "HOM" : "-", +++ dnm_recurrent ? "RECURRENT" : "-" +++ ); +++ } +++ } ++ ++ // Is this a singleton, doubleton, neither? ++- for (j=1; jn_allele; j++) +++ for (j=0; jn_allele; j++) ++ { ++- if ( args->ac_trio[j]==1 && args->ac[j]==1 ) // singleton (in parent) or novel (in child) +++ if ( !args->ac_trio[j] ) continue; +++ if ( args->max_alt_trios ) args->alt_trios[j].nalt++; +++ +++ if ( args->ac_trio[j]==1 ) // singleton (in parent) or novel (in child) ++ { ++ if ( als_child[0]==j || als_child[1]==j ) stats->nnovel++; ++- else stats->nsingleton++; +++ else +++ { +++ if ( !args->max_alt_trios ) +++ { +++ stats->nsingleton++; +++ if ( args->verbose & VERBOSE_TRANSMITTED ) +++ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tNO\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, +++ args->hdr->samples[args->trio[i].idx[iCHILD]], +++ args->hdr->samples[args->trio[i].idx[iFATHER]], +++ args->hdr->samples[args->trio[i].idx[iMOTHER]] +++ ); +++ } +++ else alt_trios_add(args, i,j,1); +++ } +++ } +++ else if ( args->ac_trio[j]==2 ) // possibly a doubleton +++ { +++ if ( (als_child[0]!=j && als_child[1]!=j) || (als_child[0]==j && als_child[1]==j) ) continue; +++ if ( (als_father[0]==j && als_father[1]==j) || (als_mother[0]==j && als_mother[1]==j) ) continue; +++ if ( !args->max_alt_trios ) +++ { +++ stats->ndoubleton++; +++ if ( args->verbose & VERBOSE_TRANSMITTED ) +++ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tYES\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, +++ args->hdr->samples[args->trio[i].idx[iCHILD]], +++ args->hdr->samples[args->trio[i].idx[iFATHER]], +++ args->hdr->samples[args->trio[i].idx[iMOTHER]] +++ ); +++ } +++ else alt_trios_add(args, i,j,0); ++ } ++- else if ( args->ac_trio[j]==2 && args->ac[j]==2 ) // possibly a doubleton +++ } +++ } +++ if ( args->max_alt_trios ) +++ { +++ for (j=0; jn_allele; j++) +++ { +++ alt_trios_t *tr = &args->alt_trios[j]; +++ if ( !tr->nsd || tr->nalt > args->max_alt_trios ) continue; +++ for (i=0; insd; i++) ++ { ++- if ( (als_child[0]==j || als_child[1]==j) && (als_child[0]!=j || als_child[1]!=j) ) stats->ndoubleton++; +++ int itr = tr->idx[i]; +++ trio_stats_t *stats = &flt->stats[itr]; +++ if ( kbs_exists(tr->sd_bset,i) ) +++ { +++ stats->nsingleton++; +++ if ( args->verbose & VERBOSE_TRANSMITTED ) +++ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tNO\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, +++ args->hdr->samples[args->trio[itr].idx[iCHILD]], +++ args->hdr->samples[args->trio[itr].idx[iFATHER]], +++ args->hdr->samples[args->trio[itr].idx[iMOTHER]] +++ ); +++ } +++ else +++ { +++ stats->ndoubleton++; +++ if ( args->verbose & VERBOSE_TRANSMITTED ) +++ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tYES\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, +++ args->hdr->samples[args->trio[itr].idx[iCHILD]], +++ args->hdr->samples[args->trio[itr].idx[iFATHER]], +++ args->hdr->samples[args->trio[itr].idx[iMOTHER]] +++ ); +++ } ++ } ++ } ++ } ++@@ -500,10 +690,13 @@ ++ args->output_fname = "-"; ++ static struct option loptions[] = ++ { +++ {"debug",required_argument,0,'d'}, +++ {"alt-trios",required_argument,0,'a'}, ++ {"include",required_argument,0,'i'}, ++ {"exclude",required_argument,0,'e'}, ++ {"output",required_argument,NULL,'o'}, ++ {"ped",required_argument,NULL,'p'}, +++ {"pfm",required_argument,NULL,'P'}, ++ {"regions",1,0,'r'}, ++ {"regions-file",1,0,'R'}, ++ {"targets",1,0,'t'}, ++@@ -511,10 +704,25 @@ ++ {NULL,0,NULL,0} ++ }; ++ int c, i; ++- while ((c = getopt_long(argc, argv, "p:o:s:i:e:r:R:t:T:",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "P:p:o:s:i:e:r:R:t:T:a:d:",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { +++ case 'd': +++ { +++ int n; +++ char **tmp = hts_readlist(optarg, 0, &n); +++ for(i=0; iverbose |= VERBOSE_MENDEL; +++ else if ( !strcasecmp(tmp[i],"transmitted") ) args->verbose |= VERBOSE_TRANSMITTED; +++ else error("Error: The argument \"%s\" to option --debug is not recognised\n", tmp[i]); +++ free(tmp[i]); +++ } +++ free(tmp); +++ break; +++ } +++ case 'a': args->max_alt_trios = atoi(optarg); break; ++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; ++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; ++ case 't': args->targets = optarg; break; ++@@ -523,6 +731,7 @@ ++ case 'R': args->regions = optarg; args->regions_is_file = 1; break; ++ case 'o': args->output_fname = optarg; break; ++ case 'p': args->ped_fname = optarg; break; +++ case 'P': args->pfm = optarg; break; ++ case 'h': ++ case '?': ++ default: error("%s", usage_text()); break; ++@@ -536,7 +745,7 @@ ++ else if ( optind+1!=argc ) error("%s", usage_text()); ++ else args->fname = argv[optind]; ++ ++- if ( !args->ped_fname ) error("Missing the -p, --ped option\n"); +++ if ( !args->ped_fname && !args->pfm ) error("Missing the -p or -P option\n"); ++ ++ init_data(args); ++ ++--- python-pysam.orig/bcftools/plugins/trio-stats.c.pysam.c +++++ python-pysam/bcftools/plugins/trio-stats.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* The MIT License ++ ++- Copyright (c) 2018 Genome Research Ltd. +++ Copyright (c) 2018-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -28,14 +28,17 @@ ++ ++ #include ++ #include +++#include ++ #include ++ #include // for isatty +++#include ++ #include ++ #include ++ #include ++ #include ++ #include ++ #include +++#include ++ #include "bcftools.h" ++ #include "filter.h" ++ ++@@ -48,6 +51,9 @@ ++ #define iFATHER 1 ++ #define iMOTHER 2 ++ +++#define VERBOSE_MENDEL 1 +++#define VERBOSE_TRANSMITTED 2 +++ ++ typedef struct ++ { ++ int idx[3]; // VCF sample index for father, mother and child ++@@ -60,11 +66,13 @@ ++ uint32_t ++ npass, // number of genotypes passing the filter ++ nnon_ref, // number of non-reference genotypes ++- nmendel_err, // number of mendelian errors +++ nmendel_err, // number of DNMs / mendelian errors ++ nnovel, // a singleton allele, but observed only in the child. Counted as mendel_err as well. ++ nsingleton, // het mother or father different from everyone else ++- ndoubleton, // het mother+child or father+child different from everyone else ++- nts, ntv; // number of transitions and transversions +++ ndoubleton, // het mother+child or father+child different from everyone else (transmitted alleles) +++ nts, ntv, // number of transitions and transversions +++ ndnm_recurrent, // number of recurrent DNMs / mendelian errors (counted as GTs, not sites; in ambiguous cases the allele with smaller AF is chosen) +++ ndnm_hom; // number of homozygous DNMs / mendelian errors ++ } ++ trio_stats_t; ++ ++@@ -78,18 +86,33 @@ ++ ++ typedef struct ++ { +++ kbitset_t *sd_bset; // singleton (1) or doubleton (0) trio? +++ uint32_t +++ nalt, // number of all alternate trios +++ nsd, // number of singleton or doubleton trios +++ *idx; // indexes of the singleton and doubleon trios +++} +++alt_trios_t; // for one alt allele +++ +++typedef struct +++{ +++ int max_alt_trios; // maximum number of alternate trios [1] +++ int malt_trios; +++ alt_trios_t *alt_trios; ++ int argc, filter_logic, regions_is_file, targets_is_file; ++ int nflt_str; ++ char *filter_str, **flt_str; ++- char **argv, *ped_fname, *output_fname, *fname, *regions, *targets; +++ char **argv, *ped_fname, *pfm, *output_fname, *fname, *regions, *targets; ++ bcf_srs_t *sr; ++ bcf_hdr_t *hdr; ++ trio_t *trio; ++ int ntrio, mtrio; ++ flt_stats_t *filters; ++ int nfilters; ++- int32_t *gt_arr, *ac, *ac_trio; ++- int mgt_arr, mac, mac_trio; +++ int32_t *gt_arr, *ac, *ac_trio, *dnm_als; +++ int mgt_arr, mac, mac_trio, mdnm_als; +++ int verbose; +++ FILE *fp_out; ++ } ++ args_t; ++ ++@@ -108,10 +131,14 @@ ++ " a range of values simultaneously\n" ++ "Usage: bcftools +trio-stats [Plugin Options]\n" ++ "Plugin options:\n" +++ " -a, --alt-trios INT for transmission rate consider only sites with at most this\n" +++ " many alternate trios, 0 for unlimited [0]\n" +++ " -d, --debug TYPE comma-separted list of features: {mendel-errors,transmitted}\n" ++ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" ++ " -i, --include EXPR include sites and samples for which the expression is true\n" ++ " -o, --output FILE output file name [bcftools_stdout]\n" ++ " -p, --ped FILE PED file\n" +++ " -P, --pfm P,F,M sample names of proband, father, and mother\n" ++ " -r, --regions REG restrict to comma-separated list of regions\n" ++ " -R, --regions-file FILE restrict to regions listed in a file\n" ++ " -t, --targets REG similar to -r but streams rather than index-jumps\n" ++@@ -171,13 +198,14 @@ ++ while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); ++ ++ fprintf(bcftools_stderr,"Identified %d complete trios in the VCF file\n", args->ntrio); +++ if ( !args->ntrio ) error("No complete trio identified\n"); ++ ++ // sort the sample by index so that they are accessed more or less sequentially ++ qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); ++ ++ free(str.s); ++ free(off); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); ++ } ++ ++ static void parse_filters(args_t *args) ++@@ -233,7 +261,33 @@ ++ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); ++ args->hdr = bcf_sr_get_header(args->sr,0); ++ ++- parse_ped(args, args->ped_fname); +++ if ( args->ped_fname ) +++ parse_ped(args, args->ped_fname); +++ else +++ { +++ args->ntrio = 1; +++ args->trio = (trio_t*) calloc(1,sizeof(trio_t)); +++ int ibeg, iend = 0; +++ while ( args->pfm[iend] && args->pfm[iend]!=',' ) iend++; +++ if ( !args->pfm[iend] ) error("Could not parse -P %s\n", args->pfm); +++ args->pfm[iend] = 0; +++ int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm); +++ if ( child<0 ) error("No such sample: \"%s\"\n", args->pfm); +++ args->pfm[iend] = ','; +++ ibeg = ++iend; +++ while ( args->pfm[iend] && args->pfm[iend]!=',' ) iend++; +++ if ( !args->pfm[iend] ) error("Could not parse -P %s\n", args->pfm); +++ args->pfm[iend] = 0; +++ int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm+ibeg); +++ if ( father<0 ) error("No such sample: \"%s\"\n", args->pfm+ibeg); +++ args->pfm[iend] = ','; +++ ibeg = ++iend; +++ int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm+ibeg); +++ if ( mother<0 ) error("No such sample: \"%s\"\n", args->pfm+ibeg); +++ args->trio[0].idx[iFATHER] = father; +++ args->trio[0].idx[iMOTHER] = mother; +++ args->trio[0].idx[iCHILD] = child; +++ } ++ parse_filters(args); ++ ++ int i; ++@@ -263,6 +317,66 @@ ++ } ++ for (i=0; infilters; i++) ++ args->filters[i].stats = (trio_stats_t*) calloc(args->ntrio,sizeof(trio_stats_t)); +++ +++ args->fp_out = !args->output_fname || !strcmp("-",args->output_fname) ? bcftools_stdout : fopen(args->output_fname,"w"); +++ if ( !args->fp_out ) error("Could not open the file for writing: %s\n", args->output_fname); +++ fprintf(args->fp_out,"# CMD line shows the command line used to generate this output\n"); +++ fprintf(args->fp_out,"# DEF lines define expressions for all tested thresholds\n"); +++ fprintf(args->fp_out,"# FLT* lines report numbers for every threshold and every trio:\n"); +++ i = 0; +++ fprintf(args->fp_out,"# %d) filter id\n", ++i); +++ fprintf(args->fp_out,"# %d) child\n", ++i); +++ fprintf(args->fp_out,"# %d) father\n", ++i); +++ fprintf(args->fp_out,"# %d) mother\n", ++i); +++ fprintf(args->fp_out,"# %d) number of valid trio genotypes (all trio members pass filters, all non-missing)\n", ++i); +++ fprintf(args->fp_out,"# %d) number of non-reference trio GTs (at least one trio member carries an alternate allele)\n", ++i); +++ fprintf(args->fp_out,"# %d) number of DNMs/Mendelian errors\n", ++i); +++ fprintf(args->fp_out,"# %d) number of novel singleton alleles in the child (counted also as DNM / Mendelian error)\n", ++i); +++ fprintf(args->fp_out,"# %d) number of untransmitted trio singletons (one alternate allele present in one parent)\n", ++i); +++ fprintf(args->fp_out,"# %d) number of transmitted trio singletons (one alternate allele present in one parent and the child)\n", ++i); +++ fprintf(args->fp_out,"# %d) number of transitions, all distinct ALT alleles present in the trio are considered\n", ++i); +++ fprintf(args->fp_out,"# %d) number of transversions, all distinct ALT alleles present in the trio are considered\n", ++i); +++ fprintf(args->fp_out,"# %d) overall ts/tv, all distinct ALT alleles present in the trio are considered\n", ++i); +++ fprintf(args->fp_out,"# %d) number of homozygous DNMs/Mendelian errors (likely genotyping errors)\n", ++i); +++ fprintf(args->fp_out,"# %d) number of recurrent DNMs/Mendelian errors (non-inherited alleles present in other samples; counts GTs, not sites)\n", ++i); +++ fprintf(args->fp_out, "CMD\t%s", args->argv[0]); +++ for (i=1; iargc; i++) fprintf(args->fp_out, " %s",args->argv[i]); +++ fprintf(args->fp_out, "\n"); +++} +++static void alt_trios_reset(args_t *args, int nals) +++{ +++ int i; +++ hts_expand0(alt_trios_t, nals, args->malt_trios, args->alt_trios); +++ for (i=0; ialt_trios[i]; +++ if ( !tr->idx ) +++ { +++ tr->idx = (uint32_t*)malloc(sizeof(*tr->idx)*args->ntrio); +++ tr->sd_bset = kbs_init(args->ntrio); +++ } +++ else +++ kbs_clear(tr->sd_bset); +++ tr->nsd = 0; +++ tr->nalt = 0; +++ } +++} +++static void alt_trios_destroy(args_t *args) +++{ +++ if ( !args->max_alt_trios ) return; +++ int i; +++ for (i=0; imalt_trios; i++) +++ { +++ free(args->alt_trios[i].idx); +++ kbs_destroy(args->alt_trios[i].sd_bset); +++ } +++ free(args->alt_trios); +++} +++static inline void alt_trios_add(args_t *args, int itrio, int ial, int is_singleton) +++{ +++ alt_trios_t *tr = &args->alt_trios[ial]; +++ if ( is_singleton ) kbs_insert(tr->sd_bset, tr->nsd); +++ tr->idx[ tr->nsd++ ] = itrio; ++ } ++ static void destroy_data(args_t *args) ++ { ++@@ -277,64 +391,47 @@ ++ for (i=0; inflt_str; i++) free(args->flt_str[i]); ++ free(args->flt_str); ++ bcf_sr_destroy(args->sr); +++ alt_trios_destroy(args); ++ free(args->trio); ++ free(args->ac); ++ free(args->ac_trio); ++ free(args->gt_arr); +++ free(args->dnm_als); +++ if ( fclose(args->fp_out)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "bcftools_stdout" : args->output_fname); ++ free(args); ++ } ++ static void report_stats(args_t *args) ++ { ++ int i = 0,j; ++- FILE *fh = !args->output_fname || !strcmp("-",args->output_fname) ? bcftools_stdout : fopen(args->output_fname,"w"); ++- if ( !fh ) error("Could not open the file for writing: %s\n", args->output_fname); ++- fprintf(fh,"# CMD line shows the command line used to generate this output\n"); ++- fprintf(fh,"# DEF lines define expressions for all tested thresholds\n"); ++- fprintf(fh,"# FLT* lines report numbers for every threshold and every trio:\n"); ++- fprintf(fh,"# %d) filter id\n", ++i); ++- fprintf(fh,"# %d) child\n", ++i); ++- fprintf(fh,"# %d) father\n", ++i); ++- fprintf(fh,"# %d) mother\n", ++i); ++- fprintf(fh,"# %d) number of valid trio genotypes (all trio members pass filters, all non-missing)\n", ++i); ++- fprintf(fh,"# %d) number of non-reference trio GTs (at least one trio member carries an alternate allele)\n", ++i); ++- fprintf(fh,"# %d) number of Mendelian errors\n", ++i); ++- fprintf(fh,"# %d) number of novel singleton alleles in the child (counted also as a Mendelian error)\n", ++i); ++- fprintf(fh,"# %d) number of untransmitted singletons, present only in one parent\n", ++i); ++- fprintf(fh,"# %d) number of transmitted singletons, present only in one parent and the child\n", ++i); ++- fprintf(fh,"# %d) number of transitions, all ALT alleles present in the trio are considered\n", ++i); ++- fprintf(fh,"# %d) number of transversions, all ALT alleles present in the trio are considered\n", ++i); ++- fprintf(fh,"# %d) overall ts/tv, all ALT alleles present in the trio are considered\n", ++i); ++- fprintf(fh, "CMD\t%s", args->argv[0]); ++- for (i=1; iargc; i++) fprintf(fh, " %s",args->argv[i]); ++- fprintf(fh, "\n"); ++ for (i=0; infilters; i++) ++ { ++ flt_stats_t *flt = &args->filters[i]; ++- fprintf(fh,"DEF\tFLT%d\t%s\n", i, flt->expr); +++ fprintf(args->fp_out,"DEF\tFLT%d\t%s\n", i, flt->expr); ++ } ++ for (i=0; infilters; i++) ++ { ++ flt_stats_t *flt = &args->filters[i]; ++ for (j=0; jntrio; j++) ++ { ++- fprintf(fh,"FLT%d", i); ++- fprintf(fh,"\t%s",args->hdr->samples[args->trio[j].idx[iCHILD]]); ++- fprintf(fh,"\t%s",args->hdr->samples[args->trio[j].idx[iFATHER]]); ++- fprintf(fh,"\t%s",args->hdr->samples[args->trio[j].idx[iMOTHER]]); +++ fprintf(args->fp_out,"FLT%d", i); +++ fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iCHILD]]); +++ fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iFATHER]]); +++ fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iMOTHER]]); ++ trio_stats_t *stats = &flt->stats[j]; ++- fprintf(fh,"\t%d", stats->npass); ++- fprintf(fh,"\t%d", stats->nnon_ref); ++- fprintf(fh,"\t%d", stats->nmendel_err); ++- fprintf(fh,"\t%d", stats->nnovel); ++- fprintf(fh,"\t%d", stats->nsingleton); ++- fprintf(fh,"\t%d", stats->ndoubleton); ++- fprintf(fh,"\t%d", stats->nts); ++- fprintf(fh,"\t%d", stats->ntv); ++- fprintf(fh,"\t%.2f", stats->ntv ? (float)stats->nts/stats->ntv : INFINITY); ++- fprintf(fh,"\n"); +++ fprintf(args->fp_out,"\t%d", stats->npass); +++ fprintf(args->fp_out,"\t%d", stats->nnon_ref); +++ fprintf(args->fp_out,"\t%d", stats->nmendel_err); +++ fprintf(args->fp_out,"\t%d", stats->nnovel); +++ fprintf(args->fp_out,"\t%d", stats->nsingleton); +++ fprintf(args->fp_out,"\t%d", stats->ndoubleton); +++ fprintf(args->fp_out,"\t%d", stats->nts); +++ fprintf(args->fp_out,"\t%d", stats->ntv); +++ fprintf(args->fp_out,"\t%.2f", stats->ntv ? (float)stats->nts/stats->ntv : INFINITY); +++ fprintf(args->fp_out,"\t%d", stats->ndnm_hom); +++ fprintf(args->fp_out,"\t%d", stats->ndnm_recurrent); +++ fprintf(args->fp_out,"\n"); ++ } ++ } ++- if ( fclose(fh)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "bcftools_stdout" : args->output_fname); ++ } ++ ++ static inline int parse_genotype(int32_t *arr, int ngt1, int idx, int als[2]) ++@@ -408,6 +505,7 @@ ++ hts_expand(int, rec->n_allele, args->mac, args->ac); ++ if ( !bcf_calc_ac(args->hdr, rec, args->ac, BCF_UN_INFO|BCF_UN_FMT) ) return; ++ hts_expand(int, rec->n_allele, args->mac_trio, args->ac_trio); +++ hts_expand(int, rec->n_allele, args->mdnm_als, args->dnm_als); ++ ++ // Get the genotypes ++ int ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); ++@@ -422,6 +520,9 @@ ++ for (i=1; in_allele; i++) ++ if ( !rec->d.allele[i][1] && rec->d.allele[i][0]=='*' ) { star_allele = i; break; } ++ +++ // number of non-reference trios +++ if ( args->max_alt_trios ) alt_trios_reset(args, rec->n_allele); +++ ++ // Run the stats ++ for (i=0; intrio; i++) ++ { ++@@ -443,8 +544,7 @@ ++ for (j=0; j<6; j++) ++ { ++ if ( als[j]==star_allele ) { has_star_allele = 1; continue; } ++- if ( als[j]==0 ) continue; ++- has_nonref = 1; +++ if ( als[j]!=0 ) has_nonref = 1; ++ args->ac_trio[ als[j] ]++; ++ } ++ if ( !has_nonref ) continue; // only ref or * in this trio ++@@ -459,7 +559,7 @@ ++ { ++ if ( als[j]==0 || als[j]==star_allele ) continue; ++ if ( als[j] >= rec->n_allele ) ++- error("The GT index is out of range at %s:%d in %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,args->hdr->samples[args->trio[i].idx[j/2]]); +++ error("The GT index is out of range at %s:%"PRId64" in %s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->hdr->samples[args->trio[i].idx[j/2]]); ++ if ( rec->d.allele[als[j]][1] ) continue; ++ ++ int alt = bcf_acgt2int(rec->d.allele[als[j]][0]); ++@@ -475,21 +575,111 @@ ++ if ( has_star_allele ) continue; ++ ++ // Detect mendelian errors ++- int mendel_ok = (als_child[0]==als_father[0] || als_child[0]==als_father[1]) && (als_child[1]==als_mother[0] || als_child[1]==als_mother[1]) ? 1 : 0; ++- if ( !mendel_ok ) mendel_ok = (als_child[1]==als_father[0] || als_child[1]==als_father[1]) && (als_child[0]==als_mother[0] || als_child[0]==als_mother[1]) ? 1 : 0; ++- if ( !mendel_ok ) stats->nmendel_err++; +++ int a0F = als_child[0]==als_father[0] || als_child[0]==als_father[1] ? 1 : 0; +++ int a1M = als_child[1]==als_mother[0] || als_child[1]==als_mother[1] ? 1 : 0; +++ if ( !a0F || !a1M ) +++ { +++ int a0M = als_child[0]==als_mother[0] || als_child[0]==als_mother[1] ? 1 : 0; +++ int a1F = als_child[1]==als_father[0] || als_child[1]==als_father[1] ? 1 : 0; +++ if ( !a0M || !a1F ) +++ { +++ stats->nmendel_err++; +++ +++ int dnm_hom = 0; +++ if ( als_child[0]==als_child[1] ) { stats->ndnm_hom++; dnm_hom = 1; } +++ +++ int culprit; // neglecting the unlikely possibility of alt het 1/2 DNM genotype +++ if ( !a0F && !a0M ) culprit = als_child[0]; +++ else if ( !a1F && !a1M ) culprit = als_child[1]; +++ else if ( args->ac[als_child[0]] < args->ac[als_child[1]] ) culprit = als_child[0]; +++ else culprit = als_child[1]; +++ +++ int dnm_recurrent = 0; +++ if ( (!dnm_hom && args->ac[culprit]>1) || (dnm_hom && args->ac[culprit]>2) ) { stats->ndnm_recurrent++; dnm_recurrent = 1; } +++ +++ if ( args->verbose & VERBOSE_MENDEL ) +++ fprintf(args->fp_out,"MERR\t%s\t%"PRId64"\t%s\t%s\t%s\t%s\t%s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, +++ args->hdr->samples[args->trio[i].idx[iCHILD]], +++ args->hdr->samples[args->trio[i].idx[iFATHER]], +++ args->hdr->samples[args->trio[i].idx[iMOTHER]], +++ dnm_hom ? "HOM" : "-", +++ dnm_recurrent ? "RECURRENT" : "-" +++ ); +++ } +++ } ++ ++ // Is this a singleton, doubleton, neither? ++- for (j=1; jn_allele; j++) +++ for (j=0; jn_allele; j++) ++ { ++- if ( args->ac_trio[j]==1 && args->ac[j]==1 ) // singleton (in parent) or novel (in child) +++ if ( !args->ac_trio[j] ) continue; +++ if ( args->max_alt_trios ) args->alt_trios[j].nalt++; +++ +++ if ( args->ac_trio[j]==1 ) // singleton (in parent) or novel (in child) ++ { ++ if ( als_child[0]==j || als_child[1]==j ) stats->nnovel++; ++- else stats->nsingleton++; +++ else +++ { +++ if ( !args->max_alt_trios ) +++ { +++ stats->nsingleton++; +++ if ( args->verbose & VERBOSE_TRANSMITTED ) +++ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tNO\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, +++ args->hdr->samples[args->trio[i].idx[iCHILD]], +++ args->hdr->samples[args->trio[i].idx[iFATHER]], +++ args->hdr->samples[args->trio[i].idx[iMOTHER]] +++ ); +++ } +++ else alt_trios_add(args, i,j,1); +++ } +++ } +++ else if ( args->ac_trio[j]==2 ) // possibly a doubleton +++ { +++ if ( (als_child[0]!=j && als_child[1]!=j) || (als_child[0]==j && als_child[1]==j) ) continue; +++ if ( (als_father[0]==j && als_father[1]==j) || (als_mother[0]==j && als_mother[1]==j) ) continue; +++ if ( !args->max_alt_trios ) +++ { +++ stats->ndoubleton++; +++ if ( args->verbose & VERBOSE_TRANSMITTED ) +++ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tYES\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, +++ args->hdr->samples[args->trio[i].idx[iCHILD]], +++ args->hdr->samples[args->trio[i].idx[iFATHER]], +++ args->hdr->samples[args->trio[i].idx[iMOTHER]] +++ ); +++ } +++ else alt_trios_add(args, i,j,0); ++ } ++- else if ( args->ac_trio[j]==2 && args->ac[j]==2 ) // possibly a doubleton +++ } +++ } +++ if ( args->max_alt_trios ) +++ { +++ for (j=0; jn_allele; j++) +++ { +++ alt_trios_t *tr = &args->alt_trios[j]; +++ if ( !tr->nsd || tr->nalt > args->max_alt_trios ) continue; +++ for (i=0; insd; i++) ++ { ++- if ( (als_child[0]==j || als_child[1]==j) && (als_child[0]!=j || als_child[1]!=j) ) stats->ndoubleton++; +++ int itr = tr->idx[i]; +++ trio_stats_t *stats = &flt->stats[itr]; +++ if ( kbs_exists(tr->sd_bset,i) ) +++ { +++ stats->nsingleton++; +++ if ( args->verbose & VERBOSE_TRANSMITTED ) +++ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tNO\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, +++ args->hdr->samples[args->trio[itr].idx[iCHILD]], +++ args->hdr->samples[args->trio[itr].idx[iFATHER]], +++ args->hdr->samples[args->trio[itr].idx[iMOTHER]] +++ ); +++ } +++ else +++ { +++ stats->ndoubleton++; +++ if ( args->verbose & VERBOSE_TRANSMITTED ) +++ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tYES\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, +++ args->hdr->samples[args->trio[itr].idx[iCHILD]], +++ args->hdr->samples[args->trio[itr].idx[iFATHER]], +++ args->hdr->samples[args->trio[itr].idx[iMOTHER]] +++ ); +++ } ++ } ++ } ++ } ++@@ -502,10 +692,13 @@ ++ args->output_fname = "-"; ++ static struct option loptions[] = ++ { +++ {"debug",required_argument,0,'d'}, +++ {"alt-trios",required_argument,0,'a'}, ++ {"include",required_argument,0,'i'}, ++ {"exclude",required_argument,0,'e'}, ++ {"output",required_argument,NULL,'o'}, ++ {"ped",required_argument,NULL,'p'}, +++ {"pfm",required_argument,NULL,'P'}, ++ {"regions",1,0,'r'}, ++ {"regions-file",1,0,'R'}, ++ {"targets",1,0,'t'}, ++@@ -513,10 +706,25 @@ ++ {NULL,0,NULL,0} ++ }; ++ int c, i; ++- while ((c = getopt_long(argc, argv, "p:o:s:i:e:r:R:t:T:",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "P:p:o:s:i:e:r:R:t:T:a:d:",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { +++ case 'd': +++ { +++ int n; +++ char **tmp = hts_readlist(optarg, 0, &n); +++ for(i=0; iverbose |= VERBOSE_MENDEL; +++ else if ( !strcasecmp(tmp[i],"transmitted") ) args->verbose |= VERBOSE_TRANSMITTED; +++ else error("Error: The argument \"%s\" to option --debug is not recognised\n", tmp[i]); +++ free(tmp[i]); +++ } +++ free(tmp); +++ break; +++ } +++ case 'a': args->max_alt_trios = atoi(optarg); break; ++ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; ++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; ++ case 't': args->targets = optarg; break; ++@@ -525,6 +733,7 @@ ++ case 'R': args->regions = optarg; args->regions_is_file = 1; break; ++ case 'o': args->output_fname = optarg; break; ++ case 'p': args->ped_fname = optarg; break; +++ case 'P': args->pfm = optarg; break; ++ case 'h': ++ case '?': ++ default: error("%s", usage_text()); break; ++@@ -538,7 +747,7 @@ ++ else if ( optind+1!=argc ) error("%s", usage_text()); ++ else args->fname = argv[optind]; ++ ++- if ( !args->ped_fname ) error("Missing the -p, --ped option\n"); +++ if ( !args->ped_fname && !args->pfm ) error("Missing the -p or -P option\n"); ++ ++ init_data(args); ++ ++--- python-pysam.orig/bcftools/plugins/trio-switch-rate.c +++++ python-pysam/bcftools/plugins/trio-switch-rate.c ++@@ -141,7 +141,7 @@ ++ khash_str2int_destroy(pop2i); ++ free(str.s); ++ free(off); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); ++ } ++ ++ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) ++--- python-pysam.orig/bcftools/plugins/trio-switch-rate.c.pysam.c +++++ python-pysam/bcftools/plugins/trio-switch-rate.c.pysam.c ++@@ -143,7 +143,7 @@ ++ khash_str2int_destroy(pop2i); ++ free(str.s); ++ free(off); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); ++ } ++ ++ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) ++--- /dev/null +++++ python-pysam/bcftools/plugins/variantkey-hex.c ++@@ -0,0 +1,136 @@ +++/* plugins/variantkey-hex.c -- Generate unsorted VariantKey lookup tables files in hexadecimal format. +++ +++ Copyright (C) 2017-2018 GENOMICS plc. +++ +++ Author: Nicola Asuni +++ +++Permission is hereby granted, free of charge, to any person obtaining a copy +++of this software and associated documentation files (the "Software"), to deal +++in the Software without restriction, including without limitation the rights +++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++copies of the Software, and to permit persons to whom the Software is +++furnished to do so, subject to the following conditions: +++ +++The above copyright notice and this permission notice shall be included in +++all copies or substantial portions of the Software. +++ +++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++DEALINGS IN THE SOFTWARE. */ +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "../variantkey.h" +++ +++const char *FILE_VKRS = "vkrs.unsorted.hex"; +++const char *FILE_RSVK = "rsvk.unsorted.hex"; +++const char *FILE_NRVK = "nrvk.unsorted.tsv"; +++ +++FILE *fp_vkrs; // VariantKey -> rsID +++FILE *fp_rsvk; // rsID -> VariantKey +++FILE *fp_nrvk; // VariantKey non-reversible map (maps VariantKey to REF and ALT) +++ +++static uint64_t numvar; // number of variants +++static uint64_t nrv; // number of non-reversible variants +++ +++bcf_hdr_t *in_hdr; +++ +++const char *about(void) +++{ +++ return "Generate VariantKey index files\n"; +++} +++ +++const char *usage(void) +++{ +++ return +++ "\n" +++ "About: Generate unsorted VariantKey lookup tables files in hexadecimal format.\n" +++ "Usage: bcftools +variantkey-hex [General Options] \n" +++ "Options:\n" +++ " run \"bcftools plugin\" for a list of common options\n" +++ "\n" +++ "Example:\n" +++ " bcftools +variantkey-hex in.vcf\n" +++ "\n"; +++} +++ +++// Called once at startup, allows to initialize local variables. +++// Return 1 to suppress VCF/BCF header from printing, 0 otherwise. +++int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) +++{ +++ in_hdr = in; +++ numvar = 0; +++ char path[1024]; +++ char dir[1024] = "./"; +++ if (argc > 1) +++ { +++ strcpy(dir, argv[1]); +++ } +++ strcpy(path, dir); +++ strcat(path, FILE_VKRS); +++ fp_vkrs = fopen(path, "w"); +++ if (!fp_vkrs) +++ { +++ fprintf(stderr, "%s: %s\n", path, strerror(errno)); +++ } +++ strcpy(path, dir); +++ strcat(path, FILE_RSVK); +++ fp_rsvk = fopen(path, "w"); +++ if (!fp_rsvk) +++ { +++ fprintf(stderr, "%s: %s\n", path, strerror(errno)); +++ } +++ strcpy(path, dir); +++ strcat(path, FILE_NRVK); +++ fp_nrvk = fopen(path, "w"); +++ if (!fp_nrvk) +++ { +++ fprintf(stderr, "%s: %s\n", path, strerror(errno)); +++ } +++ return 1; +++} +++ +++// Called for each VCF record. Return rec to output the line or NULL to suppress output. +++bcf1_t *process(bcf1_t *rec) +++{ +++ int len_ref = strlen(rec->d.allele[0]); +++ int len_alt = strlen(rec->d.allele[1]); +++ uint64_t vk = variantkey( +++ in_hdr->id[BCF_DT_CTG][rec->rid].key, +++ strlen(in_hdr->id[BCF_DT_CTG][rec->rid].key), +++ rec->pos, +++ rec->d.allele[0], +++ len_ref, +++ rec->d.allele[1], +++ len_alt); +++ char *ptr = rec->d.id; +++ ptr += 2; // remove 'rs' +++ uint32_t rs = (uint32_t)strtoul(ptr, NULL, 10); +++ fprintf(fp_vkrs, "%016" PRIx64 "\t%08" PRIx32 "\n", vk, rs); // map VariantKey to rsID +++ fprintf(fp_rsvk, "%08" PRIx32 "\t%016" PRIx64 "\n", rs, vk); // map rsID to VariantKey +++ if (vk & 1) +++ { +++ // map VariantKey to REF and ALT +++ fprintf(fp_nrvk, "%016" PRIx64 "\t%s\t%s\n", vk, rec->d.allele[0], rec->d.allele[1]); +++ nrv++; +++ } +++ numvar++; +++ return NULL; +++} +++ +++void destroy(void) +++{ +++ fclose(fp_vkrs); +++ fclose(fp_rsvk); +++ printf("VariantKeys: %" PRIu64 "\n", numvar); +++ printf("Non-reversible VariantKeys: %" PRIu64 "\n", nrv); +++} ++--- /dev/null +++++ python-pysam/bcftools/plugins/variantkey-hex.c.pysam.c ++@@ -0,0 +1,138 @@ +++#include "bcftools.pysam.h" +++ +++/* plugins/variantkey-hex.c -- Generate unsorted VariantKey lookup tables files in hexadecimal format. +++ +++ Copyright (C) 2017-2018 GENOMICS plc. +++ +++ Author: Nicola Asuni +++ +++Permission is hereby granted, free of charge, to any person obtaining a copy +++of this software and associated documentation files (the "Software"), to deal +++in the Software without restriction, including without limitation the rights +++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++copies of the Software, and to permit persons to whom the Software is +++furnished to do so, subject to the following conditions: +++ +++The above copyright notice and this permission notice shall be included in +++all copies or substantial portions of the Software. +++ +++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++DEALINGS IN THE SOFTWARE. */ +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "../variantkey.h" +++ +++const char *FILE_VKRS = "vkrs.unsorted.hex"; +++const char *FILE_RSVK = "rsvk.unsorted.hex"; +++const char *FILE_NRVK = "nrvk.unsorted.tsv"; +++ +++FILE *fp_vkrs; // VariantKey -> rsID +++FILE *fp_rsvk; // rsID -> VariantKey +++FILE *fp_nrvk; // VariantKey non-reversible map (maps VariantKey to REF and ALT) +++ +++static uint64_t numvar; // number of variants +++static uint64_t nrv; // number of non-reversible variants +++ +++bcf_hdr_t *in_hdr; +++ +++const char *about(void) +++{ +++ return "Generate VariantKey index files\n"; +++} +++ +++const char *usage(void) +++{ +++ return +++ "\n" +++ "About: Generate unsorted VariantKey lookup tables files in hexadecimal format.\n" +++ "Usage: bcftools +variantkey-hex [General Options] \n" +++ "Options:\n" +++ " run \"bcftools plugin\" for a list of common options\n" +++ "\n" +++ "Example:\n" +++ " bcftools +variantkey-hex in.vcf\n" +++ "\n"; +++} +++ +++// Called once at startup, allows to initialize local variables. +++// Return 1 to suppress VCF/BCF header from printing, 0 otherwise. +++int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) +++{ +++ in_hdr = in; +++ numvar = 0; +++ char path[1024]; +++ char dir[1024] = "./"; +++ if (argc > 1) +++ { +++ strcpy(dir, argv[1]); +++ } +++ strcpy(path, dir); +++ strcat(path, FILE_VKRS); +++ fp_vkrs = fopen(path, "w"); +++ if (!fp_vkrs) +++ { +++ fprintf(bcftools_stderr, "%s: %s\n", path, strerror(errno)); +++ } +++ strcpy(path, dir); +++ strcat(path, FILE_RSVK); +++ fp_rsvk = fopen(path, "w"); +++ if (!fp_rsvk) +++ { +++ fprintf(bcftools_stderr, "%s: %s\n", path, strerror(errno)); +++ } +++ strcpy(path, dir); +++ strcat(path, FILE_NRVK); +++ fp_nrvk = fopen(path, "w"); +++ if (!fp_nrvk) +++ { +++ fprintf(bcftools_stderr, "%s: %s\n", path, strerror(errno)); +++ } +++ return 1; +++} +++ +++// Called for each VCF record. Return rec to output the line or NULL to suppress output. +++bcf1_t *process(bcf1_t *rec) +++{ +++ int len_ref = strlen(rec->d.allele[0]); +++ int len_alt = strlen(rec->d.allele[1]); +++ uint64_t vk = variantkey( +++ in_hdr->id[BCF_DT_CTG][rec->rid].key, +++ strlen(in_hdr->id[BCF_DT_CTG][rec->rid].key), +++ rec->pos, +++ rec->d.allele[0], +++ len_ref, +++ rec->d.allele[1], +++ len_alt); +++ char *ptr = rec->d.id; +++ ptr += 2; // remove 'rs' +++ uint32_t rs = (uint32_t)strtoul(ptr, NULL, 10); +++ fprintf(fp_vkrs, "%016" PRIx64 "\t%08" PRIx32 "\n", vk, rs); // map VariantKey to rsID +++ fprintf(fp_rsvk, "%08" PRIx32 "\t%016" PRIx64 "\n", rs, vk); // map rsID to VariantKey +++ if (vk & 1) +++ { +++ // map VariantKey to REF and ALT +++ fprintf(fp_nrvk, "%016" PRIx64 "\t%s\t%s\n", vk, rec->d.allele[0], rec->d.allele[1]); +++ nrv++; +++ } +++ numvar++; +++ return NULL; +++} +++ +++void destroy(void) +++{ +++ fclose(fp_vkrs); +++ fclose(fp_rsvk); +++ fprintf(bcftools_stdout, "VariantKeys: %" PRIu64 "\n", numvar); +++ fprintf(bcftools_stdout, "Non-reversible VariantKeys: %" PRIu64 "\n", nrv); +++} ++--- python-pysam.orig/bcftools/regidx.c +++++ python-pysam/bcftools/regidx.c ++@@ -262,7 +262,11 @@ ++ } ++ ++ free(str.s); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) +++ { +++ fprintf(stderr,"[%s] Error: close failed .. %s\n", __func__,fname); +++ goto error; +++ } ++ return idx; ++ ++ error: ++@@ -392,12 +396,11 @@ ++ { ++ int iend = iBIN(end); ++ if ( iend > list->nidx ) iend = list->nidx; ++- for (i=ibeg; iidx[i] ) break; ++- if ( i==iend ) return 0; +++ if ( i>iend ) return 0; ++ i = list->idx[i]; ++ } ++- ++ for (ireg=i-1; iregnreg; ireg++) ++ { ++ if ( list->reg[ireg].beg > end ) return 0; // no match, past the query region ++--- python-pysam.orig/bcftools/regidx.c.pysam.c +++++ python-pysam/bcftools/regidx.c.pysam.c ++@@ -264,7 +264,11 @@ ++ } ++ ++ free(str.s); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) +++ { +++ fprintf(bcftools_stderr,"[%s] Error: close failed .. %s\n", __func__,fname); +++ goto error; +++ } ++ return idx; ++ ++ error: ++@@ -394,12 +398,11 @@ ++ { ++ int iend = iBIN(end); ++ if ( iend > list->nidx ) iend = list->nidx; ++- for (i=ibeg; iidx[i] ) break; ++- if ( i==iend ) return 0; +++ if ( i>iend ) return 0; ++ i = list->idx[i]; ++ } ++- ++ for (ireg=i-1; iregnreg; ireg++) ++ { ++ if ( list->reg[ireg].beg > end ) return 0; // no match, past the query region ++--- python-pysam.orig/bcftools/regidx.h +++++ python-pysam/bcftools/regidx.h ++@@ -33,14 +33,14 @@ ++ // and for working example see test/test-regidx.c. ++ regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL); ++ ++- // Query overlap with chr:from-to +++ // Query overlap with chr:beg-end (beg,end are 1-based coordinates) ++ regitr_t *itr = regitr_init(idx); ++- if ( regidx_overlap(idx, chr,from,to, itr) ) printf("There is an overlap!\n"); +++ if ( regidx_overlap(idx, chr,beg-1,end-1, itr) ) printf("There is an overlap!\n"); ++ ++ while ( regitr_overlap(itr) ) ++ { ++- printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", from,to, ++- itr->beg, itr->end, regitr_payload(itr,char*)); +++ printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", beg,end, +++ itr->beg+1, itr->end+1, regitr_payload(itr,char*)); ++ } ++ ++ regidx_destroy(idx); ++@@ -53,7 +53,7 @@ ++ regitr_t *itr = regitr_init(idx); ++ ++ while ( regitr_loop(itr) ) ++- printf("chr=%s beg=%d end=%d\n", itr->seq, itr->beg, itr->end); +++ printf("chr=%s beg=%d end=%d\n", itr->seq, itr->beg+1, itr->end+1); ++ ++ regidx_destroy(idx); ++ regitr_destroy(itr); ++--- python-pysam.orig/bcftools/reheader.c +++++ python-pysam/bcftools/reheader.c ++@@ -33,17 +33,23 @@ ++ #include ++ #include ++ #include +++#ifdef _WIN32 +++#include +++#endif ++ #include ++ #include ++ #include // for hts_get_bgzfp() ++ #include ++ #include +++#include +++#include ++ #include "bcftools.h" ++ #include "khash_str2str.h" ++ ++ typedef struct _args_t ++ { ++ char **argv, *fname, *samples_fname, *header_fname, *output_fname; +++ char *fai_fname, *rm_tmpfile; ++ htsFile *fp; ++ htsFormat type; ++ htsThreadPool *threads; ++@@ -51,6 +57,158 @@ ++ } ++ args_t; ++ +++static inline int is_escaped(const char *min, const char *str) +++{ +++ int n = 0; +++ while ( --str>=min && *str=='\\' ) n++; +++ return n%2; +++} +++static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_seen, kstring_t *dst) +++{ +++ kstring_t key = {0,0,0}, val = {0,0,0}, tmp = {0,0,0}; +++ char *chr_name = NULL, *p, *q = line + 9; // skip ##contig= +++ char *end = q; +++ int nopen = 1, chr_len = 0; +++ while ( *end && *end!='\n' ) end++; +++ while ( *q && *q!='\n' && nopen>0 ) +++ { +++ p = ++q; +++ while ( *q && (*q==' ' || *q=='\t') ) { p++; q++; } +++ // ^[A-Za-z_][0-9A-Za-z_.]*$ +++ if (p==q && *q && (isalpha(*q) || *q=='_')) +++ { +++ q++; +++ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; +++ } +++ int n = q-p; +++ int m = 0; +++ while ( *q && (*q==' ' || *q=='\t') ) { q++; m++; } +++ if ( *q!='=' || !n ) +++ { +++ char *x = q; +++ while ( *x && *x!='\n' ) x++; +++ *x = '\0'; +++ error("Could not parse the line: %s [%s][%s]\n", line,p,q); +++ } +++ key.l = 0; +++ kputsn(p,q-p-m,&key); +++ p = ++q; +++ while ( *q && (*q==' ' || *q=='\t') ) { p++; q++; } +++ int quoted = *p=='"' ? 1 : 0; +++ if ( quoted ) p++, q++; +++ while ( *q && *q != '\n' ) +++ { +++ if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; } +++ else +++ { +++ if ( *q=='<' ) nopen++; +++ if ( *q=='>' ) nopen--; +++ if ( !nopen ) break; +++ if ( *q==',' && nopen==1 ) break; +++ } +++ q++; +++ } +++ char *r = q; +++ while ( r > p && r[-1] == ' ' ) r--; +++ val.l = 0; +++ kputsn(p,r-p,&val); +++ if ( quoted && *q=='"' ) q++; +++ if ( *q=='>' ) { nopen--; q++; } +++ if ( !strcmp("length",key.s) ) continue; +++ if ( !strcmp("ID",key.s) ) +++ { +++ if ( khash_str2int_has_key(chr_seen,val.s) ) continue; +++ chr_len = faidx_seq_len(fai, val.s); +++ if ( chr_len==-1 ) +++ { +++ free(val.s); free(key.s); free(tmp.s); +++ return end; // the sequence is not in fai, remove +++ } +++ chr_name = strdup(val.s); +++ khash_str2int_inc(chr_seen, chr_name); +++ continue; +++ } +++ kputc(',',&tmp); +++ kputs(key.s,&tmp); +++ kputc('=',&tmp); +++ if ( quoted ) kputc('"',&tmp); +++ kputs(val.s,&tmp); +++ if ( quoted ) kputc('"',&tmp); +++ } +++ if ( !chr_name ) return end; +++ ksprintf(dst,"##contig=",chr_name,chr_len,tmp.l ? tmp.s : ""); +++ free(key.s); free(val.s); free(tmp.s); +++ return q; +++} +++static void update_from_fai(args_t *args) +++{ +++ if ( !strcmp("-",args->fname) ) +++ error("Cannot use the --fai option when reading from standard input.\n"); +++ +++ faidx_t *fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA); +++ if ( !fai ) error("Could not parse %s\n", args->fai_fname); +++#ifdef _WIN32 +++ char tmp_path[MAX_PATH]; +++ int ret = GetTempPath(MAX_PATH, tmp_path); +++ if (!ret || ret > MAX_PATH) +++ error("Could not get the path to the temporary folder\n"); +++ if (strlen(tmp_path) + strlen("/bcftools-fai-header-XXXXXX") >= MAX_PATH) +++ error("Full path to the temporary folder is too long\n"); +++ strcat(tmp_path, "/bcftools-fai-header-XXXXXX"); +++ args->rm_tmpfile = strdup(tmp_path); +++#else +++ args->rm_tmpfile = strdup("/tmp/bcftools-fai-header-XXXXXX"); +++#endif +++ int fd = mkstemp(args->rm_tmpfile); +++ if ( fd<0 ) error("Could not open a temporary file for writing: %s\n", args->rm_tmpfile); +++ +++ // get a template header: either from the original VCF or from --header +++ char *ori_hdr_fname = args->header_fname ? args->header_fname : args->fname; +++ htsFile *fp = hts_open(ori_hdr_fname,"r"); +++ if ( !fp ) error("Failed to open: %s\n", ori_hdr_fname); +++ bcf_hdr_t *hdr = bcf_hdr_read(fp); +++ if ( !hdr ) error("Failed to read the header: %s\n", ori_hdr_fname); +++ hts_close(fp); // no need to check the return status here +++ +++ // put the header in a text buffer +++ kstring_t hdr_txt_ori = {0,0,0}, hdr_txt_new = {0,0,0}; +++ bcf_hdr_format(hdr, 0, &hdr_txt_ori); +++ bcf_hdr_destroy(hdr); +++ +++ // update the existing contig lines and remove lines not present in the fai file +++ void *chr_seen = khash_str2int_init(); +++ char *tmp, *beg = hdr_txt_ori.s; +++ while ( beg && *beg ) +++ { +++ tmp = strstr(beg, "\n##contig=<"); +++ if ( !tmp ) break; +++ kputsn(beg, tmp-beg+1, &hdr_txt_new); +++ size_t l_prev = hdr_txt_new.l; +++ beg = copy_and_update_contig_line(fai,tmp+1,chr_seen, &hdr_txt_new); +++ if ( l_prev==hdr_txt_new.l ) hdr_txt_new.l--; // nothing was added, remove the newline +++ } +++ if ( !beg || !(tmp=strstr(beg,"\n#CHROM")) ) error("Failed to parse the header, #CHROM not found\n"); +++ kputsn(beg, tmp-beg+1, &hdr_txt_new); +++ +++ // add any new contig lines +++ int i, n = faidx_nseq(fai); +++ for (i=0; i\n",faidx_iseq(fai,i),faidx_seq_len(fai,faidx_iseq(fai,i))); +++ } +++ kputs(tmp+1,&hdr_txt_new); +++ +++ if ( write(fd, hdr_txt_new.s, hdr_txt_new.l)!=hdr_txt_new.l ) error("Failed to write %zu bytes to %s\n", hdr_txt_new.l,args->rm_tmpfile); +++ if ( close(fd)!=0 ) error("Failed to close %s\n", args->rm_tmpfile); +++ args->header_fname = args->rm_tmpfile; +++ +++ free(hdr_txt_ori.s); +++ free(hdr_txt_new.s); +++ fai_destroy(fai); +++ khash_str2int_destroy_free(chr_seen); +++} +++ ++ static void read_header_file(char *fname, kstring_t *hdr) ++ { ++ kstring_t tmp = {0,0,0}; ++@@ -313,8 +471,8 @@ ++ kputc('\n',&fp->line); ++ if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %"PRIu64" bytes\n", (uint64_t)fp->line.l); ++ } ++- hts_close(fp); ++- close(out); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname); +++ if ( close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ } ++ ++ static bcf_hdr_t *strip_header(bcf_hdr_t *src, bcf_hdr_t *dst) ++@@ -346,12 +504,14 @@ ++ if ( j>=0 ) ++ { ++ j = atoi(src_hrec->vals[j]); ++- hrec_add_idx(tmp, j); +++ if (hrec_add_idx(tmp, j) < 0) +++ error_errno("[%s] Failed to add IDX header", __func__); ++ } ++ bcf_hdr_add_hrec(out, tmp); ++ } ++ } ++- bcf_hdr_sync(out); +++ if (bcf_hdr_sync(out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ for (i=0; inhrec; i++) ++ { ++ // finally add new structured fields ++@@ -375,11 +535,10 @@ ++ ++ if ( args->n_threads > 0 ) ++ { ++- args->threads = calloc(1, sizeof(*args->threads)); +++ args->threads = (htsThreadPool *) calloc(1, sizeof(htsThreadPool)); ++ if ( !args->threads ) error("Could not allocate memory\n"); ++ if ( !(args->threads->pool = hts_tpool_init(args->n_threads)) ) error("Could not initialize threading\n"); ++- BGZF *bgzf = hts_get_bgzfp(fp); ++- if ( bgzf ) bgzf_thread_pool(bgzf, args->threads->pool, args->threads->qsize); +++ hts_set_thread_pool(fp, args->threads); ++ } ++ ++ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname); ++@@ -410,11 +569,8 @@ ++ htsFile *fp_out = hts_open(args->output_fname ? args->output_fname : "-",is_compressed ? "wb" : "wbu"); ++ if ( !fp_out ) error("%s: %s\n", args->output_fname ? args->output_fname : "-", strerror(errno)); ++ if ( args->threads ) ++- { ++- BGZF *bgzf = hts_get_bgzfp(fp_out); ++- if ( bgzf ) bgzf_thread_pool(bgzf, args->threads->pool, args->threads->qsize); ++- } ++- bcf_hdr_write(fp_out, hdr_out); +++ hts_set_thread_pool(fp_out, args->threads); +++ if ( bcf_hdr_write(fp_out, hdr_out)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname ? args->output_fname : "standard output"); ++ ++ bcf1_t *rec = bcf_init(); ++ while ( bcf_read(fp, hdr, rec)==0 ) ++@@ -459,13 +615,13 @@ ++ if ( i!=rec->n_fmt ) ++ error("The FORMAT tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.fmt[i].id)); ++ ++- bcf_write(fp_out,hdr_out,rec); +++ if ( bcf_write(fp_out,hdr_out,rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname ? args->output_fname : "standard output"); ++ } ++ bcf_destroy(rec); ++ ++ free(htxt.s); ++- hts_close(fp_out); ++- hts_close(fp); +++ if ( hts_close(fp_out)!=0 ) error("[%s] Error: failed to close the file %s\n",__func__,args->output_fname ? args->output_fname : "standard output"); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname); ++ bcf_hdr_destroy(hdr_out); ++ bcf_hdr_destroy(hdr); ++ if ( args->threads ) ++@@ -483,10 +639,21 @@ ++ fprintf(stderr, "Usage: bcftools reheader [OPTIONS] \n"); ++ fprintf(stderr, "\n"); ++ fprintf(stderr, "Options:\n"); +++ fprintf(stderr, " -f, --fai update sequences and their lengths from the .fai file\n"); ++ fprintf(stderr, " -h, --header new header\n"); ++ fprintf(stderr, " -o, --output write output to a file [standard output]\n"); ++ fprintf(stderr, " -s, --samples new sample names\n"); ++- fprintf(stderr, " --threads number of extra compression threads (BCF only) [0]\n"); +++ fprintf(stderr, " --threads use multithreading with worker threads (BCF only) [0]\n"); +++ fprintf(stderr, "\n"); +++ fprintf(stderr, "Example:\n"); +++ fprintf(stderr, " # Write out the header to be modified\n"); +++ fprintf(stderr, " bcftools view -h old.bcf > header.txt\n"); +++ fprintf(stderr, "\n"); +++ fprintf(stderr, " # Edit the header using your favorite text editor\n"); +++ fprintf(stderr, " vi header.txt\n"); +++ fprintf(stderr, "\n"); +++ fprintf(stderr, " # Reheader the file\n"); +++ fprintf(stderr, " bcftools reheader -h header.txt -o new.bcf old.bcf\n"); ++ fprintf(stderr, "\n"); ++ exit(1); ++ } ++@@ -499,21 +666,23 @@ ++ ++ static struct option loptions[] = ++ { +++ {"fai",1,0,'f'}, ++ {"output",1,0,'o'}, ++ {"header",1,0,'h'}, ++ {"samples",1,0,'s'}, ++ {"threads",1,NULL,1}, ++ {0,0,0,0} ++ }; ++- while ((c = getopt_long(argc, argv, "s:h:o:",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "s:h:o:f:",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { ++ case 1 : args->n_threads = strtol(optarg, 0, 0); break; +++ case 'f': args->fai_fname = optarg; break; ++ case 'o': args->output_fname = optarg; break; ++ case 's': args->samples_fname = optarg; break; ++ case 'h': args->header_fname = optarg; break; ++- case '?': usage(args); +++ case '?': usage(args); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -525,11 +694,12 @@ ++ } ++ else args->fname = argv[optind]; ++ +++ if ( args->fai_fname ) update_from_fai(args); ++ if ( !args->samples_fname && !args->header_fname ) usage(args); ++ if ( !args->fname ) usage(args); ++ ++ args->fp = hts_open(args->fname,"r"); ++- if ( !args->fp ) error("Failed to open: %s\n", args->fname); +++ if ( !args->fp ) error("Failed to read from %s\n", !strcmp("-",args->fname)?"standard input":args->fname); ++ args->type = *hts_get_format(args->fp); ++ ++ if ( args->type.format==vcf ) ++@@ -542,6 +712,11 @@ ++ else ++ reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip); ++ +++ if ( args->rm_tmpfile ) +++ { +++ unlink(args->rm_tmpfile); +++ free(args->rm_tmpfile); +++ } ++ free(args); ++ return 0; ++ } ++--- python-pysam.orig/bcftools/reheader.c.pysam.c +++++ python-pysam/bcftools/reheader.c.pysam.c ++@@ -35,17 +35,23 @@ ++ #include ++ #include ++ #include +++#ifdef _WIN32 +++#include +++#endif ++ #include ++ #include ++ #include // for hts_get_bgzfp() ++ #include ++ #include +++#include +++#include ++ #include "bcftools.h" ++ #include "khash_str2str.h" ++ ++ typedef struct _args_t ++ { ++ char **argv, *fname, *samples_fname, *header_fname, *output_fname; +++ char *fai_fname, *rm_tmpfile; ++ htsFile *fp; ++ htsFormat type; ++ htsThreadPool *threads; ++@@ -53,6 +59,158 @@ ++ } ++ args_t; ++ +++static inline int is_escaped(const char *min, const char *str) +++{ +++ int n = 0; +++ while ( --str>=min && *str=='\\' ) n++; +++ return n%2; +++} +++static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_seen, kstring_t *dst) +++{ +++ kstring_t key = {0,0,0}, val = {0,0,0}, tmp = {0,0,0}; +++ char *chr_name = NULL, *p, *q = line + 9; // skip ##contig= +++ char *end = q; +++ int nopen = 1, chr_len = 0; +++ while ( *end && *end!='\n' ) end++; +++ while ( *q && *q!='\n' && nopen>0 ) +++ { +++ p = ++q; +++ while ( *q && (*q==' ' || *q=='\t') ) { p++; q++; } +++ // ^[A-Za-z_][0-9A-Za-z_.]*$ +++ if (p==q && *q && (isalpha(*q) || *q=='_')) +++ { +++ q++; +++ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; +++ } +++ int n = q-p; +++ int m = 0; +++ while ( *q && (*q==' ' || *q=='\t') ) { q++; m++; } +++ if ( *q!='=' || !n ) +++ { +++ char *x = q; +++ while ( *x && *x!='\n' ) x++; +++ *x = '\0'; +++ error("Could not parse the line: %s [%s][%s]\n", line,p,q); +++ } +++ key.l = 0; +++ kputsn(p,q-p-m,&key); +++ p = ++q; +++ while ( *q && (*q==' ' || *q=='\t') ) { p++; q++; } +++ int quoted = *p=='"' ? 1 : 0; +++ if ( quoted ) p++, q++; +++ while ( *q && *q != '\n' ) +++ { +++ if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; } +++ else +++ { +++ if ( *q=='<' ) nopen++; +++ if ( *q=='>' ) nopen--; +++ if ( !nopen ) break; +++ if ( *q==',' && nopen==1 ) break; +++ } +++ q++; +++ } +++ char *r = q; +++ while ( r > p && r[-1] == ' ' ) r--; +++ val.l = 0; +++ kputsn(p,r-p,&val); +++ if ( quoted && *q=='"' ) q++; +++ if ( *q=='>' ) { nopen--; q++; } +++ if ( !strcmp("length",key.s) ) continue; +++ if ( !strcmp("ID",key.s) ) +++ { +++ if ( khash_str2int_has_key(chr_seen,val.s) ) continue; +++ chr_len = faidx_seq_len(fai, val.s); +++ if ( chr_len==-1 ) +++ { +++ free(val.s); free(key.s); free(tmp.s); +++ return end; // the sequence is not in fai, remove +++ } +++ chr_name = strdup(val.s); +++ khash_str2int_inc(chr_seen, chr_name); +++ continue; +++ } +++ kputc(',',&tmp); +++ kputs(key.s,&tmp); +++ kputc('=',&tmp); +++ if ( quoted ) kputc('"',&tmp); +++ kputs(val.s,&tmp); +++ if ( quoted ) kputc('"',&tmp); +++ } +++ if ( !chr_name ) return end; +++ ksprintf(dst,"##contig=",chr_name,chr_len,tmp.l ? tmp.s : ""); +++ free(key.s); free(val.s); free(tmp.s); +++ return q; +++} +++static void update_from_fai(args_t *args) +++{ +++ if ( !strcmp("-",args->fname) ) +++ error("Cannot use the --fai option when reading from standard input.\n"); +++ +++ faidx_t *fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA); +++ if ( !fai ) error("Could not parse %s\n", args->fai_fname); +++#ifdef _WIN32 +++ char tmp_path[MAX_PATH]; +++ int ret = GetTempPath(MAX_PATH, tmp_path); +++ if (!ret || ret > MAX_PATH) +++ error("Could not get the path to the temporary folder\n"); +++ if (strlen(tmp_path) + strlen("/bcftools-fai-header-XXXXXX") >= MAX_PATH) +++ error("Full path to the temporary folder is too long\n"); +++ strcat(tmp_path, "/bcftools-fai-header-XXXXXX"); +++ args->rm_tmpfile = strdup(tmp_path); +++#else +++ args->rm_tmpfile = strdup("/tmp/bcftools-fai-header-XXXXXX"); +++#endif +++ int fd = mkstemp(args->rm_tmpfile); +++ if ( fd<0 ) error("Could not open a temporary file for writing: %s\n", args->rm_tmpfile); +++ +++ // get a template header: either from the original VCF or from --header +++ char *ori_hdr_fname = args->header_fname ? args->header_fname : args->fname; +++ htsFile *fp = hts_open(ori_hdr_fname,"r"); +++ if ( !fp ) error("Failed to open: %s\n", ori_hdr_fname); +++ bcf_hdr_t *hdr = bcf_hdr_read(fp); +++ if ( !hdr ) error("Failed to read the header: %s\n", ori_hdr_fname); +++ hts_close(fp); // no need to check the return status here +++ +++ // put the header in a text buffer +++ kstring_t hdr_txt_ori = {0,0,0}, hdr_txt_new = {0,0,0}; +++ bcf_hdr_format(hdr, 0, &hdr_txt_ori); +++ bcf_hdr_destroy(hdr); +++ +++ // update the existing contig lines and remove lines not present in the fai file +++ void *chr_seen = khash_str2int_init(); +++ char *tmp, *beg = hdr_txt_ori.s; +++ while ( beg && *beg ) +++ { +++ tmp = strstr(beg, "\n##contig=<"); +++ if ( !tmp ) break; +++ kputsn(beg, tmp-beg+1, &hdr_txt_new); +++ size_t l_prev = hdr_txt_new.l; +++ beg = copy_and_update_contig_line(fai,tmp+1,chr_seen, &hdr_txt_new); +++ if ( l_prev==hdr_txt_new.l ) hdr_txt_new.l--; // nothing was added, remove the newline +++ } +++ if ( !beg || !(tmp=strstr(beg,"\n#CHROM")) ) error("Failed to parse the header, #CHROM not found\n"); +++ kputsn(beg, tmp-beg+1, &hdr_txt_new); +++ +++ // add any new contig lines +++ int i, n = faidx_nseq(fai); +++ for (i=0; i\n",faidx_iseq(fai,i),faidx_seq_len(fai,faidx_iseq(fai,i))); +++ } +++ kputs(tmp+1,&hdr_txt_new); +++ +++ if ( write(fd, hdr_txt_new.s, hdr_txt_new.l)!=hdr_txt_new.l ) error("Failed to write %zu bytes to %s\n", hdr_txt_new.l,args->rm_tmpfile); +++ if ( close(fd)!=0 ) error("Failed to close %s\n", args->rm_tmpfile); +++ args->header_fname = args->rm_tmpfile; +++ +++ free(hdr_txt_ori.s); +++ free(hdr_txt_new.s); +++ fai_destroy(fai); +++ khash_str2int_destroy_free(chr_seen); +++} +++ ++ static void read_header_file(char *fname, kstring_t *hdr) ++ { ++ kstring_t tmp = {0,0,0}; ++@@ -315,8 +473,8 @@ ++ kputc('\n',&fp->line); ++ if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %"PRIu64" bytes\n", (uint64_t)fp->line.l); ++ } ++- hts_close(fp); ++- close(out); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname); +++ if ( close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ } ++ ++ static bcf_hdr_t *strip_header(bcf_hdr_t *src, bcf_hdr_t *dst) ++@@ -348,12 +506,14 @@ ++ if ( j>=0 ) ++ { ++ j = atoi(src_hrec->vals[j]); ++- hrec_add_idx(tmp, j); +++ if (hrec_add_idx(tmp, j) < 0) +++ error_errno("[%s] Failed to add IDX header", __func__); ++ } ++ bcf_hdr_add_hrec(out, tmp); ++ } ++ } ++- bcf_hdr_sync(out); +++ if (bcf_hdr_sync(out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ for (i=0; inhrec; i++) ++ { ++ // finally add new structured fields ++@@ -377,11 +537,10 @@ ++ ++ if ( args->n_threads > 0 ) ++ { ++- args->threads = calloc(1, sizeof(*args->threads)); +++ args->threads = (htsThreadPool *) calloc(1, sizeof(htsThreadPool)); ++ if ( !args->threads ) error("Could not allocate memory\n"); ++ if ( !(args->threads->pool = hts_tpool_init(args->n_threads)) ) error("Could not initialize threading\n"); ++- BGZF *bgzf = hts_get_bgzfp(fp); ++- if ( bgzf ) bgzf_thread_pool(bgzf, args->threads->pool, args->threads->qsize); +++ hts_set_thread_pool(fp, args->threads); ++ } ++ ++ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname); ++@@ -412,11 +571,8 @@ ++ htsFile *fp_out = hts_open(args->output_fname ? args->output_fname : "-",is_compressed ? "wb" : "wbu"); ++ if ( !fp_out ) error("%s: %s\n", args->output_fname ? args->output_fname : "-", strerror(errno)); ++ if ( args->threads ) ++- { ++- BGZF *bgzf = hts_get_bgzfp(fp_out); ++- if ( bgzf ) bgzf_thread_pool(bgzf, args->threads->pool, args->threads->qsize); ++- } ++- bcf_hdr_write(fp_out, hdr_out); +++ hts_set_thread_pool(fp_out, args->threads); +++ if ( bcf_hdr_write(fp_out, hdr_out)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname ? args->output_fname : "standard output"); ++ ++ bcf1_t *rec = bcf_init(); ++ while ( bcf_read(fp, hdr, rec)==0 ) ++@@ -461,13 +617,13 @@ ++ if ( i!=rec->n_fmt ) ++ error("The FORMAT tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.fmt[i].id)); ++ ++- bcf_write(fp_out,hdr_out,rec); +++ if ( bcf_write(fp_out,hdr_out,rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname ? args->output_fname : "standard output"); ++ } ++ bcf_destroy(rec); ++ ++ free(htxt.s); ++- hts_close(fp_out); ++- hts_close(fp); +++ if ( hts_close(fp_out)!=0 ) error("[%s] Error: failed to close the file %s\n",__func__,args->output_fname ? args->output_fname : "standard output"); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname); ++ bcf_hdr_destroy(hdr_out); ++ bcf_hdr_destroy(hdr); ++ if ( args->threads ) ++@@ -485,10 +641,21 @@ ++ fprintf(bcftools_stderr, "Usage: bcftools reheader [OPTIONS] \n"); ++ fprintf(bcftools_stderr, "\n"); ++ fprintf(bcftools_stderr, "Options:\n"); +++ fprintf(bcftools_stderr, " -f, --fai update sequences and their lengths from the .fai file\n"); ++ fprintf(bcftools_stderr, " -h, --header new header\n"); ++ fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); ++ fprintf(bcftools_stderr, " -s, --samples new sample names\n"); ++- fprintf(bcftools_stderr, " --threads number of extra compression threads (BCF only) [0]\n"); +++ fprintf(bcftools_stderr, " --threads use multithreading with worker threads (BCF only) [0]\n"); +++ fprintf(bcftools_stderr, "\n"); +++ fprintf(bcftools_stderr, "Example:\n"); +++ fprintf(bcftools_stderr, " # Write out the header to be modified\n"); +++ fprintf(bcftools_stderr, " bcftools view -h old.bcf > header.txt\n"); +++ fprintf(bcftools_stderr, "\n"); +++ fprintf(bcftools_stderr, " # Edit the header using your favorite text editor\n"); +++ fprintf(bcftools_stderr, " vi header.txt\n"); +++ fprintf(bcftools_stderr, "\n"); +++ fprintf(bcftools_stderr, " # Reheader the file\n"); +++ fprintf(bcftools_stderr, " bcftools reheader -h header.txt -o new.bcf old.bcf\n"); ++ fprintf(bcftools_stderr, "\n"); ++ exit(1); ++ } ++@@ -501,21 +668,23 @@ ++ ++ static struct option loptions[] = ++ { +++ {"fai",1,0,'f'}, ++ {"output",1,0,'o'}, ++ {"header",1,0,'h'}, ++ {"samples",1,0,'s'}, ++ {"threads",1,NULL,1}, ++ {0,0,0,0} ++ }; ++- while ((c = getopt_long(argc, argv, "s:h:o:",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "s:h:o:f:",loptions,NULL)) >= 0) ++ { ++ switch (c) ++ { ++ case 1 : args->n_threads = strtol(optarg, 0, 0); break; +++ case 'f': args->fai_fname = optarg; break; ++ case 'o': args->output_fname = optarg; break; ++ case 's': args->samples_fname = optarg; break; ++ case 'h': args->header_fname = optarg; break; ++- case '?': usage(args); +++ case '?': usage(args); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -527,11 +696,12 @@ ++ } ++ else args->fname = argv[optind]; ++ +++ if ( args->fai_fname ) update_from_fai(args); ++ if ( !args->samples_fname && !args->header_fname ) usage(args); ++ if ( !args->fname ) usage(args); ++ ++ args->fp = hts_open(args->fname,"r"); ++- if ( !args->fp ) error("Failed to open: %s\n", args->fname); +++ if ( !args->fp ) error("Failed to read from %s\n", !strcmp("-",args->fname)?"standard input":args->fname); ++ args->type = *hts_get_format(args->fp); ++ ++ if ( args->type.format==vcf ) ++@@ -544,6 +714,11 @@ ++ else ++ reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip); ++ +++ if ( args->rm_tmpfile ) +++ { +++ unlink(args->rm_tmpfile); +++ free(args->rm_tmpfile); +++ } ++ free(args); ++ return 0; ++ } ++--- python-pysam.orig/bcftools/smpl_ilist.c +++++ python-pysam/bcftools/smpl_ilist.c ++@@ -22,15 +22,29 @@ ++ THE SOFTWARE. ++ */ ++ +++#include ++ #include "bcftools.h" ++ #include "smpl_ilist.h" ++ ++ void smpl_ilist_destroy(smpl_ilist_t *smpl) ++ { +++ int i; +++ if ( smpl->pair ) +++ { +++ for (i=0; in; i++) free(smpl->pair[i]); +++ free(smpl->pair); +++ } ++ free(smpl->idx); ++ free(smpl); ++ } ++ +++static inline int is_space_or_escaped(const char *min, const char *str) +++{ +++ if ( !isspace(*str) ) return 0; +++ int n = 0; +++ while ( --str>=min && *str=='\\' ) n++; +++ return n%2 ? 0 : 1; +++} ++ smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags) ++ { ++ smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t)); ++@@ -44,32 +58,63 @@ ++ return smpl; ++ } ++ +++ int negate = sample_list[0]=='^' ? 1 : 0; ++ int nlist; ++- char **list = hts_readlist(sample_list[0]=='^'?sample_list+1:sample_list, is_file, &nlist); +++ char **list = hts_readlist(negate?sample_list+1:sample_list, is_file, &nlist); ++ if ( !list ) error("Could not parse %s\n", sample_list); ++ ++ // preserve the VCF order ++ int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int)); +++ char **pair = NULL; ++ for (i=0; i=0 ) +++ char *smpl1 = list[i]; +++ char *smpl2 = NULL; +++ +++ char *ptr = list[i]; +++ while ( *ptr && !is_space_or_escaped(list[i], ptr) ) ptr++; +++ if ( *ptr ) +++ { +++ *ptr = 0; +++ smpl2 = ptr+1; +++ } +++ +++ char *smpl_name = flags&SMPL_PAIR2 && smpl2 ? smpl2 : smpl1; +++ int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, smpl_name); +++ if ( idx<0 ) ++ { ++- tmp[idx] = 1; ++- smpl->n++; +++ if ( !(flags&SMPL_STRICT) ) +++ { +++ if ( flags&SMPL_VERBOSE ) fprintf(stderr,"No such sample: \"%s\"\n",smpl_name); +++ continue; +++ } +++ error("No such sample: \"%s\"\n", smpl_name); ++ } ++- else if ( flags&SMPL_STRICT ) ++- error("No such sample: %s\n", list[i]); +++ +++ tmp[idx] = 1; +++ if ( smpl2 ) +++ { +++ if ( !pair ) pair = (char**)calloc(bcf_hdr_nsamples(hdr),sizeof(char*)); +++ if ( flags&SMPL_PAIR2 ) pair[idx] = strdup(smpl1); +++ else if ( flags&SMPL_PAIR1 ) pair[idx] = strdup(smpl2); +++ } +++ smpl->n++; ++ } ++ ++- if ( sample_list[0]=='^' ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n; +++ if ( negate ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n; ++ smpl->idx = (int*) malloc(sizeof(int)*smpl->n); ++ ++ int j = 0; ++- if ( sample_list[0]!='^' ) +++ if ( !negate ) ++ { +++ if ( pair ) smpl->pair = (char**) calloc(smpl->n,sizeof(char*)); ++ for (i=0; iidx[j++] = i; +++ { +++ if ( !tmp[i] ) continue; +++ smpl->idx[j] = i; +++ if ( pair && pair[i] ) smpl->pair[j] = pair[i]; +++ j++; +++ } ++ } ++ else ++ { ++@@ -78,6 +123,7 @@ ++ } ++ ++ free(tmp); +++ free(pair); ++ for (i=0; i ++ #include "bcftools.h" ++ #include "smpl_ilist.h" ++ ++ void smpl_ilist_destroy(smpl_ilist_t *smpl) ++ { +++ int i; +++ if ( smpl->pair ) +++ { +++ for (i=0; in; i++) free(smpl->pair[i]); +++ free(smpl->pair); +++ } ++ free(smpl->idx); ++ free(smpl); ++ } ++ +++static inline int is_space_or_escaped(const char *min, const char *str) +++{ +++ if ( !isspace(*str) ) return 0; +++ int n = 0; +++ while ( --str>=min && *str=='\\' ) n++; +++ return n%2 ? 0 : 1; +++} ++ smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags) ++ { ++ smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t)); ++@@ -46,32 +60,63 @@ ++ return smpl; ++ } ++ +++ int negate = sample_list[0]=='^' ? 1 : 0; ++ int nlist; ++- char **list = hts_readlist(sample_list[0]=='^'?sample_list+1:sample_list, is_file, &nlist); +++ char **list = hts_readlist(negate?sample_list+1:sample_list, is_file, &nlist); ++ if ( !list ) error("Could not parse %s\n", sample_list); ++ ++ // preserve the VCF order ++ int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int)); +++ char **pair = NULL; ++ for (i=0; i=0 ) +++ char *smpl1 = list[i]; +++ char *smpl2 = NULL; +++ +++ char *ptr = list[i]; +++ while ( *ptr && !is_space_or_escaped(list[i], ptr) ) ptr++; +++ if ( *ptr ) +++ { +++ *ptr = 0; +++ smpl2 = ptr+1; +++ } +++ +++ char *smpl_name = flags&SMPL_PAIR2 && smpl2 ? smpl2 : smpl1; +++ int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, smpl_name); +++ if ( idx<0 ) ++ { ++- tmp[idx] = 1; ++- smpl->n++; +++ if ( !(flags&SMPL_STRICT) ) +++ { +++ if ( flags&SMPL_VERBOSE ) fprintf(bcftools_stderr,"No such sample: \"%s\"\n",smpl_name); +++ continue; +++ } +++ error("No such sample: \"%s\"\n", smpl_name); ++ } ++- else if ( flags&SMPL_STRICT ) ++- error("No such sample: %s\n", list[i]); +++ +++ tmp[idx] = 1; +++ if ( smpl2 ) +++ { +++ if ( !pair ) pair = (char**)calloc(bcf_hdr_nsamples(hdr),sizeof(char*)); +++ if ( flags&SMPL_PAIR2 ) pair[idx] = strdup(smpl1); +++ else if ( flags&SMPL_PAIR1 ) pair[idx] = strdup(smpl2); +++ } +++ smpl->n++; ++ } ++ ++- if ( sample_list[0]=='^' ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n; +++ if ( negate ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n; ++ smpl->idx = (int*) malloc(sizeof(int)*smpl->n); ++ ++ int j = 0; ++- if ( sample_list[0]!='^' ) +++ if ( !negate ) ++ { +++ if ( pair ) smpl->pair = (char**) calloc(smpl->n,sizeof(char*)); ++ for (i=0; iidx[j++] = i; +++ { +++ if ( !tmp[i] ) continue; +++ smpl->idx[j] = i; +++ if ( pair && pair[i] ) smpl->pair[j] = pair[i]; +++ j++; +++ } ++ } ++ else ++ { ++@@ -80,6 +125,7 @@ ++ } ++ ++ free(tmp); +++ free(pair); ++ for (i=0; i ++ ++-#define SMPL_NONE 0 // flexible error recovery ++-#define SMPL_STRICT 1 // samples must exist +++#define SMPL_NONE 0 // flexible error recovery +++#define SMPL_STRICT 1 // samples must exist +++#define SMPL_SINGLE 2 // single sample expected +++#define SMPL_PAIR1 4 // two samples expected, the first is from the bcf hdr +++#define SMPL_PAIR2 8 // two samples expected, the second is from the bcf hdr +++#define SMPL_VERBOSE 16 // print warnings ++ ++ typedef struct ++ { ++- int *idx; // index to bcf_hdr_t.samples +++ char **pair; // the other sample in the pair +++ int *idx; // index to bcf_hdr_t.samples; the first (SMPL_SINGLE|SMPL_PAIR1) or second sample (SMPL_PAIR2) ++ int n; ++ } ++ smpl_ilist_t; ++--- python-pysam.orig/bcftools/tabix.c +++++ python-pysam/bcftools/tabix.c ++@@ -27,6 +27,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ ++@@ -84,7 +85,6 @@ ++ { ++ // auto-detect file type by file name ++ int l = strlen(argv[optind]); ++- int strcasecmp(const char *s1, const char *s2); ++ if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf = tbx_conf_gff; ++ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf = tbx_conf_bed; ++ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf = tbx_conf_sam; ++--- python-pysam.orig/bcftools/tabix.c.pysam.c +++++ python-pysam/bcftools/tabix.c.pysam.c ++@@ -29,6 +29,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ ++@@ -86,7 +87,6 @@ ++ { ++ // auto-detect file type by file name ++ int l = strlen(argv[optind]); ++- int strcasecmp(const char *s1, const char *s2); ++ if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf = tbx_conf_gff; ++ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf = tbx_conf_bed; ++ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf = tbx_conf_sam; ++--- python-pysam.orig/bcftools/test/test-regidx.c +++++ python-pysam/bcftools/test/test-regidx.c ++@@ -32,6 +32,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include "regidx.h" ++ ++@@ -225,6 +226,54 @@ ++ regidx_destroy(idx); ++ free(str.s); ++ } +++void test_explicit(char *tgt, char *qry, char *exp) +++{ +++ regidx_t *idx = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL); +++ +++ char *beg = tgt, *end, *exp_ori = exp; +++ kstring_t str = {0,0,0}; +++ while ( *beg ) +++ { +++ end = tgt; +++ while ( *end && *end!=';' ) end++; +++ str.l = 0; +++ kputsn(beg, end-beg, &str); +++ debug("insert: %s\n", str.s); +++ if ( regidx_insert(idx,str.s)!=0 ) error("insert failed: %s\n", str.s); +++ beg = *end ? end + 1 : end; +++ } +++ +++ beg = qry; +++ while ( *beg ) +++ { +++ end = qry; +++ while ( *end && *end!=';' ) end++; +++ str.l = 0; +++ kputsn(beg, end-beg, &str); +++ beg = *end ? end + 1 : end; +++ +++ char *chr_beg, *chr_end; +++ uint32_t reg_beg, reg_end; +++ if ( regidx_parse_reg(str.s, &chr_beg, &chr_end, ®_beg, ®_end, NULL, NULL)!=0 ) error("could not parse: %s in %s\n", str.s, qry); +++ chr_end[1] = 0; +++ int hit = regidx_overlap(idx,chr_beg,reg_beg,reg_end,NULL); +++ if ( *exp=='1' ) +++ { +++ if ( !hit ) error("query failed, there should be a hit .. %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); +++ debug("ok: overlap found for %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); +++ } +++ else if ( *exp=='0' ) +++ { +++ if ( hit ) error("query failed, there should be no hit .. %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); +++ debug("ok: no overlap found for %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); +++ } +++ else error("could not parse: %s\n", exp_ori); +++ exp++; +++ } +++ +++ free(str.s); +++ regidx_destroy(idx); +++} ++ ++ void create_line_bed(char *line, char *chr, int start, int end) ++ { ++@@ -259,6 +308,11 @@ ++ set_line(line,chr,start,end); ++ debug("insert: %s", line); ++ if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); +++ +++ start = 20000*i; end = start + 2000; +++ set_line(line,chr,start,end); +++ debug("insert: %s", line); +++ if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); ++ } ++ ++ regitr_t *itr = regitr_init(idx); ++@@ -311,6 +365,19 @@ ++ } ++ if ( nhit!=2 ) error("query failed, expected two hits, found %d: %s:%d-%d\n",nhit,chr,start,end); ++ +++ // fully contained interval, one hit +++ start = 20000*i - 5000; end = 20000*i + 3000; +++ set_line(line,chr,start,end); +++ if ( !regidx_overlap(idx,chr,start-1,end-1,itr) ) error("query failed, there should be a hit: %s:%d-%d\n",chr,start,end); +++ debug("ok: overlap(s) found for %s:%d-%d\n",chr,start,end); +++ nhit = 0; +++ while ( regitr_overlap(itr) ) +++ { +++ if ( itr->beg > end-1 || itr->end < start-1 ) error("query failed, incorrect region: %d-%d for %d-%d\n",itr->beg+1,itr->end+1,start,end); +++ debug("\t %d-%d\n",itr->beg+1,itr->end+1); +++ nhit++; +++ } +++ if ( nhit!=1 ) error("query failed, expected one hit, found %d: %s:%d-%d\n",nhit,chr,start,end); ++ } ++ regitr_destroy(itr); ++ regidx_destroy(idx); ++@@ -363,6 +430,9 @@ ++ info("Testing custom payload\n"); ++ test_custom_payload(); ++ +++ info("Testing cases encountered in past\n"); +++ test_explicit("12:2064519-2064763","12:2064488-2067434","1"); +++ ++ int i, ntest = 1000, nreg = 50; ++ srandom(seed); ++ info("%d randomized tests, %d regions per test. Random seed is %d\n", ntest,nreg,seed); ++--- python-pysam.orig/bcftools/test/test-regidx.c.pysam.c +++++ python-pysam/bcftools/test/test-regidx.c.pysam.c ++@@ -34,6 +34,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include "regidx.h" ++ ++@@ -227,6 +228,54 @@ ++ regidx_destroy(idx); ++ free(str.s); ++ } +++void test_explicit(char *tgt, char *qry, char *exp) +++{ +++ regidx_t *idx = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL); +++ +++ char *beg = tgt, *end, *exp_ori = exp; +++ kstring_t str = {0,0,0}; +++ while ( *beg ) +++ { +++ end = tgt; +++ while ( *end && *end!=';' ) end++; +++ str.l = 0; +++ kputsn(beg, end-beg, &str); +++ debug("insert: %s\n", str.s); +++ if ( regidx_insert(idx,str.s)!=0 ) error("insert failed: %s\n", str.s); +++ beg = *end ? end + 1 : end; +++ } +++ +++ beg = qry; +++ while ( *beg ) +++ { +++ end = qry; +++ while ( *end && *end!=';' ) end++; +++ str.l = 0; +++ kputsn(beg, end-beg, &str); +++ beg = *end ? end + 1 : end; +++ +++ char *chr_beg, *chr_end; +++ uint32_t reg_beg, reg_end; +++ if ( regidx_parse_reg(str.s, &chr_beg, &chr_end, ®_beg, ®_end, NULL, NULL)!=0 ) error("could not parse: %s in %s\n", str.s, qry); +++ chr_end[1] = 0; +++ int hit = regidx_overlap(idx,chr_beg,reg_beg,reg_end,NULL); +++ if ( *exp=='1' ) +++ { +++ if ( !hit ) error("query failed, there should be a hit .. %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); +++ debug("ok: overlap found for %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); +++ } +++ else if ( *exp=='0' ) +++ { +++ if ( hit ) error("query failed, there should be no hit .. %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); +++ debug("ok: no overlap found for %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); +++ } +++ else error("could not parse: %s\n", exp_ori); +++ exp++; +++ } +++ +++ free(str.s); +++ regidx_destroy(idx); +++} ++ ++ void create_line_bed(char *line, char *chr, int start, int end) ++ { ++@@ -261,6 +310,11 @@ ++ set_line(line,chr,start,end); ++ debug("insert: %s", line); ++ if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); +++ +++ start = 20000*i; end = start + 2000; +++ set_line(line,chr,start,end); +++ debug("insert: %s", line); +++ if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); ++ } ++ ++ regitr_t *itr = regitr_init(idx); ++@@ -313,6 +367,19 @@ ++ } ++ if ( nhit!=2 ) error("query failed, expected two hits, found %d: %s:%d-%d\n",nhit,chr,start,end); ++ +++ // fully contained interval, one hit +++ start = 20000*i - 5000; end = 20000*i + 3000; +++ set_line(line,chr,start,end); +++ if ( !regidx_overlap(idx,chr,start-1,end-1,itr) ) error("query failed, there should be a hit: %s:%d-%d\n",chr,start,end); +++ debug("ok: overlap(s) found for %s:%d-%d\n",chr,start,end); +++ nhit = 0; +++ while ( regitr_overlap(itr) ) +++ { +++ if ( itr->beg > end-1 || itr->end < start-1 ) error("query failed, incorrect region: %d-%d for %d-%d\n",itr->beg+1,itr->end+1,start,end); +++ debug("\t %d-%d\n",itr->beg+1,itr->end+1); +++ nhit++; +++ } +++ if ( nhit!=1 ) error("query failed, expected one hit, found %d: %s:%d-%d\n",nhit,chr,start,end); ++ } ++ regitr_destroy(itr); ++ regidx_destroy(idx); ++@@ -365,6 +432,9 @@ ++ info("Testing custom payload\n"); ++ test_custom_payload(); ++ +++ info("Testing cases encountered in past\n"); +++ test_explicit("12:2064519-2064763","12:2064488-2067434","1"); +++ ++ int i, ntest = 1000, nreg = 50; ++ srandom(seed); ++ info("%d randomized tests, %d regions per test. Random seed is %d\n", ntest,nreg,seed); ++--- /dev/null +++++ python-pysam/bcftools/variantkey.h ++@@ -0,0 +1,583 @@ +++// VariantKey +++// +++// variantkey.h +++// +++// @category Libraries +++// @author Nicola Asuni +++// @copyright 2017-2018 GENOMICS plc +++// @license MIT (see LICENSE) +++// @link https://github.com/genomicsplc/variantkey +++// +++// LICENSE +++// +++// Copyright (c) 2017-2018 GENOMICS plc +++// +++// Permission is hereby granted, free of charge, to any person obtaining a copy +++// of this software and associated documentation files (the "Software"), to deal +++// in the Software without restriction, including without limitation the rights +++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++// copies of the Software, and to permit persons to whom the Software is +++// furnished to do so, subject to the following conditions: +++// +++// The above copyright notice and this permission notice shall be included in +++// all copies or substantial portions of the Software. +++// +++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +++// THE SOFTWARE. +++ +++/** +++ * @file variantkey.h +++ * @brief VariantKey main functions. +++ * +++ * The functions provided here allows to generate and process a 64 bit Unsigned Integer Keys for Human Genetic Variants. +++ * The VariantKey is sortable for chromosome and position, +++ * and it is also fully reversible for variants with up to 11 bases between Reference and Alternate alleles. +++ * It can be used to sort, search and match variant-based data easily and very quickly. +++ */ +++ +++#ifndef VARIANTKEY_H +++#define VARIANTKEY_H +++ +++#include +++#include +++#include +++#include "hex.h" +++ +++#define VKMASK_CHROM 0xF800000000000000 //!< VariantKey binary mask for CHROM [ 11111000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ] +++#define VKMASK_POS 0x07FFFFFF80000000 //!< VariantKey binary mask for POS [ 00000111 11111111 11111111 11111111 10000000 00000000 00000000 00000000 ] +++#define VKMASK_CHROMPOS 0xFFFFFFFF80000000 //!< VariantKey binary mask for CHROM+POS [ 11111111 11111111 11111111 11111111 10000000 00000000 00000000 00000000 ] +++#define VKMASK_REFALT 0x000000007FFFFFFF //!< VariantKey binary mask for REF+ALT [ 00000000 00000000 00000000 00000000 01111111 11111111 11111111 11111111 ] +++#define VKSHIFT_CHROM 59 //!< CHROM LSB position from the VariantKey LSB +++#define VKSHIFT_POS 31 //!< POS LSB position from the VariantKey LSB +++ +++/** +++ * VariantKey struct. +++ * Contains the numerically encoded VariantKey components (CHROM, POS, REF+ALT). +++ */ +++typedef struct variantkey_t +++{ +++ uint8_t chrom; //!< Chromosome encoded number (only the LSB 5 bit are used) +++ uint32_t pos; //!< Reference position, with the first base having position 0 (only the LSB 28 bit are used) +++ uint32_t refalt; //!< Code for Reference and Alternate allele (only the LSB 31 bits are used) +++} variantkey_t; +++ +++/** +++ * Struct containing the minimum and maximum VariantKey values for range searches. +++ */ +++typedef struct vkrange_t +++{ +++ uint64_t min; //!< Minimum VariantKey value for any given REF+ALT encoding +++ uint64_t max; //!< Maximum VariantKey value for any given REF+ALT encoding +++} vkrange_t; +++ +++/** @brief Returns chromosome numerical encoding. +++ * +++ * @param chrom Chromosome. An identifier from the reference genome, no white-space permitted. +++ * @param size Length of the chrom string, excluding the terminating null byte. +++ * +++ * @return CHROM code +++ */ +++static inline uint8_t encode_chrom(const char *chrom, size_t size) +++{ +++ // X > 23 ; Y > 24 ; M > 25 +++ static const uint8_t onecharmap[] = +++ { +++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +++ /* M X Y */ +++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,23,24, 0, 0, 0, 0, 0, 0, +++ /* m x y */ +++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,23,24, 0, 0, 0, 0, 0, 0, +++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +++ }; +++ // remove "chr" prefix +++ if ((size > 3) +++ && ((chrom[0] == 'c') || (chrom[0] == 'C')) +++ && ((chrom[1] == 'h') || (chrom[1] == 'H')) +++ && ((chrom[2] == 'r') || (chrom[2] == 'R'))) +++ { +++ chrom += 3; +++ size -= 3; +++ } +++ if (size == 0) +++ { +++ return 0; +++ } +++ if ((chrom[0] <= '9') && (chrom[0] >= '0')) // Number +++ { +++ size_t i; +++ uint8_t v = (chrom[0] - '0'); +++ for (i = 1; i < size; i++) +++ { +++ if ((chrom[i] > '9') || (chrom[i] < '0')) +++ { +++ return 0; // NA +++ } +++ v = ((v * 10) + (chrom[i] - '0')); +++ } +++ return v; +++ } +++ if ((size == 1) || ((size == 2) && ((chrom[1] == 'T') || (chrom[1] == 't')))) +++ { +++ return onecharmap[((uint8_t)chrom[0])]; +++ } +++ return 0; // NA +++} +++ +++/** @brief Decode the chromosome numerical code. +++ * +++ * @param code CHROM code. +++ * @param chrom CHROM string buffer to be returned. Its size should be enough to contain the results (max 4 bytes). +++ * +++ * @return If successful, the total number of characters written is returned, +++ * excluding the null-character appended at the end of the string, +++ * otherwise a negative number is returned in case of failure. +++ */ +++static inline size_t decode_chrom(uint8_t code, char *chrom) +++{ +++ if ((code < 1) || (code > 25)) +++ { +++ return sprintf(chrom, "NA"); +++ } +++ if (code < 23) +++ { +++ return sprintf(chrom, "%" PRIu8, code); +++ } +++ static const char *map[] = {"X", "Y", "MT"}; +++ return sprintf(chrom, "%s", map[(code - 23)]); +++} +++ +++static inline uint32_t encode_base(const uint8_t c) +++{ +++ /* +++ Encode base: +++ A > 0 +++ C > 1 +++ G > 2 +++ T > 3 +++ */ +++ static const uint32_t map[] = +++ { +++ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, +++ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, +++ /*A C G T*/ +++ 4,0,4,1,4,4,4,2,4,4,4,4,4,4,4,4,4,4,4,4,3,4,4,4,4,4,4,4,4,4,4,4, +++ /*a c g t*/ +++ 4,0,4,1,4,4,4,2,4,4,4,4,4,4,4,4,4,4,4,4,3,4,4,4,4,4,4,4,4,4,4,4, +++ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, +++ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, +++ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, +++ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, +++ }; +++ return map[c]; +++} +++ +++static inline int encode_allele(uint32_t *h, uint8_t *bitpos, const char *str, size_t size) +++{ +++ uint32_t v; +++ while (size--) +++ { +++ v = encode_base(*str++); +++ if (v > 3) +++ { +++ return -1; +++ } +++ *bitpos -= 2; +++ *h |= (v << *bitpos); +++ } +++ return 0; +++} +++ +++static inline uint32_t encode_refalt_rev(const char *ref, size_t sizeref, const char *alt, size_t sizealt) +++{ +++ //[******** ******** ******** ******** *RRRRAAA A1122334 45566778 8990011*] +++ uint32_t h = 0; +++ h |= ((uint32_t)(sizeref) << 27); // RRRR: length of (REF - 1) +++ h |= ((uint32_t)(sizealt) << 23); // AAAA: length of (ALT - 1) +++ uint8_t bitpos = 23; +++ if ((encode_allele(&h, &bitpos, ref, sizeref) < 0) || (encode_allele(&h, &bitpos, alt, sizealt) < 0)) +++ { +++ return 0; // error code +++ } +++ return h; +++} +++ +++// Mix two 32 bit hash numbers using a MurmurHash3-like algorithm +++static inline uint32_t muxhash(uint32_t k, uint32_t h) +++{ +++ k *= 0xcc9e2d51; +++ k = (k >> 17) | (k << 15); +++ k *= 0x1b873593; +++ h ^= k; +++ h = (h >> 19) | (h << 13); +++ return ((h * 5) + 0xe6546b64); +++} +++ +++static inline uint32_t encode_packchar(int c) +++{ +++ if (c < 'A') +++ { +++ return 27; +++ } +++ if (c >= 'a') +++ { +++ return (uint32_t)(c - 'a' + 1); +++ } +++ return (uint32_t)(c - 'A' + 1); +++} +++ +++// pack blocks of 6 characters in 32 bit (6 x 5 bit + 2 spare bit) [ 01111122 22233333 44444555 55666660 ] +++static inline uint32_t pack_chars_tail(const char *str, size_t size) +++{ +++ uint32_t h = 0; +++ const char *pos = (str + size - 1); +++ switch (size) +++ { +++ case 5: +++ h ^= encode_packchar(*pos--) << (1 + (5 * 1)); +++ // fall through +++ case 4: +++ h ^= encode_packchar(*pos--) << (1 + (5 * 2)); +++ // fall through +++ case 3: +++ h ^= encode_packchar(*pos--) << (1 + (5 * 3)); +++ // fall through +++ case 2: +++ h ^= encode_packchar(*pos--) << (1 + (5 * 4)); +++ // fall through +++ case 1: +++ h ^= encode_packchar(*pos) << (1 + (5 * 5)); +++ } +++ return h; +++} +++ +++static inline uint32_t pack_chars(const char *str) +++{ +++ const char *pos = (str + 5); +++ return ((encode_packchar(*pos) << 1) +++ ^ (encode_packchar(*(pos-1)) << (1 + (5 * 1))) +++ ^ (encode_packchar(*(pos-2)) << (1 + (5 * 2))) +++ ^ (encode_packchar(*(pos-3)) << (1 + (5 * 3))) +++ ^ (encode_packchar(*(pos-4)) << (1 + (5 * 4))) +++ ^ (encode_packchar(*(pos-5)) << (1 + (5 * 5)))); +++} +++ +++// Return a 32 bit hash of a nucleotide string +++static inline uint32_t hash32(const char *str, size_t size) +++{ +++ uint32_t h = 0; +++ size_t len = 6; +++ while (size >= len) +++ { +++ h = muxhash(pack_chars(str), h); +++ str += len; +++ size -= len; +++ } +++ if (size > 0) +++ { +++ h = muxhash(pack_chars_tail(str, size), h); +++ } +++ return h; +++} +++ +++static inline uint32_t encode_refalt_hash(const char *ref, size_t sizeref, const char *alt, size_t sizealt) +++{ +++ // 0x3 is the separator character between REF and ALT [00000000 00000000 00000000 00000011] +++ uint32_t h = muxhash(hash32(alt, sizealt), muxhash(0x3, hash32(ref, sizeref))); +++ // MurmurHash3 finalization mix - force all bits of a hash block to avalanche +++ h ^= h >> 16; +++ h *= 0x85ebca6b; +++ h ^= h >> 13; +++ h *= 0xc2b2ae35; +++ h ^= h >> 16; +++ return ((h >> 1) | 0x1); // 0x1 is the set bit to indicate HASH mode [00000000 00000000 00000000 00000001] +++} +++ +++/** @brief Returns reference+alternate numerical encoding. +++ * +++ * @param ref Reference allele. String containing a sequence of nucleotide letters. +++ * The value in the pos field refers to the position of the first nucleotide in the String. +++ * Characters must be A-Z, a-z or * +++ * @param sizeref Length of the ref string, excluding the terminating null byte. +++ * @param alt Alternate non-reference allele string. +++ * Characters must be A-Z, a-z or * +++ * @param sizealt Length of the alt string, excluding the terminating null byte. +++ * +++ * @return REF+ALT code +++ */ +++static inline uint32_t encode_refalt(const char *ref, size_t sizeref, const char *alt, size_t sizealt) +++{ +++ if ((sizeref + sizealt) <= 11) +++ { +++ uint32_t h = encode_refalt_rev(ref, sizeref, alt, sizealt); +++ if (h != 0) +++ { +++ return h; +++ } +++ } +++ return encode_refalt_hash(ref, sizeref, alt, sizealt); +++} +++ +++static inline char decode_base(uint32_t code, int bitpos) +++{ +++ static const char base[4] = {'A', 'C', 'G', 'T'}; +++ return base[((code >> bitpos) & 0x3)]; // 0x3 is the 2 bit mask [00000011] +++} +++ +++static inline size_t decode_refalt_rev(uint32_t code, char *ref, size_t *sizeref, char *alt, size_t *sizealt) +++{ +++ *sizeref = (size_t)((code & 0x78000000) >> 27); // [01111000 00000000 00000000 00000000] +++ *sizealt = (size_t)((code & 0x07800000) >> 23); // [00000111 10000000 00000000 00000000] +++ switch (*sizeref) +++ { +++ case 10: +++ ref[9] = decode_base(code, (3 + (2 * 0))); +++ // fall through +++ case 9: +++ ref[8] = decode_base(code, (3 + (2 * 1))); +++ // fall through +++ case 8: +++ ref[7] = decode_base(code, (3 + (2 * 2))); +++ // fall through +++ case 7: +++ ref[6] = decode_base(code, (3 + (2 * 3))); +++ // fall through +++ case 6: +++ ref[5] = decode_base(code, (3 + (2 * 4))); +++ // fall through +++ case 5: +++ ref[4] = decode_base(code, (3 + (2 * 5))); +++ // fall through +++ case 4: +++ ref[3] = decode_base(code, (3 + (2 * 6))); +++ // fall through +++ case 3: +++ ref[2] = decode_base(code, (3 + (2 * 7))); +++ // fall through +++ case 2: +++ ref[1] = decode_base(code, (3 + (2 * 8))); +++ // fall through +++ case 1: +++ ref[0] = decode_base(code, (3 + (2 * 9))); +++ } +++ ref[*sizeref] = 0; +++ uint8_t bitpos = (23 - ((*sizeref) << 1)); +++ switch (*sizealt) +++ { +++ case 10: +++ alt[9] = decode_base(code, bitpos - (2 * 10)); +++ // fall through +++ case 9: +++ alt[8] = decode_base(code, bitpos - (2 * 9)); +++ // fall through +++ case 8: +++ alt[7] = decode_base(code, bitpos - (2 * 8)); +++ // fall through +++ case 7: +++ alt[6] = decode_base(code, bitpos - (2 * 7)); +++ // fall through +++ case 6: +++ alt[5] = decode_base(code, bitpos - (2 * 6)); +++ // fall through +++ case 5: +++ alt[4] = decode_base(code, bitpos - (2 * 5)); +++ // fall through +++ case 4: +++ alt[3] = decode_base(code, bitpos - (2 * 4)); +++ // fall through +++ case 3: +++ alt[2] = decode_base(code, bitpos - (2 * 3)); +++ // fall through +++ case 2: +++ alt[1] = decode_base(code, bitpos - (2 * 2)); +++ // fall through +++ case 1: +++ alt[0] = decode_base(code, bitpos - (2 * 1)); +++ } +++ alt[*sizealt] = 0; +++ return (*sizeref + *sizealt); +++} +++ +++/** @brief Decode the 32 bit REF+ALT code if reversible (if it has 11 or less bases in total and only contains ACGT letters). +++ * +++ * @param code REF+ALT code +++ * @param ref REF string buffer to be returned. +++ * @param sizeref Pointer to the size of the ref buffer, excluding the terminating null byte. +++ * This will contain the final ref size. +++ * @param alt ALT string buffer to be returned. +++ * @param sizealt Pointer to the size of the alt buffer, excluding the terminating null byte. +++ * This will contain the final alt size. +++ * +++ * @return If the code is reversible, then the total number of characters of REF+ALT is returned. +++ * Otherwise 0 is returned. +++ */ +++static inline size_t decode_refalt(uint32_t code, char *ref, size_t *sizeref, char *alt, size_t *sizealt) +++{ +++ if (code & 0x1) // check last bit +++ { +++ return 0; // non-reversible encoding +++ } +++ return decode_refalt_rev(code, ref, sizeref, alt, sizealt); +++} +++ +++/** @brief Returns a 64 bit variant key based on the pre-encoded CHROM, POS (0-based) and REF+ALT. +++ * +++ * @param chrom Encoded Chromosome (see encode_chrom). +++ * @param pos Position. The reference position, with the first base having position 0. +++ * @param refalt Encoded Reference + Alternate (see encode_refalt). +++ * +++ * @return VariantKey 64 bit code. +++ */ +++static inline uint64_t encode_variantkey(uint8_t chrom, uint32_t pos, uint32_t refalt) +++{ +++ return (((uint64_t)chrom << VKSHIFT_CHROM) | ((uint64_t)pos << VKSHIFT_POS) | (uint64_t)refalt); +++} +++ +++/** @brief Extract the CHROM code from VariantKey. +++ * +++ * @param vk VariantKey code. +++ * +++ * @return CHROM code. +++ */ +++static inline uint8_t extract_variantkey_chrom(uint64_t vk) +++{ +++ return (uint8_t)((vk & VKMASK_CHROM) >> VKSHIFT_CHROM); +++} +++ +++/** @brief Extract the POS code from VariantKey. +++ * +++ * @param vk VariantKey code. +++ * +++ * @return POS. +++ */ +++static inline uint32_t extract_variantkey_pos(uint64_t vk) +++{ +++ return (uint32_t)((vk & VKMASK_POS) >> VKSHIFT_POS); +++} +++ +++/** @brief Extract the REF+ALT code from VariantKey. +++ * +++ * @param vk VariantKey code. +++ * +++ * @return REF+ALT code. +++ */ +++static inline uint32_t extract_variantkey_refalt(uint64_t vk) +++{ +++ return (uint32_t)(vk & VKMASK_REFALT); +++} +++ +++/** @brief Decode a VariantKey code and returns the components as variantkey_t structure. +++ * +++ * @param code VariantKey code. +++ * @param vk Decoded variantkey structure. +++ */ +++static inline void decode_variantkey(uint64_t code, variantkey_t *vk) +++{ +++ vk->chrom = extract_variantkey_chrom(code); +++ vk->pos = extract_variantkey_pos(code); +++ vk->refalt = extract_variantkey_refalt(code); +++} +++ +++/** @brief Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT. +++ * +++ * @param chrom Chromosome. An identifier from the reference genome, no white-space or leading zeros permitted. +++ * @param sizechrom Length of the chrom string, excluding the terminating null byte. +++ * @param pos Position. The reference position, with the first base having position 0. +++ * @param ref Reference allele. String containing a sequence of nucleotide letters. +++ * The value in the pos field refers to the position of the first nucleotide in the String. +++ * Characters must be A-Z, a-z or * +++ * @param sizeref Length of the ref string, excluding the terminating null byte. +++ * @param alt Alternate non-reference allele string. +++ * Characters must be A-Z, a-z or * +++ * @param sizealt Length of the alt string, excluding the terminating null byte. +++ * +++ * @return VariantKey 64 bit code. +++ */ +++static inline uint64_t variantkey(const char *chrom, size_t sizechrom, uint32_t pos, const char *ref, size_t sizeref, const char *alt, size_t sizealt) +++{ +++ return encode_variantkey(encode_chrom(chrom, sizechrom), pos, encode_refalt(ref, sizeref, alt, sizealt)); +++} +++ +++/** @brief Returns minimum and maximum VariantKeys for range searches. +++ * +++ * @param chrom Chromosome encoded number. +++ * @param pos_min Start reference position, with the first base having position 0. +++ * @param pos_max End reference position, with the first base having position 0. +++ * @param range VariantKey range values. +++ */ +++static inline void variantkey_range(uint8_t chrom, uint32_t pos_min, uint32_t pos_max, vkrange_t *range) +++{ +++ uint64_t c = ((uint64_t)chrom << VKSHIFT_CHROM); +++ range->min = (c | ((uint64_t)pos_min << VKSHIFT_POS)); +++ range->max = (c | ((uint64_t)pos_max << VKSHIFT_POS) | VKMASK_REFALT); +++} +++ +++static inline int8_t compare_uint64_t(uint64_t a, uint64_t b) +++{ +++ return (a < b) ? -1 : (a > b); +++} +++ +++/** @brief Compares two VariantKeys by chromosome only. +++ * +++ * @param vka The first VariantKey to be compared. +++ * @param vkb The second VariantKey to be compared. +++ * +++ * @return -1 if the first chromosome is smaller than the second, 0 if they are equal and 1 if the first is greater than the second. +++ */ +++static inline int8_t compare_variantkey_chrom(uint64_t vka, uint64_t vkb) +++{ +++ return compare_uint64_t((vka >> VKSHIFT_CHROM), (vkb >> VKSHIFT_CHROM)); +++} +++ +++/** @brief Compares two VariantKeys by chromosome and position. +++ * +++ * @param vka The first VariantKey to be compared. +++ * @param vkb The second VariantKey to be compared. +++ * +++ * @return -1 if the first CHROM+POS is smaller than the second, 0 if they are equal and 1 if the first is greater than the second. +++ */ +++static inline int8_t compare_variantkey_chrom_pos(uint64_t vka, uint64_t vkb) +++{ +++ return compare_uint64_t((vka >> VKSHIFT_POS), (vkb >> VKSHIFT_POS)); +++} +++ +++/** @brief Returns VariantKey hexadecimal string (16 characters). +++ * +++ * The string represent a 64 bit number or: +++ * - 5 bit for CHROM +++ * - 28 bit for POS +++ * - 31 bit for REF+ALT +++ * +++ * @param vk VariantKey code. +++ * @param str String buffer to be returned (it must be sized 17 bytes at least). +++ * +++ * @return Upon successful return, these function returns the number of characters processed +++ * (excluding the null byte used to end output to strings). +++ * If the buffer size is not sufficient, then the return value is the number of characters required for +++ * buffer string, including the terminating null byte. +++ */ +++static inline size_t variantkey_hex(uint64_t vk, char *str) +++{ +++ return hex_uint64_t(vk, str); +++} +++ +++/** @brief Parses a VariantKey hexadecimal string and returns the code. +++ * +++ * @param vs VariantKey hexadecimal string (it must contain 16 hexadecimal characters). +++ * +++ * @return A VariantKey code. +++ */ +++static inline uint64_t parse_variantkey_hex(const char *vs) +++{ +++ return parse_hex_uint64_t(vs); +++} +++ +++#endif // VARIANTKEY_H ++--- python-pysam.orig/bcftools/vcfannotate.c +++++ python-pysam/bcftools/vcfannotate.c ++@@ -1,6 +1,6 @@ ++ /* vcfannotate.c -- Annotate and edit VCF/BCF files. ++ ++- Copyright (C) 2013-2018 Genome Research Ltd. +++ Copyright (C) 2013-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -33,16 +33,17 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++ #include ++-#include ++ #include "bcftools.h" ++ #include "vcmp.h" ++ #include "filter.h" ++ #include "convert.h" ++ #include "smpl_ilist.h" +++#include "regidx.h" ++ ++ struct _args_t; ++ ++@@ -65,15 +66,30 @@ ++ } ++ annot_line_t; ++ ++-#define REPLACE_MISSING 0 // replace only missing values ++-#define REPLACE_ALL 1 // replace both missing and existing values ++-#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing ++-#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise +++#define REPLACE_MISSING 0 // replace only missing values +++#define REPLACE_ALL 1 // replace both missing and existing values +++#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing +++#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise +++#define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest +++#define MM_APPEND 1 // append, possibly multiple times +++#define MM_UNIQUE 2 // append, only unique values +++#define MM_SUM 3 +++#define MM_AVG 4 +++#define MM_MIN 5 +++#define MM_MAX 6 ++ typedef struct _annot_col_t ++ { ++ int icol, replace, number; // number: one of BCF_VL_* types ++ char *hdr_key_src, *hdr_key_dst; ++ int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*); +++ int merge_method; // one of the MM_* defines +++ khash_t(str2int) *mm_str_hash; // lookup table to ensure uniqueness of added string values +++ kstring_t mm_kstr; +++ double +++ mm_dbl_nalloc, // the allocated size --merge-logic values array +++ mm_dbl_nused, // the number of used elements in the mm_dbl array +++ mm_dbl_ndat, // the number of merged rows (for calculating the average) +++ *mm_dbl; ++ } ++ annot_col_t; ++ ++@@ -92,6 +108,10 @@ ++ int output_type, n_threads; ++ bcf_sr_regions_t *tgts; ++ +++ regidx_t *tgt_idx; +++ regitr_t *tgt_itr; +++ int tgt_is_bed; +++ ++ filter_t *filter; ++ char *filter_str; ++ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE ++@@ -104,7 +124,7 @@ ++ vcmp_t *vcmp; // for matching annotation and VCF lines by allele ++ annot_line_t *alines; // buffered annotation lines ++ int nalines, malines; ++- int ref_idx, alt_idx, chr_idx, from_idx, to_idx; // -1 if not present +++ int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present ++ annot_col_t *cols; // column indexes and setters ++ int ncols; ++ ++@@ -125,18 +145,40 @@ ++ ++ char **argv, *output_fname, *targets_fname, *regions_list, *header_fname; ++ char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites; ++- int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic; +++ char *merge_method_str; +++ int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps; ++ } ++ args_t; ++ ++ char *msprintf(const char *fmt, ...); ++ +++int parse_with_payload(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) +++{ +++ args_t *args = (args_t*) usr; +++ int ret = args->tgt_is_bed ? regidx_parse_bed(line, chr_beg, chr_end, beg, end, NULL, NULL) : regidx_parse_tab(line, chr_beg, chr_end, beg, end, NULL, NULL); +++ if ( ret<0 ) return ret; +++ *((char **)payload) = strdup(line); +++ return 0; +++} +++void free_payload(void *payload) +++{ +++ char *str = *((char**)payload); +++ free(str); +++} +++ ++ void remove_id(args_t *args, bcf1_t *line, rm_tag_t *tag) ++ { ++ bcf_update_id(args->hdr,line,NULL); ++ } ++ void remove_filter(args_t *args, bcf1_t *line, rm_tag_t *tag) ++ { +++ if ( tag->key && tag->hdr_id<0 ) +++ { +++ error("Error: Cannot proceed, not even with the --force option, bad things could happen.\n" +++ " Note that \"bcftools annotate -x FILTER\" can be used to remove ALL filters.\n" +++ " Even better, use \"bcftools view -h\" and \"bcftools reheader\" to fix the header!\n" +++ ); +++ } ++ if ( !tag->key ) bcf_update_filter(args->hdr, line, NULL, args->flt_keep_pass); ++ else bcf_remove_filter(args->hdr, line, tag->hdr_id, args->flt_keep_pass); ++ } ++@@ -223,7 +265,10 @@ ++ memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*)); ++ bcf_hrec_destroy(hrec); ++ } ++- if ( nrm ) bcf_hdr_sync(hdr); +++ if ( nrm ) { +++ if (bcf_hdr_sync(hdr) < 0) +++ error_errno("[%s] Failed to update header", __func__); +++ } ++ } ++ ++ static void init_remove_annots(args_t *args) ++@@ -264,8 +309,14 @@ ++ tag->handler = remove_filter; ++ tag->key = strdup(str.s); ++ tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key); ++- if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s); ++- if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key); +++ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) +++ { +++ if ( args->keep_sites ) +++ error("Error: The filter \"%s\" is not defined in the header, cannot use the -k option\n", str.s); +++ else +++ fprintf(stderr,"Warning: The filter \"%s\" is not defined in the header\n", str.s); +++ } +++ else if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key); ++ } ++ else ++ { ++@@ -280,8 +331,14 @@ ++ int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,str.s); ++ if ( !bcf_hdr_idinfo_exists(args->hdr,type,id) ) ++ { ++- fprintf(stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); ++- args->nrm--; +++ if ( args->keep_sites ) +++ error("Error: The tag \"%s\" is not defined in the header, cannot use the -k option\n", str.s); +++ else +++ fprintf(stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); +++ +++ tag->key = strdup(str.s); +++ if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag; +++ else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag; ++ } ++ else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) ) ++ { ++@@ -364,7 +421,8 @@ ++ } ++ khash_str2int_destroy_free(keep); ++ if ( !args->nrm ) error("No matching tag in -x %s\n", args->remove_annots); ++- bcf_hdr_sync(args->hdr_out); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ } ++ static void init_header_lines(args_t *args) ++ { ++@@ -376,13 +434,17 @@ ++ if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s); ++ bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else) ++ } ++- hts_close(file); +++ if ( hts_close(file)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->header_fname); ++ free(str.s); ++- bcf_hdr_sync(args->hdr_out); ++- bcf_hdr_sync(args->hdr); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update output header", __func__); +++ if (bcf_hdr_sync(args->hdr) < 0) +++ error_errno("[%s] Failed to update input header", __func__); ++ } ++ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n"); +++ ++ // note: so far this works only with one filter, not a list of filters ++ annot_line_t *tab = (annot_line_t*) data; ++ if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." ++@@ -432,6 +494,8 @@ ++ } ++ static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n"); +++ ++ // possible cases: ++ // IN ANNOT OUT ACHIEVED_BY ++ // x y x -c +ID ++@@ -493,6 +557,8 @@ ++ } ++ static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with QUAL (yet?)\n"); +++ ++ annot_line_t *tab = (annot_line_t*) data; ++ char *str = tab->cols[col->icol]; ++ if ( str[0]=='.' && str[1]==0 ) return 0; // empty ++@@ -501,7 +567,7 @@ ++ ++ line->qual = strtod(str, &str); ++ if ( str == tab->cols[col->icol] ) ++- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); +++ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ++ return 0; ++ } ++ static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++@@ -514,13 +580,15 @@ ++ } ++ static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with INFO type=Flag (yet?)\n"); +++ ++ annot_line_t *tab = (annot_line_t*) data; ++ char *str = tab->cols[col->icol]; ++ if ( str[0]=='.' && str[1]==0 ) return 0; ++ ++ if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1); ++ if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0); ++- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); +++ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ++ return -1; ++ } ++ static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++@@ -533,13 +601,13 @@ ++ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi) ++ { ++ if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) ++- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) ++- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; ++ int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); ++- if ( !map ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ // fill in any missing values in the target VCF (or all, if not present) ++ int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); ++@@ -565,19 +633,75 @@ ++ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { ++ annot_line_t *tab = (annot_line_t*) data; ++- char *str = tab->cols[col->icol], *end = str; ++- if ( str[0]=='.' && str[1]==0 ) return 0; ++ ++- int ntmpi = 0; ++- while ( *end ) +++ if ( !tab ) +++ { +++ if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) +++ error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Integer\n"); +++ } +++ +++ int i,ntmpi = 0; +++ if ( tab ) +++ { +++ char *str = tab->cols[col->icol], *end = str; +++ if ( str[0]=='.' && str[1]==0 ) return 0; +++ +++ while ( *end ) +++ { +++ int val = strtol(str, &end, 10); +++ if ( end==str ) +++ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); +++ ntmpi++; +++ hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); +++ args->tmpi[ntmpi-1] = val; +++ str = end+1; +++ } +++ if ( col->merge_method!=MM_FIRST ) +++ { +++ if ( !col->mm_dbl_nused ) +++ { +++ col->mm_dbl_nused = ntmpi; +++ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); +++ for (i=0; imm_dbl[i] = args->tmpi[i]; +++ } +++ else +++ { +++ if ( col->merge_method==MM_APPEND ) +++ { +++ int nori = col->mm_dbl_nused; +++ col->mm_dbl_nused += ntmpi; +++ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); +++ for (i=0; imm_dbl[i+nori] = args->tmpi[i]; +++ } +++ else +++ { +++ if ( ntmpi!=col->mm_dbl_nused ) error("Error: cannot merge fields of unequal length\n"); +++ if ( col->merge_method==MM_SUM || col->merge_method==MM_AVG ) +++ for (i=0; imm_dbl[i] += args->tmpi[i]; +++ else if ( col->merge_method==MM_MIN ) +++ for (i=0; imm_dbl[i] > args->tmpi[i] ) col->mm_dbl[i] = args->tmpi[i]; } +++ else if ( col->merge_method==MM_MAX ) +++ for (i=0; imm_dbl[i] < args->tmpi[i] ) col->mm_dbl[i] = args->tmpi[i]; } +++ } +++ } +++ col->mm_dbl_ndat++; +++ } +++ } +++ else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) +++ { +++ ntmpi = col->mm_dbl_nused; +++ hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); +++ for (i=0; itmpi[i] = col->mm_dbl[i]; +++ col->mm_dbl_nused = col->mm_dbl_ndat = 0; +++ } +++ else if ( col->merge_method==MM_AVG ) ++ { ++- int val = strtol(str, &end, 10); ++- if ( end==str ) ++- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); ++- ntmpi++; +++ ntmpi = col->mm_dbl_nused; ++ hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); ++- args->tmpi[ntmpi-1] = val; ++- str = end+1; +++ for (i=0; itmpi[i] = col->mm_dbl[i]/col->mm_dbl_ndat; +++ col->mm_dbl_nused = col->mm_dbl_ndat = 0; ++ } ++ ++ if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) ++@@ -613,13 +737,13 @@ ++ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf) ++ { ++ if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) ++- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) ++- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; ++ int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); ++- if ( !map ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ // fill in any missing values in the target VCF (or all, if not present) ++ int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); ++@@ -645,19 +769,75 @@ ++ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { ++ annot_line_t *tab = (annot_line_t*) data; ++- char *str = tab->cols[col->icol], *end = str; ++- if ( str[0]=='.' && str[1]==0 ) return 0; ++ ++- int ntmpf = 0; ++- while ( *end ) +++ if ( !tab ) ++ { ++- double val = strtod(str, &end); ++- if ( end==str ) ++- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); ++- ntmpf++; ++- hts_expand(float,ntmpf,args->mtmpf,args->tmpf); ++- args->tmpf[ntmpf-1] = val; ++- str = end+1; +++ if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) +++ error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Float\n"); +++ } +++ +++ int i,ntmpf = 0; +++ if ( tab ) +++ { +++ char *str = tab->cols[col->icol], *end = str; +++ if ( str[0]=='.' && str[1]==0 ) return 0; +++ +++ while ( *end ) +++ { +++ double val = strtod(str, &end); +++ if ( end==str ) +++ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); +++ ntmpf++; +++ hts_expand(float,ntmpf,args->mtmpf,args->tmpf); +++ args->tmpf[ntmpf-1] = val; +++ str = end+1; +++ } +++ if ( col->merge_method!=MM_FIRST ) +++ { +++ if ( !col->mm_dbl_nused ) +++ { +++ col->mm_dbl_nused = ntmpf; +++ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); +++ for (i=0; imm_dbl[i] = args->tmpf[i]; +++ } +++ else +++ { +++ if ( col->merge_method==MM_APPEND ) +++ { +++ int nori = col->mm_dbl_nused; +++ col->mm_dbl_nused += ntmpf; +++ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); +++ for (i=0; imm_dbl[i+nori] = args->tmpf[i]; +++ } +++ else +++ { +++ if ( ntmpf!=col->mm_dbl_nused ) error("Error: cannot merge fields of unequal length\n"); +++ if ( col->merge_method==MM_SUM || col->merge_method==MM_AVG ) +++ for (i=0; imm_dbl[i] += args->tmpf[i]; +++ else if ( col->merge_method==MM_MIN ) +++ for (i=0; imm_dbl[i] > args->tmpf[i] ) col->mm_dbl[i] = args->tmpf[i]; } +++ else if ( col->merge_method==MM_MAX ) +++ for (i=0; imm_dbl[i] < args->tmpf[i] ) col->mm_dbl[i] = args->tmpf[i]; } +++ } +++ } +++ col->mm_dbl_ndat++; +++ } +++ } +++ else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) +++ { +++ ntmpf = col->mm_dbl_nused; +++ hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf); +++ for (i=0; itmpf[i] = col->mm_dbl[i]; +++ col->mm_dbl_nused = col->mm_dbl_ndat = 0; +++ } +++ else if ( col->merge_method==MM_AVG ) +++ { +++ ntmpf = col->mm_dbl_nused; +++ hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf); +++ for (i=0; itmpf[i] = col->mm_dbl[i]/col->mm_dbl_ndat; +++ col->mm_dbl_nused = col->mm_dbl_ndat = 0; ++ } ++ ++ if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) ++@@ -693,6 +873,8 @@ ++ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c ++ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als) ++ { +++ assert( col->merge_method==MM_FIRST ); +++ ++ int nsrc = 1, lsrc = 0; ++ while ( args->tmps[lsrc] ) ++ { ++@@ -700,13 +882,13 @@ ++ lsrc++; ++ } ++ if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) ) ++- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) ) ++- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; ++ int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); ++- if ( !map ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ // fill in any missing values in the target VCF (or all, if not present) ++ int i, empty = 0, nstr, mstr = args->tmpks.m; ++@@ -746,22 +928,76 @@ ++ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s); ++ return 0; ++ } +++void khash_str2int_clear_free(void *_hash) +++{ +++ khash_t(str2int) *hash = (khash_t(str2int)*)_hash; +++ khint_t k; +++ if (hash == 0) return; +++ for (k = 0; k < kh_end(hash); ++k) +++ if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); +++ kh_clear(str2int, hash); +++} ++ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( col->replace==REPLACE_MISSING && col->number!=BCF_VL_A && col->number!=BCF_VL_R ) +++ { +++ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); +++ if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; +++ } +++ ++ annot_line_t *tab = (annot_line_t*) data; ++- int len = strlen(tab->cols[col->icol]); ++- if ( !len ) return 0; ++- hts_expand(char,len+1,args->mtmps,args->tmps); ++- memcpy(args->tmps,tab->cols[col->icol],len+1); ++- if ( args->tmps[0]=='.' && args->tmps[1]==0 ) return 0; +++ +++ int len = 0; +++ if ( tab ) +++ { +++ len = strlen(tab->cols[col->icol]); +++ if ( !len ) return 0; +++ if ( len==1 && tab->cols[col->icol][0]=='.' ) return 0; +++ } ++ ++- if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) ++- return setter_ARinfo_string(args,line,col,tab->nals,tab->als); +++ if ( col->merge_method!=MM_FIRST ) +++ { +++ if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) +++ error("Error: the --merge-logic option cannot be used with INFO tags Type=String,Number={A,R,G}\n"); ++ ++- if ( col->replace==REPLACE_MISSING ) +++ if ( data ) +++ { +++ assert( col->merge_method==MM_APPEND || col->merge_method==MM_UNIQUE ); +++ if ( col->merge_method==MM_UNIQUE ) +++ { +++ if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init(); +++ if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 0; +++ khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol])); +++ } +++ +++ if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr); +++ kputs(tab->cols[col->icol], &col->mm_kstr); +++ return 0; +++ } +++ +++ if ( col->mm_kstr.l ) +++ { +++ hts_expand(char,col->mm_kstr.l+1,args->mtmps,args->tmps); +++ memcpy(args->tmps,col->mm_kstr.s,col->mm_kstr.l+1); +++ } +++ else +++ return 0; +++ +++ if ( !data ) // flush the line +++ { +++ if ( col->merge_method==MM_UNIQUE ) +++ khash_str2int_clear_free(col->mm_str_hash); +++ col->mm_kstr.l = 0; +++ } +++ } +++ else ++ { ++- int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); ++- if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; +++ assert(tab); +++ hts_expand(char,len+1,args->mtmps,args->tmps); +++ memcpy(args->tmps,tab->cols[col->icol],len+1); +++ +++ if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) +++ return setter_ARinfo_string(args,line,col,tab->nals,tab->als); ++ } ++ ++ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); ++@@ -785,6 +1021,48 @@ ++ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); ++ return 0; ++ } +++static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str) +++{ +++ int i, isrc, idst; +++ int blen = nsrc1 > 1 ? nsrc1 + 1 : 1; // typically the genotypes take three bytes 0/1, no 0-termination is needed +++ +++gt_length_too_big: +++ str->l = 0; +++ for (idst=0; idstsample_map ? args->sample_map[idst] : idst; +++ if ( isrc==-1 ) +++ { +++ kputc_('.', str); +++ for (i=1; i < blen; i++) kputc_(0, str); +++ continue; +++ } +++ +++ size_t plen = str->l; +++ int32_t *ptr = src + isrc*nsrc1; +++ for (i=0; il - plen > blen ) +++ { +++ // too many alternate alleles or ploidy is too large, the genotype does not fit +++ // three characters ("0/0" vs "10/10"). +++ blen *= 2; +++ goto gt_length_too_big; +++ } +++ plen = str->l - plen; +++ while ( plen < blen ) +++ { +++ kputc_(0, str); +++ plen++; +++ } +++ } +++ return 0; +++} ++ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { ++ bcf1_t *rec = (bcf1_t*) data; ++@@ -792,6 +1070,16 @@ ++ if ( nsrc==-3 ) return 0; // the tag is not present ++ if ( nsrc<=0 ) return 1; // error ++ +++ // Genotypes are internally represented as integers. This is a complication when +++ // adding as a different Type=String field, such as FMT/newGT:=GT +++ if ( strcmp(col->hdr_key_src,col->hdr_key_dst) ) +++ { +++ int nsmpl_dst = bcf_hdr_nsamples(args->hdr_out); +++ int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header); +++ genotypes_to_string(args,nsrc/nsmpl_src,args->tmpi,nsmpl_dst,&args->tmpks); +++ return bcf_update_format_char(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s,args->tmpks.l); +++ } +++ ++ if ( !args->sample_map ) ++ return bcf_update_genotypes(args->hdr_out,line,args->tmpi,nsrc); ++ ++@@ -1057,9 +1345,11 @@ ++ } ++ static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); +++ ++ annot_line_t *tab = (annot_line_t*) data; ++ if ( col->icol+args->nsmpl_annot > tab->ncols ) ++- error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); ++ hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi); ++ ++@@ -1082,7 +1372,7 @@ ++ char *end = str; ++ ptr[ival] = strtol(str, &end, 10); ++ if ( end==str ) ++- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); +++ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ++ ++ ival++; ++ str = *end ? end+1 : end; ++@@ -1094,9 +1384,11 @@ ++ } ++ static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); +++ ++ annot_line_t *tab = (annot_line_t*) data; ++ if ( col->icol+args->nsmpl_annot > tab->ncols ) ++- error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); ++ hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf); ++ ++@@ -1120,7 +1412,7 @@ ++ char *end = str; ++ ptr[ival] = strtod(str, &end); ++ if ( end==str ) ++- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); +++ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ++ ++ ival++; ++ str = *end ? end+1 : end; ++@@ -1132,9 +1424,11 @@ ++ } ++ static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); +++ ++ annot_line_t *tab = (annot_line_t*) data; ++ if ( col->icol+args->nsmpl_annot > tab->ncols ) ++- error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int ismpl; ++ for (ismpl=0; ismplnsmpl_annot; ismpl++) ++@@ -1186,7 +1480,7 @@ ++ // create mapping from src to dst genotypes, haploid and diploid version ++ int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1; ++ int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele); ++- if ( !map_hap ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ if ( !map_hap ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int i, j; ++ if ( rec->n_allele==line->n_allele ) ++@@ -1226,15 +1520,15 @@ ++ } ++ int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src); ++ if ( pld_src<0 ) ++- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),rec->pos+1); +++ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),(int64_t) rec->pos+1); ++ int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst); ++ if ( pld_dst<0 ) ++- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),line->pos+1); +++ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2; ++ if ( ndst1_new != ndst1 ) ++ { ++- if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%d\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),line->pos+1); +++ if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%"PRId64"\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ndst1 = ndst1_new; ++ hts_expand(int32_t, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2); ++ } ++@@ -1254,7 +1548,7 @@ ++ if ( col->number==BCF_VL_G ) ++ { ++ if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] ) ++- error("Sample ploidy differs at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ error("Sample ploidy differs at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ if ( !args->dst_smpl_pld[i] ) ++ for (j=0; jfiles->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf); ++ if ( nsrc==-3 ) return 0; // the tag is not present ++@@ -1294,7 +1587,7 @@ ++ // create mapping from src to dst genotypes, haploid and diploid version ++ int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1; ++ int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele); ++- if ( !map_hap ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ if ( !map_hap ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int i, j; ++ if ( rec->n_allele==line->n_allele ) ++@@ -1334,15 +1627,15 @@ ++ } ++ int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src); ++ if ( pld_src<0 ) ++- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),rec->pos+1); +++ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),(int64_t) rec->pos+1); ++ int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst); ++ if ( pld_dst<0 ) ++- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),line->pos+1); +++ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2; ++ if ( ndst1_new != ndst1 ) ++ { ++- if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%d\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),line->pos+1); +++ if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%"PRId64"\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ndst1 = ndst1_new; ++ hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2); ++ } ++@@ -1362,7 +1655,7 @@ ++ if ( col->number==BCF_VL_G ) ++ { ++ if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] ) ++- error("Sample ploidy differs at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ error("Sample ploidy differs at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ if ( !args->dst_smpl_pld[i] ) ++ for (j=0; jtmps = args->tmpp[0]; // tmps might be realloced ++ if ( ret==-3 ) return 0; // the tag is not present ++ if ( ret<=0 ) return 1; // error ++- return core_setter_format_str(args,line,col,args->tmpp); +++ if ( strcmp("GT",col->hdr_key_dst) ) +++ return core_setter_format_str(args,line,col,args->tmpp); +++ +++ // Genotypes are internally represented as integers. This is a complication for FMT/GT:=oldGT +++ // First determine the maximum number of alleles per-sample ndst1 +++ int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header); +++ int nsmpl_dst = bcf_hdr_nsamples(args->hdr_out); +++ int isrc,idst, ndst1 = 0, nsrc1 = ret / nsmpl_src; +++ char *ptr = args->tmps, *ptr_end = ptr + ret; +++ while ( ptr < ptr_end ) +++ { +++ char *smpl_end = ptr + nsrc1; +++ int n = 1; +++ while ( ptr < smpl_end ) +++ { +++ if ( *ptr=='/' || *ptr=='|' ) n++; +++ ptr++; +++ } +++ if ( ndst1 < n ) ndst1 = n; +++ } +++ assert( ndst1 ); +++ +++ int ndst = ndst1*nsmpl_dst; +++ hts_expand(int32_t,ndst,args->mtmpi,args->tmpi); +++ hts_expand(char,ret+1,args->mtmps,args->tmps); args->tmps[ret] = 0; // the FORMAT string may not be 0-terminated +++ for (idst=0; idsttmpi + idst*ndst1; +++ isrc = args->sample_map ? args->sample_map[idst] : idst; +++ if ( isrc==-1 ) +++ { +++ dst[0] = bcf_gt_missing; +++ for (i=1; itmps + isrc*nsrc1, *tmp; +++ char *keep_ptr = beg+nsrc1, keep = *keep_ptr; *keep_ptr = 0; +++ while ( *beg ) +++ { +++ char *end = beg; +++ while ( *end && *end!='/' && *end!='|' ) end++; +++ if ( *beg=='.' && end-beg==1 ) dst[i] = bcf_gt_missing; +++ else +++ { +++ if ( *end=='|' ) is_phased = 1; +++ dst[i] = strtol(beg, &tmp, 10); +++ if ( tmp!=end ) +++ error("Could not parse the %s field at %s:%"PRId64" in %s\n", col->hdr_key_src,bcf_seqname(args->files->readers[1].header,rec),(int64_t) rec->pos+1,args->targets_fname); +++ if ( dst[i] >= line->n_allele ) +++ error("The source allele index is bigger than the number of destination alleles at %s:%"PRId64"\n", bcf_seqname(args->files->readers[1].header,rec),(int64_t) rec->pos+1); +++ dst[i] = is_phased ? bcf_gt_phased(dst[i]) : bcf_gt_unphased(dst[i]); +++ } +++ beg = *end ? end+1 : end; +++ i++; +++ } +++ *keep_ptr = keep; +++ for (; ihdr_out,line,args->tmpi,ndst); ++ } ++ static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst) ++ { ++@@ -1446,62 +1798,25 @@ ++ args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map); ++ for (i=0; insample_map; i++) args->sample_map[i] = -1; ++ ++- // possible todo: could do with smpl_ilist only ++- smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, SMPL_STRICT); ++- if ( !ilist || !ilist->n ) error("Could not parse: %s\n", args->sample_names); ++- char **samples = (char**) malloc(sizeof(char*)*ilist->n); ++- for (i=0; in; i++) samples[i] = strdup(dst->samples[i]); +++ int flags = !src ? SMPL_STRICT|SMPL_SINGLE : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is vcf vs tab annotation file +++ smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, flags); // gives mapping dst->src +++ if ( !ilist || !ilist->n ) error("Could not parse the samples: %s\n", args->sample_names); ++ args->nsmpl_annot = ilist->n; ++- smpl_ilist_destroy(ilist); ++ int need_sample_map = args->nsmpl_annot==bcf_hdr_nsamples(dst) ? 0 : 1; ++- if ( !src ) +++ for (i=0; insmpl_annot; i++) ++ { ++- // tab annotation file ++- for (i=0; insmpl_annot; i++) +++ int idst = ilist->idx[i]; +++ const char *src_name = ilist->pair && ilist->pair[i] ? ilist->pair[i] : bcf_hdr_int2id(dst, BCF_DT_SAMPLE, idst); +++ int isrc = i; +++ if ( src ) // the annotation file is a VCF, not a tab-delimited file ++ { ++- int idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, samples[i]); ++- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", samples[i]); ++- args->sample_map[idst] = i; ++- if ( idst!=i ) need_sample_map = 1; ++- } ++- } ++- else ++- { ++- // vcf annotation file ++- for (i=0; insmpl_annot; i++) ++- { ++- int isrc, idst; ++- char *ss = samples[i], *se = samples[i]; ++- while ( *se && !isspace(*se) ) se++; ++- if ( !*se ) ++- { ++- // only one sample name ++- isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); ++- if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); ++- idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); ++- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); ++- args->sample_map[idst] = isrc; ++- if ( idst!=isrc ) need_sample_map = 1; ++- continue; ++- } ++- *se = 0; ++- isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); ++- if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); ++- ++- ss = se+1; ++- while ( isspace(*ss) ) ss++; ++- se = ss; ++- while ( *se && !isspace(*se) ) se++; ++- ++- idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); ++- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); ++- ++- args->sample_map[idst] = isrc; ++- if ( idst!=isrc ) need_sample_map = 1; +++ isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE, src_name); +++ if ( isrc==-1 ) error("Sample \"%s\" not found in the annotation file\n", src_name); ++ } +++ if ( isrc!=idst ) need_sample_map = 1; +++ args->sample_map[idst] = isrc; ++ } ++- for (i=0; insmpl_annot; i++) free(samples[i]); ++- free(samples); +++ smpl_ilist_destroy(ilist); ++ return need_sample_map; ++ } ++ static char *columns_complement(char *columns, void **skip_info, void **skip_fmt) ++@@ -1605,9 +1920,9 @@ ++ kputsn(ss, se-ss, &str); ++ if ( !str.s[0] || !strcasecmp("-",str.s) ) ; ++ else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol; ++- else if ( !strcasecmp("POS",str.s) ) args->from_idx = icol; ++- else if ( !strcasecmp("FROM",str.s) ) args->from_idx = icol; ++- else if ( !strcasecmp("TO",str.s) ) args->to_idx = icol; +++ else if ( !strcasecmp("POS",str.s) ) args->beg_idx = icol; +++ else if ( !strcasecmp("FROM",str.s) || !strcasecmp("BEG",str.s) ) args->beg_idx = icol; +++ else if ( !strcasecmp("TO",str.s) || !strcasecmp("END",str.s) ) args->end_idx = icol; ++ else if ( !strcasecmp("REF",str.s) ) ++ { ++ if ( args->tgts_is_vcf ) ++@@ -1667,7 +1982,8 @@ ++ bcf_hrec_format(hrec, &tmp); ++ bcf_hdr_append(args->hdr_out, tmp.s); ++ } ++- bcf_hdr_sync(args->hdr_out); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ } ++ } ++ else if ( !strcasecmp("QUAL",str.s) ) ++@@ -1698,7 +2014,8 @@ ++ tmp.l = 0; ++ bcf_hrec_format(hrec, &tmp); ++ bcf_hdr_append(args->hdr_out, tmp.s); ++- bcf_hdr_sync(args->hdr_out); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); ++ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); ++ annot_col_t *col = &args->cols[args->ncols-1]; ++@@ -1732,7 +2049,8 @@ ++ tmp.l = 0; ++ bcf_hrec_format(hrec, &tmp); ++ bcf_hdr_append(args->hdr_out, tmp.s); ++- bcf_hdr_sync(args->hdr_out); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); ++ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); ++ annot_col_t *col = &args->cols[args->ncols-1]; ++@@ -1774,7 +2092,8 @@ ++ tmp.l = 0; ++ bcf_hrec_format_rename(hrec, key_dst, &tmp); ++ bcf_hdr_append(args->hdr_out, tmp.s); ++- bcf_hdr_sync(args->hdr_out); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ } ++ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); ++ if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) ++@@ -1811,13 +2130,30 @@ ++ { ++ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); ++ if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n"); ++- char *key_dst = !strncasecmp("INFO/",str.s,5) ? str.s + 5 : str.s; +++ int explicit_info = 0; +++ char *key_dst; +++ if ( !strncasecmp("INFO/",str.s,5) ) +++ { +++ key_dst = str.s + 5; +++ explicit_info = 1; +++ } +++ else +++ key_dst = str.s; ++ char *key_src = strstr(key_dst,":="); ++ if ( key_src ) ++ { ++ *key_src = 0; ++ key_src += 2; ++- if ( !strncasecmp("INFO/",key_src,5) ) key_src += 5; +++ if ( !strncasecmp("INFO/",key_src,5) ) +++ { +++ key_src += 5; +++ explicit_info = 1; +++ } +++ else if ( !strncasecmp("FMT/",key_src,4) || !strncasecmp("FORMAT/",key_src,5) ) +++ { +++ key_src[-2] = ':'; +++ error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s); +++ } ++ } ++ else ++ key_src = key_dst; ++@@ -1827,11 +2163,18 @@ ++ if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line ++ { ++ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL); ++- if ( !hrec ) error("The tag \"%s\" is not defined in %s\n", str.s,args->files->readers[1].fname); +++ if ( !hrec ) +++ { +++ if ( !explicit_info && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) ) +++ error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s); +++ fprintf(stderr,"[%s] %d\n",key_src,explicit_info); +++ error("The tag \"%s\" is not defined in %s\n", key_src,args->files->readers[1].fname); +++ } ++ tmp.l = 0; ++ bcf_hrec_format_rename(hrec, key_dst, &tmp); ++ bcf_hdr_append(args->hdr_out, tmp.s); ++- bcf_hdr_sync(args->hdr_out); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); ++ } ++ else ++@@ -1860,7 +2203,6 @@ ++ } ++ free(str.s); ++ free(tmp.s); ++- if ( args->to_idx==-1 ) args->to_idx = args->from_idx; ++ free(args->columns); ++ if ( skip_info ) khash_str2int_destroy_free(skip_info); ++ if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt); ++@@ -1879,6 +2221,54 @@ ++ else if ( sample_map_ok<0 ) ++ error("No matching samples in source and destination file?\n"); ++ } +++static void init_merge_method(args_t *args) +++{ +++ int i; +++ for (i=0; incols; i++) +++ { +++ args->cols[i].merge_method = MM_FIRST; +++ args->cols[i].mm_str_hash = NULL; +++ args->cols[i].mm_dbl = NULL; +++ args->cols[i].mm_dbl_nalloc = args->cols[i].mm_dbl_nused = args->cols[i].mm_dbl_ndat = 0; +++ memset(&args->cols[i].mm_kstr, 0, sizeof(args->cols[i].mm_kstr)); +++ } +++ if ( !args->merge_method_str ) return; +++ if ( args->tgts_is_vcf ) error("Error: the --merge-logic is intended for use with BED or TAB-delimited files only.\n"); +++ if ( !args->tgt_idx ) error("Error: BEG,END (or FROM,TO) columns are expected with the --merge-logic option.\n"); +++ char *sb = args->merge_method_str; +++ while ( *sb ) +++ { +++ char *se = sb; +++ while ( *se && *se!=',' ) se++; +++ args->tmpks.l = 0; +++ kputsn(sb, se-sb, &args->tmpks); +++ kputc(0, &args->tmpks); +++ char *mm_type_str = args->tmpks.s + args->tmpks.l; +++ while ( *mm_type_str!=':' && mm_type_str > args->tmpks.s ) mm_type_str--; +++ if ( *mm_type_str!=':' ) +++ error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str); +++ *mm_type_str = 0; +++ mm_type_str++; +++ int mm_type = MM_FIRST; +++ if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE; +++ else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND; +++ else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM; +++ else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG; +++ else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN; +++ else if ( !strcasecmp("max",mm_type_str) ) mm_type = MM_MAX; +++ else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str,mm_type_str); +++ for (i=0; incols; i++) +++ { +++ if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue; +++ if ( mm_type==MM_APPEND && args->cols[i].number!=BCF_VL_VAR ) +++ error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n"); +++ args->cols[i].merge_method = mm_type; +++ break; +++ } +++ if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s); +++ sb = *se ? se + 1 : se; +++ } +++} ++ ++ static void rename_chrs(args_t *args, char *fname) ++ { ++@@ -1927,13 +2317,30 @@ ++ { ++ if ( !args->columns ) error("The -c option not given\n"); ++ if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n"); ++- if ( args->from_idx==-1 ) error("The -c POS option not given\n"); ++- if ( args->to_idx==-1 ) args->to_idx = -args->from_idx - 1; ++- ++- args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->from_idx,args->to_idx); ++- if ( !args->tgts ) error("Could not initialize the annotation file: %s\n", args->targets_fname); ++- if ( !args->tgts->tbx ) error("Expected tabix-indexed annotation file: %s\n", args->targets_fname); +++ if ( args->beg_idx==-1 ) error("The -c POS option not given\n"); +++ if ( args->single_overlaps && args->merge_method_str ) error("The options --merge-logic and --single-overlaps cannot be combined\n"); +++ if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str) ) +++ { +++ args->end_idx = -args->beg_idx - 1; +++ args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->beg_idx,args->end_idx); +++ if ( !args->tgts ) error("Could not initialize the annotation file: %s\n", args->targets_fname); +++ if ( !args->tgts->tbx ) error("Expected tabix-indexed annotation file: %s\n", args->targets_fname); +++ } +++ else +++ { +++ if ( args->ref_idx!=-1 ) error("Error: the REF columns will be ignored when BEG,END (or FROM,TO) is present. Replace END (or TO) with \"-\".\n"); +++ int len = strlen(args->targets_fname); +++ if ( len>=7 && !strcasecmp(".bed.gz",args->targets_fname+len-7) ) args->tgt_is_bed = 1; +++ else if ( len>=8 && !strcasecmp(".bed.bgz",args->targets_fname+len-8) ) args->tgt_is_bed = 1; +++ else if ( len>=4 && !strcasecmp(".bed",args->targets_fname+len-4) ) args->tgt_is_bed = 1; +++ args->tgt_idx = regidx_init(args->targets_fname,parse_with_payload,free_payload,sizeof(char*),args); +++ if ( !args->tgt_idx ) error("Failed to parse: %s\n", args->targets_fname); +++ args->tgt_itr = regitr_init(args->tgt_idx); +++ args->nalines++; +++ hts_expand0(annot_line_t,args->nalines,args->malines,args->alines); +++ } ++ } +++ init_merge_method(args); ++ args->vcmp = vcmp_init(); ++ ++ if ( args->filter_str ) ++@@ -1958,10 +2365,10 @@ ++ if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); ++ ++ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); ++- if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); +++ if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno)); ++ if ( args->n_threads ) ++ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); ++- bcf_hdr_write(args->out_fh, args->hdr_out); +++ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname); ++ } ++ } ++ ++@@ -1976,6 +2383,9 @@ ++ { ++ free(args->cols[i].hdr_key_src); ++ free(args->cols[i].hdr_key_dst); +++ free(args->cols[i].mm_kstr.s); +++ if ( args->cols[i].mm_str_hash ) khash_str2int_destroy_free(args->cols[i].mm_str_hash); +++ free(args->cols[i].mm_dbl); ++ } ++ free(args->cols); ++ for (i=0; imalines; i++) ++@@ -1985,6 +2395,11 @@ ++ free(args->alines[i].line.s); ++ } ++ free(args->alines); +++ if ( args->tgt_idx ) +++ { +++ regidx_destroy(args->tgt_idx); +++ regitr_destroy(args->tgt_itr); +++ } ++ if ( args->tgts ) bcf_sr_regions_destroy(args->tgts); ++ free(args->tmpks.s); ++ free(args->tmpi); ++@@ -2007,6 +2422,48 @@ ++ free(args->sample_map); ++ } ++ +++static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp) +++{ +++ tmp->line.l = 0; +++ kputs(str, &tmp->line); +++ char *s = tmp->line.s; +++ tmp->ncols = 1; +++ hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); +++ tmp->cols[0] = s; +++ while ( *s ) +++ { +++ if ( *s=='\t' ) +++ { +++ tmp->ncols++; +++ hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); +++ tmp->cols[tmp->ncols-1] = s+1; +++ *s = 0; +++ } +++ s++; +++ } +++ if ( args->ref_idx != -1 ) +++ { +++ if ( args->ref_idx >= tmp->ncols ) +++ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,str); +++ if ( args->alt_idx >= tmp->ncols ) +++ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,str); +++ tmp->nals = 2; +++ hts_expand(char*,tmp->nals,tmp->mals,tmp->als); +++ tmp->als[0] = tmp->cols[args->ref_idx]; +++ tmp->als[1] = s = tmp->cols[args->alt_idx]; +++ while ( *s ) +++ { +++ if ( *s==',' ) +++ { +++ tmp->nals++; +++ hts_expand(char*,tmp->nals,tmp->mals,tmp->als); +++ tmp->als[tmp->nals-1] = s+1; +++ *s = 0; +++ } +++ s++; +++ } +++ } +++} ++ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int end_pos) ++ { ++ if ( args->nalines && args->alines[0].rid != line->rid ) args->nalines = 0; ++@@ -2037,44 +2494,9 @@ ++ tmp->rid = line->rid; ++ tmp->start = args->tgts->start; ++ tmp->end = args->tgts->end; ++- tmp->line.l = 0; ++- kputs(args->tgts->line.s, &tmp->line); ++- char *s = tmp->line.s; ++- tmp->ncols = 1; ++- hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); ++- tmp->cols[0] = s; ++- while ( *s ) ++- { ++- if ( *s=='\t' ) ++- { ++- tmp->ncols++; ++- hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); ++- tmp->cols[tmp->ncols-1] = s+1; ++- *s = 0; ++- } ++- s++; ++- } +++ parse_annot_line(args, args->tgts->line.s, tmp); ++ if ( args->ref_idx != -1 ) ++ { ++- if ( args->ref_idx >= tmp->ncols ) ++- error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,args->tgts->line.s); ++- if ( args->alt_idx >= tmp->ncols ) ++- error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,args->tgts->line.s); ++- tmp->nals = 2; ++- hts_expand(char*,tmp->nals,tmp->mals,tmp->als); ++- tmp->als[0] = tmp->cols[args->ref_idx]; ++- tmp->als[1] = s = tmp->cols[args->alt_idx]; ++- while ( *s ) ++- { ++- if ( *s==',' ) ++- { ++- tmp->nals++; ++- hts_expand(char*,tmp->nals,tmp->mals,tmp->als); ++- tmp->als[tmp->nals-1] = s+1; ++- *s = 0; ++- } ++- s++; ++- } ++ int iseq = args->tgts->iseq; ++ if ( bcf_sr_regions_next(args->tgts)<0 || args->tgts->iseq!=iseq ) break; ++ } ++@@ -2088,7 +2510,30 @@ ++ for (i=0; inrm; i++) ++ args->rm[i].handler(args, line, &args->rm[i]); ++ ++- if ( args->tgts ) +++ int has_overlap = 0; +++ +++ if ( args->tgt_idx ) +++ { +++ if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) ) +++ { +++ while ( regitr_overlap(args->tgt_itr) ) +++ { +++ annot_line_t *tmp = &args->alines[0]; +++ tmp->rid = line->rid; +++ tmp->start = args->tgt_itr->beg; +++ tmp->end = args->tgt_itr->end; +++ parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp); +++ for (j=0; jncols; j++) +++ if ( args->cols[j].setter(args,line,&args->cols[j],tmp) ) +++ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); +++ } +++ has_overlap = 1; +++ } +++ for (j=0; jncols; j++) +++ if ( args->cols[j].merge_method != MM_FIRST ) +++ args->cols[j].setter(args,line,&args->cols[j],NULL); +++ } +++ else if ( args->tgts ) ++ { ++ // Buffer annotation lines. When multiple ALT alleles are present in the ++ // annotation file, at least one must match one of the VCF alleles. ++@@ -2119,18 +2564,9 @@ ++ // there is a matching line ++ for (j=0; jncols; j++) ++ if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) ) ++- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); ++- ++- } ++- ++- if ( args->mark_sites ) ++- { ++- // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87 ++- if ( args->mark_sites_logic==MARK_LISTED ) ++- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,inalines?1:0); ++- else ++- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,inalines?0:1); +++ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ } +++ has_overlap = inalines ? 1 : 0; ++ } ++ else if ( args->files->nreaders == 2 ) ++ { ++@@ -2139,13 +2575,10 @@ ++ bcf1_t *aline = bcf_sr_get_line(args->files,1); ++ for (j=0; jncols; j++) ++ if ( args->cols[j].setter(args,line,&args->cols[j],aline) ) ++- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++- if ( args->mark_sites ) ++- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,args->mark_sites_logic==MARK_LISTED ? 1 : 0); +++ has_overlap = 1; ++ } ++- else if ( args->mark_sites ) ++- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL, args->mark_sites_logic==MARK_UNLISTED ? 1 : 0); ++ } ++ if ( args->set_ids ) ++ { ++@@ -2160,6 +2593,15 @@ ++ bcf_update_id(args->hdr_out,line,args->tmpks.s); ++ } ++ } +++ +++ if ( args->mark_sites ) +++ { +++ // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87 +++ if ( args->mark_sites_logic==MARK_LISTED ) +++ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0); +++ else +++ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?0:1); +++ } ++ } ++ ++ static void usage(args_t *args) ++@@ -2173,10 +2615,12 @@ ++ fprintf(stderr, " --collapse matching records by , see man page for details [some]\n"); ++ fprintf(stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); ++ fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); +++ fprintf(stderr, " --force continue despite parsing error (at your own risk!)\n"); ++ fprintf(stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); ++ fprintf(stderr, " -I, --set-id [+] set ID column, see man page for details\n"); ++ fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); ++ fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); +++ fprintf(stderr, " -l, --merge-logic merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); ++ fprintf(stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); ++ fprintf(stderr, " --no-version do not append version and command line to the header\n"); ++ fprintf(stderr, " -o, --output write output to a file [standard output]\n"); ++@@ -2186,6 +2630,7 @@ ++ fprintf(stderr, " --rename-chrs rename sequences according to map file: from\\tto\n"); ++ fprintf(stderr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); ++ fprintf(stderr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); +++ fprintf(stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); ++ fprintf(stderr, " -x, --remove list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); ++ fprintf(stderr, " --threads number of extra output compression threads [0]\n"); ++ fprintf(stderr, "\n"); ++@@ -2202,19 +2647,20 @@ ++ args->output_type = FT_VCF; ++ args->n_threads = 0; ++ args->record_cmd_line = 1; ++- args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1; +++ args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1; ++ args->set_ids_replace = 1; ++ int regions_is_file = 0, collapse = 0; ++ ++ static struct option loptions[] = ++ { ++- {"keep-sites",required_argument,NULL,'k'}, +++ {"keep-sites",no_argument,NULL,'k'}, ++ {"mark-sites",required_argument,NULL,'m'}, ++ {"set-id",required_argument,NULL,'I'}, ++ {"output",required_argument,NULL,'o'}, ++ {"output-type",required_argument,NULL,'O'}, ++ {"threads",required_argument,NULL,9}, ++ {"annotations",required_argument,NULL,'a'}, +++ {"merge-logic",required_argument,NULL,'l'}, ++ {"collapse",required_argument,NULL,2}, ++ {"include",required_argument,NULL,'i'}, ++ {"exclude",required_argument,NULL,'e'}, ++@@ -2226,12 +2672,15 @@ ++ {"header-lines",required_argument,NULL,'h'}, ++ {"samples",required_argument,NULL,'s'}, ++ {"samples-file",required_argument,NULL,'S'}, +++ {"single-overlaps",no_argument,NULL,10}, ++ {"no-version",no_argument,NULL,8}, +++ {"force",no_argument,NULL,'f'}, ++ {NULL,0,NULL,0} ++ }; ++- while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:k",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) ++ { ++ switch (c) { +++ case 'f': args->force = 1; break; ++ case 'k': args->keep_sites = 1; break; ++ case 'm': ++ args->mark_sites_logic = MARK_LISTED; ++@@ -2239,6 +2688,7 @@ ++ else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; } ++ else args->mark_sites = optarg; ++ break; +++ case 'l': args->merge_method_str = optarg; break; ++ case 'I': args->set_ids_fmt = optarg; break; ++ case 's': args->sample_names = optarg; break; ++ case 'S': args->sample_names = optarg; args->sample_is_file = 1; break; ++@@ -2273,6 +2723,7 @@ ++ break; ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; +++ case 10 : args->single_overlaps = 1; break; ++ case '?': usage(args); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++@@ -2294,6 +2745,7 @@ ++ if ( args->targets_fname ) ++ { ++ htsFile *fp = hts_open(args->targets_fname,"r"); +++ if ( !fp ) error("Failed to open %s\n", args->targets_fname); ++ htsFormat type = *hts_get_format(fp); ++ hts_close(fp); ++ ++@@ -2305,26 +2757,40 @@ ++ } ++ } ++ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ +++ static int line_errcode_warned = 0; ++ init_data(args); ++ while ( bcf_sr_next_line(args->files) ) ++ { ++ if ( !bcf_sr_has_line(args->files,0) ) continue; ++ bcf1_t *line = bcf_sr_get_line(args->files,0); ++- if ( line->errcode ) error("Encountered error, cannot proceed. Please check the error output above.\n"); +++ if ( line->errcode ) +++ { +++ if ( !args->force ) +++ error("Encountered an error, cannot proceed. Please check the error output above.\n" +++ "If feeling adventurous, use the --force option. (At your own risk!)\n"); +++ else if ( !line_errcode_warned ) +++ { +++ fprintf(stderr, +++ "Warning: Encountered an error, proceeding only because --force was given.\n" +++ " Note that this can result in a segfault or a silent corruption of the output file!\n"); +++ line_errcode_warned = 1; +++ line->errcode = 0; +++ } +++ } ++ if ( args->filter ) ++ { ++ int pass = filter_test(args->filter, line, NULL); ++ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; ++ if ( !pass ) ++ { ++- if ( args->keep_sites ) bcf_write1(args->out_fh, args->hdr_out, line); +++ if ( args->keep_sites && bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); ++ continue; ++ } ++ } ++ annotate(args, line); ++- bcf_write1(args->out_fh, args->hdr_out, line); +++ if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); ++ } ++ destroy_data(args); ++ bcf_sr_destroy(args->files); ++--- python-pysam.orig/bcftools/vcfannotate.c.pysam.c +++++ python-pysam/bcftools/vcfannotate.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* vcfannotate.c -- Annotate and edit VCF/BCF files. ++ ++- Copyright (C) 2013-2018 Genome Research Ltd. +++ Copyright (C) 2013-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -35,16 +35,17 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++ #include ++-#include ++ #include "bcftools.h" ++ #include "vcmp.h" ++ #include "filter.h" ++ #include "convert.h" ++ #include "smpl_ilist.h" +++#include "regidx.h" ++ ++ struct _args_t; ++ ++@@ -67,15 +68,30 @@ ++ } ++ annot_line_t; ++ ++-#define REPLACE_MISSING 0 // replace only missing values ++-#define REPLACE_ALL 1 // replace both missing and existing values ++-#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing ++-#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise +++#define REPLACE_MISSING 0 // replace only missing values +++#define REPLACE_ALL 1 // replace both missing and existing values +++#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing +++#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise +++#define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest +++#define MM_APPEND 1 // append, possibly multiple times +++#define MM_UNIQUE 2 // append, only unique values +++#define MM_SUM 3 +++#define MM_AVG 4 +++#define MM_MIN 5 +++#define MM_MAX 6 ++ typedef struct _annot_col_t ++ { ++ int icol, replace, number; // number: one of BCF_VL_* types ++ char *hdr_key_src, *hdr_key_dst; ++ int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*); +++ int merge_method; // one of the MM_* defines +++ khash_t(str2int) *mm_str_hash; // lookup table to ensure uniqueness of added string values +++ kstring_t mm_kstr; +++ double +++ mm_dbl_nalloc, // the allocated size --merge-logic values array +++ mm_dbl_nused, // the number of used elements in the mm_dbl array +++ mm_dbl_ndat, // the number of merged rows (for calculating the average) +++ *mm_dbl; ++ } ++ annot_col_t; ++ ++@@ -94,6 +110,10 @@ ++ int output_type, n_threads; ++ bcf_sr_regions_t *tgts; ++ +++ regidx_t *tgt_idx; +++ regitr_t *tgt_itr; +++ int tgt_is_bed; +++ ++ filter_t *filter; ++ char *filter_str; ++ int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE ++@@ -106,7 +126,7 @@ ++ vcmp_t *vcmp; // for matching annotation and VCF lines by allele ++ annot_line_t *alines; // buffered annotation lines ++ int nalines, malines; ++- int ref_idx, alt_idx, chr_idx, from_idx, to_idx; // -1 if not present +++ int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present ++ annot_col_t *cols; // column indexes and setters ++ int ncols; ++ ++@@ -127,18 +147,40 @@ ++ ++ char **argv, *output_fname, *targets_fname, *regions_list, *header_fname; ++ char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites; ++- int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic; +++ char *merge_method_str; +++ int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps; ++ } ++ args_t; ++ ++ char *msprintf(const char *fmt, ...); ++ +++int parse_with_payload(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) +++{ +++ args_t *args = (args_t*) usr; +++ int ret = args->tgt_is_bed ? regidx_parse_bed(line, chr_beg, chr_end, beg, end, NULL, NULL) : regidx_parse_tab(line, chr_beg, chr_end, beg, end, NULL, NULL); +++ if ( ret<0 ) return ret; +++ *((char **)payload) = strdup(line); +++ return 0; +++} +++void free_payload(void *payload) +++{ +++ char *str = *((char**)payload); +++ free(str); +++} +++ ++ void remove_id(args_t *args, bcf1_t *line, rm_tag_t *tag) ++ { ++ bcf_update_id(args->hdr,line,NULL); ++ } ++ void remove_filter(args_t *args, bcf1_t *line, rm_tag_t *tag) ++ { +++ if ( tag->key && tag->hdr_id<0 ) +++ { +++ error("Error: Cannot proceed, not even with the --force option, bad things could happen.\n" +++ " Note that \"bcftools annotate -x FILTER\" can be used to remove ALL filters.\n" +++ " Even better, use \"bcftools view -h\" and \"bcftools reheader\" to fix the header!\n" +++ ); +++ } ++ if ( !tag->key ) bcf_update_filter(args->hdr, line, NULL, args->flt_keep_pass); ++ else bcf_remove_filter(args->hdr, line, tag->hdr_id, args->flt_keep_pass); ++ } ++@@ -225,7 +267,10 @@ ++ memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*)); ++ bcf_hrec_destroy(hrec); ++ } ++- if ( nrm ) bcf_hdr_sync(hdr); +++ if ( nrm ) { +++ if (bcf_hdr_sync(hdr) < 0) +++ error_errno("[%s] Failed to update header", __func__); +++ } ++ } ++ ++ static void init_remove_annots(args_t *args) ++@@ -266,8 +311,14 @@ ++ tag->handler = remove_filter; ++ tag->key = strdup(str.s); ++ tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key); ++- if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s); ++- if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key); +++ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) +++ { +++ if ( args->keep_sites ) +++ error("Error: The filter \"%s\" is not defined in the header, cannot use the -k option\n", str.s); +++ else +++ fprintf(bcftools_stderr,"Warning: The filter \"%s\" is not defined in the header\n", str.s); +++ } +++ else if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key); ++ } ++ else ++ { ++@@ -282,8 +333,14 @@ ++ int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,str.s); ++ if ( !bcf_hdr_idinfo_exists(args->hdr,type,id) ) ++ { ++- fprintf(bcftools_stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); ++- args->nrm--; +++ if ( args->keep_sites ) +++ error("Error: The tag \"%s\" is not defined in the header, cannot use the -k option\n", str.s); +++ else +++ fprintf(bcftools_stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); +++ +++ tag->key = strdup(str.s); +++ if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag; +++ else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag; ++ } ++ else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) ) ++ { ++@@ -366,7 +423,8 @@ ++ } ++ khash_str2int_destroy_free(keep); ++ if ( !args->nrm ) error("No matching tag in -x %s\n", args->remove_annots); ++- bcf_hdr_sync(args->hdr_out); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ } ++ static void init_header_lines(args_t *args) ++ { ++@@ -378,13 +436,17 @@ ++ if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s); ++ bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else) ++ } ++- hts_close(file); +++ if ( hts_close(file)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->header_fname); ++ free(str.s); ++- bcf_hdr_sync(args->hdr_out); ++- bcf_hdr_sync(args->hdr); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update output header", __func__); +++ if (bcf_hdr_sync(args->hdr) < 0) +++ error_errno("[%s] Failed to update input header", __func__); ++ } ++ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n"); +++ ++ // note: so far this works only with one filter, not a list of filters ++ annot_line_t *tab = (annot_line_t*) data; ++ if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." ++@@ -434,6 +496,8 @@ ++ } ++ static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n"); +++ ++ // possible cases: ++ // IN ANNOT OUT ACHIEVED_BY ++ // x y x -c +ID ++@@ -495,6 +559,8 @@ ++ } ++ static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with QUAL (yet?)\n"); +++ ++ annot_line_t *tab = (annot_line_t*) data; ++ char *str = tab->cols[col->icol]; ++ if ( str[0]=='.' && str[1]==0 ) return 0; // empty ++@@ -503,7 +569,7 @@ ++ ++ line->qual = strtod(str, &str); ++ if ( str == tab->cols[col->icol] ) ++- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); +++ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ++ return 0; ++ } ++ static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++@@ -516,13 +582,15 @@ ++ } ++ static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with INFO type=Flag (yet?)\n"); +++ ++ annot_line_t *tab = (annot_line_t*) data; ++ char *str = tab->cols[col->icol]; ++ if ( str[0]=='.' && str[1]==0 ) return 0; ++ ++ if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1); ++ if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0); ++- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); +++ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ++ return -1; ++ } ++ static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++@@ -535,13 +603,13 @@ ++ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi) ++ { ++ if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) ++- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) ++- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; ++ int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); ++- if ( !map ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ // fill in any missing values in the target VCF (or all, if not present) ++ int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); ++@@ -567,19 +635,75 @@ ++ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { ++ annot_line_t *tab = (annot_line_t*) data; ++- char *str = tab->cols[col->icol], *end = str; ++- if ( str[0]=='.' && str[1]==0 ) return 0; ++ ++- int ntmpi = 0; ++- while ( *end ) +++ if ( !tab ) +++ { +++ if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) +++ error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Integer\n"); +++ } +++ +++ int i,ntmpi = 0; +++ if ( tab ) +++ { +++ char *str = tab->cols[col->icol], *end = str; +++ if ( str[0]=='.' && str[1]==0 ) return 0; +++ +++ while ( *end ) +++ { +++ int val = strtol(str, &end, 10); +++ if ( end==str ) +++ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); +++ ntmpi++; +++ hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); +++ args->tmpi[ntmpi-1] = val; +++ str = end+1; +++ } +++ if ( col->merge_method!=MM_FIRST ) +++ { +++ if ( !col->mm_dbl_nused ) +++ { +++ col->mm_dbl_nused = ntmpi; +++ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); +++ for (i=0; imm_dbl[i] = args->tmpi[i]; +++ } +++ else +++ { +++ if ( col->merge_method==MM_APPEND ) +++ { +++ int nori = col->mm_dbl_nused; +++ col->mm_dbl_nused += ntmpi; +++ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); +++ for (i=0; imm_dbl[i+nori] = args->tmpi[i]; +++ } +++ else +++ { +++ if ( ntmpi!=col->mm_dbl_nused ) error("Error: cannot merge fields of unequal length\n"); +++ if ( col->merge_method==MM_SUM || col->merge_method==MM_AVG ) +++ for (i=0; imm_dbl[i] += args->tmpi[i]; +++ else if ( col->merge_method==MM_MIN ) +++ for (i=0; imm_dbl[i] > args->tmpi[i] ) col->mm_dbl[i] = args->tmpi[i]; } +++ else if ( col->merge_method==MM_MAX ) +++ for (i=0; imm_dbl[i] < args->tmpi[i] ) col->mm_dbl[i] = args->tmpi[i]; } +++ } +++ } +++ col->mm_dbl_ndat++; +++ } +++ } +++ else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) +++ { +++ ntmpi = col->mm_dbl_nused; +++ hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); +++ for (i=0; itmpi[i] = col->mm_dbl[i]; +++ col->mm_dbl_nused = col->mm_dbl_ndat = 0; +++ } +++ else if ( col->merge_method==MM_AVG ) ++ { ++- int val = strtol(str, &end, 10); ++- if ( end==str ) ++- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); ++- ntmpi++; +++ ntmpi = col->mm_dbl_nused; ++ hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); ++- args->tmpi[ntmpi-1] = val; ++- str = end+1; +++ for (i=0; itmpi[i] = col->mm_dbl[i]/col->mm_dbl_ndat; +++ col->mm_dbl_nused = col->mm_dbl_ndat = 0; ++ } ++ ++ if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) ++@@ -615,13 +739,13 @@ ++ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf) ++ { ++ if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) ++- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) ++- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; ++ int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); ++- if ( !map ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ // fill in any missing values in the target VCF (or all, if not present) ++ int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); ++@@ -647,19 +771,75 @@ ++ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { ++ annot_line_t *tab = (annot_line_t*) data; ++- char *str = tab->cols[col->icol], *end = str; ++- if ( str[0]=='.' && str[1]==0 ) return 0; ++ ++- int ntmpf = 0; ++- while ( *end ) +++ if ( !tab ) ++ { ++- double val = strtod(str, &end); ++- if ( end==str ) ++- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); ++- ntmpf++; ++- hts_expand(float,ntmpf,args->mtmpf,args->tmpf); ++- args->tmpf[ntmpf-1] = val; ++- str = end+1; +++ if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) +++ error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Float\n"); +++ } +++ +++ int i,ntmpf = 0; +++ if ( tab ) +++ { +++ char *str = tab->cols[col->icol], *end = str; +++ if ( str[0]=='.' && str[1]==0 ) return 0; +++ +++ while ( *end ) +++ { +++ double val = strtod(str, &end); +++ if ( end==str ) +++ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); +++ ntmpf++; +++ hts_expand(float,ntmpf,args->mtmpf,args->tmpf); +++ args->tmpf[ntmpf-1] = val; +++ str = end+1; +++ } +++ if ( col->merge_method!=MM_FIRST ) +++ { +++ if ( !col->mm_dbl_nused ) +++ { +++ col->mm_dbl_nused = ntmpf; +++ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); +++ for (i=0; imm_dbl[i] = args->tmpf[i]; +++ } +++ else +++ { +++ if ( col->merge_method==MM_APPEND ) +++ { +++ int nori = col->mm_dbl_nused; +++ col->mm_dbl_nused += ntmpf; +++ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); +++ for (i=0; imm_dbl[i+nori] = args->tmpf[i]; +++ } +++ else +++ { +++ if ( ntmpf!=col->mm_dbl_nused ) error("Error: cannot merge fields of unequal length\n"); +++ if ( col->merge_method==MM_SUM || col->merge_method==MM_AVG ) +++ for (i=0; imm_dbl[i] += args->tmpf[i]; +++ else if ( col->merge_method==MM_MIN ) +++ for (i=0; imm_dbl[i] > args->tmpf[i] ) col->mm_dbl[i] = args->tmpf[i]; } +++ else if ( col->merge_method==MM_MAX ) +++ for (i=0; imm_dbl[i] < args->tmpf[i] ) col->mm_dbl[i] = args->tmpf[i]; } +++ } +++ } +++ col->mm_dbl_ndat++; +++ } +++ } +++ else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) +++ { +++ ntmpf = col->mm_dbl_nused; +++ hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf); +++ for (i=0; itmpf[i] = col->mm_dbl[i]; +++ col->mm_dbl_nused = col->mm_dbl_ndat = 0; +++ } +++ else if ( col->merge_method==MM_AVG ) +++ { +++ ntmpf = col->mm_dbl_nused; +++ hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf); +++ for (i=0; itmpf[i] = col->mm_dbl[i]/col->mm_dbl_ndat; +++ col->mm_dbl_nused = col->mm_dbl_ndat = 0; ++ } ++ ++ if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) ++@@ -695,6 +875,8 @@ ++ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c ++ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als) ++ { +++ assert( col->merge_method==MM_FIRST ); +++ ++ int nsrc = 1, lsrc = 0; ++ while ( args->tmps[lsrc] ) ++ { ++@@ -702,13 +884,13 @@ ++ lsrc++; ++ } ++ if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) ) ++- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) ) ++- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; ++ int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); ++- if ( !map ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ // fill in any missing values in the target VCF (or all, if not present) ++ int i, empty = 0, nstr, mstr = args->tmpks.m; ++@@ -748,22 +930,76 @@ ++ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s); ++ return 0; ++ } +++void khash_str2int_clear_free(void *_hash) +++{ +++ khash_t(str2int) *hash = (khash_t(str2int)*)_hash; +++ khint_t k; +++ if (hash == 0) return; +++ for (k = 0; k < kh_end(hash); ++k) +++ if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); +++ kh_clear(str2int, hash); +++} ++ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( col->replace==REPLACE_MISSING && col->number!=BCF_VL_A && col->number!=BCF_VL_R ) +++ { +++ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); +++ if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; +++ } +++ ++ annot_line_t *tab = (annot_line_t*) data; ++- int len = strlen(tab->cols[col->icol]); ++- if ( !len ) return 0; ++- hts_expand(char,len+1,args->mtmps,args->tmps); ++- memcpy(args->tmps,tab->cols[col->icol],len+1); ++- if ( args->tmps[0]=='.' && args->tmps[1]==0 ) return 0; +++ +++ int len = 0; +++ if ( tab ) +++ { +++ len = strlen(tab->cols[col->icol]); +++ if ( !len ) return 0; +++ if ( len==1 && tab->cols[col->icol][0]=='.' ) return 0; +++ } ++ ++- if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) ++- return setter_ARinfo_string(args,line,col,tab->nals,tab->als); +++ if ( col->merge_method!=MM_FIRST ) +++ { +++ if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) +++ error("Error: the --merge-logic option cannot be used with INFO tags Type=String,Number={A,R,G}\n"); ++ ++- if ( col->replace==REPLACE_MISSING ) +++ if ( data ) +++ { +++ assert( col->merge_method==MM_APPEND || col->merge_method==MM_UNIQUE ); +++ if ( col->merge_method==MM_UNIQUE ) +++ { +++ if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init(); +++ if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 0; +++ khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol])); +++ } +++ +++ if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr); +++ kputs(tab->cols[col->icol], &col->mm_kstr); +++ return 0; +++ } +++ +++ if ( col->mm_kstr.l ) +++ { +++ hts_expand(char,col->mm_kstr.l+1,args->mtmps,args->tmps); +++ memcpy(args->tmps,col->mm_kstr.s,col->mm_kstr.l+1); +++ } +++ else +++ return 0; +++ +++ if ( !data ) // flush the line +++ { +++ if ( col->merge_method==MM_UNIQUE ) +++ khash_str2int_clear_free(col->mm_str_hash); +++ col->mm_kstr.l = 0; +++ } +++ } +++ else ++ { ++- int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); ++- if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; +++ assert(tab); +++ hts_expand(char,len+1,args->mtmps,args->tmps); +++ memcpy(args->tmps,tab->cols[col->icol],len+1); +++ +++ if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) +++ return setter_ARinfo_string(args,line,col,tab->nals,tab->als); ++ } ++ ++ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); ++@@ -787,6 +1023,48 @@ ++ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); ++ return 0; ++ } +++static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str) +++{ +++ int i, isrc, idst; +++ int blen = nsrc1 > 1 ? nsrc1 + 1 : 1; // typically the genotypes take three bytes 0/1, no 0-termination is needed +++ +++gt_length_too_big: +++ str->l = 0; +++ for (idst=0; idstsample_map ? args->sample_map[idst] : idst; +++ if ( isrc==-1 ) +++ { +++ kputc_('.', str); +++ for (i=1; i < blen; i++) kputc_(0, str); +++ continue; +++ } +++ +++ size_t plen = str->l; +++ int32_t *ptr = src + isrc*nsrc1; +++ for (i=0; il - plen > blen ) +++ { +++ // too many alternate alleles or ploidy is too large, the genotype does not fit +++ // three characters ("0/0" vs "10/10"). +++ blen *= 2; +++ goto gt_length_too_big; +++ } +++ plen = str->l - plen; +++ while ( plen < blen ) +++ { +++ kputc_(0, str); +++ plen++; +++ } +++ } +++ return 0; +++} ++ static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { ++ bcf1_t *rec = (bcf1_t*) data; ++@@ -794,6 +1072,16 @@ ++ if ( nsrc==-3 ) return 0; // the tag is not present ++ if ( nsrc<=0 ) return 1; // error ++ +++ // Genotypes are internally represented as integers. This is a complication when +++ // adding as a different Type=String field, such as FMT/newGT:=GT +++ if ( strcmp(col->hdr_key_src,col->hdr_key_dst) ) +++ { +++ int nsmpl_dst = bcf_hdr_nsamples(args->hdr_out); +++ int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header); +++ genotypes_to_string(args,nsrc/nsmpl_src,args->tmpi,nsmpl_dst,&args->tmpks); +++ return bcf_update_format_char(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s,args->tmpks.l); +++ } +++ ++ if ( !args->sample_map ) ++ return bcf_update_genotypes(args->hdr_out,line,args->tmpi,nsrc); ++ ++@@ -1059,9 +1347,11 @@ ++ } ++ static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); +++ ++ annot_line_t *tab = (annot_line_t*) data; ++ if ( col->icol+args->nsmpl_annot > tab->ncols ) ++- error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); ++ hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi); ++ ++@@ -1084,7 +1374,7 @@ ++ char *end = str; ++ ptr[ival] = strtol(str, &end, 10); ++ if ( end==str ) ++- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); +++ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ++ ++ ival++; ++ str = *end ? end+1 : end; ++@@ -1096,9 +1386,11 @@ ++ } ++ static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); +++ ++ annot_line_t *tab = (annot_line_t*) data; ++ if ( col->icol+args->nsmpl_annot > tab->ncols ) ++- error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); ++ hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf); ++ ++@@ -1122,7 +1414,7 @@ ++ char *end = str; ++ ptr[ival] = strtod(str, &end); ++ if ( end==str ) ++- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); +++ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ++ ++ ival++; ++ str = *end ? end+1 : end; ++@@ -1134,9 +1426,11 @@ ++ } ++ static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) ++ { +++ if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); +++ ++ annot_line_t *tab = (annot_line_t*) data; ++ if ( col->icol+args->nsmpl_annot > tab->ncols ) ++- error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int ismpl; ++ for (ismpl=0; ismplnsmpl_annot; ismpl++) ++@@ -1188,7 +1482,7 @@ ++ // create mapping from src to dst genotypes, haploid and diploid version ++ int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1; ++ int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele); ++- if ( !map_hap ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ if ( !map_hap ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int i, j; ++ if ( rec->n_allele==line->n_allele ) ++@@ -1228,15 +1522,15 @@ ++ } ++ int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src); ++ if ( pld_src<0 ) ++- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),rec->pos+1); +++ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),(int64_t) rec->pos+1); ++ int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst); ++ if ( pld_dst<0 ) ++- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),line->pos+1); +++ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2; ++ if ( ndst1_new != ndst1 ) ++ { ++- if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%d\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),line->pos+1); +++ if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%"PRId64"\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ndst1 = ndst1_new; ++ hts_expand(int32_t, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2); ++ } ++@@ -1256,7 +1550,7 @@ ++ if ( col->number==BCF_VL_G ) ++ { ++ if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] ) ++- error("Sample ploidy differs at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ error("Sample ploidy differs at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ if ( !args->dst_smpl_pld[i] ) ++ for (j=0; jfiles->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf); ++ if ( nsrc==-3 ) return 0; // the tag is not present ++@@ -1296,7 +1589,7 @@ ++ // create mapping from src to dst genotypes, haploid and diploid version ++ int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1; ++ int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele); ++- if ( !map_hap ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ if ( !map_hap ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int i, j; ++ if ( rec->n_allele==line->n_allele ) ++@@ -1336,15 +1629,15 @@ ++ } ++ int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src); ++ if ( pld_src<0 ) ++- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),rec->pos+1); +++ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),(int64_t) rec->pos+1); ++ int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst); ++ if ( pld_dst<0 ) ++- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),line->pos+1); +++ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++ int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2; ++ if ( ndst1_new != ndst1 ) ++ { ++- if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%d\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),line->pos+1); +++ if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%"PRId64"\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ndst1 = ndst1_new; ++ hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2); ++ } ++@@ -1364,7 +1657,7 @@ ++ if ( col->number==BCF_VL_G ) ++ { ++ if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] ) ++- error("Sample ploidy differs at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ error("Sample ploidy differs at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ if ( !args->dst_smpl_pld[i] ) ++ for (j=0; jtmps = args->tmpp[0]; // tmps might be realloced ++ if ( ret==-3 ) return 0; // the tag is not present ++ if ( ret<=0 ) return 1; // error ++- return core_setter_format_str(args,line,col,args->tmpp); +++ if ( strcmp("GT",col->hdr_key_dst) ) +++ return core_setter_format_str(args,line,col,args->tmpp); +++ +++ // Genotypes are internally represented as integers. This is a complication for FMT/GT:=oldGT +++ // First determine the maximum number of alleles per-sample ndst1 +++ int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header); +++ int nsmpl_dst = bcf_hdr_nsamples(args->hdr_out); +++ int isrc,idst, ndst1 = 0, nsrc1 = ret / nsmpl_src; +++ char *ptr = args->tmps, *ptr_end = ptr + ret; +++ while ( ptr < ptr_end ) +++ { +++ char *smpl_end = ptr + nsrc1; +++ int n = 1; +++ while ( ptr < smpl_end ) +++ { +++ if ( *ptr=='/' || *ptr=='|' ) n++; +++ ptr++; +++ } +++ if ( ndst1 < n ) ndst1 = n; +++ } +++ assert( ndst1 ); +++ +++ int ndst = ndst1*nsmpl_dst; +++ hts_expand(int32_t,ndst,args->mtmpi,args->tmpi); +++ hts_expand(char,ret+1,args->mtmps,args->tmps); args->tmps[ret] = 0; // the FORMAT string may not be 0-terminated +++ for (idst=0; idsttmpi + idst*ndst1; +++ isrc = args->sample_map ? args->sample_map[idst] : idst; +++ if ( isrc==-1 ) +++ { +++ dst[0] = bcf_gt_missing; +++ for (i=1; itmps + isrc*nsrc1, *tmp; +++ char *keep_ptr = beg+nsrc1, keep = *keep_ptr; *keep_ptr = 0; +++ while ( *beg ) +++ { +++ char *end = beg; +++ while ( *end && *end!='/' && *end!='|' ) end++; +++ if ( *beg=='.' && end-beg==1 ) dst[i] = bcf_gt_missing; +++ else +++ { +++ if ( *end=='|' ) is_phased = 1; +++ dst[i] = strtol(beg, &tmp, 10); +++ if ( tmp!=end ) +++ error("Could not parse the %s field at %s:%"PRId64" in %s\n", col->hdr_key_src,bcf_seqname(args->files->readers[1].header,rec),(int64_t) rec->pos+1,args->targets_fname); +++ if ( dst[i] >= line->n_allele ) +++ error("The source allele index is bigger than the number of destination alleles at %s:%"PRId64"\n", bcf_seqname(args->files->readers[1].header,rec),(int64_t) rec->pos+1); +++ dst[i] = is_phased ? bcf_gt_phased(dst[i]) : bcf_gt_unphased(dst[i]); +++ } +++ beg = *end ? end+1 : end; +++ i++; +++ } +++ *keep_ptr = keep; +++ for (; ihdr_out,line,args->tmpi,ndst); ++ } ++ static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst) ++ { ++@@ -1448,62 +1800,25 @@ ++ args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map); ++ for (i=0; insample_map; i++) args->sample_map[i] = -1; ++ ++- // possible todo: could do with smpl_ilist only ++- smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, SMPL_STRICT); ++- if ( !ilist || !ilist->n ) error("Could not parse: %s\n", args->sample_names); ++- char **samples = (char**) malloc(sizeof(char*)*ilist->n); ++- for (i=0; in; i++) samples[i] = strdup(dst->samples[i]); +++ int flags = !src ? SMPL_STRICT|SMPL_SINGLE : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is vcf vs tab annotation file +++ smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, flags); // gives mapping dst->src +++ if ( !ilist || !ilist->n ) error("Could not parse the samples: %s\n", args->sample_names); ++ args->nsmpl_annot = ilist->n; ++- smpl_ilist_destroy(ilist); ++ int need_sample_map = args->nsmpl_annot==bcf_hdr_nsamples(dst) ? 0 : 1; ++- if ( !src ) +++ for (i=0; insmpl_annot; i++) ++ { ++- // tab annotation file ++- for (i=0; insmpl_annot; i++) +++ int idst = ilist->idx[i]; +++ const char *src_name = ilist->pair && ilist->pair[i] ? ilist->pair[i] : bcf_hdr_int2id(dst, BCF_DT_SAMPLE, idst); +++ int isrc = i; +++ if ( src ) // the annotation file is a VCF, not a tab-delimited file ++ { ++- int idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, samples[i]); ++- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", samples[i]); ++- args->sample_map[idst] = i; ++- if ( idst!=i ) need_sample_map = 1; ++- } ++- } ++- else ++- { ++- // vcf annotation file ++- for (i=0; insmpl_annot; i++) ++- { ++- int isrc, idst; ++- char *ss = samples[i], *se = samples[i]; ++- while ( *se && !isspace(*se) ) se++; ++- if ( !*se ) ++- { ++- // only one sample name ++- isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); ++- if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); ++- idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); ++- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); ++- args->sample_map[idst] = isrc; ++- if ( idst!=isrc ) need_sample_map = 1; ++- continue; ++- } ++- *se = 0; ++- isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); ++- if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); ++- ++- ss = se+1; ++- while ( isspace(*ss) ) ss++; ++- se = ss; ++- while ( *se && !isspace(*se) ) se++; ++- ++- idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); ++- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); ++- ++- args->sample_map[idst] = isrc; ++- if ( idst!=isrc ) need_sample_map = 1; +++ isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE, src_name); +++ if ( isrc==-1 ) error("Sample \"%s\" not found in the annotation file\n", src_name); ++ } +++ if ( isrc!=idst ) need_sample_map = 1; +++ args->sample_map[idst] = isrc; ++ } ++- for (i=0; insmpl_annot; i++) free(samples[i]); ++- free(samples); +++ smpl_ilist_destroy(ilist); ++ return need_sample_map; ++ } ++ static char *columns_complement(char *columns, void **skip_info, void **skip_fmt) ++@@ -1607,9 +1922,9 @@ ++ kputsn(ss, se-ss, &str); ++ if ( !str.s[0] || !strcasecmp("-",str.s) ) ; ++ else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol; ++- else if ( !strcasecmp("POS",str.s) ) args->from_idx = icol; ++- else if ( !strcasecmp("FROM",str.s) ) args->from_idx = icol; ++- else if ( !strcasecmp("TO",str.s) ) args->to_idx = icol; +++ else if ( !strcasecmp("POS",str.s) ) args->beg_idx = icol; +++ else if ( !strcasecmp("FROM",str.s) || !strcasecmp("BEG",str.s) ) args->beg_idx = icol; +++ else if ( !strcasecmp("TO",str.s) || !strcasecmp("END",str.s) ) args->end_idx = icol; ++ else if ( !strcasecmp("REF",str.s) ) ++ { ++ if ( args->tgts_is_vcf ) ++@@ -1669,7 +1984,8 @@ ++ bcf_hrec_format(hrec, &tmp); ++ bcf_hdr_append(args->hdr_out, tmp.s); ++ } ++- bcf_hdr_sync(args->hdr_out); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ } ++ } ++ else if ( !strcasecmp("QUAL",str.s) ) ++@@ -1700,7 +2016,8 @@ ++ tmp.l = 0; ++ bcf_hrec_format(hrec, &tmp); ++ bcf_hdr_append(args->hdr_out, tmp.s); ++- bcf_hdr_sync(args->hdr_out); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); ++ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); ++ annot_col_t *col = &args->cols[args->ncols-1]; ++@@ -1734,7 +2051,8 @@ ++ tmp.l = 0; ++ bcf_hrec_format(hrec, &tmp); ++ bcf_hdr_append(args->hdr_out, tmp.s); ++- bcf_hdr_sync(args->hdr_out); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); ++ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); ++ annot_col_t *col = &args->cols[args->ncols-1]; ++@@ -1776,7 +2094,8 @@ ++ tmp.l = 0; ++ bcf_hrec_format_rename(hrec, key_dst, &tmp); ++ bcf_hdr_append(args->hdr_out, tmp.s); ++- bcf_hdr_sync(args->hdr_out); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ } ++ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); ++ if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) ++@@ -1813,13 +2132,30 @@ ++ { ++ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); ++ if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n"); ++- char *key_dst = !strncasecmp("INFO/",str.s,5) ? str.s + 5 : str.s; +++ int explicit_info = 0; +++ char *key_dst; +++ if ( !strncasecmp("INFO/",str.s,5) ) +++ { +++ key_dst = str.s + 5; +++ explicit_info = 1; +++ } +++ else +++ key_dst = str.s; ++ char *key_src = strstr(key_dst,":="); ++ if ( key_src ) ++ { ++ *key_src = 0; ++ key_src += 2; ++- if ( !strncasecmp("INFO/",key_src,5) ) key_src += 5; +++ if ( !strncasecmp("INFO/",key_src,5) ) +++ { +++ key_src += 5; +++ explicit_info = 1; +++ } +++ else if ( !strncasecmp("FMT/",key_src,4) || !strncasecmp("FORMAT/",key_src,5) ) +++ { +++ key_src[-2] = ':'; +++ error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s); +++ } ++ } ++ else ++ key_src = key_dst; ++@@ -1829,11 +2165,18 @@ ++ if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line ++ { ++ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL); ++- if ( !hrec ) error("The tag \"%s\" is not defined in %s\n", str.s,args->files->readers[1].fname); +++ if ( !hrec ) +++ { +++ if ( !explicit_info && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) ) +++ error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s); +++ fprintf(bcftools_stderr,"[%s] %d\n",key_src,explicit_info); +++ error("The tag \"%s\" is not defined in %s\n", key_src,args->files->readers[1].fname); +++ } ++ tmp.l = 0; ++ bcf_hrec_format_rename(hrec, key_dst, &tmp); ++ bcf_hdr_append(args->hdr_out, tmp.s); ++- bcf_hdr_sync(args->hdr_out); +++ if (bcf_hdr_sync(args->hdr_out) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); ++ } ++ else ++@@ -1862,7 +2205,6 @@ ++ } ++ free(str.s); ++ free(tmp.s); ++- if ( args->to_idx==-1 ) args->to_idx = args->from_idx; ++ free(args->columns); ++ if ( skip_info ) khash_str2int_destroy_free(skip_info); ++ if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt); ++@@ -1881,6 +2223,54 @@ ++ else if ( sample_map_ok<0 ) ++ error("No matching samples in source and destination file?\n"); ++ } +++static void init_merge_method(args_t *args) +++{ +++ int i; +++ for (i=0; incols; i++) +++ { +++ args->cols[i].merge_method = MM_FIRST; +++ args->cols[i].mm_str_hash = NULL; +++ args->cols[i].mm_dbl = NULL; +++ args->cols[i].mm_dbl_nalloc = args->cols[i].mm_dbl_nused = args->cols[i].mm_dbl_ndat = 0; +++ memset(&args->cols[i].mm_kstr, 0, sizeof(args->cols[i].mm_kstr)); +++ } +++ if ( !args->merge_method_str ) return; +++ if ( args->tgts_is_vcf ) error("Error: the --merge-logic is intended for use with BED or TAB-delimited files only.\n"); +++ if ( !args->tgt_idx ) error("Error: BEG,END (or FROM,TO) columns are expected with the --merge-logic option.\n"); +++ char *sb = args->merge_method_str; +++ while ( *sb ) +++ { +++ char *se = sb; +++ while ( *se && *se!=',' ) se++; +++ args->tmpks.l = 0; +++ kputsn(sb, se-sb, &args->tmpks); +++ kputc(0, &args->tmpks); +++ char *mm_type_str = args->tmpks.s + args->tmpks.l; +++ while ( *mm_type_str!=':' && mm_type_str > args->tmpks.s ) mm_type_str--; +++ if ( *mm_type_str!=':' ) +++ error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str); +++ *mm_type_str = 0; +++ mm_type_str++; +++ int mm_type = MM_FIRST; +++ if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE; +++ else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND; +++ else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM; +++ else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG; +++ else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN; +++ else if ( !strcasecmp("max",mm_type_str) ) mm_type = MM_MAX; +++ else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str,mm_type_str); +++ for (i=0; incols; i++) +++ { +++ if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue; +++ if ( mm_type==MM_APPEND && args->cols[i].number!=BCF_VL_VAR ) +++ error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n"); +++ args->cols[i].merge_method = mm_type; +++ break; +++ } +++ if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s); +++ sb = *se ? se + 1 : se; +++ } +++} ++ ++ static void rename_chrs(args_t *args, char *fname) ++ { ++@@ -1929,13 +2319,30 @@ ++ { ++ if ( !args->columns ) error("The -c option not given\n"); ++ if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n"); ++- if ( args->from_idx==-1 ) error("The -c POS option not given\n"); ++- if ( args->to_idx==-1 ) args->to_idx = -args->from_idx - 1; ++- ++- args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->from_idx,args->to_idx); ++- if ( !args->tgts ) error("Could not initialize the annotation file: %s\n", args->targets_fname); ++- if ( !args->tgts->tbx ) error("Expected tabix-indexed annotation file: %s\n", args->targets_fname); +++ if ( args->beg_idx==-1 ) error("The -c POS option not given\n"); +++ if ( args->single_overlaps && args->merge_method_str ) error("The options --merge-logic and --single-overlaps cannot be combined\n"); +++ if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str) ) +++ { +++ args->end_idx = -args->beg_idx - 1; +++ args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->beg_idx,args->end_idx); +++ if ( !args->tgts ) error("Could not initialize the annotation file: %s\n", args->targets_fname); +++ if ( !args->tgts->tbx ) error("Expected tabix-indexed annotation file: %s\n", args->targets_fname); +++ } +++ else +++ { +++ if ( args->ref_idx!=-1 ) error("Error: the REF columns will be ignored when BEG,END (or FROM,TO) is present. Replace END (or TO) with \"-\".\n"); +++ int len = strlen(args->targets_fname); +++ if ( len>=7 && !strcasecmp(".bed.gz",args->targets_fname+len-7) ) args->tgt_is_bed = 1; +++ else if ( len>=8 && !strcasecmp(".bed.bgz",args->targets_fname+len-8) ) args->tgt_is_bed = 1; +++ else if ( len>=4 && !strcasecmp(".bed",args->targets_fname+len-4) ) args->tgt_is_bed = 1; +++ args->tgt_idx = regidx_init(args->targets_fname,parse_with_payload,free_payload,sizeof(char*),args); +++ if ( !args->tgt_idx ) error("Failed to parse: %s\n", args->targets_fname); +++ args->tgt_itr = regitr_init(args->tgt_idx); +++ args->nalines++; +++ hts_expand0(annot_line_t,args->nalines,args->malines,args->alines); +++ } ++ } +++ init_merge_method(args); ++ args->vcmp = vcmp_init(); ++ ++ if ( args->filter_str ) ++@@ -1960,10 +2367,10 @@ ++ if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); ++ ++ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); ++- if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); +++ if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno)); ++ if ( args->n_threads ) ++ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); ++- bcf_hdr_write(args->out_fh, args->hdr_out); +++ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname); ++ } ++ } ++ ++@@ -1978,6 +2385,9 @@ ++ { ++ free(args->cols[i].hdr_key_src); ++ free(args->cols[i].hdr_key_dst); +++ free(args->cols[i].mm_kstr.s); +++ if ( args->cols[i].mm_str_hash ) khash_str2int_destroy_free(args->cols[i].mm_str_hash); +++ free(args->cols[i].mm_dbl); ++ } ++ free(args->cols); ++ for (i=0; imalines; i++) ++@@ -1987,6 +2397,11 @@ ++ free(args->alines[i].line.s); ++ } ++ free(args->alines); +++ if ( args->tgt_idx ) +++ { +++ regidx_destroy(args->tgt_idx); +++ regitr_destroy(args->tgt_itr); +++ } ++ if ( args->tgts ) bcf_sr_regions_destroy(args->tgts); ++ free(args->tmpks.s); ++ free(args->tmpi); ++@@ -2009,6 +2424,48 @@ ++ free(args->sample_map); ++ } ++ +++static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp) +++{ +++ tmp->line.l = 0; +++ kputs(str, &tmp->line); +++ char *s = tmp->line.s; +++ tmp->ncols = 1; +++ hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); +++ tmp->cols[0] = s; +++ while ( *s ) +++ { +++ if ( *s=='\t' ) +++ { +++ tmp->ncols++; +++ hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); +++ tmp->cols[tmp->ncols-1] = s+1; +++ *s = 0; +++ } +++ s++; +++ } +++ if ( args->ref_idx != -1 ) +++ { +++ if ( args->ref_idx >= tmp->ncols ) +++ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,str); +++ if ( args->alt_idx >= tmp->ncols ) +++ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,str); +++ tmp->nals = 2; +++ hts_expand(char*,tmp->nals,tmp->mals,tmp->als); +++ tmp->als[0] = tmp->cols[args->ref_idx]; +++ tmp->als[1] = s = tmp->cols[args->alt_idx]; +++ while ( *s ) +++ { +++ if ( *s==',' ) +++ { +++ tmp->nals++; +++ hts_expand(char*,tmp->nals,tmp->mals,tmp->als); +++ tmp->als[tmp->nals-1] = s+1; +++ *s = 0; +++ } +++ s++; +++ } +++ } +++} ++ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int end_pos) ++ { ++ if ( args->nalines && args->alines[0].rid != line->rid ) args->nalines = 0; ++@@ -2039,44 +2496,9 @@ ++ tmp->rid = line->rid; ++ tmp->start = args->tgts->start; ++ tmp->end = args->tgts->end; ++- tmp->line.l = 0; ++- kputs(args->tgts->line.s, &tmp->line); ++- char *s = tmp->line.s; ++- tmp->ncols = 1; ++- hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); ++- tmp->cols[0] = s; ++- while ( *s ) ++- { ++- if ( *s=='\t' ) ++- { ++- tmp->ncols++; ++- hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); ++- tmp->cols[tmp->ncols-1] = s+1; ++- *s = 0; ++- } ++- s++; ++- } +++ parse_annot_line(args, args->tgts->line.s, tmp); ++ if ( args->ref_idx != -1 ) ++ { ++- if ( args->ref_idx >= tmp->ncols ) ++- error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,args->tgts->line.s); ++- if ( args->alt_idx >= tmp->ncols ) ++- error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,args->tgts->line.s); ++- tmp->nals = 2; ++- hts_expand(char*,tmp->nals,tmp->mals,tmp->als); ++- tmp->als[0] = tmp->cols[args->ref_idx]; ++- tmp->als[1] = s = tmp->cols[args->alt_idx]; ++- while ( *s ) ++- { ++- if ( *s==',' ) ++- { ++- tmp->nals++; ++- hts_expand(char*,tmp->nals,tmp->mals,tmp->als); ++- tmp->als[tmp->nals-1] = s+1; ++- *s = 0; ++- } ++- s++; ++- } ++ int iseq = args->tgts->iseq; ++ if ( bcf_sr_regions_next(args->tgts)<0 || args->tgts->iseq!=iseq ) break; ++ } ++@@ -2090,7 +2512,30 @@ ++ for (i=0; inrm; i++) ++ args->rm[i].handler(args, line, &args->rm[i]); ++ ++- if ( args->tgts ) +++ int has_overlap = 0; +++ +++ if ( args->tgt_idx ) +++ { +++ if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) ) +++ { +++ while ( regitr_overlap(args->tgt_itr) ) +++ { +++ annot_line_t *tmp = &args->alines[0]; +++ tmp->rid = line->rid; +++ tmp->start = args->tgt_itr->beg; +++ tmp->end = args->tgt_itr->end; +++ parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp); +++ for (j=0; jncols; j++) +++ if ( args->cols[j].setter(args,line,&args->cols[j],tmp) ) +++ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); +++ } +++ has_overlap = 1; +++ } +++ for (j=0; jncols; j++) +++ if ( args->cols[j].merge_method != MM_FIRST ) +++ args->cols[j].setter(args,line,&args->cols[j],NULL); +++ } +++ else if ( args->tgts ) ++ { ++ // Buffer annotation lines. When multiple ALT alleles are present in the ++ // annotation file, at least one must match one of the VCF alleles. ++@@ -2121,18 +2566,9 @@ ++ // there is a matching line ++ for (j=0; jncols; j++) ++ if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) ) ++- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); ++- ++- } ++- ++- if ( args->mark_sites ) ++- { ++- // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87 ++- if ( args->mark_sites_logic==MARK_LISTED ) ++- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,inalines?1:0); ++- else ++- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,inalines?0:1); +++ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ } +++ has_overlap = inalines ? 1 : 0; ++ } ++ else if ( args->files->nreaders == 2 ) ++ { ++@@ -2141,13 +2577,10 @@ ++ bcf1_t *aline = bcf_sr_get_line(args->files,1); ++ for (j=0; jncols; j++) ++ if ( args->cols[j].setter(args,line,&args->cols[j],aline) ) ++- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); +++ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ ++- if ( args->mark_sites ) ++- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,args->mark_sites_logic==MARK_LISTED ? 1 : 0); +++ has_overlap = 1; ++ } ++- else if ( args->mark_sites ) ++- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL, args->mark_sites_logic==MARK_UNLISTED ? 1 : 0); ++ } ++ if ( args->set_ids ) ++ { ++@@ -2162,6 +2595,15 @@ ++ bcf_update_id(args->hdr_out,line,args->tmpks.s); ++ } ++ } +++ +++ if ( args->mark_sites ) +++ { +++ // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87 +++ if ( args->mark_sites_logic==MARK_LISTED ) +++ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0); +++ else +++ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?0:1); +++ } ++ } ++ ++ static void usage(args_t *args) ++@@ -2175,10 +2617,12 @@ ++ fprintf(bcftools_stderr, " --collapse matching records by , see man page for details [some]\n"); ++ fprintf(bcftools_stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); ++ fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); +++ fprintf(bcftools_stderr, " --force continue despite parsing error (at your own risk!)\n"); ++ fprintf(bcftools_stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); ++ fprintf(bcftools_stderr, " -I, --set-id [+] set ID column, see man page for details\n"); ++ fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); ++ fprintf(bcftools_stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); +++ fprintf(bcftools_stderr, " -l, --merge-logic merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); ++ fprintf(bcftools_stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); ++ fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); ++ fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); ++@@ -2188,6 +2632,7 @@ ++ fprintf(bcftools_stderr, " --rename-chrs rename sequences according to map file: from\\tto\n"); ++ fprintf(bcftools_stderr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); ++ fprintf(bcftools_stderr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); +++ fprintf(bcftools_stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); ++ fprintf(bcftools_stderr, " -x, --remove list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); ++ fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); ++ fprintf(bcftools_stderr, "\n"); ++@@ -2204,19 +2649,20 @@ ++ args->output_type = FT_VCF; ++ args->n_threads = 0; ++ args->record_cmd_line = 1; ++- args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1; +++ args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1; ++ args->set_ids_replace = 1; ++ int regions_is_file = 0, collapse = 0; ++ ++ static struct option loptions[] = ++ { ++- {"keep-sites",required_argument,NULL,'k'}, +++ {"keep-sites",no_argument,NULL,'k'}, ++ {"mark-sites",required_argument,NULL,'m'}, ++ {"set-id",required_argument,NULL,'I'}, ++ {"output",required_argument,NULL,'o'}, ++ {"output-type",required_argument,NULL,'O'}, ++ {"threads",required_argument,NULL,9}, ++ {"annotations",required_argument,NULL,'a'}, +++ {"merge-logic",required_argument,NULL,'l'}, ++ {"collapse",required_argument,NULL,2}, ++ {"include",required_argument,NULL,'i'}, ++ {"exclude",required_argument,NULL,'e'}, ++@@ -2228,12 +2674,15 @@ ++ {"header-lines",required_argument,NULL,'h'}, ++ {"samples",required_argument,NULL,'s'}, ++ {"samples-file",required_argument,NULL,'S'}, +++ {"single-overlaps",no_argument,NULL,10}, ++ {"no-version",no_argument,NULL,8}, +++ {"force",no_argument,NULL,'f'}, ++ {NULL,0,NULL,0} ++ }; ++- while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:k",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) ++ { ++ switch (c) { +++ case 'f': args->force = 1; break; ++ case 'k': args->keep_sites = 1; break; ++ case 'm': ++ args->mark_sites_logic = MARK_LISTED; ++@@ -2241,6 +2690,7 @@ ++ else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; } ++ else args->mark_sites = optarg; ++ break; +++ case 'l': args->merge_method_str = optarg; break; ++ case 'I': args->set_ids_fmt = optarg; break; ++ case 's': args->sample_names = optarg; break; ++ case 'S': args->sample_names = optarg; args->sample_is_file = 1; break; ++@@ -2275,6 +2725,7 @@ ++ break; ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; +++ case 10 : args->single_overlaps = 1; break; ++ case '?': usage(args); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++@@ -2296,6 +2747,7 @@ ++ if ( args->targets_fname ) ++ { ++ htsFile *fp = hts_open(args->targets_fname,"r"); +++ if ( !fp ) error("Failed to open %s\n", args->targets_fname); ++ htsFormat type = *hts_get_format(fp); ++ hts_close(fp); ++ ++@@ -2307,26 +2759,40 @@ ++ } ++ } ++ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ +++ static int line_errcode_warned = 0; ++ init_data(args); ++ while ( bcf_sr_next_line(args->files) ) ++ { ++ if ( !bcf_sr_has_line(args->files,0) ) continue; ++ bcf1_t *line = bcf_sr_get_line(args->files,0); ++- if ( line->errcode ) error("Encountered error, cannot proceed. Please check the error output above.\n"); +++ if ( line->errcode ) +++ { +++ if ( !args->force ) +++ error("Encountered an error, cannot proceed. Please check the error output above.\n" +++ "If feeling adventurous, use the --force option. (At your own risk!)\n"); +++ else if ( !line_errcode_warned ) +++ { +++ fprintf(bcftools_stderr, +++ "Warning: Encountered an error, proceeding only because --force was given.\n" +++ " Note that this can result in a segfault or a silent corruption of the output file!\n"); +++ line_errcode_warned = 1; +++ line->errcode = 0; +++ } +++ } ++ if ( args->filter ) ++ { ++ int pass = filter_test(args->filter, line, NULL); ++ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; ++ if ( !pass ) ++ { ++- if ( args->keep_sites ) bcf_write1(args->out_fh, args->hdr_out, line); +++ if ( args->keep_sites && bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); ++ continue; ++ } ++ } ++ annotate(args, line); ++- bcf_write1(args->out_fh, args->hdr_out, line); +++ if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); ++ } ++ destroy_data(args); ++ bcf_sr_destroy(args->files); ++--- python-pysam.orig/bcftools/vcfbuf.c +++++ python-pysam/bcftools/vcfbuf.c ++@@ -1,6 +1,6 @@ ++ /* The MIT License ++ ++- Copyright (c) 2016 Genome Research Ltd. +++ Copyright (c) 2016-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -57,6 +57,12 @@ ++ ++ typedef struct ++ { +++ int active; +++} +++rmdup_t; +++ +++typedef struct +++{ ++ int active, rid, end; ++ } ++ overlap_t; ++@@ -70,6 +76,7 @@ ++ ld_t ld; ++ prune_t prune; ++ overlap_t overlap; +++ rmdup_t rmdup; ++ }; ++ ++ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win) ++@@ -103,6 +110,7 @@ ++ if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; } ++ if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; } ++ if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; } +++ if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; } ++ } ++ ++ int vcfbuf_nsites(vcfbuf_t *buf) ++@@ -126,6 +134,21 @@ ++ return ret; ++ } ++ +++bcf1_t *vcfbuf_peek(vcfbuf_t *buf, int idx) +++{ +++ int i = rbuf_kth(&buf->rbuf, idx); +++ return i<0 ? NULL : buf->vcf[i].rec; +++} +++ +++bcf1_t *vcfbuf_remove(vcfbuf_t *buf, int idx) +++{ +++ int i = rbuf_kth(&buf->rbuf, idx); +++ if ( i<0 ) return NULL; +++ bcf1_t *rec = buf->vcf[i].rec; +++ rbuf_remove_kth(&buf->rbuf, vcfrec_t, idx, buf->vcf); +++ return rec; +++} +++ ++ static int cmpvrec(const void *_a, const void *_b) ++ { ++ vcfrec_t *a = *((vcfrec_t**) _a); ++@@ -198,6 +221,24 @@ ++ rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->prune.idx[i], buf->vcf); ++ } ++ +++static int _rmdup_can_flush(vcfbuf_t *buf, int flush_all) +++{ +++ if ( flush_all ) return 1; +++ +++ if ( buf->rbuf.n==1 ) return 0; +++ +++ int k1 = rbuf_kth(&buf->rbuf, -1); +++ int k2 = rbuf_kth(&buf->rbuf, -2); +++ +++ vcfrec_t *rec1 = &buf->vcf[k1]; +++ vcfrec_t *rec2 = &buf->vcf[k2]; +++ +++ if ( rec1->rec->rid!=rec2->rec->rid ) return 1; +++ if ( rec1->rec->pos!=rec2->rec->pos ) return 1; +++ +++ return 0; +++} +++ ++ static int _overlap_can_flush(vcfbuf_t *buf, int flush_all) ++ { ++ if ( flush_all ) { buf->overlap.rid = -1; return 1; } ++@@ -252,13 +293,8 @@ ++ j = rbuf_last(&buf->rbuf); // last ++ ++ if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) goto ret; ++- if ( buf->overlap.active ) ++- { ++- int ret = _overlap_can_flush(buf, flush_all); ++- //printf("can_flush: %d %d - %d\n", ret, buf->vcf[i].rec->pos+1, buf->vcf[j].rec->pos+1); ++- if ( ret ) goto ret; ++- } ++- //if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret; +++ if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret; +++ if ( buf->rmdup.active && _rmdup_can_flush(buf, flush_all) ) goto ret; ++ ++ if ( buf->win > 0 ) ++ { ++--- python-pysam.orig/bcftools/vcfbuf.c.pysam.c +++++ python-pysam/bcftools/vcfbuf.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* The MIT License ++ ++- Copyright (c) 2016 Genome Research Ltd. +++ Copyright (c) 2016-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -59,6 +59,12 @@ ++ ++ typedef struct ++ { +++ int active; +++} +++rmdup_t; +++ +++typedef struct +++{ ++ int active, rid, end; ++ } ++ overlap_t; ++@@ -72,6 +78,7 @@ ++ ld_t ld; ++ prune_t prune; ++ overlap_t overlap; +++ rmdup_t rmdup; ++ }; ++ ++ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win) ++@@ -105,6 +112,7 @@ ++ if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; } ++ if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; } ++ if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; } +++ if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; } ++ } ++ ++ int vcfbuf_nsites(vcfbuf_t *buf) ++@@ -128,6 +136,21 @@ ++ return ret; ++ } ++ +++bcf1_t *vcfbuf_peek(vcfbuf_t *buf, int idx) +++{ +++ int i = rbuf_kth(&buf->rbuf, idx); +++ return i<0 ? NULL : buf->vcf[i].rec; +++} +++ +++bcf1_t *vcfbuf_remove(vcfbuf_t *buf, int idx) +++{ +++ int i = rbuf_kth(&buf->rbuf, idx); +++ if ( i<0 ) return NULL; +++ bcf1_t *rec = buf->vcf[i].rec; +++ rbuf_remove_kth(&buf->rbuf, vcfrec_t, idx, buf->vcf); +++ return rec; +++} +++ ++ static int cmpvrec(const void *_a, const void *_b) ++ { ++ vcfrec_t *a = *((vcfrec_t**) _a); ++@@ -200,6 +223,24 @@ ++ rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->prune.idx[i], buf->vcf); ++ } ++ +++static int _rmdup_can_flush(vcfbuf_t *buf, int flush_all) +++{ +++ if ( flush_all ) return 1; +++ +++ if ( buf->rbuf.n==1 ) return 0; +++ +++ int k1 = rbuf_kth(&buf->rbuf, -1); +++ int k2 = rbuf_kth(&buf->rbuf, -2); +++ +++ vcfrec_t *rec1 = &buf->vcf[k1]; +++ vcfrec_t *rec2 = &buf->vcf[k2]; +++ +++ if ( rec1->rec->rid!=rec2->rec->rid ) return 1; +++ if ( rec1->rec->pos!=rec2->rec->pos ) return 1; +++ +++ return 0; +++} +++ ++ static int _overlap_can_flush(vcfbuf_t *buf, int flush_all) ++ { ++ if ( flush_all ) { buf->overlap.rid = -1; return 1; } ++@@ -254,13 +295,8 @@ ++ j = rbuf_last(&buf->rbuf); // last ++ ++ if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) goto ret; ++- if ( buf->overlap.active ) ++- { ++- int ret = _overlap_can_flush(buf, flush_all); ++- //printf("can_flush: %d %d - %d\n", ret, buf->vcf[i].rec->pos+1, buf->vcf[j].rec->pos+1); ++- if ( ret ) goto ret; ++- } ++- //if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret; +++ if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret; +++ if ( buf->rmdup.active && _rmdup_can_flush(buf, flush_all) ) goto ret; ++ ++ if ( buf->win > 0 ) ++ { ++--- python-pysam.orig/bcftools/vcfbuf.h +++++ python-pysam/bcftools/vcfbuf.h ++@@ -1,6 +1,6 @@ ++ /* The MIT License ++ ++- Copyright (c) 2017 Genome Research Ltd. +++ Copyright (c) 2017-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -44,6 +44,7 @@ ++ VCFBUF_NSITES, // leave at max this many sites in the window ++ VCFBUF_AF_TAG, // use this INFO tag with LD_NSITES ++ VCFBUF_OVERLAP_WIN, // keep only overlapping variants in the window +++ VCFBUF_RMDUP, // remove duplicate sites (completely) ++ } ++ vcfbuf_opt_t; ++ ++@@ -64,6 +65,18 @@ ++ */ ++ bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap); ++ +++/* +++ * vcfbuf_peek() - return pointer to i-th record in the buffer but do not remove it from the buffer +++ * @idx: 0-based index to buffered lines +++ */ +++bcf1_t *vcfbuf_peek(vcfbuf_t *buf, int idx); +++ +++/* +++ * vcfbuf_remove() - return pointer to i-th record in the buffer and remove it from the buffer +++ * @idx: 0-based index to buffered lines +++ */ +++bcf1_t *vcfbuf_remove(vcfbuf_t *buf, int idx); +++ ++ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all); ++ ++ /* ++--- python-pysam.orig/bcftools/vcfcall.c +++++ python-pysam/bcftools/vcfcall.c ++@@ -42,14 +42,11 @@ ++ #include "prob1.h" ++ #include "ploidy.h" ++ #include "gvcf.h" +++#include "regidx.h" +++#include "vcfbuf.h" ++ ++ void error(const char *format, ...); ++ ++-#ifdef _WIN32 ++-#define srand48(x) srand(x) ++-#define lrand48() rand() ++-#endif ++- ++ #define CF_NO_GENO 1 ++ #define CF_INS_MISSED (1<<1) ++ #define CF_CCALL (1<<2) ++@@ -68,6 +65,13 @@ ++ ++ typedef struct ++ { +++ tgt_als_t *als; +++ int nmatch_als, ibuf; +++} +++rec_tgt_t; +++ +++typedef struct +++{ ++ int flag; // combination of CF_* flags above ++ int output_type, n_threads, record_cmd_line; ++ htsFile *bcf_in, *out_fh; ++@@ -76,6 +80,9 @@ ++ int nsamples, *samples_map; // mapping from output sample names to original VCF ++ char *regions, *targets; // regions to process ++ int regions_is_file, targets_is_file; +++ regidx_t *tgt_idx; +++ regitr_t *tgt_itr, *tgt_itr_prev, *tgt_itr_tmp; +++ vcfbuf_t *vcfbuf; ++ ++ char *samples_fname; ++ int samples_is_file; ++@@ -86,6 +93,7 @@ ++ ++ bcf1_t *missed_line; ++ call_t aux; // parameters and temporary data +++ kstring_t str; ++ ++ int argc; ++ char **argv; ++@@ -297,7 +305,7 @@ ++ if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } ++ if ( old2new[ismpl] != -1 ) { fprintf(stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; } ++ ++- ss = se+1; +++ ss = se+(x != '\0'); ++ while ( *ss && isspace(*ss) ) ss++; ++ if ( !*ss ) ss = "2"; // default ploidy ++ se = ss; ++@@ -347,26 +355,253 @@ ++ bcf_float_set_missing(args->missed_line->qual); ++ } ++ ++-static void print_missed_line(bcf_sr_regions_t *regs, void *data) +++static int tgt_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) +++{ +++ char *ss = (char*) line; +++ while ( *ss && isspace(*ss) ) ss++; +++ if ( !*ss ) { fprintf(stderr,"Could not parse the line: %s\n", line); return -2; } +++ if ( *ss=='#' ) return -1; // skip comments +++ +++ char *se = ss; +++ while ( *se && !isspace(*se) ) se++; +++ +++ *chr_beg = ss; +++ *chr_end = se-1; +++ +++ if ( !*se ) { fprintf(stderr,"Could not parse the line: %s\n", line); return -2; } +++ +++ ss = se+1; +++ *beg = strtod(ss, &se); +++ if ( ss==se ) { fprintf(stderr,"Could not parse tab line: %s\n", line); return -2; } +++ if ( *beg==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } +++ (*beg)--; +++ *end = *beg; +++ +++ if ( !usr ) return 0; // allele information not required +++ +++ ss = se+1; +++ tgt_als_t *als = (tgt_als_t*)payload; +++ als->used = 0; +++ als->n = 0; +++ als->allele = NULL; +++ while ( *ss ) +++ { +++ se = ss; +++ while ( *se && *se!=',' ) se++; +++ als->n++; +++ als->allele = (char**)realloc(als->allele,als->n*sizeof(*als->allele)); +++ als->allele[als->n-1] = (char*)malloc(se-ss+1); +++ memcpy(als->allele[als->n-1],ss,se-ss); +++ als->allele[als->n-1][se-ss] = 0; +++ ss = se+1; +++ if ( !*se ) break; +++ } +++ return 0; +++} +++static void tgt_free(void *payload) +++{ +++ tgt_als_t *als = (tgt_als_t*)payload; +++ int i; +++ for (i=0; in; i++) free(als->allele[i]); +++ free(als->allele); +++} +++static void tgt_flush_region(args_t *args, char *chr, uint32_t beg, uint32_t end) +++{ +++ if ( !regidx_overlap(args->tgt_idx, chr,beg,end,args->tgt_itr_tmp) ) return; +++ while ( regitr_overlap(args->tgt_itr_tmp) ) +++ { +++ if ( args->tgt_itr_tmp->beg < beg ) continue; +++ +++ tgt_als_t *tgt_als = ®itr_payload(args->tgt_itr_tmp,tgt_als_t); +++ if ( tgt_als->used ) continue; +++ +++ args->missed_line->rid = bcf_hdr_name2id(args->aux.hdr,chr); +++ args->missed_line->pos = args->tgt_itr_tmp->beg; +++ bcf_unpack(args->missed_line,BCF_UN_ALL); +++ bcf_update_alleles(args->aux.hdr, args->missed_line, (const char**)tgt_als->allele, tgt_als->n); +++ tgt_als->used = 1; +++ if ( bcf_write1(args->out_fh, args->aux.hdr, args->missed_line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); +++ } +++} +++static void tgt_flush(args_t *args, bcf1_t *rec) +++{ +++ if ( rec ) +++ { +++ char *chr = (char*)bcf_seqname(args->aux.hdr,rec); +++ +++ if ( !args->tgt_itr_prev ) // first record +++ tgt_flush_region(args,chr,0,rec->pos-1); +++ +++ else if ( strcmp(chr,args->tgt_itr_prev->seq) ) // first record on a new chromosome +++ { +++ tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg+1,REGIDX_MAX); +++ tgt_flush_region(args,chr,0,rec->pos-1); +++ } +++ else // another record on the same chromosome +++ tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg,rec->pos-1); +++ } +++ else +++ { +++ // flush everything +++ if ( args->tgt_itr_prev ) +++ tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg,REGIDX_MAX); +++ +++ int i, nchr = 0; +++ char **chr = regidx_seq_names(args->tgt_idx, &nchr); +++ for (i=0; i" is not present at indels sites and there are no other symbolic alleles than <*> +++ if ( als[1][0]=='<' ) return 0; +++ +++ int i; +++ for (i=0; iaux; ++- bcf1_t *missed = args->missed_line; +++ bcf1_t *rec = NULL; +++ if ( !args->vcfbuf ) +++ { +++ while ( bcf_sr_next_line(args->aux.srs) ) +++ { +++ rec = args->aux.srs->readers[0].buffer[0]; +++ if ( args->aux.srs->errnum || rec->errcode ) error("Error: could not parse the input VCF\n"); +++ if ( args->tgt_idx ) +++ { +++ if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) continue; +++ +++ // For backward compatibility: require the exact position, not an interval overlap +++ int pos_match = 0; +++ while ( regitr_overlap(args->tgt_itr) ) +++ { +++ if ( args->tgt_itr->beg != rec->pos ) continue; +++ pos_match = 1; +++ break; +++ } +++ if ( !pos_match ) continue; +++ } +++ if ( args->samples_map ) bcf_subset(args->aux.hdr, rec, args->nsamples, args->samples_map); +++ bcf_unpack(rec, BCF_UN_STR); +++ return rec; +++ } +++ return NULL; +++ } +++ +++ // If we are here,-C alleles was given and vcfbuf and tgt_idx are set +++ +++ // Fill the buffer with duplicate lines +++ int vcfbuf_full = 1; +++ int nbuf = vcfbuf_nsites(args->vcfbuf); +++ bcf1_t *rec0 = NULL, *recN = NULL; +++ if ( nbuf==0 ) vcfbuf_full = 0; +++ else if ( nbuf==1 ) +++ { +++ vcfbuf_full = 0; +++ rec0 = vcfbuf_peek(args->vcfbuf, 0); +++ } +++ else +++ { +++ rec0 = vcfbuf_peek(args->vcfbuf, 0); +++ recN = vcfbuf_peek(args->vcfbuf, nbuf-1); +++ if ( rec0->rid == recN->rid && rec0->pos == recN->pos ) vcfbuf_full = 0; +++ } +++ if ( !vcfbuf_full ) +++ { +++ while ( bcf_sr_next_line(args->aux.srs) ) +++ { +++ rec = args->aux.srs->readers[0].buffer[0]; +++ if ( args->aux.srs->errnum || rec->errcode ) error("Error: could not parse the input VCF\n"); +++ if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) continue; +++ // as above: require the exact position, not an interval overlap +++ int exact_match = 0; +++ while ( regitr_overlap(args->tgt_itr) ) +++ { +++ if ( args->tgt_itr->beg != rec->pos ) continue; +++ exact_match = 1; +++ break; +++ } +++ if ( !exact_match ) continue; +++ +++ if ( args->samples_map ) bcf_subset(args->aux.hdr, rec, args->nsamples, args->samples_map); +++ bcf_unpack(rec, BCF_UN_STR); +++ if ( !rec0 ) rec0 = rec; +++ recN = rec; +++ args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1); +++ if ( rec0->rid!=recN->rid || rec0->pos!=recN->pos ) break; +++ } +++ } ++ ++- char *ss = regs->line.s; ++- int i = 0; ++- while ( iaux.srs->targets_als-1 && *ss ) +++ nbuf = vcfbuf_nsites(args->vcfbuf); +++ int n, i,j; +++ for (n=nbuf; n>1; n--) ++ { ++- if ( *ss=='\t' ) i++; ++- ss++; +++ recN = vcfbuf_peek(args->vcfbuf, n-1); +++ if ( rec0->rid==recN->rid && rec0->pos==recN->pos ) break; ++ } ++- if ( !*ss ) error("Could not parse: [%s] (%d)\n", regs->line.s,args->aux.srs->targets_als); +++ if ( n==0 ) +++ { +++ assert( !nbuf ); +++ return NULL; +++ } +++ +++ // Find the VCF and tab record with the best matching combination of alleles, prioritize +++ // records of the same type (snp vs indel) +++ rec_tgt_t rec_tgt; +++ memset(&rec_tgt,0,sizeof(rec_tgt)); +++ regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec0),rec0->pos,rec0->pos,args->tgt_itr); +++ regitr_t *tmp_itr = regitr_init(args->tgt_idx); +++ regitr_copy(tmp_itr, args->tgt_itr); +++ for (i=0; ivcfbuf, i); +++ int rec_indel = is_indel(rec->n_allele, rec->d.allele) ? 1 : -1; +++ while ( regitr_overlap(tmp_itr) ) +++ { +++ if ( tmp_itr->beg != rec->pos ) continue; +++ tgt_als_t *als = ®itr_payload(tmp_itr,tgt_als_t); +++ if ( als->used ) continue; +++ int nmatch_als = 0; +++ vcmp_t *vcmp = vcmp_init(); +++ int ret = vcmp_set_ref(vcmp, rec->d.allele[0], als->allele[0]); +++ if ( ret==0 ) +++ { +++ nmatch_als++; +++ if ( rec->n_allele > 1 && als->n > 1 ) +++ { +++ for (j=1; jn; j++) +++ { +++ if ( vcmp_find_allele(vcmp, rec->d.allele+1, rec->n_allele-1, als->allele[j])>=0 ) nmatch_als++; +++ } +++ } +++ } +++ int als_indel = is_indel(als->n, als->allele) ? 1 : -1; +++ nmatch_als *= rec_indel*als_indel; +++ if ( nmatch_als > rec_tgt.nmatch_als || !rec_tgt.als ) +++ { +++ rec_tgt.nmatch_als = nmatch_als; +++ rec_tgt.als = als; +++ rec_tgt.ibuf = i; +++ } +++ vcmp_destroy(vcmp); +++ } +++ } +++ regitr_destroy(tmp_itr); ++ ++- missed->rid = bcf_hdr_name2id(call->hdr,regs->seq_names[regs->prev_seq]); ++- missed->pos = regs->start; ++- bcf_update_alleles_str(call->hdr, missed,ss); +++ args->aux.tgt_als = rec_tgt.als; +++ if ( rec_tgt.als ) rec_tgt.als->used = 1; ++ ++- bcf_write1(args->out_fh, call->hdr, missed); +++ rec = vcfbuf_remove(args->vcfbuf, rec_tgt.ibuf); +++ return rec; ++ } ++ ++ static void init_data(args_t *args) ++@@ -376,22 +611,19 @@ ++ // Open files for input and output, initialize structures ++ if ( args->targets ) ++ { ++- if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 ) ++- error("Failed to read the targets: %s\n", args->targets); ++- ++- if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED ) ++- { ++- args->aux.srs->targets->missed_reg_handler = print_missed_line; ++- args->aux.srs->targets->missed_reg_data = args; ++- } +++ args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL); +++ args->tgt_itr = regitr_init(args->tgt_idx); +++ args->tgt_itr_tmp = regitr_init(args->tgt_idx); ++ } +++ ++ if ( args->regions ) ++ { ++ if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) ++ error("Failed to read the regions: %s\n", args->regions); ++ } ++ ++- if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); +++ if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) +++ error("Failed to read from %s: %s\n", !strcmp("-",args->bcf_fname)?"standard input":args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); ++ args->aux.hdr = bcf_sr_get_header(args->aux.srs,0); ++ ++ int i; ++@@ -451,8 +683,11 @@ ++ } ++ } ++ +++ if ( args->aux.flag & CALL_CONSTR_ALLELES ) +++ args->vcfbuf = vcfbuf_init(args->aux.hdr, 0); +++ ++ args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); ++- if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); +++ if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); ++ ++ if ( args->flag & CF_QCALL ) ++@@ -468,13 +703,21 @@ ++ bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16"); ++ ++ if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); ++- bcf_hdr_write(args->out_fh, args->aux.hdr); +++ if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); ++ ++ if ( args->flag&CF_INS_MISSED ) init_missed_line(args); ++ } ++ ++ static void destroy_data(args_t *args) ++ { +++ if ( args->vcfbuf ) vcfbuf_destroy(args->vcfbuf); +++ if ( args->tgt_idx ) +++ { +++ regidx_destroy(args->tgt_idx); +++ regitr_destroy(args->tgt_itr); +++ regitr_destroy(args->tgt_itr_tmp); +++ if ( args->tgt_itr_prev ) regitr_destroy(args->tgt_itr_prev); +++ } ++ if ( args->flag & CF_CCALL ) ccall_destroy(&args->aux); ++ else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux); ++ else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux); ++@@ -496,9 +739,10 @@ ++ free(args->samples_map); ++ free(args->sample2sex); ++ free(args->aux.ploidy); +++ free(args->str.s); ++ if ( args->gvcf ) gvcf_destroy(args->gvcf); ++ bcf_hdr_destroy(args->aux.hdr); ++- hts_close(args->out_fh); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ bcf_sr_destroy(args->aux.srs); ++ } ++ ++@@ -604,7 +848,7 @@ ++ static void usage(args_t *args) ++ { ++ fprintf(stderr, "\n"); ++- fprintf(stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with samtools mpileup.\n"); +++ fprintf(stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with bcftools mpileup.\n"); ++ fprintf(stderr, " This command replaces the former \"bcftools view\" caller. Some of the original\n"); ++ fprintf(stderr, " functionality has been temporarily lost in the process of transition to htslib,\n"); ++ fprintf(stderr, " but will be added back on popular demand. The original calling model can be\n"); ++@@ -623,12 +867,13 @@ ++ fprintf(stderr, " -S, --samples-file PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); ++ fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); ++ fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); ++- fprintf(stderr, " --threads number of extra output compression threads [0]\n"); +++ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(stderr, "\n"); ++ fprintf(stderr, "Input/output options:\n"); ++ fprintf(stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); ++ fprintf(stderr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); ++ fprintf(stderr, " -F, --prior-freqs use prior allele frequencies\n"); +++ fprintf(stderr, " -G, --group-samples group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling\n"); ++ fprintf(stderr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); ++ fprintf(stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); ++ fprintf(stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); ++@@ -642,6 +887,10 @@ ++ fprintf(stderr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); ++ fprintf(stderr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n"); +++ fprintf(stderr, "\n"); +++ fprintf(stderr, "Example:\n"); +++ fprintf(stderr, " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"); +++ fprintf(stderr, " bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"); ++ ++ // todo (and more) ++ // fprintf(stderr, "\nContrast calling and association test options:\n"); ++@@ -680,6 +929,7 @@ ++ {"format-fields",required_argument,NULL,'f'}, ++ {"prior-freqs",required_argument,NULL,'F'}, ++ {"gvcf",required_argument,NULL,'g'}, +++ {"group-samples",required_argument,NULL,'G'}, ++ {"output",required_argument,NULL,'o'}, ++ {"output-type",required_argument,NULL,'O'}, ++ {"regions",required_argument,NULL,'r'}, ++@@ -710,7 +960,7 @@ ++ }; ++ ++ char *tmp = NULL; ++- while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:", loptions, NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:G:", loptions, NULL)) >= 0) ++ { ++ switch (c) ++ { ++@@ -718,6 +968,7 @@ ++ case 1 : ploidy = optarg; break; ++ case 'X': ploidy = "X"; fprintf(stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break; ++ case 'Y': ploidy = "Y"; fprintf(stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break; +++ case 'G': args.aux.sample_groups = optarg; break; ++ case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; ++ case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N ++ case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) ++@@ -805,13 +1056,14 @@ ++ } ++ if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n"); ++ if ( args.aux.flag&CALL_VARONLY && args.gvcf ) error("The two options cannot be combined: --variants-only and --gvcf\n"); +++ if ( args.aux.sample_groups && !(args.flag & CF_MCALL) ) error("The -G feature is supported only with the -m calling mode\n"); ++ init_data(&args); ++ ++- while ( bcf_sr_next_line(args.aux.srs) ) +++ bcf1_t *bcf_rec; +++ while ( (bcf_rec = next_line(&args)) ) ++ { ++- bcf1_t *bcf_rec = args.aux.srs->readers[0].buffer[0]; ++- if ( args.samples_map ) bcf_subset(args.aux.hdr, bcf_rec, args.nsamples, args.samples_map); ++- bcf_unpack(bcf_rec, BCF_UN_STR); +++ // Skip duplicate positions with all matching `-C alleles -T` used up +++ if ( args.aux.flag&CALL_CONSTR_ALLELES && !args.aux.tgt_als ) continue; ++ ++ // Skip unwanted sites ++ int i, is_indel = bcf_is_snp(bcf_rec) ? 0 : 1; ++@@ -845,6 +1097,13 @@ ++ continue; ++ } ++ +++ if ( args.flag & CF_INS_MISSED ) +++ { +++ tgt_flush(&args,bcf_rec); +++ if ( !args.tgt_itr_prev ) args.tgt_itr_prev = regitr_init(args.tgt_idx); +++ regitr_copy(args.tgt_itr_prev, args.tgt_itr); +++ } +++ ++ // Calling modes which output VCFs ++ int ret; ++ if ( args.flag & CF_MCALL ) ++@@ -858,11 +1117,10 @@ ++ if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant ++ if ( args.gvcf ) ++ bcf_rec = gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, bcf_rec, ret==1?1:0); ++- if ( bcf_rec ) ++- bcf_write1(args.out_fh, args.aux.hdr, bcf_rec); +++ if ( bcf_rec && bcf_write1(args.out_fh, args.aux.hdr, bcf_rec)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args.output_fname); ++ } ++ if ( args.gvcf ) gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, NULL, 0); ++- if ( args.flag & CF_INS_MISSED ) bcf_sr_regions_flush(args.aux.srs->targets); +++ if ( args.flag & CF_INS_MISSED ) tgt_flush(&args,NULL); ++ destroy_data(&args); ++ return 0; ++ } ++--- python-pysam.orig/bcftools/vcfcall.c.pysam.c +++++ python-pysam/bcftools/vcfcall.c.pysam.c ++@@ -44,14 +44,11 @@ ++ #include "prob1.h" ++ #include "ploidy.h" ++ #include "gvcf.h" +++#include "regidx.h" +++#include "vcfbuf.h" ++ ++ void error(const char *format, ...); ++ ++-#ifdef _WIN32 ++-#define srand48(x) srand(x) ++-#define lrand48() rand() ++-#endif ++- ++ #define CF_NO_GENO 1 ++ #define CF_INS_MISSED (1<<1) ++ #define CF_CCALL (1<<2) ++@@ -70,6 +67,13 @@ ++ ++ typedef struct ++ { +++ tgt_als_t *als; +++ int nmatch_als, ibuf; +++} +++rec_tgt_t; +++ +++typedef struct +++{ ++ int flag; // combination of CF_* flags above ++ int output_type, n_threads, record_cmd_line; ++ htsFile *bcf_in, *out_fh; ++@@ -78,6 +82,9 @@ ++ int nsamples, *samples_map; // mapping from output sample names to original VCF ++ char *regions, *targets; // regions to process ++ int regions_is_file, targets_is_file; +++ regidx_t *tgt_idx; +++ regitr_t *tgt_itr, *tgt_itr_prev, *tgt_itr_tmp; +++ vcfbuf_t *vcfbuf; ++ ++ char *samples_fname; ++ int samples_is_file; ++@@ -88,6 +95,7 @@ ++ ++ bcf1_t *missed_line; ++ call_t aux; // parameters and temporary data +++ kstring_t str; ++ ++ int argc; ++ char **argv; ++@@ -299,7 +307,7 @@ ++ if ( ismpl < 0 ) { fprintf(bcftools_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } ++ if ( old2new[ismpl] != -1 ) { fprintf(bcftools_stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; } ++ ++- ss = se+1; +++ ss = se+(x != '\0'); ++ while ( *ss && isspace(*ss) ) ss++; ++ if ( !*ss ) ss = "2"; // default ploidy ++ se = ss; ++@@ -349,26 +357,253 @@ ++ bcf_float_set_missing(args->missed_line->qual); ++ } ++ ++-static void print_missed_line(bcf_sr_regions_t *regs, void *data) +++static int tgt_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) +++{ +++ char *ss = (char*) line; +++ while ( *ss && isspace(*ss) ) ss++; +++ if ( !*ss ) { fprintf(bcftools_stderr,"Could not parse the line: %s\n", line); return -2; } +++ if ( *ss=='#' ) return -1; // skip comments +++ +++ char *se = ss; +++ while ( *se && !isspace(*se) ) se++; +++ +++ *chr_beg = ss; +++ *chr_end = se-1; +++ +++ if ( !*se ) { fprintf(bcftools_stderr,"Could not parse the line: %s\n", line); return -2; } +++ +++ ss = se+1; +++ *beg = strtod(ss, &se); +++ if ( ss==se ) { fprintf(bcftools_stderr,"Could not parse tab line: %s\n", line); return -2; } +++ if ( *beg==0 ) { fprintf(bcftools_stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } +++ (*beg)--; +++ *end = *beg; +++ +++ if ( !usr ) return 0; // allele information not required +++ +++ ss = se+1; +++ tgt_als_t *als = (tgt_als_t*)payload; +++ als->used = 0; +++ als->n = 0; +++ als->allele = NULL; +++ while ( *ss ) +++ { +++ se = ss; +++ while ( *se && *se!=',' ) se++; +++ als->n++; +++ als->allele = (char**)realloc(als->allele,als->n*sizeof(*als->allele)); +++ als->allele[als->n-1] = (char*)malloc(se-ss+1); +++ memcpy(als->allele[als->n-1],ss,se-ss); +++ als->allele[als->n-1][se-ss] = 0; +++ ss = se+1; +++ if ( !*se ) break; +++ } +++ return 0; +++} +++static void tgt_free(void *payload) +++{ +++ tgt_als_t *als = (tgt_als_t*)payload; +++ int i; +++ for (i=0; in; i++) free(als->allele[i]); +++ free(als->allele); +++} +++static void tgt_flush_region(args_t *args, char *chr, uint32_t beg, uint32_t end) +++{ +++ if ( !regidx_overlap(args->tgt_idx, chr,beg,end,args->tgt_itr_tmp) ) return; +++ while ( regitr_overlap(args->tgt_itr_tmp) ) +++ { +++ if ( args->tgt_itr_tmp->beg < beg ) continue; +++ +++ tgt_als_t *tgt_als = ®itr_payload(args->tgt_itr_tmp,tgt_als_t); +++ if ( tgt_als->used ) continue; +++ +++ args->missed_line->rid = bcf_hdr_name2id(args->aux.hdr,chr); +++ args->missed_line->pos = args->tgt_itr_tmp->beg; +++ bcf_unpack(args->missed_line,BCF_UN_ALL); +++ bcf_update_alleles(args->aux.hdr, args->missed_line, (const char**)tgt_als->allele, tgt_als->n); +++ tgt_als->used = 1; +++ if ( bcf_write1(args->out_fh, args->aux.hdr, args->missed_line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); +++ } +++} +++static void tgt_flush(args_t *args, bcf1_t *rec) +++{ +++ if ( rec ) +++ { +++ char *chr = (char*)bcf_seqname(args->aux.hdr,rec); +++ +++ if ( !args->tgt_itr_prev ) // first record +++ tgt_flush_region(args,chr,0,rec->pos-1); +++ +++ else if ( strcmp(chr,args->tgt_itr_prev->seq) ) // first record on a new chromosome +++ { +++ tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg+1,REGIDX_MAX); +++ tgt_flush_region(args,chr,0,rec->pos-1); +++ } +++ else // another record on the same chromosome +++ tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg,rec->pos-1); +++ } +++ else +++ { +++ // flush everything +++ if ( args->tgt_itr_prev ) +++ tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg,REGIDX_MAX); +++ +++ int i, nchr = 0; +++ char **chr = regidx_seq_names(args->tgt_idx, &nchr); +++ for (i=0; i" is not present at indels sites and there are no other symbolic alleles than <*> +++ if ( als[1][0]=='<' ) return 0; +++ +++ int i; +++ for (i=0; iaux; ++- bcf1_t *missed = args->missed_line; +++ bcf1_t *rec = NULL; +++ if ( !args->vcfbuf ) +++ { +++ while ( bcf_sr_next_line(args->aux.srs) ) +++ { +++ rec = args->aux.srs->readers[0].buffer[0]; +++ if ( args->aux.srs->errnum || rec->errcode ) error("Error: could not parse the input VCF\n"); +++ if ( args->tgt_idx ) +++ { +++ if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) continue; +++ +++ // For backward compatibility: require the exact position, not an interval overlap +++ int pos_match = 0; +++ while ( regitr_overlap(args->tgt_itr) ) +++ { +++ if ( args->tgt_itr->beg != rec->pos ) continue; +++ pos_match = 1; +++ break; +++ } +++ if ( !pos_match ) continue; +++ } +++ if ( args->samples_map ) bcf_subset(args->aux.hdr, rec, args->nsamples, args->samples_map); +++ bcf_unpack(rec, BCF_UN_STR); +++ return rec; +++ } +++ return NULL; +++ } +++ +++ // If we are here,-C alleles was given and vcfbuf and tgt_idx are set +++ +++ // Fill the buffer with duplicate lines +++ int vcfbuf_full = 1; +++ int nbuf = vcfbuf_nsites(args->vcfbuf); +++ bcf1_t *rec0 = NULL, *recN = NULL; +++ if ( nbuf==0 ) vcfbuf_full = 0; +++ else if ( nbuf==1 ) +++ { +++ vcfbuf_full = 0; +++ rec0 = vcfbuf_peek(args->vcfbuf, 0); +++ } +++ else +++ { +++ rec0 = vcfbuf_peek(args->vcfbuf, 0); +++ recN = vcfbuf_peek(args->vcfbuf, nbuf-1); +++ if ( rec0->rid == recN->rid && rec0->pos == recN->pos ) vcfbuf_full = 0; +++ } +++ if ( !vcfbuf_full ) +++ { +++ while ( bcf_sr_next_line(args->aux.srs) ) +++ { +++ rec = args->aux.srs->readers[0].buffer[0]; +++ if ( args->aux.srs->errnum || rec->errcode ) error("Error: could not parse the input VCF\n"); +++ if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) continue; +++ // as above: require the exact position, not an interval overlap +++ int exact_match = 0; +++ while ( regitr_overlap(args->tgt_itr) ) +++ { +++ if ( args->tgt_itr->beg != rec->pos ) continue; +++ exact_match = 1; +++ break; +++ } +++ if ( !exact_match ) continue; +++ +++ if ( args->samples_map ) bcf_subset(args->aux.hdr, rec, args->nsamples, args->samples_map); +++ bcf_unpack(rec, BCF_UN_STR); +++ if ( !rec0 ) rec0 = rec; +++ recN = rec; +++ args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1); +++ if ( rec0->rid!=recN->rid || rec0->pos!=recN->pos ) break; +++ } +++ } ++ ++- char *ss = regs->line.s; ++- int i = 0; ++- while ( iaux.srs->targets_als-1 && *ss ) +++ nbuf = vcfbuf_nsites(args->vcfbuf); +++ int n, i,j; +++ for (n=nbuf; n>1; n--) ++ { ++- if ( *ss=='\t' ) i++; ++- ss++; +++ recN = vcfbuf_peek(args->vcfbuf, n-1); +++ if ( rec0->rid==recN->rid && rec0->pos==recN->pos ) break; ++ } ++- if ( !*ss ) error("Could not parse: [%s] (%d)\n", regs->line.s,args->aux.srs->targets_als); +++ if ( n==0 ) +++ { +++ assert( !nbuf ); +++ return NULL; +++ } +++ +++ // Find the VCF and tab record with the best matching combination of alleles, prioritize +++ // records of the same type (snp vs indel) +++ rec_tgt_t rec_tgt; +++ memset(&rec_tgt,0,sizeof(rec_tgt)); +++ regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec0),rec0->pos,rec0->pos,args->tgt_itr); +++ regitr_t *tmp_itr = regitr_init(args->tgt_idx); +++ regitr_copy(tmp_itr, args->tgt_itr); +++ for (i=0; ivcfbuf, i); +++ int rec_indel = is_indel(rec->n_allele, rec->d.allele) ? 1 : -1; +++ while ( regitr_overlap(tmp_itr) ) +++ { +++ if ( tmp_itr->beg != rec->pos ) continue; +++ tgt_als_t *als = ®itr_payload(tmp_itr,tgt_als_t); +++ if ( als->used ) continue; +++ int nmatch_als = 0; +++ vcmp_t *vcmp = vcmp_init(); +++ int ret = vcmp_set_ref(vcmp, rec->d.allele[0], als->allele[0]); +++ if ( ret==0 ) +++ { +++ nmatch_als++; +++ if ( rec->n_allele > 1 && als->n > 1 ) +++ { +++ for (j=1; jn; j++) +++ { +++ if ( vcmp_find_allele(vcmp, rec->d.allele+1, rec->n_allele-1, als->allele[j])>=0 ) nmatch_als++; +++ } +++ } +++ } +++ int als_indel = is_indel(als->n, als->allele) ? 1 : -1; +++ nmatch_als *= rec_indel*als_indel; +++ if ( nmatch_als > rec_tgt.nmatch_als || !rec_tgt.als ) +++ { +++ rec_tgt.nmatch_als = nmatch_als; +++ rec_tgt.als = als; +++ rec_tgt.ibuf = i; +++ } +++ vcmp_destroy(vcmp); +++ } +++ } +++ regitr_destroy(tmp_itr); ++ ++- missed->rid = bcf_hdr_name2id(call->hdr,regs->seq_names[regs->prev_seq]); ++- missed->pos = regs->start; ++- bcf_update_alleles_str(call->hdr, missed,ss); +++ args->aux.tgt_als = rec_tgt.als; +++ if ( rec_tgt.als ) rec_tgt.als->used = 1; ++ ++- bcf_write1(args->out_fh, call->hdr, missed); +++ rec = vcfbuf_remove(args->vcfbuf, rec_tgt.ibuf); +++ return rec; ++ } ++ ++ static void init_data(args_t *args) ++@@ -378,22 +613,19 @@ ++ // Open files for input and output, initialize structures ++ if ( args->targets ) ++ { ++- if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 ) ++- error("Failed to read the targets: %s\n", args->targets); ++- ++- if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED ) ++- { ++- args->aux.srs->targets->missed_reg_handler = print_missed_line; ++- args->aux.srs->targets->missed_reg_data = args; ++- } +++ args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL); +++ args->tgt_itr = regitr_init(args->tgt_idx); +++ args->tgt_itr_tmp = regitr_init(args->tgt_idx); ++ } +++ ++ if ( args->regions ) ++ { ++ if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) ++ error("Failed to read the regions: %s\n", args->regions); ++ } ++ ++- if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); +++ if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) +++ error("Failed to read from %s: %s\n", !strcmp("-",args->bcf_fname)?"standard input":args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); ++ args->aux.hdr = bcf_sr_get_header(args->aux.srs,0); ++ ++ int i; ++@@ -453,8 +685,11 @@ ++ } ++ } ++ +++ if ( args->aux.flag & CALL_CONSTR_ALLELES ) +++ args->vcfbuf = vcfbuf_init(args->aux.hdr, 0); +++ ++ args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); ++- if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); +++ if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); ++ ++ if ( args->flag & CF_QCALL ) ++@@ -470,13 +705,21 @@ ++ bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16"); ++ ++ if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); ++- bcf_hdr_write(args->out_fh, args->aux.hdr); +++ if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); ++ ++ if ( args->flag&CF_INS_MISSED ) init_missed_line(args); ++ } ++ ++ static void destroy_data(args_t *args) ++ { +++ if ( args->vcfbuf ) vcfbuf_destroy(args->vcfbuf); +++ if ( args->tgt_idx ) +++ { +++ regidx_destroy(args->tgt_idx); +++ regitr_destroy(args->tgt_itr); +++ regitr_destroy(args->tgt_itr_tmp); +++ if ( args->tgt_itr_prev ) regitr_destroy(args->tgt_itr_prev); +++ } ++ if ( args->flag & CF_CCALL ) ccall_destroy(&args->aux); ++ else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux); ++ else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux); ++@@ -498,9 +741,10 @@ ++ free(args->samples_map); ++ free(args->sample2sex); ++ free(args->aux.ploidy); +++ free(args->str.s); ++ if ( args->gvcf ) gvcf_destroy(args->gvcf); ++ bcf_hdr_destroy(args->aux.hdr); ++- hts_close(args->out_fh); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ bcf_sr_destroy(args->aux.srs); ++ } ++ ++@@ -606,7 +850,7 @@ ++ static void usage(args_t *args) ++ { ++ fprintf(bcftools_stderr, "\n"); ++- fprintf(bcftools_stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with samtools mpileup.\n"); +++ fprintf(bcftools_stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with bcftools mpileup.\n"); ++ fprintf(bcftools_stderr, " This command replaces the former \"bcftools view\" caller. Some of the original\n"); ++ fprintf(bcftools_stderr, " functionality has been temporarily lost in the process of transition to htslib,\n"); ++ fprintf(bcftools_stderr, " but will be added back on popular demand. The original calling model can be\n"); ++@@ -625,12 +869,13 @@ ++ fprintf(bcftools_stderr, " -S, --samples-file PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); ++ fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); ++ fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); ++- fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); +++ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(bcftools_stderr, "\n"); ++ fprintf(bcftools_stderr, "Input/output options:\n"); ++ fprintf(bcftools_stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); ++ fprintf(bcftools_stderr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); ++ fprintf(bcftools_stderr, " -F, --prior-freqs use prior allele frequencies\n"); +++ fprintf(bcftools_stderr, " -G, --group-samples group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling\n"); ++ fprintf(bcftools_stderr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); ++ fprintf(bcftools_stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); ++ fprintf(bcftools_stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); ++@@ -644,6 +889,10 @@ ++ fprintf(bcftools_stderr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); ++ fprintf(bcftools_stderr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n"); +++ fprintf(bcftools_stderr, "\n"); +++ fprintf(bcftools_stderr, "Example:\n"); +++ fprintf(bcftools_stderr, " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"); +++ fprintf(bcftools_stderr, " bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"); ++ ++ // todo (and more) ++ // fprintf(bcftools_stderr, "\nContrast calling and association test options:\n"); ++@@ -682,6 +931,7 @@ ++ {"format-fields",required_argument,NULL,'f'}, ++ {"prior-freqs",required_argument,NULL,'F'}, ++ {"gvcf",required_argument,NULL,'g'}, +++ {"group-samples",required_argument,NULL,'G'}, ++ {"output",required_argument,NULL,'o'}, ++ {"output-type",required_argument,NULL,'O'}, ++ {"regions",required_argument,NULL,'r'}, ++@@ -712,7 +962,7 @@ ++ }; ++ ++ char *tmp = NULL; ++- while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:", loptions, NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:G:", loptions, NULL)) >= 0) ++ { ++ switch (c) ++ { ++@@ -720,6 +970,7 @@ ++ case 1 : ploidy = optarg; break; ++ case 'X': ploidy = "X"; fprintf(bcftools_stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break; ++ case 'Y': ploidy = "Y"; fprintf(bcftools_stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break; +++ case 'G': args.aux.sample_groups = optarg; break; ++ case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; ++ case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N ++ case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) ++@@ -807,13 +1058,14 @@ ++ } ++ if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n"); ++ if ( args.aux.flag&CALL_VARONLY && args.gvcf ) error("The two options cannot be combined: --variants-only and --gvcf\n"); +++ if ( args.aux.sample_groups && !(args.flag & CF_MCALL) ) error("The -G feature is supported only with the -m calling mode\n"); ++ init_data(&args); ++ ++- while ( bcf_sr_next_line(args.aux.srs) ) +++ bcf1_t *bcf_rec; +++ while ( (bcf_rec = next_line(&args)) ) ++ { ++- bcf1_t *bcf_rec = args.aux.srs->readers[0].buffer[0]; ++- if ( args.samples_map ) bcf_subset(args.aux.hdr, bcf_rec, args.nsamples, args.samples_map); ++- bcf_unpack(bcf_rec, BCF_UN_STR); +++ // Skip duplicate positions with all matching `-C alleles -T` used up +++ if ( args.aux.flag&CALL_CONSTR_ALLELES && !args.aux.tgt_als ) continue; ++ ++ // Skip unwanted sites ++ int i, is_indel = bcf_is_snp(bcf_rec) ? 0 : 1; ++@@ -847,6 +1099,13 @@ ++ continue; ++ } ++ +++ if ( args.flag & CF_INS_MISSED ) +++ { +++ tgt_flush(&args,bcf_rec); +++ if ( !args.tgt_itr_prev ) args.tgt_itr_prev = regitr_init(args.tgt_idx); +++ regitr_copy(args.tgt_itr_prev, args.tgt_itr); +++ } +++ ++ // Calling modes which output VCFs ++ int ret; ++ if ( args.flag & CF_MCALL ) ++@@ -860,11 +1119,10 @@ ++ if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant ++ if ( args.gvcf ) ++ bcf_rec = gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, bcf_rec, ret==1?1:0); ++- if ( bcf_rec ) ++- bcf_write1(args.out_fh, args.aux.hdr, bcf_rec); +++ if ( bcf_rec && bcf_write1(args.out_fh, args.aux.hdr, bcf_rec)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args.output_fname); ++ } ++ if ( args.gvcf ) gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, NULL, 0); ++- if ( args.flag & CF_INS_MISSED ) bcf_sr_regions_flush(args.aux.srs->targets); +++ if ( args.flag & CF_INS_MISSED ) tgt_flush(&args,NULL); ++ destroy_data(&args); ++ return 0; ++ } ++--- python-pysam.orig/bcftools/vcfcnv.c +++++ python-pysam/bcftools/vcfcnv.c ++@@ -34,6 +34,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -226,9 +227,9 @@ ++ } ++ static void close_sample_files(sample_t *smpl) ++ { ++- fclose(smpl->dat_fh); ++- fclose(smpl->cn_fh); ++- fclose(smpl->summary_fh); +++ if ( fclose(smpl->dat_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,smpl->dat_fname); +++ if ( fclose(smpl->cn_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,smpl->cn_fname); +++ if ( fclose(smpl->summary_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,smpl->summary_fname); ++ } ++ ++ static double norm_cdf(double mean, double dev); ++@@ -1190,10 +1191,10 @@ ++ args->control_sample.lrr[args->nsites-1] = lrr2; ++ args->control_sample.baf[args->nsites-1] = baf2; ++ if ( baf2>=0 ) // skip missing values ++- fprintf(args->control_sample.dat_fh,"%s\t%d\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), line->pos+1,baf2,lrr2); +++ fprintf(args->control_sample.dat_fh,"%s\t%"PRId64"\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), (int64_t) line->pos+1,baf2,lrr2); ++ } ++ if ( baf1>=0 ) // skip missing values ++- fprintf(args->query_sample.dat_fh,"%s\t%d\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), line->pos+1,baf1,lrr1); +++ fprintf(args->query_sample.dat_fh,"%s\t%"PRId64"\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), (int64_t) line->pos+1,baf1,lrr1); ++ ++ if ( baf1>=0 ) ++ { ++@@ -1277,13 +1278,13 @@ ++ {"LRR-weight",1,0,'l'}, ++ {"same-prob",1,0,'P'}, ++ {"xy-prob",1,0,'x'}, ++- {"sample",1,0,'s'}, ++- {"control",1,0,'c'}, +++ {"query-sample",1,0,'s'}, +++ {"control-sample",1,0,'c'}, ++ {"targets",1,0,'t'}, ++ {"targets-file",1,0,'T'}, ++ {"regions",1,0,'r'}, ++ {"regions-file",1,0,'R'}, ++- {"plot",1,0,'p'}, +++ {"plot-threshold",1,0,'p'}, ++ {"output-dir",1,0,'o'}, ++ {0,0,0,0} ++ }; ++@@ -1399,7 +1400,8 @@ ++ if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 ) ++ error("Failed to read the targets: %s\n", args->af_fname); ++ } ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) +++ error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ ++ init_data(args); ++ while ( bcf_sr_next_line(args->files) ) ++--- python-pysam.orig/bcftools/vcfcnv.c.pysam.c +++++ python-pysam/bcftools/vcfcnv.c.pysam.c ++@@ -36,6 +36,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -228,9 +229,9 @@ ++ } ++ static void close_sample_files(sample_t *smpl) ++ { ++- fclose(smpl->dat_fh); ++- fclose(smpl->cn_fh); ++- fclose(smpl->summary_fh); +++ if ( fclose(smpl->dat_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,smpl->dat_fname); +++ if ( fclose(smpl->cn_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,smpl->cn_fname); +++ if ( fclose(smpl->summary_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,smpl->summary_fname); ++ } ++ ++ static double norm_cdf(double mean, double dev); ++@@ -1192,10 +1193,10 @@ ++ args->control_sample.lrr[args->nsites-1] = lrr2; ++ args->control_sample.baf[args->nsites-1] = baf2; ++ if ( baf2>=0 ) // skip missing values ++- fprintf(args->control_sample.dat_fh,"%s\t%d\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), line->pos+1,baf2,lrr2); +++ fprintf(args->control_sample.dat_fh,"%s\t%"PRId64"\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), (int64_t) line->pos+1,baf2,lrr2); ++ } ++ if ( baf1>=0 ) // skip missing values ++- fprintf(args->query_sample.dat_fh,"%s\t%d\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), line->pos+1,baf1,lrr1); +++ fprintf(args->query_sample.dat_fh,"%s\t%"PRId64"\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), (int64_t) line->pos+1,baf1,lrr1); ++ ++ if ( baf1>=0 ) ++ { ++@@ -1279,13 +1280,13 @@ ++ {"LRR-weight",1,0,'l'}, ++ {"same-prob",1,0,'P'}, ++ {"xy-prob",1,0,'x'}, ++- {"sample",1,0,'s'}, ++- {"control",1,0,'c'}, +++ {"query-sample",1,0,'s'}, +++ {"control-sample",1,0,'c'}, ++ {"targets",1,0,'t'}, ++ {"targets-file",1,0,'T'}, ++ {"regions",1,0,'r'}, ++ {"regions-file",1,0,'R'}, ++- {"plot",1,0,'p'}, +++ {"plot-threshold",1,0,'p'}, ++ {"output-dir",1,0,'o'}, ++ {0,0,0,0} ++ }; ++@@ -1401,7 +1402,8 @@ ++ if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 ) ++ error("Failed to read the targets: %s\n", args->af_fname); ++ } ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) +++ error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ ++ init_data(args); ++ while ( bcf_sr_next_line(args->files) ) ++--- python-pysam.orig/bcftools/vcfconcat.c +++++ python-pysam/bcftools/vcfconcat.c ++@@ -1,6 +1,6 @@ ++ /* vcfconcat.c -- Concatenate or combine VCF/BCF files. ++ ++- Copyright (C) 2013-2015 Genome Research Ltd. +++ Copyright (C) 2013-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -34,6 +34,8 @@ ++ #include ++ #include ++ #include // for hts_get_bgzfp() +++#include +++#include ++ #include "bcftools.h" ++ ++ typedef struct _args_t ++@@ -53,7 +55,9 @@ ++ ++ char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list; ++ int argc, nfnames, allow_overlaps, phased_concat, regions_is_file; ++- int compact_PS, phase_set_changed, naive_concat; +++ int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers; +++ int verbose; +++ htsThreadPool *tpool; ++ } ++ args_t; ++ ++@@ -70,6 +74,7 @@ ++ line = bcf_init(); ++ } ++ +++ if ( args->verbose ) fprintf(stderr,"Checking the headers and starting positions of %d files\n", args->nfnames); ++ kstring_t str = {0,0,0}; ++ int i, prev_chrid = -1; ++ for (i=0; infnames; i++) ++@@ -97,7 +102,7 @@ ++ } ++ } ++ bcf_hdr_destroy(hdr); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]); ++ } ++ free(str.s); ++ if ( line ) bcf_destroy(line); ++@@ -112,14 +117,30 @@ ++ if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); ++ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); ++ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); ++- if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); ++- ++- bcf_hdr_write(args->out_fh, args->out_hdr); ++- ++- if ( args->allow_overlaps ) +++ if ( args->allow_overlaps || args->phased_concat ) ++ { ++ args->files = bcf_sr_init(); ++ args->files->require_index = 1; +++ } +++ if ( args->n_threads ) +++ { +++ if ( args->files ) +++ { +++ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); +++ args->tpool = args->files->p; +++ } +++ else +++ { +++ args->tpool = (htsThreadPool*) calloc(1, sizeof(htsThreadPool)); +++ if ( !args->tpool ) error("Failed to allocate memory\n"); +++ if ( !(args->tpool->pool = hts_tpool_init(args->n_threads)) ) error("Failed to initialize %d threads\n",args->n_threads); +++ } +++ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool); +++ } +++ if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); +++ +++ if ( args->allow_overlaps ) +++ { ++ if ( args->regions_list ) ++ { ++ if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) ++@@ -167,8 +188,6 @@ ++ args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); ++ args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); ++ args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); ++- args->files = bcf_sr_init(); ++- args->files->require_index = 1; ++ args->ifname = 0; ++ } ++ } ++@@ -176,13 +195,16 @@ ++ static void destroy_data(args_t *args) ++ { ++ int i; ++- for (i=0; infnames; i++) free(args->fnames[i]); ++- free(args->fnames); ++- if ( args->files ) bcf_sr_destroy(args->files); ++ if ( args->out_fh ) ++ { ++ if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); ++ } +++ if ( args->tpool && !args->files ) +++ { +++ hts_tpool_destroy(args->tpool->pool); +++ free(args->tpool); +++ } +++ if ( args->files ) bcf_sr_destroy(args->files); ++ if ( args->out_hdr ) bcf_hdr_destroy(args->out_hdr); ++ free(args->seen_seq); ++ free(args->start_pos); ++@@ -195,6 +217,8 @@ ++ free(args->nmism); ++ free(args->phase_qual); ++ free(args->phase_set); +++ for (i=0; infnames; i++) free(args->fnames[i]); +++ free(args->fnames); ++ } ++ ++ int vcf_write_line(htsFile *fp, kstring_t *line); ++@@ -235,7 +259,7 @@ ++ { ++ if ( !gt_absent_warned ) ++ { ++- fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1); +++ fprintf(stderr,"GT is not present at %s:%"PRId64". (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), (int64_t) arec->pos+1); ++ gt_absent_warned = 1; ++ } ++ continue; ++@@ -246,7 +270,7 @@ ++ { ++ if ( !gt_absent_warned ) ++ { ++- fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1); +++ fprintf(stderr,"GT is not present at %s:%"PRId64". (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), (int64_t) brec->pos+1); ++ gt_absent_warned = 1; ++ } ++ continue; ++@@ -282,9 +306,9 @@ ++ bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); ++ args->phase_set_changed = 0; ++ } ++- bcf_write(args->out_fh, args->out_hdr, arec); +++ if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ ++- if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1,args->prev_pos_check+1); +++ if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1,args->prev_pos_check+1); ++ args->prev_pos_check = arec->pos; ++ } ++ args->nswap = 0; ++@@ -332,9 +356,9 @@ ++ bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl); ++ args->phase_set_changed = 0; ++ } ++- bcf_write(args->out_fh, args->out_hdr, brec); +++ if ( bcf_write(args->out_fh, args->out_hdr, brec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ ++- if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1,args->prev_pos_check+1); +++ if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1,args->prev_pos_check+1); ++ args->prev_pos_check = brec->pos; ++ } ++ args->nbuf = 0; ++@@ -343,9 +367,9 @@ ++ static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec) ++ { ++ if ( arec && arec->errcode ) ++- error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1, args->files->readers[0].fname); +++ error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1, args->files->readers[0].fname); ++ if ( brec && brec->errcode ) ++- error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1, args->files->readers[1].fname); +++ error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1, args->files->readers[1].fname); ++ ++ int i, nsmpl = bcf_hdr_nsamples(args->out_hdr); ++ int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec)); ++@@ -373,10 +397,10 @@ ++ bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); ++ args->phase_set_changed = 0; ++ } ++- bcf_write(args->out_fh, args->out_hdr, arec); +++ if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ ++ if ( arec->pos < args->prev_pos_check ) ++- error("FIXME, disorder: %s:%d in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); +++ error("FIXME, disorder: %s:%"PRId64" in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), (int64_t) arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); ++ args->prev_pos_check = arec->pos; ++ return; ++ } ++@@ -393,6 +417,7 @@ ++ ++ static void concat(args_t *args) ++ { +++ static int site_drop_warned = 0; ++ int i; ++ if ( args->phased_concat ) // phased concat ++ { ++@@ -429,8 +454,20 @@ ++ if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader ++ { ++ // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped ++- if ( ! bcf_sr_region_done(args->files,0) ) continue; ++- +++ if ( ! bcf_sr_region_done(args->files,0) ) +++ { +++ if ( !site_drop_warned ) +++ { +++ fprintf(stderr, +++ "Warning: Dropping the site %s:%"PRId64". The --ligate option is intended for VCFs with perfect\n" +++ " overlap, sites in overlapping regions present in one but missing in other are dropped.\n" +++ " This warning is printed only once.\n", +++ bcf_seqname(bcf_sr_get_header(args->files,1),bcf_sr_get_line(args->files,1)), (int64_t) bcf_sr_get_line(args->files,1)->pos+1 +++ ); +++ site_drop_warned = 1; +++ } +++ continue; +++ } ++ phased_flush(args); ++ bcf_sr_remove_reader(args->files, 0); ++ } ++@@ -483,20 +520,27 @@ ++ bcf1_t *line = bcf_sr_get_line(args->files,i); ++ if ( !line ) continue; ++ bcf_translate(args->out_hdr, args->files->readers[i].header, line); ++- bcf_write1(args->out_fh, args->out_hdr, line); +++ if ( bcf_write1(args->out_fh, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ if ( args->remove_dups ) break; ++ } ++ } ++ } ++ else // concatenating ++ { +++ struct timeval t0, t1; ++ kstring_t tmp = {0,0,0}; ++ int prev_chr_id = -1, prev_pos; ++ bcf1_t *line = bcf_init(); ++ for (i=0; infnames; i++) ++ { ++- htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); ++- bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); +++ if ( args->verbose ) +++ { +++ fprintf(stderr,"Concatenating %s", args->fnames[i]); +++ gettimeofday(&t0, NULL); +++ } +++ htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("\nFailed to open: %s\n", args->fnames[i]); +++ if ( args->n_threads ) hts_set_opt(fp, HTS_OPT_THREAD_POOL, args->tpool); +++ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("\nFailed to parse header: %s\n", args->fnames[i]); ++ if ( !fp->is_bin && args->output_type&FT_VCF ) ++ { ++ line->max_unpack = BCF_UN_STR; ++@@ -508,7 +552,7 @@ ++ tmp.l = 0; ++ kputsn(fp->line.s,str-fp->line.s,&tmp); ++ int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); ++- if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); +++ if ( chr_id<0 ) error("\nThe sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); ++ if ( prev_chr_id!=chr_id ) ++ { ++ prev_pos = -1; ++@@ -519,11 +563,11 @@ ++ int pos = strtol(str+1,&end,10) - 1; ++ if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); ++ if ( prev_pos > pos ) ++- error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); +++ error("\nThe chromosome block %s is not sorted, consider running with -a.\n", tmp.s); ++ args->seen_seq[chr_id] = 1; ++ prev_chr_id = chr_id; ++ ++- if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %"PRIu64" bytes\n", (uint64_t)fp->line.l); +++ if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("\nFailed to write %"PRIu64" bytes\n", (uint64_t)fp->line.l); ++ } ++ } ++ else ++@@ -541,15 +585,21 @@ ++ error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); ++ } ++ if ( prev_pos > line->pos ) ++- error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); +++ error("\nThe chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); ++ args->seen_seq[line->rid] = 1; ++ prev_chr_id = line->rid; ++ ++- if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); +++ if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("\nFailed to write\n"); ++ } ++ } ++ bcf_hdr_destroy(hdr); ++ hts_close(fp); +++ if ( args->verbose ) +++ { +++ gettimeofday(&t1, NULL); +++ double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec); +++ fprintf(stderr,"\t%f seconds\n",delta/1e6); +++ } ++ } ++ bcf_destroy(line); ++ free(tmp.s); ++@@ -612,63 +662,141 @@ ++ && header[12] == 'B' && header[13] == 'C' ++ && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1; ++ } +++static void _check_hrecs(const bcf_hdr_t *hdr0, const bcf_hdr_t *hdr, char *fname0, char *fname) +++{ +++ int j; +++ for (j=0; jnhrec; j++) +++ { +++ bcf_hrec_t *hrec0 = hdr0->hrec[j]; +++ if ( hrec0->type!=BCF_HL_FLT && hrec0->type!=BCF_HL_INFO && hrec0->type!=BCF_HL_FMT && hrec0->type!=BCF_HL_CTG ) continue; // skip fiels w/o IDX +++ int itag = bcf_hrec_find_key(hrec0, "ID"); +++ bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, hrec0->type, "ID", hrec0->vals[itag], NULL); +++ +++ char *type = NULL; +++ if ( hrec0->type==BCF_HL_FLT ) type = "FILTER"; +++ if ( hrec0->type==BCF_HL_INFO ) type = "INFO"; +++ if ( hrec0->type==BCF_HL_FMT ) type = "FORMAT"; +++ if ( hrec0->type==BCF_HL_CTG ) type = "contig"; +++ +++ if ( !hrec ) +++ error("Cannot use --naive, incompatible headers, the tag %s/%s not present in %s\n",type,hrec0->vals[itag],fname); +++ +++ int idx0 = bcf_hrec_find_key(hrec0, "IDX"); +++ int idx = bcf_hrec_find_key(hrec, "IDX"); +++ if ( idx0<0 || idx<0 ) +++ error("fixme: unexpected IDX<0 for %s/%s in %s or %s\n",type,hrec0->vals[itag],fname0,fname); +++ if ( strcmp(hrec0->vals[idx0],hrec->vals[idx]) ) +++ error("Cannot use --naive, use --naive-force instead: different order the tag %s/%s in %s vs %s\n",type,hrec0->vals[itag],fname0,fname); +++ } +++} +++static void naive_concat_check_headers(args_t *args) +++{ +++ fprintf(stderr,"Checking the headers of %d files.\n",args->nfnames); +++ bcf_hdr_t *hdr0 = NULL; +++ int i,j; +++ for (i=0; infnames; i++) +++ { +++ htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); +++ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); +++ htsFormat type = *hts_get_format(fp); +++ hts_close(fp); +++ +++ if ( i==0 ) +++ { +++ hdr0 = hdr; +++ continue; +++ } +++ +++ // check the samples +++ if ( bcf_hdr_nsamples(hdr0)!=bcf_hdr_nsamples(hdr) ) +++ error("Cannot concatenate, different number of samples: %d vs %d in %s vs %s\n",bcf_hdr_nsamples(hdr0),bcf_hdr_nsamples(hdr),args->fnames[0],args->fnames[i]); +++ for (j=0; jsamples[j],hdr->samples[j]) ) +++ error("Cannot concatenate, different samples in %s vs %s\n",args->fnames[0],args->fnames[i]); +++ +++ // if BCF, check if tag IDs are consistent in the dictionary of strings +++ if ( type.compression!=bgzf ) +++ error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); +++ if ( type.format==vcf ) +++ { +++ bcf_hdr_destroy(hdr); +++ continue; +++ } +++ +++ _check_hrecs(hdr0,hdr,args->fnames[0],args->fnames[i]); +++ _check_hrecs(hdr,hdr0,args->fnames[i],args->fnames[0]); +++ +++ bcf_hdr_destroy(hdr); +++ } +++ if ( hdr0 ) bcf_hdr_destroy(hdr0); +++ fprintf(stderr,"Done, the headers are compatible.\n"); +++} ++ static void naive_concat(args_t *args) ++ { +++ if ( !args->naive_concat_trust_headers ) +++ naive_concat_check_headers(args); +++ ++ // only compressed BCF atm ++ BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; ++ +++ struct timeval t0, t1; ++ const size_t page_size = BGZF_MAX_BLOCK_SIZE; ++ uint8_t *buf = (uint8_t*) malloc(page_size); ++ kstring_t tmp = {0,0,0}; ++ int i, file_types = 0; ++ for (i=0; infnames; i++) ++ { +++ if ( args->verbose ) +++ { +++ fprintf(stderr,"Concatenating %s", args->fnames[i]); +++ gettimeofday(&t0, NULL); +++ } ++ htsFile *hts_fp = hts_open(args->fnames[i],"r"); ++- if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); +++ if ( !hts_fp ) error("\nFailed to open: %s\n", args->fnames[i]); ++ htsFormat type = *hts_get_format(hts_fp); ++ ++ if ( type.compression!=bgzf ) ++- error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); +++ error("\nThe --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); ++ file_types |= type.format==vcf ? 1 : 2; ++ if ( file_types==3 ) ++- error("The --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n"); +++ error("\nThe --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n"); ++ ++ BGZF *fp = hts_get_bgzfp(hts_fp); ++ if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) ++- error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); +++ error("\nFailed to read %s: %s\n", args->fnames[i], strerror(errno)); ++ ++ int nskip; ++ if ( type.format==bcf ) ++ { ++ uint8_t magic[5]; ++- if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); ++- if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); +++ if ( bgzf_read(fp, magic, 5) != 5 ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]); +++ if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("\nInvalid BCF magic string in %s\n", args->fnames[i]); ++ ++- if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); +++ if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]); ++ hts_expand(char,tmp.l,tmp.m,tmp.s); ++- if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); +++ if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]); ++ ++ // write only the first header ++ if ( i==0 ) ++ { ++- if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); ++- if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); ++- if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %"PRId64" bytes to %s\n", (uint64_t)tmp.l,args->output_fname); +++ if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("\nFailed to write %d bytes to %s\n", 5,args->output_fname); +++ if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("\nFailed to write %d bytes to %s\n", 4,args->output_fname); +++ if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("\nFailed to write %"PRId64" bytes to %s\n", (uint64_t)tmp.l,args->output_fname); ++ } ++ nskip = fp->block_offset; ++ } ++ else ++ { ++ nskip = print_vcf_gz_header(fp, bgzf_out, i==0?1:0, &tmp); ++- if ( nskip==-1 ) error("Error reading %s\n", args->fnames[i]); +++ if ( nskip==-1 ) error("\nError reading %s\n", args->fnames[i]); ++ } ++ ++ // Output all non-header data that were read together with the header block ++ if ( fp->block_length - nskip > 0 ) ++ { ++- if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); +++ if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("\nError: %d\n",fp->errcode); ++ } ++- if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); +++ if ( bgzf_flush(bgzf_out)<0 ) error("\nError: %d\n",bgzf_out->errcode); ++ ++ ++ // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks ++@@ -680,16 +808,22 @@ ++ { ++ nread = bgzf_raw_read(fp, buf, nheader); ++ if ( !nread ) break; ++- if ( nread != nheader || check_header(buf)!=0 ) error("Could not parse the header of a bgzf block: %s\n",args->fnames[i]); +++ if ( nread != nheader || check_header(buf)!=0 ) error("\nCould not parse the header of a bgzf block: %s\n",args->fnames[i]); ++ nblock = unpackInt16(buf+16) + 1; ++ assert( nblock <= page_size && nblock >= nheader ); ++ nread += bgzf_raw_read(fp, buf+nheader, nblock - nheader); ++- if ( nread!=nblock ) error("Could not read %"PRId64" bytes: %s\n",(uint64_t)nblock,args->fnames[i]); +++ if ( nread!=nblock ) error("\nCould not read %"PRId64" bytes: %s\n",(uint64_t)nblock,args->fnames[i]); ++ if ( nread==neof && !memcmp(buf,eof,neof) ) continue; ++ nwr = bgzf_raw_write(bgzf_out, buf, nread); ++- if ( nwr != nread ) error("Write failed, wrote %"PRId64" instead of %d bytes.\n", (uint64_t)nwr,(int)nread); +++ if ( nwr != nread ) error("\nWrite failed, wrote %"PRId64" instead of %d bytes.\n", (uint64_t)nwr,(int)nread); +++ } +++ if (hts_close(hts_fp)) error("\nClose failed: %s\n",args->fnames[i]); +++ if ( args->verbose ) +++ { +++ gettimeofday(&t1, NULL); +++ double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec); +++ fprintf(stderr,"\t%f seconds\n",delta/1e6); ++ } ++- if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); ++ } ++ free(buf); ++ free(tmp.s); ++@@ -705,8 +839,7 @@ ++ fprintf(stderr, " VCF into one. The input files must be sorted by chr and position. The files\n"); ++ fprintf(stderr, " must be given in the correct order to produce sorted VCF on output unless\n"); ++ fprintf(stderr, " the -a, --allow-overlaps option is specified. With the --naive option, the files\n"); ++- fprintf(stderr, " are concatenated without being recompressed, which is very fast but dangerous\n"); ++- fprintf(stderr, " if the BCF headers differ.\n"); +++ fprintf(stderr, " are concatenated without being recompressed, which is very fast.\n"); ++ fprintf(stderr, "Usage: bcftools concat [options] [ [...]]\n"); ++ fprintf(stderr, "\n"); ++ fprintf(stderr, "Options:\n"); ++@@ -717,13 +850,15 @@ ++ fprintf(stderr, " -f, --file-list Read the list of files from a file.\n"); ++ fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); ++ fprintf(stderr, " --no-version Do not append version and command line to the header\n"); ++- fprintf(stderr, " -n, --naive Concatenate files without recompression (dangerous, use with caution)\n"); +++ fprintf(stderr, " -n, --naive Concatenate files without recompression, a header check compatibility is performed\n"); +++ fprintf(stderr, " --naive-force Same as --naive, but header compatibility is not checked. Dangerous, use with caution.\n"); ++ fprintf(stderr, " -o, --output Write output to a file [standard output]\n"); ++ fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); ++ fprintf(stderr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); ++ fprintf(stderr, " -r, --regions Restrict to comma-separated list of regions\n"); ++ fprintf(stderr, " -R, --regions-file Restrict to regions listed in a file\n"); ++- fprintf(stderr, " --threads Number of extra output compression threads [0]\n"); +++ fprintf(stderr, " --threads Use multithreading with worker threads [0]\n"); +++ fprintf(stderr, " -v, --verbose <0|1> Set verbosity level [1]\n"); ++ fprintf(stderr, "\n"); ++ exit(1); ++ } ++@@ -738,10 +873,13 @@ ++ args->n_threads = 0; ++ args->record_cmd_line = 1; ++ args->min_PQ = 30; +++ args->verbose = 1; ++ ++ static struct option loptions[] = ++ { +++ {"verbose",required_argument,NULL,'v'}, ++ {"naive",no_argument,NULL,'n'}, +++ {"naive-force",no_argument,NULL,7}, ++ {"compact-PS",no_argument,NULL,'c'}, ++ {"regions",required_argument,NULL,'r'}, ++ {"regions-file",required_argument,NULL,'R'}, ++@@ -758,7 +896,7 @@ ++ {NULL,0,NULL,0} ++ }; ++ char *tmp; ++- while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cn",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cnv:",loptions,NULL)) >= 0) ++ { ++ switch (c) { ++ case 'c': args->compact_PS = 1; break; ++@@ -786,6 +924,11 @@ ++ break; ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; +++ case 7 : args->naive_concat = 1; args->naive_concat_trust_headers = 1; break; +++ case 'v': +++ args->verbose = strtol(optarg, 0, 0); +++ error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); +++ break; ++ case 'h': ++ case '?': usage(args); break; ++ default: error("Unknown argument: %s\n", optarg); ++@@ -798,7 +941,7 @@ ++ args->fnames[args->nfnames-1] = strdup(argv[optind]); ++ optind++; ++ } ++- if ( args->allow_overlaps && args->phased_concat ) args->allow_overlaps = 0; +++ if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n"); ++ if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n"); ++ if ( args->file_list ) ++ { ++--- python-pysam.orig/bcftools/vcfconcat.c.pysam.c +++++ python-pysam/bcftools/vcfconcat.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* vcfconcat.c -- Concatenate or combine VCF/BCF files. ++ ++- Copyright (C) 2013-2015 Genome Research Ltd. +++ Copyright (C) 2013-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -36,6 +36,8 @@ ++ #include ++ #include ++ #include // for hts_get_bgzfp() +++#include +++#include ++ #include "bcftools.h" ++ ++ typedef struct _args_t ++@@ -55,7 +57,9 @@ ++ ++ char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list; ++ int argc, nfnames, allow_overlaps, phased_concat, regions_is_file; ++- int compact_PS, phase_set_changed, naive_concat; +++ int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers; +++ int verbose; +++ htsThreadPool *tpool; ++ } ++ args_t; ++ ++@@ -72,6 +76,7 @@ ++ line = bcf_init(); ++ } ++ +++ if ( args->verbose ) fprintf(bcftools_stderr,"Checking the headers and starting positions of %d files\n", args->nfnames); ++ kstring_t str = {0,0,0}; ++ int i, prev_chrid = -1; ++ for (i=0; infnames; i++) ++@@ -99,7 +104,7 @@ ++ } ++ } ++ bcf_hdr_destroy(hdr); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]); ++ } ++ free(str.s); ++ if ( line ) bcf_destroy(line); ++@@ -114,14 +119,30 @@ ++ if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); ++ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); ++ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); ++- if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); ++- ++- bcf_hdr_write(args->out_fh, args->out_hdr); ++- ++- if ( args->allow_overlaps ) +++ if ( args->allow_overlaps || args->phased_concat ) ++ { ++ args->files = bcf_sr_init(); ++ args->files->require_index = 1; +++ } +++ if ( args->n_threads ) +++ { +++ if ( args->files ) +++ { +++ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); +++ args->tpool = args->files->p; +++ } +++ else +++ { +++ args->tpool = (htsThreadPool*) calloc(1, sizeof(htsThreadPool)); +++ if ( !args->tpool ) error("Failed to allocate memory\n"); +++ if ( !(args->tpool->pool = hts_tpool_init(args->n_threads)) ) error("Failed to initialize %d threads\n",args->n_threads); +++ } +++ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool); +++ } +++ if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); +++ +++ if ( args->allow_overlaps ) +++ { ++ if ( args->regions_list ) ++ { ++ if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) ++@@ -169,8 +190,6 @@ ++ args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); ++ args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); ++ args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); ++- args->files = bcf_sr_init(); ++- args->files->require_index = 1; ++ args->ifname = 0; ++ } ++ } ++@@ -178,13 +197,16 @@ ++ static void destroy_data(args_t *args) ++ { ++ int i; ++- for (i=0; infnames; i++) free(args->fnames[i]); ++- free(args->fnames); ++- if ( args->files ) bcf_sr_destroy(args->files); ++ if ( args->out_fh ) ++ { ++ if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); ++ } +++ if ( args->tpool && !args->files ) +++ { +++ hts_tpool_destroy(args->tpool->pool); +++ free(args->tpool); +++ } +++ if ( args->files ) bcf_sr_destroy(args->files); ++ if ( args->out_hdr ) bcf_hdr_destroy(args->out_hdr); ++ free(args->seen_seq); ++ free(args->start_pos); ++@@ -197,6 +219,8 @@ ++ free(args->nmism); ++ free(args->phase_qual); ++ free(args->phase_set); +++ for (i=0; infnames; i++) free(args->fnames[i]); +++ free(args->fnames); ++ } ++ ++ int vcf_write_line(htsFile *fp, kstring_t *line); ++@@ -237,7 +261,7 @@ ++ { ++ if ( !gt_absent_warned ) ++ { ++- fprintf(bcftools_stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1); +++ fprintf(bcftools_stderr,"GT is not present at %s:%"PRId64". (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), (int64_t) arec->pos+1); ++ gt_absent_warned = 1; ++ } ++ continue; ++@@ -248,7 +272,7 @@ ++ { ++ if ( !gt_absent_warned ) ++ { ++- fprintf(bcftools_stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1); +++ fprintf(bcftools_stderr,"GT is not present at %s:%"PRId64". (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), (int64_t) brec->pos+1); ++ gt_absent_warned = 1; ++ } ++ continue; ++@@ -284,9 +308,9 @@ ++ bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); ++ args->phase_set_changed = 0; ++ } ++- bcf_write(args->out_fh, args->out_hdr, arec); +++ if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ ++- if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1,args->prev_pos_check+1); +++ if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1,args->prev_pos_check+1); ++ args->prev_pos_check = arec->pos; ++ } ++ args->nswap = 0; ++@@ -334,9 +358,9 @@ ++ bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl); ++ args->phase_set_changed = 0; ++ } ++- bcf_write(args->out_fh, args->out_hdr, brec); +++ if ( bcf_write(args->out_fh, args->out_hdr, brec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ ++- if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1,args->prev_pos_check+1); +++ if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1,args->prev_pos_check+1); ++ args->prev_pos_check = brec->pos; ++ } ++ args->nbuf = 0; ++@@ -345,9 +369,9 @@ ++ static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec) ++ { ++ if ( arec && arec->errcode ) ++- error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1, args->files->readers[0].fname); +++ error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1, args->files->readers[0].fname); ++ if ( brec && brec->errcode ) ++- error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1, args->files->readers[1].fname); +++ error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1, args->files->readers[1].fname); ++ ++ int i, nsmpl = bcf_hdr_nsamples(args->out_hdr); ++ int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec)); ++@@ -375,10 +399,10 @@ ++ bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); ++ args->phase_set_changed = 0; ++ } ++- bcf_write(args->out_fh, args->out_hdr, arec); +++ if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ ++ if ( arec->pos < args->prev_pos_check ) ++- error("FIXME, disorder: %s:%d in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); +++ error("FIXME, disorder: %s:%"PRId64" in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), (int64_t) arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); ++ args->prev_pos_check = arec->pos; ++ return; ++ } ++@@ -395,6 +419,7 @@ ++ ++ static void concat(args_t *args) ++ { +++ static int site_drop_warned = 0; ++ int i; ++ if ( args->phased_concat ) // phased concat ++ { ++@@ -431,8 +456,20 @@ ++ if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader ++ { ++ // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped ++- if ( ! bcf_sr_region_done(args->files,0) ) continue; ++- +++ if ( ! bcf_sr_region_done(args->files,0) ) +++ { +++ if ( !site_drop_warned ) +++ { +++ fprintf(bcftools_stderr, +++ "Warning: Dropping the site %s:%"PRId64". The --ligate option is intended for VCFs with perfect\n" +++ " overlap, sites in overlapping regions present in one but missing in other are dropped.\n" +++ " This warning is printed only once.\n", +++ bcf_seqname(bcf_sr_get_header(args->files,1),bcf_sr_get_line(args->files,1)), (int64_t) bcf_sr_get_line(args->files,1)->pos+1 +++ ); +++ site_drop_warned = 1; +++ } +++ continue; +++ } ++ phased_flush(args); ++ bcf_sr_remove_reader(args->files, 0); ++ } ++@@ -485,20 +522,27 @@ ++ bcf1_t *line = bcf_sr_get_line(args->files,i); ++ if ( !line ) continue; ++ bcf_translate(args->out_hdr, args->files->readers[i].header, line); ++- bcf_write1(args->out_fh, args->out_hdr, line); +++ if ( bcf_write1(args->out_fh, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ if ( args->remove_dups ) break; ++ } ++ } ++ } ++ else // concatenating ++ { +++ struct timeval t0, t1; ++ kstring_t tmp = {0,0,0}; ++ int prev_chr_id = -1, prev_pos; ++ bcf1_t *line = bcf_init(); ++ for (i=0; infnames; i++) ++ { ++- htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); ++- bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); +++ if ( args->verbose ) +++ { +++ fprintf(bcftools_stderr,"Concatenating %s", args->fnames[i]); +++ gettimeofday(&t0, NULL); +++ } +++ htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("\nFailed to open: %s\n", args->fnames[i]); +++ if ( args->n_threads ) hts_set_opt(fp, HTS_OPT_THREAD_POOL, args->tpool); +++ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("\nFailed to parse header: %s\n", args->fnames[i]); ++ if ( !fp->is_bin && args->output_type&FT_VCF ) ++ { ++ line->max_unpack = BCF_UN_STR; ++@@ -510,7 +554,7 @@ ++ tmp.l = 0; ++ kputsn(fp->line.s,str-fp->line.s,&tmp); ++ int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); ++- if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); +++ if ( chr_id<0 ) error("\nThe sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); ++ if ( prev_chr_id!=chr_id ) ++ { ++ prev_pos = -1; ++@@ -521,11 +565,11 @@ ++ int pos = strtol(str+1,&end,10) - 1; ++ if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); ++ if ( prev_pos > pos ) ++- error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); +++ error("\nThe chromosome block %s is not sorted, consider running with -a.\n", tmp.s); ++ args->seen_seq[chr_id] = 1; ++ prev_chr_id = chr_id; ++ ++- if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %"PRIu64" bytes\n", (uint64_t)fp->line.l); +++ if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("\nFailed to write %"PRIu64" bytes\n", (uint64_t)fp->line.l); ++ } ++ } ++ else ++@@ -543,15 +587,21 @@ ++ error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); ++ } ++ if ( prev_pos > line->pos ) ++- error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); +++ error("\nThe chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); ++ args->seen_seq[line->rid] = 1; ++ prev_chr_id = line->rid; ++ ++- if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); +++ if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("\nFailed to write\n"); ++ } ++ } ++ bcf_hdr_destroy(hdr); ++ hts_close(fp); +++ if ( args->verbose ) +++ { +++ gettimeofday(&t1, NULL); +++ double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec); +++ fprintf(bcftools_stderr,"\t%f seconds\n",delta/1e6); +++ } ++ } ++ bcf_destroy(line); ++ free(tmp.s); ++@@ -614,63 +664,141 @@ ++ && header[12] == 'B' && header[13] == 'C' ++ && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1; ++ } +++static void _check_hrecs(const bcf_hdr_t *hdr0, const bcf_hdr_t *hdr, char *fname0, char *fname) +++{ +++ int j; +++ for (j=0; jnhrec; j++) +++ { +++ bcf_hrec_t *hrec0 = hdr0->hrec[j]; +++ if ( hrec0->type!=BCF_HL_FLT && hrec0->type!=BCF_HL_INFO && hrec0->type!=BCF_HL_FMT && hrec0->type!=BCF_HL_CTG ) continue; // skip fiels w/o IDX +++ int itag = bcf_hrec_find_key(hrec0, "ID"); +++ bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, hrec0->type, "ID", hrec0->vals[itag], NULL); +++ +++ char *type = NULL; +++ if ( hrec0->type==BCF_HL_FLT ) type = "FILTER"; +++ if ( hrec0->type==BCF_HL_INFO ) type = "INFO"; +++ if ( hrec0->type==BCF_HL_FMT ) type = "FORMAT"; +++ if ( hrec0->type==BCF_HL_CTG ) type = "contig"; +++ +++ if ( !hrec ) +++ error("Cannot use --naive, incompatible headers, the tag %s/%s not present in %s\n",type,hrec0->vals[itag],fname); +++ +++ int idx0 = bcf_hrec_find_key(hrec0, "IDX"); +++ int idx = bcf_hrec_find_key(hrec, "IDX"); +++ if ( idx0<0 || idx<0 ) +++ error("fixme: unexpected IDX<0 for %s/%s in %s or %s\n",type,hrec0->vals[itag],fname0,fname); +++ if ( strcmp(hrec0->vals[idx0],hrec->vals[idx]) ) +++ error("Cannot use --naive, use --naive-force instead: different order the tag %s/%s in %s vs %s\n",type,hrec0->vals[itag],fname0,fname); +++ } +++} +++static void naive_concat_check_headers(args_t *args) +++{ +++ fprintf(bcftools_stderr,"Checking the headers of %d files.\n",args->nfnames); +++ bcf_hdr_t *hdr0 = NULL; +++ int i,j; +++ for (i=0; infnames; i++) +++ { +++ htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); +++ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); +++ htsFormat type = *hts_get_format(fp); +++ hts_close(fp); +++ +++ if ( i==0 ) +++ { +++ hdr0 = hdr; +++ continue; +++ } +++ +++ // check the samples +++ if ( bcf_hdr_nsamples(hdr0)!=bcf_hdr_nsamples(hdr) ) +++ error("Cannot concatenate, different number of samples: %d vs %d in %s vs %s\n",bcf_hdr_nsamples(hdr0),bcf_hdr_nsamples(hdr),args->fnames[0],args->fnames[i]); +++ for (j=0; jsamples[j],hdr->samples[j]) ) +++ error("Cannot concatenate, different samples in %s vs %s\n",args->fnames[0],args->fnames[i]); +++ +++ // if BCF, check if tag IDs are consistent in the dictionary of strings +++ if ( type.compression!=bgzf ) +++ error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); +++ if ( type.format==vcf ) +++ { +++ bcf_hdr_destroy(hdr); +++ continue; +++ } +++ +++ _check_hrecs(hdr0,hdr,args->fnames[0],args->fnames[i]); +++ _check_hrecs(hdr,hdr0,args->fnames[i],args->fnames[0]); +++ +++ bcf_hdr_destroy(hdr); +++ } +++ if ( hdr0 ) bcf_hdr_destroy(hdr0); +++ fprintf(bcftools_stderr,"Done, the headers are compatible.\n"); +++} ++ static void naive_concat(args_t *args) ++ { +++ if ( !args->naive_concat_trust_headers ) +++ naive_concat_check_headers(args); +++ ++ // only compressed BCF atm ++ BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; ++ +++ struct timeval t0, t1; ++ const size_t page_size = BGZF_MAX_BLOCK_SIZE; ++ uint8_t *buf = (uint8_t*) malloc(page_size); ++ kstring_t tmp = {0,0,0}; ++ int i, file_types = 0; ++ for (i=0; infnames; i++) ++ { +++ if ( args->verbose ) +++ { +++ fprintf(bcftools_stderr,"Concatenating %s", args->fnames[i]); +++ gettimeofday(&t0, NULL); +++ } ++ htsFile *hts_fp = hts_open(args->fnames[i],"r"); ++- if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); +++ if ( !hts_fp ) error("\nFailed to open: %s\n", args->fnames[i]); ++ htsFormat type = *hts_get_format(hts_fp); ++ ++ if ( type.compression!=bgzf ) ++- error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); +++ error("\nThe --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); ++ file_types |= type.format==vcf ? 1 : 2; ++ if ( file_types==3 ) ++- error("The --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n"); +++ error("\nThe --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n"); ++ ++ BGZF *fp = hts_get_bgzfp(hts_fp); ++ if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) ++- error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); +++ error("\nFailed to read %s: %s\n", args->fnames[i], strerror(errno)); ++ ++ int nskip; ++ if ( type.format==bcf ) ++ { ++ uint8_t magic[5]; ++- if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); ++- if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); +++ if ( bgzf_read(fp, magic, 5) != 5 ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]); +++ if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("\nInvalid BCF magic string in %s\n", args->fnames[i]); ++ ++- if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); +++ if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]); ++ hts_expand(char,tmp.l,tmp.m,tmp.s); ++- if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); +++ if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]); ++ ++ // write only the first header ++ if ( i==0 ) ++ { ++- if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); ++- if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); ++- if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %"PRId64" bytes to %s\n", (uint64_t)tmp.l,args->output_fname); +++ if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("\nFailed to write %d bytes to %s\n", 5,args->output_fname); +++ if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("\nFailed to write %d bytes to %s\n", 4,args->output_fname); +++ if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("\nFailed to write %"PRId64" bytes to %s\n", (uint64_t)tmp.l,args->output_fname); ++ } ++ nskip = fp->block_offset; ++ } ++ else ++ { ++ nskip = print_vcf_gz_header(fp, bgzf_out, i==0?1:0, &tmp); ++- if ( nskip==-1 ) error("Error reading %s\n", args->fnames[i]); +++ if ( nskip==-1 ) error("\nError reading %s\n", args->fnames[i]); ++ } ++ ++ // Output all non-header data that were read together with the header block ++ if ( fp->block_length - nskip > 0 ) ++ { ++- if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); +++ if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("\nError: %d\n",fp->errcode); ++ } ++- if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); +++ if ( bgzf_flush(bgzf_out)<0 ) error("\nError: %d\n",bgzf_out->errcode); ++ ++ ++ // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks ++@@ -682,16 +810,22 @@ ++ { ++ nread = bgzf_raw_read(fp, buf, nheader); ++ if ( !nread ) break; ++- if ( nread != nheader || check_header(buf)!=0 ) error("Could not parse the header of a bgzf block: %s\n",args->fnames[i]); +++ if ( nread != nheader || check_header(buf)!=0 ) error("\nCould not parse the header of a bgzf block: %s\n",args->fnames[i]); ++ nblock = unpackInt16(buf+16) + 1; ++ assert( nblock <= page_size && nblock >= nheader ); ++ nread += bgzf_raw_read(fp, buf+nheader, nblock - nheader); ++- if ( nread!=nblock ) error("Could not read %"PRId64" bytes: %s\n",(uint64_t)nblock,args->fnames[i]); +++ if ( nread!=nblock ) error("\nCould not read %"PRId64" bytes: %s\n",(uint64_t)nblock,args->fnames[i]); ++ if ( nread==neof && !memcmp(buf,eof,neof) ) continue; ++ nwr = bgzf_raw_write(bgzf_out, buf, nread); ++- if ( nwr != nread ) error("Write failed, wrote %"PRId64" instead of %d bytes.\n", (uint64_t)nwr,(int)nread); +++ if ( nwr != nread ) error("\nWrite failed, wrote %"PRId64" instead of %d bytes.\n", (uint64_t)nwr,(int)nread); +++ } +++ if (hts_close(hts_fp)) error("\nClose failed: %s\n",args->fnames[i]); +++ if ( args->verbose ) +++ { +++ gettimeofday(&t1, NULL); +++ double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec); +++ fprintf(bcftools_stderr,"\t%f seconds\n",delta/1e6); ++ } ++- if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); ++ } ++ free(buf); ++ free(tmp.s); ++@@ -707,8 +841,7 @@ ++ fprintf(bcftools_stderr, " VCF into one. The input files must be sorted by chr and position. The files\n"); ++ fprintf(bcftools_stderr, " must be given in the correct order to produce sorted VCF on output unless\n"); ++ fprintf(bcftools_stderr, " the -a, --allow-overlaps option is specified. With the --naive option, the files\n"); ++- fprintf(bcftools_stderr, " are concatenated without being recompressed, which is very fast but dangerous\n"); ++- fprintf(bcftools_stderr, " if the BCF headers differ.\n"); +++ fprintf(bcftools_stderr, " are concatenated without being recompressed, which is very fast.\n"); ++ fprintf(bcftools_stderr, "Usage: bcftools concat [options] [ [...]]\n"); ++ fprintf(bcftools_stderr, "\n"); ++ fprintf(bcftools_stderr, "Options:\n"); ++@@ -719,13 +852,15 @@ ++ fprintf(bcftools_stderr, " -f, --file-list Read the list of files from a file.\n"); ++ fprintf(bcftools_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); ++ fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); ++- fprintf(bcftools_stderr, " -n, --naive Concatenate files without recompression (dangerous, use with caution)\n"); +++ fprintf(bcftools_stderr, " -n, --naive Concatenate files without recompression, a header check compatibility is performed\n"); +++ fprintf(bcftools_stderr, " --naive-force Same as --naive, but header compatibility is not checked. Dangerous, use with caution.\n"); ++ fprintf(bcftools_stderr, " -o, --output Write output to a file [standard output]\n"); ++ fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); ++ fprintf(bcftools_stderr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); ++ fprintf(bcftools_stderr, " -r, --regions Restrict to comma-separated list of regions\n"); ++ fprintf(bcftools_stderr, " -R, --regions-file Restrict to regions listed in a file\n"); ++- fprintf(bcftools_stderr, " --threads Number of extra output compression threads [0]\n"); +++ fprintf(bcftools_stderr, " --threads Use multithreading with worker threads [0]\n"); +++ fprintf(bcftools_stderr, " -v, --verbose <0|1> Set verbosity level [1]\n"); ++ fprintf(bcftools_stderr, "\n"); ++ exit(1); ++ } ++@@ -740,10 +875,13 @@ ++ args->n_threads = 0; ++ args->record_cmd_line = 1; ++ args->min_PQ = 30; +++ args->verbose = 1; ++ ++ static struct option loptions[] = ++ { +++ {"verbose",required_argument,NULL,'v'}, ++ {"naive",no_argument,NULL,'n'}, +++ {"naive-force",no_argument,NULL,7}, ++ {"compact-PS",no_argument,NULL,'c'}, ++ {"regions",required_argument,NULL,'r'}, ++ {"regions-file",required_argument,NULL,'R'}, ++@@ -760,7 +898,7 @@ ++ {NULL,0,NULL,0} ++ }; ++ char *tmp; ++- while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cn",loptions,NULL)) >= 0) +++ while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cnv:",loptions,NULL)) >= 0) ++ { ++ switch (c) { ++ case 'c': args->compact_PS = 1; break; ++@@ -788,6 +926,11 @@ ++ break; ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; +++ case 7 : args->naive_concat = 1; args->naive_concat_trust_headers = 1; break; +++ case 'v': +++ args->verbose = strtol(optarg, 0, 0); +++ error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); +++ break; ++ case 'h': ++ case '?': usage(args); break; ++ default: error("Unknown argument: %s\n", optarg); ++@@ -800,7 +943,7 @@ ++ args->fnames[args->nfnames-1] = strdup(argv[optind]); ++ optind++; ++ } ++- if ( args->allow_overlaps && args->phased_concat ) args->allow_overlaps = 0; +++ if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n"); ++ if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n"); ++ if ( args->file_list ) ++ { ++--- python-pysam.orig/bcftools/vcfconvert.c +++++ python-pysam/bcftools/vcfconvert.c ++@@ -31,6 +31,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -387,7 +388,7 @@ ++ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); ++ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); ++- bcf_hdr_write(out_fh,args->header); +++ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); ++ bcf1_t *rec = bcf_init(); ++ ++ nsamples -= 2; ++@@ -399,7 +400,9 @@ ++ bcf_clear(rec); ++ args->n.total++; ++ if ( !tsv_parse(tsv, rec, line.s) ) ++- bcf_write(out_fh, args->header, rec); +++ { +++ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); +++ } ++ else ++ error("Error occurred while parsing: %s\n", line.s); ++ } ++@@ -513,7 +516,7 @@ ++ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); ++ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); ++- bcf_hdr_write(out_fh,args->header); +++ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); ++ bcf1_t *rec = bcf_init(); ++ ++ args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2); ++@@ -531,7 +534,7 @@ ++ if ( tsv_parse(hap_tsv, rec, line.s) ) ++ error("Error occurred while parsing %s: %s\n", hap_fname,line.s); ++ ++- bcf_write(out_fh, args->header, rec); +++ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ ++ if ( hts_getline(leg_fh, KS_SEP_LINE, &line)<=0 ) ++ { ++@@ -627,7 +630,7 @@ ++ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); ++ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); ++- bcf_hdr_write(out_fh,args->header); +++ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ bcf1_t *rec = bcf_init(); ++ ++ nsamples -= 2; ++@@ -638,7 +641,9 @@ ++ bcf_clear(rec); ++ args->n.total++; ++ if ( !tsv_parse(tsv, rec, line.s) ) ++- bcf_write(out_fh, args->header, rec); +++ { +++ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); +++ } ++ else ++ error("Error occurred while parsing: %s\n", line.s); ++ } ++@@ -938,9 +943,9 @@ ++ if (legend_fname) { ++ str.l = 0; ++ if ( args->output_vcf_ids && (line->d.id[0]!='.' || line->d.id[1]!=0) ) ++- ksprintf(&str, "%s %d %s %s\n", line->d.id, line->pos+1, line->d.allele[0], line->d.allele[1]); +++ ksprintf(&str, "%s %"PRId64" %s %s\n", line->d.id, (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1]); ++ else ++- ksprintf(&str, "%s:%d_%s_%s %d %s %s\n", bcf_seqname(args->header, line), line->pos+1, line->d.allele[0], line->d.allele[1], line->pos+1, line->d.allele[0], line->d.allele[1]); +++ ksprintf(&str, "%s:%"PRId64"_%s_%s %"PRId64" %s %s\n", bcf_seqname(args->header, line), (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1], (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1]); ++ ++ // write legend file ++ ret = bgzf_write(lout, str.s, str.l); ++@@ -1141,7 +1146,7 @@ ++ ++ int len; ++ char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(args->header,rec->rid), rec->pos, rec->pos, &len); ++- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(args->header,rec->rid), rec->pos+1); +++ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); ++ ++ int nals = 1, alleles[5] = { -1, -1, -1, -1, -1 }; // a,c,g,t,n ++ ref[0] = toupper(ref[0]); ++@@ -1156,10 +1161,10 @@ ++ if ( i>0 ) ++ { ++ ret = tsv_next(tsv); ++- if ( ret==-1 ) error("Too few columns for %d samples at %s:%d\n", rec->n_sample,bcf_hdr_id2name(args->header,rec->rid), rec->pos+1); +++ if ( ret==-1 ) error("Too few columns for %d samples at %s:%"PRId64"\n", rec->n_sample,bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); ++ } ++ ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2); ++- if ( ret==-1 ) error("Error parsing the site %s:%d, expected two characters\n", bcf_hdr_id2name(args->header,rec->rid), rec->pos+1); +++ if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); ++ if ( ret==-2 ) ++ { ++ // something else than a SNP ++@@ -1213,7 +1218,7 @@ ++ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); ++ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); ++- bcf_hdr_write(out_fh,args->header); +++ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ ++ tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA"); ++ if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n"); ++@@ -1234,7 +1239,9 @@ ++ ++ args->n.total++; ++ if ( !tsv_parse(tsv, rec, line.s) ) ++- bcf_write(out_fh, args->header, rec); +++ { +++ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); +++ } ++ else ++ args->n.skipped++; ++ } ++@@ -1242,7 +1249,7 @@ ++ free(line.s); ++ ++ bcf_hdr_destroy(args->header); ++- hts_close(out_fh); +++ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); ++ tsv_destroy(tsv); ++ bcf_destroy(rec); ++ free(args->str.s); ++@@ -1265,7 +1272,7 @@ ++ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); ++ ++ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); ++- bcf_hdr_write(out_fh,hdr); +++ if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ ++ while ( bcf_sr_next_line(args->files) ) ++ { ++@@ -1276,9 +1283,9 @@ ++ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; ++ if ( !pass ) continue; ++ } ++- bcf_write(out_fh,hdr,line); +++ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ } ++- hts_close(out_fh); +++ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); ++ } ++ ++ static void gvcf_to_vcf(args_t *args) ++@@ -1295,7 +1302,7 @@ ++ ++ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); ++ if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert"); ++- bcf_hdr_write(out_fh,hdr); +++ if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ ++ int32_t *itmp = NULL, nitmp = 0; ++ ++@@ -1308,7 +1315,7 @@ ++ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; ++ if ( !pass ) ++ { ++- bcf_write(out_fh,hdr,line); +++ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ continue; ++ } ++ } ++@@ -1332,7 +1339,7 @@ ++ // no gVCF compatible alleles ++ if (gallele<0) ++ { ++- bcf_write(out_fh,hdr,line); +++ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ continue; ++ } ++ ++@@ -1340,7 +1347,7 @@ ++ if ( nend!=1 ) ++ { ++ // No INFO/END => not gVCF record ++- bcf_write(out_fh,hdr,line); +++ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ continue; ++ } ++ bcf_update_info_int32(hdr,line,"END",NULL,0); ++@@ -1349,14 +1356,14 @@ ++ { ++ line->pos = pos; ++ char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len); ++- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(hdr,line->rid), line->pos+1); +++ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(hdr,line->rid),(int64_t) line->pos+1); ++ strncpy(line->d.allele[0],ref,len); ++- bcf_write(out_fh,hdr,line); +++ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ free(ref); ++ } ++ } ++ free(itmp); ++- hts_close(out_fh); +++ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); ++ } ++ ++ static void usage(void) ++@@ -1381,7 +1388,7 @@ ++ fprintf(stderr, " --no-version do not append version and command line to the header\n"); ++ fprintf(stderr, " -o, --output output file name [stdout]\n"); ++ fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); ++- fprintf(stderr, " --threads number of extra output compression threads [0]\n"); +++ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(stderr, "\n"); ++ fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); ++ fprintf(stderr, " -G, --gensample2vcf <...> |,\n"); ++@@ -1505,7 +1512,7 @@ ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 10 : args->record_cmd_line = 0; break; ++ case 11 : args->sex_fname = optarg; break; ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++--- python-pysam.orig/bcftools/vcfconvert.c.pysam.c +++++ python-pysam/bcftools/vcfconvert.c.pysam.c ++@@ -33,6 +33,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -389,7 +390,7 @@ ++ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); ++ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); ++- bcf_hdr_write(out_fh,args->header); +++ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); ++ bcf1_t *rec = bcf_init(); ++ ++ nsamples -= 2; ++@@ -401,7 +402,9 @@ ++ bcf_clear(rec); ++ args->n.total++; ++ if ( !tsv_parse(tsv, rec, line.s) ) ++- bcf_write(out_fh, args->header, rec); +++ { +++ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); +++ } ++ else ++ error("Error occurred while parsing: %s\n", line.s); ++ } ++@@ -515,7 +518,7 @@ ++ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); ++ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); ++- bcf_hdr_write(out_fh,args->header); +++ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); ++ bcf1_t *rec = bcf_init(); ++ ++ args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2); ++@@ -533,7 +536,7 @@ ++ if ( tsv_parse(hap_tsv, rec, line.s) ) ++ error("Error occurred while parsing %s: %s\n", hap_fname,line.s); ++ ++- bcf_write(out_fh, args->header, rec); +++ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ ++ if ( hts_getline(leg_fh, KS_SEP_LINE, &line)<=0 ) ++ { ++@@ -629,7 +632,7 @@ ++ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); ++ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); ++- bcf_hdr_write(out_fh,args->header); +++ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ bcf1_t *rec = bcf_init(); ++ ++ nsamples -= 2; ++@@ -640,7 +643,9 @@ ++ bcf_clear(rec); ++ args->n.total++; ++ if ( !tsv_parse(tsv, rec, line.s) ) ++- bcf_write(out_fh, args->header, rec); +++ { +++ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); +++ } ++ else ++ error("Error occurred while parsing: %s\n", line.s); ++ } ++@@ -940,9 +945,9 @@ ++ if (legend_fname) { ++ str.l = 0; ++ if ( args->output_vcf_ids && (line->d.id[0]!='.' || line->d.id[1]!=0) ) ++- ksprintf(&str, "%s %d %s %s\n", line->d.id, line->pos+1, line->d.allele[0], line->d.allele[1]); +++ ksprintf(&str, "%s %"PRId64" %s %s\n", line->d.id, (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1]); ++ else ++- ksprintf(&str, "%s:%d_%s_%s %d %s %s\n", bcf_seqname(args->header, line), line->pos+1, line->d.allele[0], line->d.allele[1], line->pos+1, line->d.allele[0], line->d.allele[1]); +++ ksprintf(&str, "%s:%"PRId64"_%s_%s %"PRId64" %s %s\n", bcf_seqname(args->header, line), (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1], (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1]); ++ ++ // write legend file ++ ret = bgzf_write(lout, str.s, str.l); ++@@ -1143,7 +1148,7 @@ ++ ++ int len; ++ char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(args->header,rec->rid), rec->pos, rec->pos, &len); ++- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(args->header,rec->rid), rec->pos+1); +++ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); ++ ++ int nals = 1, alleles[5] = { -1, -1, -1, -1, -1 }; // a,c,g,t,n ++ ref[0] = toupper(ref[0]); ++@@ -1158,10 +1163,10 @@ ++ if ( i>0 ) ++ { ++ ret = tsv_next(tsv); ++- if ( ret==-1 ) error("Too few columns for %d samples at %s:%d\n", rec->n_sample,bcf_hdr_id2name(args->header,rec->rid), rec->pos+1); +++ if ( ret==-1 ) error("Too few columns for %d samples at %s:%"PRId64"\n", rec->n_sample,bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); ++ } ++ ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2); ++- if ( ret==-1 ) error("Error parsing the site %s:%d, expected two characters\n", bcf_hdr_id2name(args->header,rec->rid), rec->pos+1); +++ if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); ++ if ( ret==-2 ) ++ { ++ // something else than a SNP ++@@ -1215,7 +1220,7 @@ ++ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); ++ if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); ++- bcf_hdr_write(out_fh,args->header); +++ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ ++ tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA"); ++ if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n"); ++@@ -1236,7 +1241,9 @@ ++ ++ args->n.total++; ++ if ( !tsv_parse(tsv, rec, line.s) ) ++- bcf_write(out_fh, args->header, rec); +++ { +++ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); +++ } ++ else ++ args->n.skipped++; ++ } ++@@ -1244,7 +1251,7 @@ ++ free(line.s); ++ ++ bcf_hdr_destroy(args->header); ++- hts_close(out_fh); +++ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); ++ tsv_destroy(tsv); ++ bcf_destroy(rec); ++ free(args->str.s); ++@@ -1267,7 +1274,7 @@ ++ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); ++ ++ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); ++- bcf_hdr_write(out_fh,hdr); +++ if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ ++ while ( bcf_sr_next_line(args->files) ) ++ { ++@@ -1278,9 +1285,9 @@ ++ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; ++ if ( !pass ) continue; ++ } ++- bcf_write(out_fh,hdr,line); +++ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ } ++- hts_close(out_fh); +++ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); ++ } ++ ++ static void gvcf_to_vcf(args_t *args) ++@@ -1297,7 +1304,7 @@ ++ ++ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); ++ if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert"); ++- bcf_hdr_write(out_fh,hdr); +++ if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ ++ int32_t *itmp = NULL, nitmp = 0; ++ ++@@ -1310,7 +1317,7 @@ ++ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; ++ if ( !pass ) ++ { ++- bcf_write(out_fh,hdr,line); +++ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ continue; ++ } ++ } ++@@ -1334,7 +1341,7 @@ ++ // no gVCF compatible alleles ++ if (gallele<0) ++ { ++- bcf_write(out_fh,hdr,line); +++ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ continue; ++ } ++ ++@@ -1342,7 +1349,7 @@ ++ if ( nend!=1 ) ++ { ++ // No INFO/END => not gVCF record ++- bcf_write(out_fh,hdr,line); +++ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ continue; ++ } ++ bcf_update_info_int32(hdr,line,"END",NULL,0); ++@@ -1351,14 +1358,14 @@ ++ { ++ line->pos = pos; ++ char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len); ++- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(hdr,line->rid), line->pos+1); +++ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(hdr,line->rid),(int64_t) line->pos+1); ++ strncpy(line->d.allele[0],ref,len); ++- bcf_write(out_fh,hdr,line); +++ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); ++ free(ref); ++ } ++ } ++ free(itmp); ++- hts_close(out_fh); +++ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); ++ } ++ ++ static void usage(void) ++@@ -1383,7 +1390,7 @@ ++ fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); ++ fprintf(bcftools_stderr, " -o, --output output file name [bcftools_stdout]\n"); ++ fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); ++- fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); +++ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(bcftools_stderr, "\n"); ++ fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); ++ fprintf(bcftools_stderr, " -G, --gensample2vcf <...> |,\n"); ++@@ -1507,7 +1514,7 @@ ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 10 : args->record_cmd_line = 0; break; ++ case 11 : args->sex_fname = optarg; break; ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++--- python-pysam.orig/bcftools/vcffilter.c +++++ python-pysam/bcftools/vcffilter.c ++@@ -188,7 +188,7 @@ ++ if ( args->snp_gap && rec->d.flt[j]==args->SnpGap_id ) { pass = 0; break; } ++ } ++ } ++- if ( pass ) bcf_write1(args->out_fh, args->hdr, rec); +++ if ( pass && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ } ++ ++@@ -278,7 +278,7 @@ ++ if ( k_flush || !line ) ++ { ++ // Select the best indel from the cluster of k_flush indels ++- int k = 0, max_ac = -1, imax_ac = -1; +++ int k = 0, max_ac = -1, imax_ac = -1, max_qual = -1, imax_qual = -1; ++ for (i=-1; rbuf_next(&args->rbuf,&i) && kn_allele, args->ntmpi, args->tmpi); ++ int ret = bcf_calc_ac(args->hdr, rec, args->tmpi, BCF_UN_ALL); ++ if ( imax_ac==-1 || (ret && max_ac < args->tmpi[1]) ) { max_ac = args->tmpi[1]; imax_ac = i; } +++ if ( imax_qual==-1 || max_qual < rec->qual ) { max_qual = rec->qual; imax_qual = i; } ++ } ++ ++- // Filter all but the best indel (with max AF or first if AF not available) +++ // Filter all but the best indel (with the best QUAL, bigger AC, or take the first if neither QUAL nor AC are available) ++ k = 0; ++ for (i=-1; rbuf_next(&args->rbuf,&i) && krbuf_lines[i]; ++ if ( !(rec->d.var_type & IndelGap_set) ) continue; ++ rec->d.var_type |= IndelGap_flush; ++- if ( i!=imax_ac ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id); +++ +++ int do_filter = 0; +++ if ( max_qual>0 ) +++ { +++ if ( i!=imax_qual ) do_filter = 1; +++ } +++ else if ( i!=imax_ac ) do_filter = 1; +++ if ( do_filter ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id); ++ } ++ } ++ } ++@@ -418,7 +426,7 @@ ++ fprintf(stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n"); ++ fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); ++ fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); ++- fprintf(stderr, " --threads number of extra output compression threads [0]\n"); +++ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(stderr, "\n"); ++ exit(1); ++ } ++@@ -494,7 +502,7 @@ ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; ++ case 'h': ++- case '?': usage(args); +++ case '?': usage(args); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -531,10 +539,10 @@ ++ if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 ) ++ error("Failed to read the targets: %s\n", args->targets_list); ++ } ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ ++ init_data(args); ++- bcf_hdr_write(args->out_fh, args->hdr); +++ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); ++ while ( bcf_sr_next_line(args->files) ) ++ { ++ bcf1_t *line = bcf_sr_get_line(args->files, 0); ++@@ -558,14 +566,16 @@ ++ } ++ if ( args->set_gts ) set_genotypes(args, line, pass); ++ if ( !args->rbuf_lines ) ++- bcf_write1(args->out_fh, args->hdr, line); +++ { +++ if ( bcf_write1(args->out_fh, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ } ++ else ++ buffered_filters(args, line); ++ } ++ } ++ buffered_filters(args, NULL); ++ ++- hts_close(args->out_fh); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ destroy_data(args); ++ bcf_sr_destroy(args->files); ++ free(args); ++--- python-pysam.orig/bcftools/vcffilter.c.pysam.c +++++ python-pysam/bcftools/vcffilter.c.pysam.c ++@@ -190,7 +190,7 @@ ++ if ( args->snp_gap && rec->d.flt[j]==args->SnpGap_id ) { pass = 0; break; } ++ } ++ } ++- if ( pass ) bcf_write1(args->out_fh, args->hdr, rec); +++ if ( pass && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ } ++ ++@@ -280,7 +280,7 @@ ++ if ( k_flush || !line ) ++ { ++ // Select the best indel from the cluster of k_flush indels ++- int k = 0, max_ac = -1, imax_ac = -1; +++ int k = 0, max_ac = -1, imax_ac = -1, max_qual = -1, imax_qual = -1; ++ for (i=-1; rbuf_next(&args->rbuf,&i) && kn_allele, args->ntmpi, args->tmpi); ++ int ret = bcf_calc_ac(args->hdr, rec, args->tmpi, BCF_UN_ALL); ++ if ( imax_ac==-1 || (ret && max_ac < args->tmpi[1]) ) { max_ac = args->tmpi[1]; imax_ac = i; } +++ if ( imax_qual==-1 || max_qual < rec->qual ) { max_qual = rec->qual; imax_qual = i; } ++ } ++ ++- // Filter all but the best indel (with max AF or first if AF not available) +++ // Filter all but the best indel (with the best QUAL, bigger AC, or take the first if neither QUAL nor AC are available) ++ k = 0; ++ for (i=-1; rbuf_next(&args->rbuf,&i) && krbuf_lines[i]; ++ if ( !(rec->d.var_type & IndelGap_set) ) continue; ++ rec->d.var_type |= IndelGap_flush; ++- if ( i!=imax_ac ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id); +++ +++ int do_filter = 0; +++ if ( max_qual>0 ) +++ { +++ if ( i!=imax_qual ) do_filter = 1; +++ } +++ else if ( i!=imax_ac ) do_filter = 1; +++ if ( do_filter ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id); ++ } ++ } ++ } ++@@ -420,7 +428,7 @@ ++ fprintf(bcftools_stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n"); ++ fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); ++ fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); ++- fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); +++ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(bcftools_stderr, "\n"); ++ exit(1); ++ } ++@@ -496,7 +504,7 @@ ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; ++ case 'h': ++- case '?': usage(args); +++ case '?': usage(args); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -533,10 +541,10 @@ ++ if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 ) ++ error("Failed to read the targets: %s\n", args->targets_list); ++ } ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ ++ init_data(args); ++- bcf_hdr_write(args->out_fh, args->hdr); +++ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); ++ while ( bcf_sr_next_line(args->files) ) ++ { ++ bcf1_t *line = bcf_sr_get_line(args->files, 0); ++@@ -560,14 +568,16 @@ ++ } ++ if ( args->set_gts ) set_genotypes(args, line, pass); ++ if ( !args->rbuf_lines ) ++- bcf_write1(args->out_fh, args->hdr, line); +++ { +++ if ( bcf_write1(args->out_fh, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); +++ } ++ else ++ buffered_filters(args, line); ++ } ++ } ++ buffered_filters(args, NULL); ++ ++- hts_close(args->out_fh); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ destroy_data(args); ++ bcf_sr_destroy(args->files); ++ free(args); ++--- python-pysam.orig/bcftools/vcfgtcheck.c +++++ python-pysam/bcftools/vcfgtcheck.c ++@@ -302,7 +302,7 @@ ++ int fake_PL = args->no_PLs ? args->no_PLs : 99; // with 1, discordance is the number of non-matching GTs ++ int nsm_gt, i; ++ if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 ) ++- error("GT not present at %s:%d?\n", hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1); +++ error("GT not present at %s:%"PRId64"?\n", hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); ++ nsm_gt /= bcf_hdr_nsamples(hdr); ++ int npl = line->n_allele*(line->n_allele+1)/2; ++ hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr); ++@@ -399,7 +399,7 @@ ++ // Target genotypes ++ int ngt, npl; ++ if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 ) ++- error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); +++ error("GT not present at %s:%"PRId64"?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1); ++ ngt /= bcf_hdr_nsamples(args->gt_hdr); ++ if ( ngt!=2 ) continue; // checking only diploid genotypes ++ ++@@ -415,7 +415,7 @@ ++ npl = fake_PLs(args, args->sm_hdr, sm_line); ++ } ++ else ++- error("PL not present at %s:%d?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1); +++ error("PL not present at %s:%"PRId64"?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, (int64_t) sm_line->pos+1); ++ } ++ else ++ npl /= bcf_hdr_nsamples(args->sm_hdr); ++@@ -460,7 +460,7 @@ ++ int a = bcf_gt_allele(gt_ptr[0]); ++ int b = bcf_gt_allele(gt_ptr[1]); ++ if ( args->hom_only && a!=b ) continue; // heterozygous genotype ++- fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); +++ fprintf(fp, "SC\t%s\t%"PRId64, args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1); ++ for (i=0; in_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]); ++ fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : "."); ++ fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk); ++@@ -515,7 +515,7 @@ ++ ++ if ( args->plot ) ++ { ++- fclose(fp); +++ if ( fclose(fp)!=0 ) error("[%s] Error: close failed\n", __func__); ++ plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]); ++ } ++ } ++@@ -788,7 +788,7 @@ ++ case 't': targets = optarg; break; ++ case 'T': targets = optarg; targets_is_file = 1; break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -805,7 +805,8 @@ ++ if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions); ++ if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets); ++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); ++- if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) error("Failed to open %s: %s\n", args->gt_fname,bcf_sr_strerror(args->files->errnum)); +++ if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) +++ error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum)); ++ args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS; ++ if ( args->plot ) args->plot = init_prefix(args->plot); ++ init_data(args); ++--- python-pysam.orig/bcftools/vcfgtcheck.c.pysam.c +++++ python-pysam/bcftools/vcfgtcheck.c.pysam.c ++@@ -304,7 +304,7 @@ ++ int fake_PL = args->no_PLs ? args->no_PLs : 99; // with 1, discordance is the number of non-matching GTs ++ int nsm_gt, i; ++ if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 ) ++- error("GT not present at %s:%d?\n", hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1); +++ error("GT not present at %s:%"PRId64"?\n", hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); ++ nsm_gt /= bcf_hdr_nsamples(hdr); ++ int npl = line->n_allele*(line->n_allele+1)/2; ++ hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr); ++@@ -401,7 +401,7 @@ ++ // Target genotypes ++ int ngt, npl; ++ if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 ) ++- error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); +++ error("GT not present at %s:%"PRId64"?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1); ++ ngt /= bcf_hdr_nsamples(args->gt_hdr); ++ if ( ngt!=2 ) continue; // checking only diploid genotypes ++ ++@@ -417,7 +417,7 @@ ++ npl = fake_PLs(args, args->sm_hdr, sm_line); ++ } ++ else ++- error("PL not present at %s:%d?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1); +++ error("PL not present at %s:%"PRId64"?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, (int64_t) sm_line->pos+1); ++ } ++ else ++ npl /= bcf_hdr_nsamples(args->sm_hdr); ++@@ -462,7 +462,7 @@ ++ int a = bcf_gt_allele(gt_ptr[0]); ++ int b = bcf_gt_allele(gt_ptr[1]); ++ if ( args->hom_only && a!=b ) continue; // heterozygous genotype ++- fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); +++ fprintf(fp, "SC\t%s\t%"PRId64, args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1); ++ for (i=0; in_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]); ++ fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : "."); ++ fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk); ++@@ -517,7 +517,7 @@ ++ ++ if ( args->plot ) ++ { ++- fclose(fp); +++ if ( fclose(fp)!=0 ) error("[%s] Error: close failed\n", __func__); ++ plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]); ++ } ++ } ++@@ -790,7 +790,7 @@ ++ case 't': targets = optarg; break; ++ case 'T': targets = optarg; targets_is_file = 1; break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -807,7 +807,8 @@ ++ if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions); ++ if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets); ++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); ++- if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) error("Failed to open %s: %s\n", args->gt_fname,bcf_sr_strerror(args->files->errnum)); +++ if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) +++ error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum)); ++ args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS; ++ if ( args->plot ) args->plot = init_prefix(args->plot); ++ init_data(args); ++--- python-pysam.orig/bcftools/vcfindex.c +++++ python-pysam/bcftools/vcfindex.c ++@@ -49,7 +49,7 @@ ++ fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); ++ fprintf(stderr, " -o, --output-file FILE optional output index file name\n"); ++ fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n"); ++- fprintf(stderr, " --threads sets the number of threads [0]\n"); +++ fprintf(stderr, " --threads INT use multithreading with INT worker threads [0]\n"); ++ fprintf(stderr, "\n"); ++ fprintf(stderr, "Stats options:\n"); ++ fprintf(stderr, " -n, --nrecords print number of records based on existing index file\n"); ++@@ -112,7 +112,7 @@ ++ } ++ if (stats&2) printf("%" PRIu64 "\n", sum); ++ free(seq); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__); ++ bcf_hdr_destroy(hdr); ++ if (tbx) ++ tbx_destroy(tbx); ++--- python-pysam.orig/bcftools/vcfindex.c.pysam.c +++++ python-pysam/bcftools/vcfindex.c.pysam.c ++@@ -51,7 +51,7 @@ ++ fprintf(bcftools_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); ++ fprintf(bcftools_stderr, " -o, --output-file FILE optional output index file name\n"); ++ fprintf(bcftools_stderr, " -t, --tbi generate TBI-format index for VCF files\n"); ++- fprintf(bcftools_stderr, " --threads sets the number of threads [0]\n"); +++ fprintf(bcftools_stderr, " --threads INT use multithreading with INT worker threads [0]\n"); ++ fprintf(bcftools_stderr, "\n"); ++ fprintf(bcftools_stderr, "Stats options:\n"); ++ fprintf(bcftools_stderr, " -n, --nrecords print number of records based on existing index file\n"); ++@@ -114,7 +114,7 @@ ++ } ++ if (stats&2) fprintf(bcftools_stdout, "%" PRIu64 "\n", sum); ++ free(seq); ++- hts_close(fp); +++ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__); ++ bcf_hdr_destroy(hdr); ++ if (tbx) ++ tbx_destroy(tbx); ++--- python-pysam.orig/bcftools/vcfisec.c +++++ python-pysam/bcftools/vcfisec.c ++@@ -1,6 +1,6 @@ ++ /* vcfisec.c -- Create intersections, unions and complements of VCF files. ++ ++- Copyright (C) 2012-2014 Genome Research Ltd. +++ Copyright (C) 2012-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -33,6 +33,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include "bcftools.h" ++ #include "filter.h" ++ ++@@ -144,7 +145,7 @@ ++ if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); ++ if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); ++- bcf_hdr_write(out_fh, files->readers[args->iwrite].header); +++ if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); ++ } ++ if ( !args->nwrite && !out_std && !args->prefix ) ++ fprintf(stderr,"Note: -w option not given, printing list of sites...\n"); ++@@ -195,8 +196,8 @@ ++ ++ if ( out_std ) ++ { ++- if ( bcf_sr_has_line(files,args->iwrite) ) ++- bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]); +++ if ( bcf_sr_has_line(files,args->iwrite) && bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0])!=0 ) +++ error("[%s] Error: cannot write to %s\n", __func__, args->output_fname ? args->output_fname : "standard output"); ++ continue; ++ } ++ else if ( args->fh_sites ) ++@@ -218,7 +219,8 @@ ++ for (i=0; inreaders; i++) ++ kputc(bcf_sr_has_line(files,i)?'1':'0', &str); ++ kputc('\n', &str); ++- fwrite(str.s,sizeof(char),str.l,args->fh_sites); +++ if ( fwrite(str.s,sizeof(char),str.l,args->fh_sites)!=str.l ) +++ error("[%s] Error: failed to write %d bytes to %s\n", __func__,(int)str.l,args->output_fname ? args->output_fname : "standard output"); ++ } ++ ++ if ( args->prefix ) ++@@ -226,9 +228,15 @@ ++ if ( args->isec_op==OP_VENN && ret==3 ) ++ { ++ if ( !args->nwrite || args->write[0] ) ++- bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0)); +++ { +++ if ( bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0))!=0 ) +++ error("[%s] Error: cannot write\n", __func__); +++ } ++ if ( !args->nwrite || args->write[1] ) ++- bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1)); +++ { +++ if ( bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1))!=0 ) +++ error("[%s] Error: cannot write\n", __func__); +++ } ++ } ++ else ++ { ++@@ -236,13 +244,13 @@ ++ { ++ if ( !bcf_sr_has_line(files,i) ) continue; ++ if ( args->write && !args->write[i] ) continue; ++- bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]); +++ if ( bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0])!=0 ) error("[%s] Error: cannot write\n", __func__); ++ } ++ } ++ } ++ } ++ if ( str.s ) free(str.s); ++- if ( out_fh ) hts_close(out_fh); +++ if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-"); ++ } ++ ++ static void add_filter(args_t *args, char *expr, int logic) ++@@ -352,7 +360,7 @@ ++ if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \ ++ if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \ ++ if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ ++- bcf_hdr_write(args->fh_out[i], args->files->readers[j].header); \ +++ if ( bcf_hdr_write(args->fh_out[i], args->files->readers[j].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fnames[i]); \ ++ } ++ if ( !args->nwrite || args->write[0] ) ++ { ++@@ -425,7 +433,7 @@ ++ for (i=0; ifnames[i] ) continue; ++- hts_close(args->fh_out[i]); +++ if ( hts_close(args->fh_out[i])!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]); ++ if ( args->output_type==FT_VCF_GZ ) ++ { ++ tbx_conf_t conf = tbx_conf_vcf; ++@@ -465,7 +473,7 @@ ++ fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); ++ fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); ++ fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); ++- fprintf(stderr, " --threads number of extra output compression threads [0]\n"); +++ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(stderr, " -w, --write list of files to write with -p given as 1-based indexes. By default, all files are written\n"); ++ fprintf(stderr, "\n"); ++ fprintf(stderr, "Examples:\n"); ++@@ -478,6 +486,9 @@ ++ fprintf(stderr, " # Extract and write records from A shared by both A and B using exact allele match\n"); ++ fprintf(stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n"); ++ fprintf(stderr, "\n"); +++ fprintf(stderr, " # Extract and write records from C found in A and C but not in B\n"); +++ fprintf(stderr, " bcftools isec A.vcf.gz B.vcf.gz C.vcf.gz -p dir -n~101 -w 3\n"); +++ fprintf(stderr, "\n"); ++ fprintf(stderr, " # Extract records private to A or B comparing by position only\n"); ++ fprintf(stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n"); ++ fprintf(stderr, "\n"); ++@@ -540,7 +551,9 @@ ++ else error("The --collapse string \"%s\" not recognised.\n", optarg); ++ break; ++ case 'f': args->files->apply_filters = optarg; break; ++- case 'C': args->isec_op = OP_COMPLEMENT; break; +++ case 'C': +++ if ( args->isec_op!=0 && args->isec_op!=OP_COMPLEMENT ) error("Error: either -C or -n should be given, not both.\n"); +++ args->isec_op = OP_COMPLEMENT; break; ++ case 'r': args->regions_list = optarg; break; ++ case 'R': args->regions_list = optarg; regions_is_file = 1; break; ++ case 't': args->targets_list = optarg; break; ++@@ -551,6 +564,8 @@ ++ case 'e': add_filter(args, optarg, FLT_EXCLUDE); break; ++ case 'n': ++ { +++ if ( args->isec_op!=0 && args->isec_op==OP_COMPLEMENT ) error("Error: either -C or -n should be given, not both.\n"); +++ if ( args->isec_op!=0 ) error("Error: -n should be given only once.\n"); ++ char *p = optarg; ++ if ( *p=='-' ) { args->isec_op = OP_MINUS; p++; } ++ else if ( *p=='+' ) { args->isec_op = OP_PLUS; p++; } ++@@ -565,7 +580,7 @@ ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++--- python-pysam.orig/bcftools/vcfisec.c.pysam.c +++++ python-pysam/bcftools/vcfisec.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* vcfisec.c -- Create intersections, unions and complements of VCF files. ++ ++- Copyright (C) 2012-2014 Genome Research Ltd. +++ Copyright (C) 2012-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -35,6 +35,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include "bcftools.h" ++ #include "filter.h" ++ ++@@ -146,7 +147,7 @@ ++ if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); ++ if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); ++- bcf_hdr_write(out_fh, files->readers[args->iwrite].header); +++ if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); ++ } ++ if ( !args->nwrite && !out_std && !args->prefix ) ++ fprintf(bcftools_stderr,"Note: -w option not given, printing list of sites...\n"); ++@@ -197,8 +198,8 @@ ++ ++ if ( out_std ) ++ { ++- if ( bcf_sr_has_line(files,args->iwrite) ) ++- bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]); +++ if ( bcf_sr_has_line(files,args->iwrite) && bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0])!=0 ) +++ error("[%s] Error: cannot write to %s\n", __func__, args->output_fname ? args->output_fname : "standard output"); ++ continue; ++ } ++ else if ( args->fh_sites ) ++@@ -220,7 +221,8 @@ ++ for (i=0; inreaders; i++) ++ kputc(bcf_sr_has_line(files,i)?'1':'0', &str); ++ kputc('\n', &str); ++- fwrite(str.s,sizeof(char),str.l,args->fh_sites); +++ if ( fwrite(str.s,sizeof(char),str.l,args->fh_sites)!=str.l ) +++ error("[%s] Error: failed to write %d bytes to %s\n", __func__,(int)str.l,args->output_fname ? args->output_fname : "standard output"); ++ } ++ ++ if ( args->prefix ) ++@@ -228,9 +230,15 @@ ++ if ( args->isec_op==OP_VENN && ret==3 ) ++ { ++ if ( !args->nwrite || args->write[0] ) ++- bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0)); +++ { +++ if ( bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0))!=0 ) +++ error("[%s] Error: cannot write\n", __func__); +++ } ++ if ( !args->nwrite || args->write[1] ) ++- bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1)); +++ { +++ if ( bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1))!=0 ) +++ error("[%s] Error: cannot write\n", __func__); +++ } ++ } ++ else ++ { ++@@ -238,13 +246,13 @@ ++ { ++ if ( !bcf_sr_has_line(files,i) ) continue; ++ if ( args->write && !args->write[i] ) continue; ++- bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]); +++ if ( bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0])!=0 ) error("[%s] Error: cannot write\n", __func__); ++ } ++ } ++ } ++ } ++ if ( str.s ) free(str.s); ++- if ( out_fh ) hts_close(out_fh); +++ if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-"); ++ } ++ ++ static void add_filter(args_t *args, char *expr, int logic) ++@@ -354,7 +362,7 @@ ++ if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \ ++ if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \ ++ if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ ++- bcf_hdr_write(args->fh_out[i], args->files->readers[j].header); \ +++ if ( bcf_hdr_write(args->fh_out[i], args->files->readers[j].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fnames[i]); \ ++ } ++ if ( !args->nwrite || args->write[0] ) ++ { ++@@ -427,7 +435,7 @@ ++ for (i=0; ifnames[i] ) continue; ++- hts_close(args->fh_out[i]); +++ if ( hts_close(args->fh_out[i])!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]); ++ if ( args->output_type==FT_VCF_GZ ) ++ { ++ tbx_conf_t conf = tbx_conf_vcf; ++@@ -467,7 +475,7 @@ ++ fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); ++ fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); ++ fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); ++- fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); +++ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(bcftools_stderr, " -w, --write list of files to write with -p given as 1-based indexes. By default, all files are written\n"); ++ fprintf(bcftools_stderr, "\n"); ++ fprintf(bcftools_stderr, "Examples:\n"); ++@@ -480,6 +488,9 @@ ++ fprintf(bcftools_stderr, " # Extract and write records from A shared by both A and B using exact allele match\n"); ++ fprintf(bcftools_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n"); ++ fprintf(bcftools_stderr, "\n"); +++ fprintf(bcftools_stderr, " # Extract and write records from C found in A and C but not in B\n"); +++ fprintf(bcftools_stderr, " bcftools isec A.vcf.gz B.vcf.gz C.vcf.gz -p dir -n~101 -w 3\n"); +++ fprintf(bcftools_stderr, "\n"); ++ fprintf(bcftools_stderr, " # Extract records private to A or B comparing by position only\n"); ++ fprintf(bcftools_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n"); ++ fprintf(bcftools_stderr, "\n"); ++@@ -542,7 +553,9 @@ ++ else error("The --collapse string \"%s\" not recognised.\n", optarg); ++ break; ++ case 'f': args->files->apply_filters = optarg; break; ++- case 'C': args->isec_op = OP_COMPLEMENT; break; +++ case 'C': +++ if ( args->isec_op!=0 && args->isec_op!=OP_COMPLEMENT ) error("Error: either -C or -n should be given, not both.\n"); +++ args->isec_op = OP_COMPLEMENT; break; ++ case 'r': args->regions_list = optarg; break; ++ case 'R': args->regions_list = optarg; regions_is_file = 1; break; ++ case 't': args->targets_list = optarg; break; ++@@ -553,6 +566,8 @@ ++ case 'e': add_filter(args, optarg, FLT_EXCLUDE); break; ++ case 'n': ++ { +++ if ( args->isec_op!=0 && args->isec_op==OP_COMPLEMENT ) error("Error: either -C or -n should be given, not both.\n"); +++ if ( args->isec_op!=0 ) error("Error: -n should be given only once.\n"); ++ char *p = optarg; ++ if ( *p=='-' ) { args->isec_op = OP_MINUS; p++; } ++ else if ( *p=='+' ) { args->isec_op = OP_PLUS; p++; } ++@@ -567,7 +582,7 @@ ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++--- python-pysam.orig/bcftools/vcfmerge.c +++++ python-pysam/bcftools/vcfmerge.c ++@@ -1,6 +1,6 @@ ++ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. ++ ++- Copyright (C) 2012-2016 Genome Research Ltd. +++ Copyright (C) 2012-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -28,6 +28,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -84,7 +85,7 @@ ++ typedef struct ++ { ++ bcf1_t *line; ++- int end, active; +++ int end, active; // end: 0-based INFO/END ++ } ++ gvcf_aux_t; ++ ++@@ -121,13 +122,16 @@ ++ int nfmt_map; // number of rows in the fmt_map array ++ int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes ++ void *tmp_arr; ++- int ntmp_arr; +++ size_t ntmp_arr; ++ buffer_t *buf; ++ AGR_info_t *AGR_info; ++ int nAGR_info, mAGR_info; ++ bcf_srs_t *files; ++- int gvcf_min, gvcf_break; // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present ++- gvcf_aux_t *gvcf; // buffer of gVCF lines +++ int gvcf_min, // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present +++ gvcf_break; // 0-based position of a next record which breaks a gVCF block +++ gvcf_aux_t *gvcf; // buffer of gVCF lines, for each reader one line +++ int nout_smpl; +++ kstring_t *str; ++ } ++ maux_t; ++ ++@@ -397,7 +401,7 @@ ++ { ++ int msize = args->maux->ntmp_arr / rule->type_size; ++ int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &msize, rule->type); ++- if ( ret<=0 ) error("FIXME: error parsing %s at %s:%d .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1,ret); +++ if ( ret<=0 ) error("FIXME: error parsing %s at %s:%"PRId64" .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); ++ args->maux->ntmp_arr = msize * rule->type_size; ++ ++ rule->nblocks++; ++@@ -416,7 +420,7 @@ ++ int i, j; ++ if ( var_len==BCF_VL_A ) ++ { ++- if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); +++ if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%"PRId64"\n",rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); ++ args->maux->nagr_map = ret; ++ hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map); ++ // create mapping from source file ALT indexes to dst file indexes ++@@ -425,7 +429,7 @@ ++ } ++ else if ( var_len==BCF_VL_R ) ++ { ++- if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); +++ if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%"PRId64"\n",rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); ++ args->maux->nagr_map = ret; ++ hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map); ++ for (i=0; imaux->agr_map[i] = als->map[i]; ++@@ -460,7 +464,7 @@ ++ else ++ { ++ if ( rule->nblocks>1 && ret!=rule->block_size ) ++- error("Mismatch in number of values for INFO/%s at %s:%d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); +++ error("Mismatch in number of values for INFO/%s at %s:%"PRId64"\n", rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); ++ rule->block_size = ret; ++ args->maux->nagr_map = 0; ++ } ++@@ -501,20 +505,24 @@ ++ int i; ++ for (i=0; isamples[i]; ++- if ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 ) +++ char *rmme = NULL, *name = hr->samples[i]; +++ while ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 ) ++ { ++ // there is a sample with the same name ++ if ( !force_samples ) error("Error: Duplicate sample names (%s), use --force-samples to proceed anyway.\n", name); ++ ++- int len = strlen(hr->samples[i]) + strlen(clash_prefix) + 1; ++- name = (char*) malloc(sizeof(char)*(len+1)); ++- sprintf(name,"%s:%s",clash_prefix,hr->samples[i]); ++- bcf_hdr_add_sample(hw,name); ++- free(name); +++ // Resolve conflicting samples names. For example, replace: +++ // A + A with A,2:A +++ // A,2:A + A with A,2:A,2:2:A +++ +++ int len = strlen(name) + strlen(clash_prefix) + 1; +++ char *tmp = (char*) malloc(sizeof(char)*(len+1)); +++ sprintf(tmp,"%s:%s",clash_prefix,name); +++ free(rmme); +++ rmme = name = tmp; ++ } ++- else ++- bcf_hdr_add_sample(hw,name); +++ bcf_hdr_add_sample(hw,name); +++ free(rmme); ++ } ++ } ++ ++@@ -677,6 +685,8 @@ ++ int i, n_smpl = 0; ++ for (i=0; in; i++) ++ n_smpl += bcf_hdr_nsamples(files->readers[i].header); +++ ma->nout_smpl = n_smpl; +++ assert( n_smpl==bcf_hdr_nsamples(args->out_hdr) ); ++ if ( args->do_gvcf ) ++ { ++ ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); ++@@ -688,11 +698,14 @@ ++ ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t)); ++ for (i=0; in; i++) ++ ma->buf[i].rid = -1; +++ ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t)); ++ return ma; ++ } ++ void maux_destroy(maux_t *ma) ++ { ++ int i,j; +++ for (i=0; inout_smpl; i++) free(ma->str[i].s); +++ free(ma->str); ++ for (i=0; imals; i++) ++ { ++ free(ma->als[i]); ++@@ -776,7 +789,7 @@ ++ } ++ ma->buf[i].end = j; ++ ma->buf[i].cur = -1; ++- if ( ma->buf[i].beg < ma->buf[i].end ) +++ if ( ma->buf[i].beg < ma->buf[i].end ) ++ { ++ ma->buf[i].lines = ma->files->readers[i].buffer; ++ if ( ma->gvcf ) ma->gvcf[i].active = 0; // gvcf block cannot overlap with the next record ++@@ -1008,7 +1021,7 @@ ++ int end_src = start_src; ++ while ( end_srcmbuf = tmp.m; agr->nbuf = tmp.l; agr->buf = (uint8_t*)tmp.s; ++ } ++ else ++- error("Not ready for type [%d]: %s at %d\n", info->type,agr->hdr_tag,line->pos+1); +++ error("Not ready for type [%d]: %s at %"PRId64"\n", info->type,agr->hdr_tag,(int64_t) line->pos+1); ++ } ++ ++ if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 || info->type==BCF_BT_FLOAT ) ++@@ -1137,7 +1150,7 @@ ++ { ++ int ret = copy_string_field((char*)info->vptr, iori-ifrom, info->len, &tmp, als->map[iori]-ifrom); ++ if ( ret ) ++- error("Error at %s:%d: wrong number of fields in %s?\n", bcf_seqname(hdr,line),line->pos+1,agr->hdr_tag); +++ error("Error at %s:%"PRId64": wrong number of fields in %s?\n", bcf_seqname(hdr,line),(int64_t) line->pos+1,agr->hdr_tag); ++ } ++ } ++ else ++@@ -1153,7 +1166,7 @@ ++ int knew = bcf_alleles2gt(inew,jnew); ++ int ret = copy_string_field((char*)info->vptr, kori, info->len, &tmp, knew); ++ if ( ret ) ++- error("Error at %s:%d: wrong number of fields in %s?\n", bcf_seqname(hdr,line),line->pos+1,agr->hdr_tag); +++ error("Error at %s:%"PRId64": wrong number of fields in %s?\n", bcf_seqname(hdr,line),(int64_t) line->pos+1,agr->hdr_tag); ++ } ++ } ++ } ++@@ -1227,7 +1240,7 @@ ++ } ++ kitr = kh_get(strdict, tmph, key); ++ int idx = kh_val(tmph, kitr); ++- if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%d\n", key,bcf_seqname(hdr,line),line->pos+1); +++ if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%"PRId64"\n", key,bcf_seqname(hdr,line),(int64_t) line->pos+1); ++ merge_AGR_info_tag(hdr, line,inf,len,&ma->buf[i].rec[irec],&ma->AGR_info[idx]); ++ continue; ++ } ++@@ -1318,6 +1331,7 @@ ++ bcf_hdr_t *out_hdr = args->out_hdr; ++ maux_t *ma = args->maux; ++ int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr); +++ static int warned = 0; ++ ++ int nsize = 0, msize = sizeof(int32_t); ++ for (i=0; inreaders; i++) ++@@ -1333,6 +1347,13 @@ ++ { ++ ma->ntmp_arr = nsamples*nsize*msize; ++ ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); +++ if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr); +++ if ( ma->ntmp_arr > 2147483647 ) +++ { +++ if ( !warned ) fprintf(stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); +++ warned = 1; +++ return; +++ } ++ } ++ memset(ma->smpl_ploidy,0,nsamples*sizeof(int)); ++ ++@@ -1412,15 +1433,126 @@ ++ bcf_update_format_int32(out_hdr, out, "GT", (int32_t*)ma->tmp_arr, nsamples*nsize); ++ } ++ +++void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf1_t *out, int length, int nsize) +++{ +++ bcf_srs_t *files = args->files; +++ bcf_hdr_t *out_hdr = args->out_hdr; +++ maux_t *ma = args->maux; +++ int i,j, nsamples = bcf_hdr_nsamples(out_hdr); +++ static int warned = 0; +++ +++ // initialize empty strings, a dot for each value, e.g. ".,.,." +++ int nmax = 0; +++ for (i=0; istr[i]; +++ if ( length==BCF_VL_FIXED || length==BCF_VL_VAR ) +++ { +++ str->l = 1; +++ ks_resize(str, str->l+1); +++ str->s[0] = '.'; +++ } +++ else +++ { +++ str->l = nsize*2 - 1; +++ ks_resize(str, str->l+1); +++ str->s[0] = '.'; +++ for (j=1; js[j*2-1] = ',', str->s[j*2] = '.'; +++ } +++ str->s[str->l] = 0; +++ if ( nmax < str->l ) nmax = str->l; +++ } +++ +++ // fill in values for each sample +++ int ismpl = 0; +++ for (i=0; inreaders; i++) +++ { +++ bcf_sr_t *reader = &files->readers[i]; +++ bcf_hdr_t *hdr = reader->header; +++ bcf_fmt_t *fmt_ori = fmt_map[i]; +++ if ( !fmt_ori ) +++ { +++ // the field is not present in this file +++ ismpl += bcf_hdr_nsamples(hdr); +++ continue; +++ } +++ +++ bcf1_t *line = maux_get_line(args, i); +++ int irec = ma->buf[i].cur; +++ char *src = (char*) fmt_ori->p; +++ +++ if ( length==BCF_VL_FIXED || length==BCF_VL_VAR || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) ) +++ { +++ // alleles unchanged, copy over +++ for (j=0; jstr[ismpl++]; +++ str->l = 0; +++ kputsn(src, fmt_ori->n, str); +++ if ( nmax < str->l ) nmax = str->l; +++ src += fmt_ori->n; +++ } +++ continue; +++ } +++ // NB, what is below is not the fastest way, copy_string_field() keeps +++ // finding the indexes repeatedly at multiallelic sites +++ if ( length==BCF_VL_A || length==BCF_VL_R ) +++ { +++ int ifrom = length==BCF_VL_A ? 1 : 0; +++ for (j=0; jstr[ismpl++]; +++ int iori,inew; +++ for (iori=ifrom; iorin_allele; iori++) +++ { +++ inew = ma->buf[i].rec[irec].map[iori] - ifrom; +++ int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew); +++ if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); +++ } +++ src += fmt_ori->size; +++ } +++ continue; +++ } +++ assert( length==BCF_VL_G ); +++ error("[E::%s] Merging of Number=G FORMAT strings (in your case FORMAT/%s) is not supported yet, sorry!\n" +++ "Please open an issue on github if this feature is essential for you. However, note that using FORMAT strings is not\n" +++ "a good idea in general - it is slow to parse and does not compress well, it is better to use integer codes instead.\n" +++ "If you don't really need it, use `bcftools annotate -x` to remove the annotation before merging.\n", __func__,key); +++ } +++ // update the record +++ if ( ma->ntmp_arr < nsamples*nmax ) +++ { +++ ma->ntmp_arr = nsamples*nmax; +++ ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); +++ if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr); +++ if ( ma->ntmp_arr > 2147483647 ) +++ { +++ if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); +++ warned = 1; +++ return; +++ } +++ } +++ char *tgt = (char*) ma->tmp_arr; +++ for (i=0; istr[i].s, ma->str[i].l); +++ if ( ma->str[i].l < nmax ) memset(tgt + ma->str[i].l, 0, nmax - ma->str[i].l); +++ tgt += nmax; +++ } +++ bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nmax); +++} +++ ++ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) ++ { ++ bcf_srs_t *files = args->files; ++ bcf_hdr_t *out_hdr = args->out_hdr; ++ maux_t *ma = args->maux; ++ int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr); +++ static int warned = 0; ++ ++ const char *key = NULL; ++- int nsize = 0, length = BCF_VL_FIXED, type = -1; +++ size_t nsize = 0, length = BCF_VL_FIXED; +++ int type = -1; ++ for (i=0; inreaders; i++) ++ { ++ if ( !maux_get_line(args,i) ) continue; ++@@ -1447,12 +1579,24 @@ ++ } ++ if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n; ++ } +++ if ( type==BCF_BT_CHAR ) +++ { +++ merge_format_string(args, key, fmt_map, out, length, nsize); +++ return; +++ } ++ ++- int msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); +++ size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); ++ if ( ma->ntmp_arr < nsamples*nsize*msize ) ++ { ++ ma->ntmp_arr = nsamples*nsize*msize; ++ ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); +++ if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", ma->ntmp_arr,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key); +++ if ( ma->ntmp_arr > 2147483647 ) +++ { +++ if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); +++ warned = 1; +++ return; +++ } ++ } ++ ++ // Fill the temp array for all samples by collecting values from all files ++@@ -1463,6 +1607,7 @@ ++ bcf_fmt_t *fmt_ori = fmt_map[i]; ++ bcf1_t *line = maux_get_line(args, i); ++ int irec = ma->buf[i].cur; +++ ++ if ( fmt_ori ) ++ { ++ type = fmt_ori->type; ++@@ -1471,23 +1616,23 @@ ++ { ++ // if all fields are missing then n==1 is valid ++ if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori*(nals_ori+1)/2 && fmt_map[i]->n != nals_ori ) ++- error("Incorrect number of FORMAT/%s values at %s:%d, cannot merge. The tag is defined as Number=G, but found\n" +++ error("Incorrect number of FORMAT/%s values at %s:%"PRId64", cannot merge. The tag is defined as Number=G, but found\n" ++ "%d values and %d alleles. See also http://samtools.github.io/bcftools/howtos/FAQ.html#incorrect-nfields\n", ++- key,bcf_seqname(args->out_hdr,out),out->pos+1,fmt_ori->n,nals_ori); +++ key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,fmt_ori->n,nals_ori); ++ } ++ else if ( length==BCF_VL_A ) ++ { ++ if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori-1 ) ++- error("Incorrect number of FORMAT/%s values at %s:%d, cannot merge. The tag is defined as Number=A, but found\n" +++ error("Incorrect number of FORMAT/%s values at %s:%"PRId64", cannot merge. The tag is defined as Number=A, but found\n" ++ "%d values and %d alleles. See also http://samtools.github.io/bcftools/howtos/FAQ.html#incorrect-nfields\n", ++- key,bcf_seqname(args->out_hdr,out),out->pos+1,fmt_ori->n,nals_ori); +++ key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,fmt_ori->n,nals_ori); ++ } ++ else if ( length==BCF_VL_R ) ++ { ++ if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori ) ++- error("Incorrect number of FORMAT/%s values at %s:%d, cannot merge. The tag is defined as Number=R, but found\n" +++ error("Incorrect number of FORMAT/%s values at %s:%"PRId64", cannot merge. The tag is defined as Number=R, but found\n" ++ "%d values and %d alleles. See also http://samtools.github.io/bcftools/howtos/FAQ.html#incorrect-nfields\n", ++- key,bcf_seqname(args->out_hdr,out),out->pos+1,fmt_ori->n,nals_ori); +++ key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,fmt_ori->n,nals_ori); ++ } ++ } ++ ++@@ -1619,15 +1764,12 @@ ++ case BCF_BT_INT16: BRANCH(int32_t, int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; ++ case BCF_BT_INT32: BRANCH(int32_t, int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; ++ case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break; ++- case BCF_BT_CHAR: BRANCH(uint8_t, uint8_t, *src==bcf_str_missing, *src==bcf_str_vector_end, *tgt=bcf_str_missing, *tgt=bcf_str_vector_end); break; ++ default: error("Unexpected case: %d, %s\n", type, key); ++ } ++ #undef BRANCH ++ } ++ if ( type==BCF_BT_FLOAT ) ++ bcf_update_format_float(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize); ++- else if ( type==BCF_BT_CHAR ) ++- bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize); ++ else ++ bcf_update_format_int32(out_hdr, out, key, (int32_t*)ma->tmp_arr, nsamples*nsize); ++ } ++@@ -1718,6 +1860,7 @@ ++ { ++ if ( !gaux[i].active ) continue; ++ bcf1_t *line = maux_get_line(args, i); +++ if ( !line ) continue; ++ int irec = maux->buf[i].cur; ++ ++ hts_expand(int, line->n_allele, maux->buf[i].rec[irec].mmap, maux->buf[i].rec[irec].map); ++@@ -1739,7 +1882,7 @@ ++ if ( !maux->als ) ++ { ++ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); ++- error("Failed to merge alleles at %s:%d\n",bcf_seqname(hdr,line),line->pos+1); +++ error("Failed to merge alleles at %s:%"PRId64"\n",bcf_seqname(hdr,line),(int64_t) line->pos+1); ++ } ++ } ++ } ++@@ -1748,6 +1891,7 @@ ++ /* ++ Output staged gVCF blocks, end is the last position of the block. Assuming ++ gaux[i].active flags are set and maux_get_line returns correct lines. +++ Both start,end coordinates are 0-based. ++ */ ++ void gvcf_write_block(args_t *args, int start, int end) ++ { ++@@ -1757,7 +1901,7 @@ ++ assert(gaux); ++ ++ // Update POS ++- int min = INT_MAX; +++ int min = INT_MAX; // the minimum active gVCF INFO/END (0-based) ++ char ref = 'N'; ++ for (i=0; ifiles->nreaders; i++) ++ { ++@@ -1778,7 +1922,7 @@ ++ if ( min > gaux[i].end ) min = gaux[i].end; ++ } ++ // Check for valid gVCF blocks in this region ++- if ( min==INT_MAX ) +++ if ( min==INT_MAX ) // this probably should not happen ++ { ++ assert(0); ++ maux->gvcf_min = 0; ++@@ -1814,7 +1958,7 @@ ++ } ++ else ++ bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0); ++- bcf_write1(args->out_fh, args->out_hdr, out); +++ if ( bcf_write1(args->out_fh, args->out_hdr, out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ bcf_clear1(out); ++ ++ ++@@ -1872,7 +2016,7 @@ ++ } ++ ++ // When called on a region, trim the blocks accordingly ++- int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos; +++ int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos; // the start of a new gvcf block to output ++ if ( args->regs ) ++ { ++ int rstart = -1, rend = -1; ++@@ -1892,7 +2036,7 @@ ++ // does the block end before the new line or is it interrupted? ++ int tmp = maux->gvcf_min < flush_until ? maux->gvcf_min : flush_until; ++ if ( start > tmp-1 ) break; ++- gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based +++ gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based, passing 0-based coordinates ++ start = tmp; ++ } ++ } ++@@ -1901,6 +2045,7 @@ ++ Check incoming lines for new gVCF blocks, set pointer to the current source ++ buffer (gvcf or readers). In contrast to gvcf_flush, this function can be ++ called only after maux_reset as it relies on updated maux buffers. +++ The coordinate is 0-based ++ */ ++ void gvcf_stage(args_t *args, int pos) ++ { ++@@ -1935,8 +2080,16 @@ ++ int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend); ++ if ( ret==1 ) ++ { +++ if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END +++ { +++ maux->gvcf_break = line->pos; +++ continue; +++ } +++ if ( end[0] <= line->pos ) error("Error: Incorrect END at %s:%"PRId64" .. END=%d\n", bcf_seqname(hdr,line),(int64_t) line->pos+1,end[0]); +++ ++ // END is set, this is a new gVCF block. Cache this line in gaux[i] and swap with ++ // an empty record: the gaux line must be kept until we reach its END. +++ ++ gaux[i].active = 1; ++ gaux[i].end = end[0] - 1; ++ SWAP(bcf1_t*,args->files->readers[i].buffer[irec],gaux[i].line); ++@@ -1982,7 +2135,15 @@ ++ { ++ // Invalidate pointer to reader's buffer or else gvcf_flush will attempt ++ // to use the old lines via maux_get_line() ++- if ( ma->gvcf && !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; +++ if ( ma->gvcf ) +++ { +++ if ( ma->gvcf[ir].active ) +++ { +++ if ( ma->pos >= ma->gvcf[ir].end ) ma->gvcf[ir].active = 0; +++ else if ( ma->buf[ir].cur==-1 ) ma->buf[ir].cur = ma->buf[ir].beg; // re-activate interrupted gVCF block +++ } +++ if ( !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; +++ } ++ ++ bcf_sr_t *reader = bcf_sr_get_reader(args->files,ir); ++ if ( !reader->nbuffer ) continue; // nothing to clean ++@@ -2043,14 +2204,15 @@ ++ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); ++ const char *chr = bcf_hdr_id2name(hdr, maux->buf[i].rid); ++ fprintf(stderr,"\t"); ++- for (j=maux->buf[i].beg; jbuf[i].end; j++) fprintf(stderr," %s:%d",chr,maux->buf[i].lines[j]->pos+1); +++ for (j=maux->buf[i].beg; jbuf[i].end; j++) fprintf(stderr," %s:%"PRId64,chr,(int64_t) maux->buf[i].lines[j]->pos+1); ++ } ++ fprintf(stderr,"\n"); ++ } +++ fprintf(stderr,"gvcf_min=%d\n", args->maux->gvcf_min); ++ for (i=0; ifiles->nreaders; i++) ++ { ++ fprintf(stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active); ++- if ( maux->gvcf[i].active ) fprintf(stderr,"\tpos,end=%d,%d", maux->gvcf[i].line->pos+1,maux->gvcf[i].end+1); +++ if ( maux->gvcf[i].active ) fprintf(stderr,"\tpos,end=%"PRId64",%"PRId64, (int64_t) maux->gvcf[i].line->pos+1,(int64_t) maux->gvcf[i].end+1); ++ fprintf(stderr,"\n"); ++ } ++ fprintf(stderr,"\n"); ++@@ -2185,7 +2347,7 @@ ++ } ++ // normalize alleles ++ maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals); ++- if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",maux->chr,line->pos+1,reader->fname); +++ if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname); ++ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); ++ for (k=1; kn_allele; k++) ++ maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files ++@@ -2286,33 +2448,46 @@ ++ if ( args->do_gvcf ) ++ bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0); ++ merge_format(args, out); ++- bcf_write1(args->out_fh, args->out_hdr, out); +++ if ( bcf_write1(args->out_fh, args->out_hdr, out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ bcf_clear1(out); ++ } ++ ++ void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd) ++ { ++ kstring_t str = {0,0,0}; ++- ksprintf(&str,"##%sVersion=%s+htslib-%s\n", cmd, bcftools_version(), hts_version()); ++- bcf_hdr_append(hdr,str.s); +++ int e = 0; +++ if (ksprintf(&str,"##%sVersion=%s+htslib-%s\n", cmd, bcftools_version(), hts_version()) < 0) +++ goto fail; +++ if (bcf_hdr_append(hdr,str.s) < 0) +++ goto fail; ++ ++ str.l = 0; ++- ksprintf(&str,"##%sCommand=%s", cmd, argv[0]); +++ e |= ksprintf(&str,"##%sCommand=%s", cmd, argv[0]) < 0; ++ int i; ++ for (i=1; ifiles->nreaders; i++) ++ { ++- char buf[10]; snprintf(buf,10,"%d",i+1); +++ char buf[24]; snprintf(buf,sizeof buf,"%d",i+1); ++ merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples); ++ } ++ if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge"); ++- bcf_hdr_sync(args->out_hdr); +++ if (bcf_hdr_sync(args->out_hdr) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ } ++ info_rules_init(args); ++ ++ bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header)); ++- bcf_hdr_write(args->out_fh, args->out_hdr); +++ if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ if ( args->header_only ) ++ { ++ bcf_hdr_destroy(args->out_hdr); ++- hts_close(args->out_fh); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ return; ++ } ++ ++@@ -2379,7 +2555,7 @@ ++ info_rules_destroy(args); ++ maux_destroy(args->maux); ++ bcf_hdr_destroy(args->out_hdr); ++- hts_close(args->out_fh); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ bcf_destroy1(args->out_line); ++ kh_destroy(strdict, args->tmph); ++ if ( args->tmps.m ) free(args->tmps.s); ++@@ -2410,7 +2586,7 @@ ++ fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); ++ fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); ++ fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); ++- fprintf(stderr, " --threads number of extra output compression threads [0]\n"); +++ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(stderr, "\n"); ++ exit(1); ++ } ++@@ -2497,7 +2673,7 @@ ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++--- python-pysam.orig/bcftools/vcfmerge.c.pysam.c +++++ python-pysam/bcftools/vcfmerge.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. ++ ++- Copyright (C) 2012-2016 Genome Research Ltd. +++ Copyright (C) 2012-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -30,6 +30,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -86,7 +87,7 @@ ++ typedef struct ++ { ++ bcf1_t *line; ++- int end, active; +++ int end, active; // end: 0-based INFO/END ++ } ++ gvcf_aux_t; ++ ++@@ -123,13 +124,16 @@ ++ int nfmt_map; // number of rows in the fmt_map array ++ int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes ++ void *tmp_arr; ++- int ntmp_arr; +++ size_t ntmp_arr; ++ buffer_t *buf; ++ AGR_info_t *AGR_info; ++ int nAGR_info, mAGR_info; ++ bcf_srs_t *files; ++- int gvcf_min, gvcf_break; // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present ++- gvcf_aux_t *gvcf; // buffer of gVCF lines +++ int gvcf_min, // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present +++ gvcf_break; // 0-based position of a next record which breaks a gVCF block +++ gvcf_aux_t *gvcf; // buffer of gVCF lines, for each reader one line +++ int nout_smpl; +++ kstring_t *str; ++ } ++ maux_t; ++ ++@@ -399,7 +403,7 @@ ++ { ++ int msize = args->maux->ntmp_arr / rule->type_size; ++ int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &msize, rule->type); ++- if ( ret<=0 ) error("FIXME: error parsing %s at %s:%d .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1,ret); +++ if ( ret<=0 ) error("FIXME: error parsing %s at %s:%"PRId64" .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); ++ args->maux->ntmp_arr = msize * rule->type_size; ++ ++ rule->nblocks++; ++@@ -418,7 +422,7 @@ ++ int i, j; ++ if ( var_len==BCF_VL_A ) ++ { ++- if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); +++ if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%"PRId64"\n",rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); ++ args->maux->nagr_map = ret; ++ hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map); ++ // create mapping from source file ALT indexes to dst file indexes ++@@ -427,7 +431,7 @@ ++ } ++ else if ( var_len==BCF_VL_R ) ++ { ++- if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); +++ if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%"PRId64"\n",rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); ++ args->maux->nagr_map = ret; ++ hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map); ++ for (i=0; imaux->agr_map[i] = als->map[i]; ++@@ -462,7 +466,7 @@ ++ else ++ { ++ if ( rule->nblocks>1 && ret!=rule->block_size ) ++- error("Mismatch in number of values for INFO/%s at %s:%d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); +++ error("Mismatch in number of values for INFO/%s at %s:%"PRId64"\n", rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); ++ rule->block_size = ret; ++ args->maux->nagr_map = 0; ++ } ++@@ -503,20 +507,24 @@ ++ int i; ++ for (i=0; isamples[i]; ++- if ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 ) +++ char *rmme = NULL, *name = hr->samples[i]; +++ while ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 ) ++ { ++ // there is a sample with the same name ++ if ( !force_samples ) error("Error: Duplicate sample names (%s), use --force-samples to proceed anyway.\n", name); ++ ++- int len = strlen(hr->samples[i]) + strlen(clash_prefix) + 1; ++- name = (char*) malloc(sizeof(char)*(len+1)); ++- sprintf(name,"%s:%s",clash_prefix,hr->samples[i]); ++- bcf_hdr_add_sample(hw,name); ++- free(name); +++ // Resolve conflicting samples names. For example, replace: +++ // A + A with A,2:A +++ // A,2:A + A with A,2:A,2:2:A +++ +++ int len = strlen(name) + strlen(clash_prefix) + 1; +++ char *tmp = (char*) malloc(sizeof(char)*(len+1)); +++ sprintf(tmp,"%s:%s",clash_prefix,name); +++ free(rmme); +++ rmme = name = tmp; ++ } ++- else ++- bcf_hdr_add_sample(hw,name); +++ bcf_hdr_add_sample(hw,name); +++ free(rmme); ++ } ++ } ++ ++@@ -679,6 +687,8 @@ ++ int i, n_smpl = 0; ++ for (i=0; in; i++) ++ n_smpl += bcf_hdr_nsamples(files->readers[i].header); +++ ma->nout_smpl = n_smpl; +++ assert( n_smpl==bcf_hdr_nsamples(args->out_hdr) ); ++ if ( args->do_gvcf ) ++ { ++ ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); ++@@ -690,11 +700,14 @@ ++ ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t)); ++ for (i=0; in; i++) ++ ma->buf[i].rid = -1; +++ ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t)); ++ return ma; ++ } ++ void maux_destroy(maux_t *ma) ++ { ++ int i,j; +++ for (i=0; inout_smpl; i++) free(ma->str[i].s); +++ free(ma->str); ++ for (i=0; imals; i++) ++ { ++ free(ma->als[i]); ++@@ -778,7 +791,7 @@ ++ } ++ ma->buf[i].end = j; ++ ma->buf[i].cur = -1; ++- if ( ma->buf[i].beg < ma->buf[i].end ) +++ if ( ma->buf[i].beg < ma->buf[i].end ) ++ { ++ ma->buf[i].lines = ma->files->readers[i].buffer; ++ if ( ma->gvcf ) ma->gvcf[i].active = 0; // gvcf block cannot overlap with the next record ++@@ -1010,7 +1023,7 @@ ++ int end_src = start_src; ++ while ( end_srcmbuf = tmp.m; agr->nbuf = tmp.l; agr->buf = (uint8_t*)tmp.s; ++ } ++ else ++- error("Not ready for type [%d]: %s at %d\n", info->type,agr->hdr_tag,line->pos+1); +++ error("Not ready for type [%d]: %s at %"PRId64"\n", info->type,agr->hdr_tag,(int64_t) line->pos+1); ++ } ++ ++ if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 || info->type==BCF_BT_FLOAT ) ++@@ -1139,7 +1152,7 @@ ++ { ++ int ret = copy_string_field((char*)info->vptr, iori-ifrom, info->len, &tmp, als->map[iori]-ifrom); ++ if ( ret ) ++- error("Error at %s:%d: wrong number of fields in %s?\n", bcf_seqname(hdr,line),line->pos+1,agr->hdr_tag); +++ error("Error at %s:%"PRId64": wrong number of fields in %s?\n", bcf_seqname(hdr,line),(int64_t) line->pos+1,agr->hdr_tag); ++ } ++ } ++ else ++@@ -1155,7 +1168,7 @@ ++ int knew = bcf_alleles2gt(inew,jnew); ++ int ret = copy_string_field((char*)info->vptr, kori, info->len, &tmp, knew); ++ if ( ret ) ++- error("Error at %s:%d: wrong number of fields in %s?\n", bcf_seqname(hdr,line),line->pos+1,agr->hdr_tag); +++ error("Error at %s:%"PRId64": wrong number of fields in %s?\n", bcf_seqname(hdr,line),(int64_t) line->pos+1,agr->hdr_tag); ++ } ++ } ++ } ++@@ -1229,7 +1242,7 @@ ++ } ++ kitr = kh_get(strdict, tmph, key); ++ int idx = kh_val(tmph, kitr); ++- if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%d\n", key,bcf_seqname(hdr,line),line->pos+1); +++ if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%"PRId64"\n", key,bcf_seqname(hdr,line),(int64_t) line->pos+1); ++ merge_AGR_info_tag(hdr, line,inf,len,&ma->buf[i].rec[irec],&ma->AGR_info[idx]); ++ continue; ++ } ++@@ -1320,6 +1333,7 @@ ++ bcf_hdr_t *out_hdr = args->out_hdr; ++ maux_t *ma = args->maux; ++ int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr); +++ static int warned = 0; ++ ++ int nsize = 0, msize = sizeof(int32_t); ++ for (i=0; inreaders; i++) ++@@ -1335,6 +1349,13 @@ ++ { ++ ma->ntmp_arr = nsamples*nsize*msize; ++ ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); +++ if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr); +++ if ( ma->ntmp_arr > 2147483647 ) +++ { +++ if ( !warned ) fprintf(bcftools_stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); +++ warned = 1; +++ return; +++ } ++ } ++ memset(ma->smpl_ploidy,0,nsamples*sizeof(int)); ++ ++@@ -1414,15 +1435,126 @@ ++ bcf_update_format_int32(out_hdr, out, "GT", (int32_t*)ma->tmp_arr, nsamples*nsize); ++ } ++ +++void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf1_t *out, int length, int nsize) +++{ +++ bcf_srs_t *files = args->files; +++ bcf_hdr_t *out_hdr = args->out_hdr; +++ maux_t *ma = args->maux; +++ int i,j, nsamples = bcf_hdr_nsamples(out_hdr); +++ static int warned = 0; +++ +++ // initialize empty strings, a dot for each value, e.g. ".,.,." +++ int nmax = 0; +++ for (i=0; istr[i]; +++ if ( length==BCF_VL_FIXED || length==BCF_VL_VAR ) +++ { +++ str->l = 1; +++ ks_resize(str, str->l+1); +++ str->s[0] = '.'; +++ } +++ else +++ { +++ str->l = nsize*2 - 1; +++ ks_resize(str, str->l+1); +++ str->s[0] = '.'; +++ for (j=1; js[j*2-1] = ',', str->s[j*2] = '.'; +++ } +++ str->s[str->l] = 0; +++ if ( nmax < str->l ) nmax = str->l; +++ } +++ +++ // fill in values for each sample +++ int ismpl = 0; +++ for (i=0; inreaders; i++) +++ { +++ bcf_sr_t *reader = &files->readers[i]; +++ bcf_hdr_t *hdr = reader->header; +++ bcf_fmt_t *fmt_ori = fmt_map[i]; +++ if ( !fmt_ori ) +++ { +++ // the field is not present in this file +++ ismpl += bcf_hdr_nsamples(hdr); +++ continue; +++ } +++ +++ bcf1_t *line = maux_get_line(args, i); +++ int irec = ma->buf[i].cur; +++ char *src = (char*) fmt_ori->p; +++ +++ if ( length==BCF_VL_FIXED || length==BCF_VL_VAR || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) ) +++ { +++ // alleles unchanged, copy over +++ for (j=0; jstr[ismpl++]; +++ str->l = 0; +++ kputsn(src, fmt_ori->n, str); +++ if ( nmax < str->l ) nmax = str->l; +++ src += fmt_ori->n; +++ } +++ continue; +++ } +++ // NB, what is below is not the fastest way, copy_string_field() keeps +++ // finding the indexes repeatedly at multiallelic sites +++ if ( length==BCF_VL_A || length==BCF_VL_R ) +++ { +++ int ifrom = length==BCF_VL_A ? 1 : 0; +++ for (j=0; jstr[ismpl++]; +++ int iori,inew; +++ for (iori=ifrom; iorin_allele; iori++) +++ { +++ inew = ma->buf[i].rec[irec].map[iori] - ifrom; +++ int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew); +++ if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); +++ } +++ src += fmt_ori->size; +++ } +++ continue; +++ } +++ assert( length==BCF_VL_G ); +++ error("[E::%s] Merging of Number=G FORMAT strings (in your case FORMAT/%s) is not supported yet, sorry!\n" +++ "Please open an issue on github if this feature is essential for you. However, note that using FORMAT strings is not\n" +++ "a good idea in general - it is slow to parse and does not compress well, it is better to use integer codes instead.\n" +++ "If you don't really need it, use `bcftools annotate -x` to remove the annotation before merging.\n", __func__,key); +++ } +++ // update the record +++ if ( ma->ntmp_arr < nsamples*nmax ) +++ { +++ ma->ntmp_arr = nsamples*nmax; +++ ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); +++ if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr); +++ if ( ma->ntmp_arr > 2147483647 ) +++ { +++ if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); +++ warned = 1; +++ return; +++ } +++ } +++ char *tgt = (char*) ma->tmp_arr; +++ for (i=0; istr[i].s, ma->str[i].l); +++ if ( ma->str[i].l < nmax ) memset(tgt + ma->str[i].l, 0, nmax - ma->str[i].l); +++ tgt += nmax; +++ } +++ bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nmax); +++} +++ ++ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) ++ { ++ bcf_srs_t *files = args->files; ++ bcf_hdr_t *out_hdr = args->out_hdr; ++ maux_t *ma = args->maux; ++ int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr); +++ static int warned = 0; ++ ++ const char *key = NULL; ++- int nsize = 0, length = BCF_VL_FIXED, type = -1; +++ size_t nsize = 0, length = BCF_VL_FIXED; +++ int type = -1; ++ for (i=0; inreaders; i++) ++ { ++ if ( !maux_get_line(args,i) ) continue; ++@@ -1449,12 +1581,24 @@ ++ } ++ if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n; ++ } +++ if ( type==BCF_BT_CHAR ) +++ { +++ merge_format_string(args, key, fmt_map, out, length, nsize); +++ return; +++ } ++ ++- int msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); +++ size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); ++ if ( ma->ntmp_arr < nsamples*nsize*msize ) ++ { ++ ma->ntmp_arr = nsamples*nsize*msize; ++ ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); +++ if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", ma->ntmp_arr,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key); +++ if ( ma->ntmp_arr > 2147483647 ) +++ { +++ if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); +++ warned = 1; +++ return; +++ } ++ } ++ ++ // Fill the temp array for all samples by collecting values from all files ++@@ -1465,6 +1609,7 @@ ++ bcf_fmt_t *fmt_ori = fmt_map[i]; ++ bcf1_t *line = maux_get_line(args, i); ++ int irec = ma->buf[i].cur; +++ ++ if ( fmt_ori ) ++ { ++ type = fmt_ori->type; ++@@ -1473,23 +1618,23 @@ ++ { ++ // if all fields are missing then n==1 is valid ++ if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori*(nals_ori+1)/2 && fmt_map[i]->n != nals_ori ) ++- error("Incorrect number of FORMAT/%s values at %s:%d, cannot merge. The tag is defined as Number=G, but found\n" +++ error("Incorrect number of FORMAT/%s values at %s:%"PRId64", cannot merge. The tag is defined as Number=G, but found\n" ++ "%d values and %d alleles. See also http://samtools.github.io/bcftools/howtos/FAQ.html#incorrect-nfields\n", ++- key,bcf_seqname(args->out_hdr,out),out->pos+1,fmt_ori->n,nals_ori); +++ key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,fmt_ori->n,nals_ori); ++ } ++ else if ( length==BCF_VL_A ) ++ { ++ if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori-1 ) ++- error("Incorrect number of FORMAT/%s values at %s:%d, cannot merge. The tag is defined as Number=A, but found\n" +++ error("Incorrect number of FORMAT/%s values at %s:%"PRId64", cannot merge. The tag is defined as Number=A, but found\n" ++ "%d values and %d alleles. See also http://samtools.github.io/bcftools/howtos/FAQ.html#incorrect-nfields\n", ++- key,bcf_seqname(args->out_hdr,out),out->pos+1,fmt_ori->n,nals_ori); +++ key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,fmt_ori->n,nals_ori); ++ } ++ else if ( length==BCF_VL_R ) ++ { ++ if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori ) ++- error("Incorrect number of FORMAT/%s values at %s:%d, cannot merge. The tag is defined as Number=R, but found\n" +++ error("Incorrect number of FORMAT/%s values at %s:%"PRId64", cannot merge. The tag is defined as Number=R, but found\n" ++ "%d values and %d alleles. See also http://samtools.github.io/bcftools/howtos/FAQ.html#incorrect-nfields\n", ++- key,bcf_seqname(args->out_hdr,out),out->pos+1,fmt_ori->n,nals_ori); +++ key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,fmt_ori->n,nals_ori); ++ } ++ } ++ ++@@ -1621,15 +1766,12 @@ ++ case BCF_BT_INT16: BRANCH(int32_t, int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; ++ case BCF_BT_INT32: BRANCH(int32_t, int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; ++ case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break; ++- case BCF_BT_CHAR: BRANCH(uint8_t, uint8_t, *src==bcf_str_missing, *src==bcf_str_vector_end, *tgt=bcf_str_missing, *tgt=bcf_str_vector_end); break; ++ default: error("Unexpected case: %d, %s\n", type, key); ++ } ++ #undef BRANCH ++ } ++ if ( type==BCF_BT_FLOAT ) ++ bcf_update_format_float(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize); ++- else if ( type==BCF_BT_CHAR ) ++- bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize); ++ else ++ bcf_update_format_int32(out_hdr, out, key, (int32_t*)ma->tmp_arr, nsamples*nsize); ++ } ++@@ -1720,6 +1862,7 @@ ++ { ++ if ( !gaux[i].active ) continue; ++ bcf1_t *line = maux_get_line(args, i); +++ if ( !line ) continue; ++ int irec = maux->buf[i].cur; ++ ++ hts_expand(int, line->n_allele, maux->buf[i].rec[irec].mmap, maux->buf[i].rec[irec].map); ++@@ -1741,7 +1884,7 @@ ++ if ( !maux->als ) ++ { ++ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); ++- error("Failed to merge alleles at %s:%d\n",bcf_seqname(hdr,line),line->pos+1); +++ error("Failed to merge alleles at %s:%"PRId64"\n",bcf_seqname(hdr,line),(int64_t) line->pos+1); ++ } ++ } ++ } ++@@ -1750,6 +1893,7 @@ ++ /* ++ Output staged gVCF blocks, end is the last position of the block. Assuming ++ gaux[i].active flags are set and maux_get_line returns correct lines. +++ Both start,end coordinates are 0-based. ++ */ ++ void gvcf_write_block(args_t *args, int start, int end) ++ { ++@@ -1759,7 +1903,7 @@ ++ assert(gaux); ++ ++ // Update POS ++- int min = INT_MAX; +++ int min = INT_MAX; // the minimum active gVCF INFO/END (0-based) ++ char ref = 'N'; ++ for (i=0; ifiles->nreaders; i++) ++ { ++@@ -1780,7 +1924,7 @@ ++ if ( min > gaux[i].end ) min = gaux[i].end; ++ } ++ // Check for valid gVCF blocks in this region ++- if ( min==INT_MAX ) +++ if ( min==INT_MAX ) // this probably should not happen ++ { ++ assert(0); ++ maux->gvcf_min = 0; ++@@ -1816,7 +1960,7 @@ ++ } ++ else ++ bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0); ++- bcf_write1(args->out_fh, args->out_hdr, out); +++ if ( bcf_write1(args->out_fh, args->out_hdr, out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ bcf_clear1(out); ++ ++ ++@@ -1874,7 +2018,7 @@ ++ } ++ ++ // When called on a region, trim the blocks accordingly ++- int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos; +++ int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos; // the start of a new gvcf block to output ++ if ( args->regs ) ++ { ++ int rstart = -1, rend = -1; ++@@ -1894,7 +2038,7 @@ ++ // does the block end before the new line or is it interrupted? ++ int tmp = maux->gvcf_min < flush_until ? maux->gvcf_min : flush_until; ++ if ( start > tmp-1 ) break; ++- gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based +++ gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based, passing 0-based coordinates ++ start = tmp; ++ } ++ } ++@@ -1903,6 +2047,7 @@ ++ Check incoming lines for new gVCF blocks, set pointer to the current source ++ buffer (gvcf or readers). In contrast to gvcf_flush, this function can be ++ called only after maux_reset as it relies on updated maux buffers. +++ The coordinate is 0-based ++ */ ++ void gvcf_stage(args_t *args, int pos) ++ { ++@@ -1937,8 +2082,16 @@ ++ int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend); ++ if ( ret==1 ) ++ { +++ if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END +++ { +++ maux->gvcf_break = line->pos; +++ continue; +++ } +++ if ( end[0] <= line->pos ) error("Error: Incorrect END at %s:%"PRId64" .. END=%d\n", bcf_seqname(hdr,line),(int64_t) line->pos+1,end[0]); +++ ++ // END is set, this is a new gVCF block. Cache this line in gaux[i] and swap with ++ // an empty record: the gaux line must be kept until we reach its END. +++ ++ gaux[i].active = 1; ++ gaux[i].end = end[0] - 1; ++ SWAP(bcf1_t*,args->files->readers[i].buffer[irec],gaux[i].line); ++@@ -1984,7 +2137,15 @@ ++ { ++ // Invalidate pointer to reader's buffer or else gvcf_flush will attempt ++ // to use the old lines via maux_get_line() ++- if ( ma->gvcf && !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; +++ if ( ma->gvcf ) +++ { +++ if ( ma->gvcf[ir].active ) +++ { +++ if ( ma->pos >= ma->gvcf[ir].end ) ma->gvcf[ir].active = 0; +++ else if ( ma->buf[ir].cur==-1 ) ma->buf[ir].cur = ma->buf[ir].beg; // re-activate interrupted gVCF block +++ } +++ if ( !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; +++ } ++ ++ bcf_sr_t *reader = bcf_sr_get_reader(args->files,ir); ++ if ( !reader->nbuffer ) continue; // nothing to clean ++@@ -2045,14 +2206,15 @@ ++ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); ++ const char *chr = bcf_hdr_id2name(hdr, maux->buf[i].rid); ++ fprintf(bcftools_stderr,"\t"); ++- for (j=maux->buf[i].beg; jbuf[i].end; j++) fprintf(bcftools_stderr," %s:%d",chr,maux->buf[i].lines[j]->pos+1); +++ for (j=maux->buf[i].beg; jbuf[i].end; j++) fprintf(bcftools_stderr," %s:%"PRId64,chr,(int64_t) maux->buf[i].lines[j]->pos+1); ++ } ++ fprintf(bcftools_stderr,"\n"); ++ } +++ fprintf(bcftools_stderr,"gvcf_min=%d\n", args->maux->gvcf_min); ++ for (i=0; ifiles->nreaders; i++) ++ { ++ fprintf(bcftools_stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active); ++- if ( maux->gvcf[i].active ) fprintf(bcftools_stderr,"\tpos,end=%d,%d", maux->gvcf[i].line->pos+1,maux->gvcf[i].end+1); +++ if ( maux->gvcf[i].active ) fprintf(bcftools_stderr,"\tpos,end=%"PRId64",%"PRId64, (int64_t) maux->gvcf[i].line->pos+1,(int64_t) maux->gvcf[i].end+1); ++ fprintf(bcftools_stderr,"\n"); ++ } ++ fprintf(bcftools_stderr,"\n"); ++@@ -2187,7 +2349,7 @@ ++ } ++ // normalize alleles ++ maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals); ++- if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",maux->chr,line->pos+1,reader->fname); +++ if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname); ++ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); ++ for (k=1; kn_allele; k++) ++ maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files ++@@ -2288,33 +2450,46 @@ ++ if ( args->do_gvcf ) ++ bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0); ++ merge_format(args, out); ++- bcf_write1(args->out_fh, args->out_hdr, out); +++ if ( bcf_write1(args->out_fh, args->out_hdr, out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ bcf_clear1(out); ++ } ++ ++ void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd) ++ { ++ kstring_t str = {0,0,0}; ++- ksprintf(&str,"##%sVersion=%s+htslib-%s\n", cmd, bcftools_version(), hts_version()); ++- bcf_hdr_append(hdr,str.s); +++ int e = 0; +++ if (ksprintf(&str,"##%sVersion=%s+htslib-%s\n", cmd, bcftools_version(), hts_version()) < 0) +++ goto fail; +++ if (bcf_hdr_append(hdr,str.s) < 0) +++ goto fail; ++ ++ str.l = 0; ++- ksprintf(&str,"##%sCommand=%s", cmd, argv[0]); +++ e |= ksprintf(&str,"##%sCommand=%s", cmd, argv[0]) < 0; ++ int i; ++ for (i=1; ifiles->nreaders; i++) ++ { ++- char buf[10]; snprintf(buf,10,"%d",i+1); +++ char buf[24]; snprintf(buf,sizeof buf,"%d",i+1); ++ merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples); ++ } ++ if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge"); ++- bcf_hdr_sync(args->out_hdr); +++ if (bcf_hdr_sync(args->out_hdr) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ } ++ info_rules_init(args); ++ ++ bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header)); ++- bcf_hdr_write(args->out_fh, args->out_hdr); +++ if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ if ( args->header_only ) ++ { ++ bcf_hdr_destroy(args->out_hdr); ++- hts_close(args->out_fh); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ return; ++ } ++ ++@@ -2381,7 +2557,7 @@ ++ info_rules_destroy(args); ++ maux_destroy(args->maux); ++ bcf_hdr_destroy(args->out_hdr); ++- hts_close(args->out_fh); +++ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ bcf_destroy1(args->out_line); ++ kh_destroy(strdict, args->tmph); ++ if ( args->tmps.m ) free(args->tmps.s); ++@@ -2412,7 +2588,7 @@ ++ fprintf(bcftools_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); ++ fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); ++ fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); ++- fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); +++ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(bcftools_stderr, "\n"); ++ exit(1); ++ } ++@@ -2499,7 +2675,7 @@ ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++--- python-pysam.orig/bcftools/vcfnorm.c +++++ python-pysam/bcftools/vcfnorm.c ++@@ -1,6 +1,6 @@ ++ /* vcfnorm.c -- Left-align and normalize indels. ++ ++- Copyright (C) 2013-2017 Genome Research Ltd. +++ Copyright (C) 2013-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -31,6 +31,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -38,10 +39,10 @@ ++ #include "bcftools.h" ++ #include "rbuf.h" ++ ++-#define CHECK_REF_EXIT 0 ++-#define CHECK_REF_WARN 1 ++-#define CHECK_REF_SKIP 2 ++-#define CHECK_REF_FIX 4 +++#define CHECK_REF_EXIT 1 +++#define CHECK_REF_WARN 2 +++#define CHECK_REF_SKIP 4 +++#define CHECK_REF_FIX 8 ++ ++ #define MROWS_SPLIT 1 ++ #define MROWS_MERGE 2 ++@@ -61,6 +62,13 @@ ++ char *ref, *alt; ++ void *hash; ++ } +++cmpals1_t; +++ +++typedef struct +++{ +++ cmpals1_t *cmpals; +++ int ncmpals, mcmpals; +++} ++ cmpals_t; ++ ++ typedef struct ++@@ -83,14 +91,13 @@ ++ int aln_win; // the realignment window size (maximum repeat size) ++ bcf_srs_t *files; // using the synced reader only for -r option ++ bcf_hdr_t *hdr; ++- cmpals_t *cmpals; ++- int ncmpals, mcmpals; +++ cmpals_t cmpals_in, cmpals_out; ++ faidx_t *fai; ++ struct { int tot, set, swap; } nref; ++ char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; ++ int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels; ++ int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious; ++- int record_cmd_line; +++ int record_cmd_line, force, force_warned; ++ } ++ args_t; ++ ++@@ -137,7 +144,7 @@ ++ } ++ ++ char *ref = faidx_fetch_seq(args->fai, (char*)bcf_seqname(args->hdr,line), line->pos, line->pos+maxlen-1, &len); ++- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ replace_iupac_codes(ref,len); ++ ++ args->nref.tot++; ++@@ -248,7 +255,7 @@ ++ int i, j, nals = line->n_allele, nals_ori = line->n_allele; ++ for (i=1, j=1; in_allele; i++) ++ { ++- if ( strcmp(line->d.allele[0],line->d.allele[i]) ) +++ if ( strcasecmp(line->d.allele[0],line->d.allele[i]) ) ++ { ++ args->tmp_arr1[i] = j++; ++ continue; ++@@ -295,7 +302,7 @@ ++ // Sanity check REF ++ int i, nref, reflen = strlen(line->d.allele[0]); ++ char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref); ++- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1); +++ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); ++ seq_to_upper(ref,0); ++ replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N ++ ++@@ -303,18 +310,18 @@ ++ if ( has_non_acgtn(line->d.allele[0],reflen) ) ++ { ++ if ( args->check_ref==CHECK_REF_EXIT ) ++- error("Non-ACGTN reference allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); +++ error("Non-ACGTN reference allele at %s:%"PRId64" .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,ref,line->d.allele[0]); ++ if ( args->check_ref & CHECK_REF_WARN ) ++- fprintf(stderr,"NON_ACGTN_REF\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); +++ fprintf(stderr,"NON_ACGTN_REF\t%s\t%"PRId64"\t%s\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[0]); ++ free(ref); ++ return ERR_REF_MISMATCH; ++ } ++ if ( strcasecmp(ref,line->d.allele[0]) ) ++ { ++ if ( args->check_ref==CHECK_REF_EXIT ) ++- error("Reference allele mismatch at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); +++ error("Reference allele mismatch at %s:%"PRId64" .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,ref,line->d.allele[0]); ++ if ( args->check_ref & CHECK_REF_WARN ) ++- fprintf(stderr,"REF_MISMATCH\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); +++ fprintf(stderr,"REF_MISMATCH\t%s\t%"PRId64"\t%s\t%s\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[0],ref); ++ free(ref); ++ return ERR_REF_MISMATCH; ++ } ++@@ -342,9 +349,9 @@ ++ if ( has_non_acgtn(line->d.allele[i],line->shared.l) ) ++ { ++ if ( args->check_ref==CHECK_REF_EXIT ) ++- error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]); +++ error("Non-ACGTN alternate allele at %s:%"PRId64" .. VCF:'%s'\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[i]); ++ if ( args->check_ref & CHECK_REF_WARN ) ++- fprintf(stderr,"NON_ACGTN_ALT\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[i]); +++ fprintf(stderr,"NON_ACGTN_ALT\t%s\t%"PRId64"\t%s\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[i]); ++ return ERR_REF_MISMATCH; ++ } ++ ++@@ -352,7 +359,7 @@ ++ kputs(line->d.allele[i], &als[i]); ++ seq_to_upper(als[i].s,0); ++ ++- if ( i>0 && als[i].l==als[0].l && !strcmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; +++ if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; ++ } ++ ++ // trim from right ++@@ -363,7 +370,7 @@ ++ int min_len = als[0].l; ++ for (i=1; in_allele; i++) ++ { ++- if ( als[0].s[ als[0].l-1 ]!=als[i].s[ als[i].l-1 ] ) break; +++ if ( toupper(als[0].s[ als[0].l-1 ])!=toupper(als[i].s[ als[i].l-1 ]) ) break; ++ if ( als[i].l < min_len ) min_len = als[i].l; ++ } ++ if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed ++@@ -380,7 +387,7 @@ ++ int npad = line->pos >= args->aln_win ? args->aln_win : line->pos; ++ free(ref); ++ ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref); ++- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad+1); +++ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos-npad+1); ++ replace_iupac_codes(ref,nref); ++ for (i=0; in_allele; i++) ++ { ++@@ -420,7 +427,7 @@ ++ ++ // Have the alleles changed? ++ als[0].s[ als[0].l ] = 0; // in order for strcmp to work ++- if ( ori_pos==line->pos && !strcmp(line->d.allele[0],als[0].s) ) return ERR_OK; +++ if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK; ++ ++ // Create new block of alleles and update ++ args->tmp_als_str.l = 0; ++@@ -459,23 +466,68 @@ ++ if ( len==BCF_VL_A ) \ ++ { \ ++ if ( ret!=src->n_allele-1 ) \ ++- error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ ++- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele-1,ret); \ +++ { \ +++ if ( args->force && !args->force_warned ) \ +++ { \ +++ fprintf(stderr, \ +++ "Warning: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n" \ +++ " (This warning is printed only once.)\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \ +++ args->force_warned = 1; \ +++ } \ +++ if ( args->force ) \ +++ { \ +++ bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ +++ return; \ +++ } \ +++ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \ +++ } \ ++ bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \ ++ } \ ++ else if ( len==BCF_VL_R ) \ ++ { \ ++ if ( ret!=src->n_allele ) \ ++- error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ ++- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele,ret); \ +++ { \ +++ if ( args->force && !args->force_warned ) \ +++ { \ +++ fprintf(stderr, \ +++ "Warning: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n" \ +++ " (This warning is printed only once.)\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \ +++ args->force_warned = 1; \ +++ } \ +++ if ( args->force ) \ +++ { \ +++ bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ +++ return; \ +++ } \ +++ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \ +++ } \ ++ if ( ialt!=0 ) vals[1] = vals[ialt+1]; \ ++ bcf_update_info_##type(args->hdr,dst,tag,vals,2); \ ++ } \ ++ else if ( len==BCF_VL_G ) \ ++ { \ ++ if ( ret!=src->n_allele*(src->n_allele+1)/2 ) \ ++- error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ ++- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ +++ { \ +++ if ( args->force && !args->force_warned ) \ +++ { \ +++ fprintf(stderr, \ +++ "Warning: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n" \ +++ " (This warning is printed only once.)\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ +++ args->force_warned = 1; \ +++ } \ +++ if ( args->force ) \ +++ { \ +++ bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ +++ return; \ +++ } \ +++ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ +++ } \ ++ if ( ialt!=0 ) \ ++ { \ ++ vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \ ++@@ -620,8 +672,23 @@ ++ if ( len==BCF_VL_A ) \ ++ { \ ++ if ( nvals!=(src->n_allele-1)*nsmpl ) \ ++- error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \ ++- tag,bcf_seqname(args->hdr,src),src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ +++ { \ +++ if ( args->force && !args->force_warned ) \ +++ { \ +++ fprintf(stderr, \ +++ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Removing the field.\n" \ +++ " (This warning is printed only once.)\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ +++ args->force_warned = 1; \ +++ } \ +++ if ( args->force ) \ +++ { \ +++ bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ +++ return; \ +++ } \ +++ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ +++ } \ ++ nvals /= nsmpl; \ ++ type_t *src_vals = vals, *dst_vals = vals; \ ++ for (i=0; in_allele*nsmpl ) \ ++- error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \ ++- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*nsmpl,nvals); \ +++ { \ +++ if ( args->force && !args->force_warned ) \ +++ { \ +++ fprintf(stderr, \ +++ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Removing the field.\n" \ +++ " (This warning is printed only once.)\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ +++ args->force_warned = 1; \ +++ } \ +++ if ( args->force ) \ +++ { \ +++ bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ +++ return; \ +++ } \ +++ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*nsmpl,nvals); \ +++ } \ ++ nvals /= nsmpl; \ ++ type_t *src_vals = vals, *dst_vals = vals; \ ++ for (i=0; in_allele*(src->n_allele+1)/2*nsmpl && nvals!=src->n_allele*nsmpl ) \ ++- error("Error at %s:%d, the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \ +++ { \ +++ if ( args->force && !args->force_warned ) \ +++ { \ +++ fprintf(stderr, \ +++ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Removing the field.\n" \ +++ " (This warning is printed only once.)\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ +++ args->force_warned = 1; \ +++ } \ +++ if ( args->force ) \ +++ { \ +++ bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ +++ return; \ +++ } \ +++ error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \ +++ } \ ++ nvals /= nsmpl; \ ++ int all_haploid = nvals==src->n_allele ? 1 : 0; \ ++ type_t *src_vals = vals, *dst_vals = vals; \ ++@@ -704,6 +801,7 @@ ++ { ++ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); ++ int ret = bcf_get_format_char(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1); +++ if ( !ret ) return; // all values can be empty, leave out the tag, no need to panic ++ assert( ret>0 ); ++ ++ kstring_t str; ++@@ -760,9 +858,25 @@ ++ if ( *se==',' ) nfields++; ++ se++; ++ } +++ if ( nfields==1 && se-ptr==1 && *ptr=='.' ) continue; // missing value ++ if ( nfields!=src->n_allele*(src->n_allele+1)/2 && nfields!=src->n_allele ) ++- error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d or %d, found %d\n", ++- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); +++ { +++ if ( args->force && !args->force_warned ) +++ { +++ fprintf(stderr, +++ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d. Removing the field.\n" +++ " (This warning is printed only once.)\n", +++ tag,bcf_seqname(args->hdr,src),(int64_t)src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); +++ args->force_warned = 1; +++ } +++ if ( args->force ) +++ { +++ bcf_update_format_char(args->hdr,dst,tag,NULL,0); +++ return; +++ } +++ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n", +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); +++ } ++ ++ int len = 0; ++ if ( nfields==src->n_allele ) // haploid ++@@ -888,7 +1002,7 @@ ++ if ( len==BCF_VL_A ) \ ++ { \ ++ if (nvals_ori!=lines[0]->n_allele - 1) \ ++- error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele-1); \ +++ error("vcfnorm: number of fields in first record at position %s:%"PRId64" for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),(int64_t) lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele-1); \ ++ int nvals = dst->n_allele - 1; \ ++ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \ ++ vals = (type_t*) args->tmp_arr1; \ ++@@ -899,7 +1013,7 @@ ++ if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \ ++ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ ++ if (nvals2!=lines[i]->n_allele-1) \ ++- error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ +++ error("vcfnorm: could not merge INFO tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ ++ vals2 = (type_t*) args->tmp_arr2; \ ++ for (k=0; kn_allele) \ ++- error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele); \ +++ error("vcfnorm: number of fields in first record at position %s:%"PRId64" for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),(int64_t) lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele); \ ++ int nvals = dst->n_allele; \ ++ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \ ++ vals = (type_t*) args->tmp_arr1; \ ++@@ -923,7 +1037,7 @@ ++ if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \ ++ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ ++ if (nvals2!=lines[i]->n_allele) \ ++- error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ +++ error("vcfnorm: could not merge INFO tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ ++ vals2 = (type_t*) args->tmp_arr2; \ ++ for (k=0; kn_allele*(lines[0]->n_allele+1)/2) { \ ++ fprintf(stderr, "todo: merge Number=G INFO fields for haploid sites\n"); \ ++- error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \ +++ error("vcfnorm: number of fields in first record at position %s:%"PRId64" for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),(int64_t) lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \ ++ } \ ++ int nvals = dst->n_allele*(dst->n_allele+1)/2; \ ++ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \ ++@@ -950,7 +1064,7 @@ ++ if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \ ++ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ ++ if (nvals2!=lines[i]->n_allele*(lines[i]->n_allele+1)/2) \ ++- error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ +++ error("vcfnorm: could not merge INFO tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ ++ vals2 = (type_t*) args->tmp_arr2; \ ++ int ia,ib; \ ++ k = 0; \ ++@@ -1062,7 +1176,7 @@ ++ int ngts2 = bcf_get_genotypes(args->hdr,lines[i],&args->tmp_arr2,&ntmp2); ++ args->ntmp_arr2 = ntmp2 * 4; ++ ngts2 /= nsmpl; ++- if ( ngts!=ngts2 ) error("Error at %s:%d: cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); +++ if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); ++ ++ int32_t *gt = (int32_t*) args->tmp_arr1; ++ int32_t *gt2 = (int32_t*) args->tmp_arr2; ++@@ -1076,7 +1190,7 @@ ++ else ++ { ++ int ial = bcf_gt_allele(gt2[k]); ++- if ( ial>=args->maps[i].nals ) error("Error at %s:%d: incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1,ial); +++ if ( ial>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial); ++ gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]); ++ } ++ } ++@@ -1123,7 +1237,7 @@ ++ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ ++ nvals2 /= nsmpl; \ ++ if (nvals2!=lines[i]->n_allele-1) \ ++- error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ +++ error("vcfnorm: could not merge FORMAT tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ ++ vals = (type_t*) args->tmp_arr1; \ ++ vals2 = (type_t*) args->tmp_arr2; \ ++ for (j=0; jntmp_arr2 = ntmp2 * sizeof(type_t); \ ++ nvals2 /= nsmpl; \ ++ if (nvals2!=lines[i]->n_allele) \ ++- error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ +++ error("vcfnorm: could not merge FORMAT tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ ++ vals = (type_t*) args->tmp_arr1; \ ++ vals2 = (type_t*) args->tmp_arr2; \ ++ for (j=0; jn_allele*(lines[i]->n_allele+1)/2; \ ++ int line_diploid = nvals2==ndiploid ? 1 : 0; \ ++ if (!(nvals2==1 || nvals2==lines[i]->n_allele || nvals2==lines[i]->n_allele*(lines[i]->n_allele+1)/2)) \ ++- error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ +++ error("vcfnorm: could not merge FORMAT tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ ++ vals = (type_t*) args->tmp_arr1; \ ++ vals2 = (type_t*) args->tmp_arr2; \ ++ for (j=0; jn_allele*(dst->n_allele+1)/2; ++ } ++- else error("The field %s at %s:%d neither diploid nor haploid?\n", tag,bcf_seqname(args->hdr,dst),dst->pos+1); +++ else error("The field %s at %s:%"PRId64" neither diploid nor haploid?\n", tag,bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1); ++ ++ kstring_t *tmp = &args->tmp_str[i]; ++ kputc('.',tmp); ++@@ -1415,7 +1529,7 @@ ++ args->maps[i].nals = lines[i]->n_allele; ++ hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map); ++ args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals); ++- if ( !args->als ) error("Failed to merge alleles at %s:%d\n", bcf_seqname(args->hdr,dst),dst->pos+1); +++ if ( !args->als ) error("Failed to merge alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1); ++ } ++ bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals); ++ for (i=0; inals; i++) ++@@ -1533,11 +1647,11 @@ ++ } ++ return NULL; ++ } ++-static void cmpals_add(args_t *args, bcf1_t *rec) +++static void cmpals_add(cmpals_t *ca, bcf1_t *rec) ++ { ++- args->ncmpals++; ++- hts_expand0(cmpals_t, args->ncmpals, args->mcmpals, args->cmpals); ++- cmpals_t *cmpals = args->cmpals + args->ncmpals - 1; +++ ca->ncmpals++; +++ hts_expand0(cmpals1_t, ca->ncmpals, ca->mcmpals, ca->cmpals); +++ cmpals1_t *cmpals = ca->cmpals + ca->ncmpals - 1; ++ free(cmpals->ref); ++ cmpals->ref = strdup(rec->d.allele[0]); ++ cmpals->n = rec->n_allele; ++@@ -1555,21 +1669,21 @@ ++ khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i])); ++ } ++ } ++-static int cmpals_match(args_t *args, bcf1_t *rec) +++static int cmpals_match(cmpals_t *ca, bcf1_t *rec) ++ { ++ int i, j; ++- for (i=0; incmpals; i++) +++ for (i=0; incmpals; i++) ++ { ++- cmpals_t *cmpals = args->cmpals + i; +++ cmpals1_t *cmpals = ca->cmpals + i; ++ if ( rec->n_allele != cmpals->n ) continue; ++ ++ // NB. assuming both are normalized ++- if ( strcmp(rec->d.allele[0], cmpals->ref) ) continue; +++ if ( strcasecmp(rec->d.allele[0], cmpals->ref) ) continue; ++ ++ // the most frequent case ++ if ( rec->n_allele==2 ) ++ { ++- if ( strcmp(rec->d.allele[1], cmpals->alt) ) continue; +++ if ( strcasecmp(rec->d.allele[1], cmpals->alt) ) continue; ++ return 1; ++ } ++ ++@@ -1579,21 +1693,20 @@ ++ if ( jn_allele ) continue; ++ return 1; ++ } ++- cmpals_add(args, rec); ++ return 0; ++ } ++-static void cmpals_reset(args_t *args) { args->ncmpals = 0; } ++-static void cmpals_destroy(args_t *args) +++static void cmpals_reset(cmpals_t *ca) { ca->ncmpals = 0; } +++static void cmpals_destroy(cmpals_t *ca) ++ { ++ int i; ++- for (i=0; imcmpals; i++) +++ for (i=0; imcmpals; i++) ++ { ++- cmpals_t *cmpals = args->cmpals + i; +++ cmpals1_t *cmpals = ca->cmpals + i; ++ free(cmpals->ref); ++ free(cmpals->alt); ++ if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash); ++ } ++- free(args->cmpals); +++ free(ca->cmpals); ++ } ++ ++ static void flush_buffer(args_t *args, htsFile *file, int n) ++@@ -1608,7 +1721,8 @@ ++ { ++ if ( mrows_ready_to_flush(args, args->lines[k]) ) ++ { ++- while ( (line=mrows_flush(args)) ) bcf_write1(file, args->hdr, line); +++ while ( (line=mrows_flush(args)) ) +++ if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ int merge = 1; ++ if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY ) ++@@ -1629,23 +1743,24 @@ ++ if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only ++ if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; ++ if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; ++- if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, args->lines[k]) ) continue; +++ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_out, args->lines[k]) ) continue; ++ } ++ else ++ { ++ prev_rid = args->lines[k]->rid; ++ prev_pos = args->lines[k]->pos; ++ prev_type = 0; ++- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args); +++ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_out); ++ } ++ prev_type |= line_type; ++- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, args->lines[k]); +++ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]); ++ } ++- bcf_write1(file, args->hdr, args->lines[k]); +++ if ( bcf_write1(file, args->hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n ) ++ { ++- while ( (line=mrows_flush(args)) ) bcf_write1(file, args->hdr, line); +++ while ( (line=mrows_flush(args)) ) +++ if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ } ++ ++@@ -1669,7 +1784,8 @@ ++ ++ static void destroy_data(args_t *args) ++ { ++- cmpals_destroy(args); +++ cmpals_destroy(&args->cmpals_in); +++ cmpals_destroy(&args->cmpals_out); ++ int i; ++ for (i=0; irbuf.m; i++) ++ if ( args->lines[i] ) bcf_destroy1(args->lines[i]); ++@@ -1727,9 +1843,9 @@ ++ if ( args->check_ref & CHECK_REF_FIX ) ++ fix_dup_alt(args, line); ++ else if ( args->check_ref==CHECK_REF_EXIT ) ++- error("Duplicate alleles at %s:%d; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),line->pos+1); +++ error("Duplicate alleles at %s:%"PRId64"; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ else if ( args->check_ref & CHECK_REF_WARN ) ++- fprintf(stderr,"ALT_DUP\t%s\t%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ fprintf(stderr,"ALT_DUP\t%s\t%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ } ++ } ++ } ++@@ -1754,7 +1870,7 @@ ++ if ( args->n_threads ) ++ hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p); ++ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); ++- bcf_hdr_write(out, args->hdr); +++ if ( bcf_hdr_write(out, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ ++ int prev_rid = -1, prev_pos = -1, prev_type = 0; ++ while ( bcf_sr_next_line(args->files) ) ++@@ -1770,17 +1886,17 @@ ++ if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only ++ if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; ++ if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; ++- if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, line) ) continue; +++ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_in, line) ) continue; ++ } ++ else ++ { ++ prev_rid = line->rid; ++ prev_pos = line->pos; ++ prev_type = 0; ++- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args); +++ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_in); ++ } ++ prev_type |= line_type; ++- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, line); +++ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_in, line); ++ } ++ ++ // still on the same chromosome? ++@@ -1819,7 +1935,7 @@ ++ if ( j>0 ) flush_buffer(args, out, j); ++ } ++ flush_buffer(args, out, args->rbuf.n); ++- hts_close(out); +++ if ( hts_close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ ++ fprintf(stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); ++ if ( args->check_ref & CHECK_REF_FIX ) ++@@ -1837,8 +1953,9 @@ ++ fprintf(stderr, "Options:\n"); ++ fprintf(stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); ++ fprintf(stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); ++- fprintf(stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|none\n"); ++- fprintf(stderr, " -f, --fasta-ref reference sequence (MANDATORY)\n"); +++ fprintf(stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|exact\n"); +++ fprintf(stderr, " -f, --fasta-ref reference sequence\n"); +++ fprintf(stderr, " --force try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); ++ fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); ++ fprintf(stderr, " --no-version do not append version and command line to the header\n"); ++ fprintf(stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); ++@@ -1849,9 +1966,16 @@ ++ fprintf(stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); ++ fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); ++ fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); ++- fprintf(stderr, " --threads number of extra (de)compression threads [0]\n"); +++ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); ++ fprintf(stderr, "\n"); +++ fprintf(stderr, "Examples:\n"); +++ fprintf(stderr, " # normalize and left-align indels\n"); +++ fprintf(stderr, " bcftools norm -f ref.fa in.vcf\n"); +++ fprintf(stderr, "\n"); +++ fprintf(stderr, " # split multi-allelic sites\n"); +++ fprintf(stderr, " bcftools norm -m- in.vcf\n"); +++ fprintf(stderr, "\n"); ++ exit(1); ++ } ++ ++@@ -1875,6 +1999,7 @@ ++ static struct option loptions[] = ++ { ++ {"help",no_argument,NULL,'h'}, +++ {"force",no_argument,NULL,7}, ++ {"fasta-ref",required_argument,NULL,'f'}, ++ {"do-not-normalize",no_argument,NULL,'N'}, ++ {"multiallelics",required_argument,NULL,'m'}, ++@@ -1904,6 +2029,7 @@ ++ else if ( !strcmp("all",optarg) ) args->rmdup = BCF_SR_PAIR_ANY; ++ else if ( !strcmp("any",optarg) ) args->rmdup = BCF_SR_PAIR_ANY; ++ else if ( !strcmp("none",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT; +++ else if ( !strcmp("exact",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT; ++ else error("The argument to -d not recognised: %s\n", optarg); ++ break; ++ case 'm': ++@@ -1951,8 +2077,9 @@ ++ break; ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; +++ case 7 : args->force = 1; break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -1966,7 +2093,8 @@ ++ else fname = argv[optind]; ++ ++ if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n"); ++- if ( !args->ref_fname && args->check_ref&CHECK_REF_FIX ) error("Expected --fasta-ref with --check-ref s\n"); +++ if ( !args->check_ref && args->ref_fname ) args->check_ref = CHECK_REF_EXIT; +++ if ( args->check_ref && !args->ref_fname ) error("Expected --fasta-ref with --check-ref\n"); ++ ++ if ( args->region ) ++ { ++@@ -1980,7 +2108,7 @@ ++ } ++ ++ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n"); ++ init_data(args); ++ normalize_vcf(args); ++--- python-pysam.orig/bcftools/vcfnorm.c.pysam.c +++++ python-pysam/bcftools/vcfnorm.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* vcfnorm.c -- Left-align and normalize indels. ++ ++- Copyright (C) 2013-2017 Genome Research Ltd. +++ Copyright (C) 2013-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -33,6 +33,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -40,10 +41,10 @@ ++ #include "bcftools.h" ++ #include "rbuf.h" ++ ++-#define CHECK_REF_EXIT 0 ++-#define CHECK_REF_WARN 1 ++-#define CHECK_REF_SKIP 2 ++-#define CHECK_REF_FIX 4 +++#define CHECK_REF_EXIT 1 +++#define CHECK_REF_WARN 2 +++#define CHECK_REF_SKIP 4 +++#define CHECK_REF_FIX 8 ++ ++ #define MROWS_SPLIT 1 ++ #define MROWS_MERGE 2 ++@@ -63,6 +64,13 @@ ++ char *ref, *alt; ++ void *hash; ++ } +++cmpals1_t; +++ +++typedef struct +++{ +++ cmpals1_t *cmpals; +++ int ncmpals, mcmpals; +++} ++ cmpals_t; ++ ++ typedef struct ++@@ -85,14 +93,13 @@ ++ int aln_win; // the realignment window size (maximum repeat size) ++ bcf_srs_t *files; // using the synced reader only for -r option ++ bcf_hdr_t *hdr; ++- cmpals_t *cmpals; ++- int ncmpals, mcmpals; +++ cmpals_t cmpals_in, cmpals_out; ++ faidx_t *fai; ++ struct { int tot, set, swap; } nref; ++ char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; ++ int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels; ++ int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious; ++- int record_cmd_line; +++ int record_cmd_line, force, force_warned; ++ } ++ args_t; ++ ++@@ -139,7 +146,7 @@ ++ } ++ ++ char *ref = faidx_fetch_seq(args->fai, (char*)bcf_seqname(args->hdr,line), line->pos, line->pos+maxlen-1, &len); ++- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ replace_iupac_codes(ref,len); ++ ++ args->nref.tot++; ++@@ -250,7 +257,7 @@ ++ int i, j, nals = line->n_allele, nals_ori = line->n_allele; ++ for (i=1, j=1; in_allele; i++) ++ { ++- if ( strcmp(line->d.allele[0],line->d.allele[i]) ) +++ if ( strcasecmp(line->d.allele[0],line->d.allele[i]) ) ++ { ++ args->tmp_arr1[i] = j++; ++ continue; ++@@ -297,7 +304,7 @@ ++ // Sanity check REF ++ int i, nref, reflen = strlen(line->d.allele[0]); ++ char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref); ++- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1); +++ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); ++ seq_to_upper(ref,0); ++ replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N ++ ++@@ -305,18 +312,18 @@ ++ if ( has_non_acgtn(line->d.allele[0],reflen) ) ++ { ++ if ( args->check_ref==CHECK_REF_EXIT ) ++- error("Non-ACGTN reference allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); +++ error("Non-ACGTN reference allele at %s:%"PRId64" .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,ref,line->d.allele[0]); ++ if ( args->check_ref & CHECK_REF_WARN ) ++- fprintf(bcftools_stderr,"NON_ACGTN_REF\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); +++ fprintf(bcftools_stderr,"NON_ACGTN_REF\t%s\t%"PRId64"\t%s\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[0]); ++ free(ref); ++ return ERR_REF_MISMATCH; ++ } ++ if ( strcasecmp(ref,line->d.allele[0]) ) ++ { ++ if ( args->check_ref==CHECK_REF_EXIT ) ++- error("Reference allele mismatch at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); +++ error("Reference allele mismatch at %s:%"PRId64" .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,ref,line->d.allele[0]); ++ if ( args->check_ref & CHECK_REF_WARN ) ++- fprintf(bcftools_stderr,"REF_MISMATCH\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); +++ fprintf(bcftools_stderr,"REF_MISMATCH\t%s\t%"PRId64"\t%s\t%s\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[0],ref); ++ free(ref); ++ return ERR_REF_MISMATCH; ++ } ++@@ -344,9 +351,9 @@ ++ if ( has_non_acgtn(line->d.allele[i],line->shared.l) ) ++ { ++ if ( args->check_ref==CHECK_REF_EXIT ) ++- error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]); +++ error("Non-ACGTN alternate allele at %s:%"PRId64" .. VCF:'%s'\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[i]); ++ if ( args->check_ref & CHECK_REF_WARN ) ++- fprintf(bcftools_stderr,"NON_ACGTN_ALT\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[i]); +++ fprintf(bcftools_stderr,"NON_ACGTN_ALT\t%s\t%"PRId64"\t%s\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[i]); ++ return ERR_REF_MISMATCH; ++ } ++ ++@@ -354,7 +361,7 @@ ++ kputs(line->d.allele[i], &als[i]); ++ seq_to_upper(als[i].s,0); ++ ++- if ( i>0 && als[i].l==als[0].l && !strcmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; +++ if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; ++ } ++ ++ // trim from right ++@@ -365,7 +372,7 @@ ++ int min_len = als[0].l; ++ for (i=1; in_allele; i++) ++ { ++- if ( als[0].s[ als[0].l-1 ]!=als[i].s[ als[i].l-1 ] ) break; +++ if ( toupper(als[0].s[ als[0].l-1 ])!=toupper(als[i].s[ als[i].l-1 ]) ) break; ++ if ( als[i].l < min_len ) min_len = als[i].l; ++ } ++ if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed ++@@ -382,7 +389,7 @@ ++ int npad = line->pos >= args->aln_win ? args->aln_win : line->pos; ++ free(ref); ++ ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref); ++- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad+1); +++ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos-npad+1); ++ replace_iupac_codes(ref,nref); ++ for (i=0; in_allele; i++) ++ { ++@@ -422,7 +429,7 @@ ++ ++ // Have the alleles changed? ++ als[0].s[ als[0].l ] = 0; // in order for strcmp to work ++- if ( ori_pos==line->pos && !strcmp(line->d.allele[0],als[0].s) ) return ERR_OK; +++ if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK; ++ ++ // Create new block of alleles and update ++ args->tmp_als_str.l = 0; ++@@ -461,23 +468,68 @@ ++ if ( len==BCF_VL_A ) \ ++ { \ ++ if ( ret!=src->n_allele-1 ) \ ++- error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ ++- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele-1,ret); \ +++ { \ +++ if ( args->force && !args->force_warned ) \ +++ { \ +++ fprintf(bcftools_stderr, \ +++ "Warning: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n" \ +++ " (This warning is printed only once.)\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \ +++ args->force_warned = 1; \ +++ } \ +++ if ( args->force ) \ +++ { \ +++ bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ +++ return; \ +++ } \ +++ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \ +++ } \ ++ bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \ ++ } \ ++ else if ( len==BCF_VL_R ) \ ++ { \ ++ if ( ret!=src->n_allele ) \ ++- error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ ++- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele,ret); \ +++ { \ +++ if ( args->force && !args->force_warned ) \ +++ { \ +++ fprintf(bcftools_stderr, \ +++ "Warning: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n" \ +++ " (This warning is printed only once.)\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \ +++ args->force_warned = 1; \ +++ } \ +++ if ( args->force ) \ +++ { \ +++ bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ +++ return; \ +++ } \ +++ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \ +++ } \ ++ if ( ialt!=0 ) vals[1] = vals[ialt+1]; \ ++ bcf_update_info_##type(args->hdr,dst,tag,vals,2); \ ++ } \ ++ else if ( len==BCF_VL_G ) \ ++ { \ ++ if ( ret!=src->n_allele*(src->n_allele+1)/2 ) \ ++- error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ ++- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ +++ { \ +++ if ( args->force && !args->force_warned ) \ +++ { \ +++ fprintf(bcftools_stderr, \ +++ "Warning: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n" \ +++ " (This warning is printed only once.)\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ +++ args->force_warned = 1; \ +++ } \ +++ if ( args->force ) \ +++ { \ +++ bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ +++ return; \ +++ } \ +++ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ +++ } \ ++ if ( ialt!=0 ) \ ++ { \ ++ vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \ ++@@ -622,8 +674,23 @@ ++ if ( len==BCF_VL_A ) \ ++ { \ ++ if ( nvals!=(src->n_allele-1)*nsmpl ) \ ++- error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \ ++- tag,bcf_seqname(args->hdr,src),src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ +++ { \ +++ if ( args->force && !args->force_warned ) \ +++ { \ +++ fprintf(bcftools_stderr, \ +++ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Removing the field.\n" \ +++ " (This warning is printed only once.)\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ +++ args->force_warned = 1; \ +++ } \ +++ if ( args->force ) \ +++ { \ +++ bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ +++ return; \ +++ } \ +++ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ +++ } \ ++ nvals /= nsmpl; \ ++ type_t *src_vals = vals, *dst_vals = vals; \ ++ for (i=0; in_allele*nsmpl ) \ ++- error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \ ++- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*nsmpl,nvals); \ +++ { \ +++ if ( args->force && !args->force_warned ) \ +++ { \ +++ fprintf(bcftools_stderr, \ +++ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Removing the field.\n" \ +++ " (This warning is printed only once.)\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ +++ args->force_warned = 1; \ +++ } \ +++ if ( args->force ) \ +++ { \ +++ bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ +++ return; \ +++ } \ +++ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*nsmpl,nvals); \ +++ } \ ++ nvals /= nsmpl; \ ++ type_t *src_vals = vals, *dst_vals = vals; \ ++ for (i=0; in_allele*(src->n_allele+1)/2*nsmpl && nvals!=src->n_allele*nsmpl ) \ ++- error("Error at %s:%d, the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \ +++ { \ +++ if ( args->force && !args->force_warned ) \ +++ { \ +++ fprintf(bcftools_stderr, \ +++ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Removing the field.\n" \ +++ " (This warning is printed only once.)\n", \ +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ +++ args->force_warned = 1; \ +++ } \ +++ if ( args->force ) \ +++ { \ +++ bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ +++ return; \ +++ } \ +++ error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \ +++ } \ ++ nvals /= nsmpl; \ ++ int all_haploid = nvals==src->n_allele ? 1 : 0; \ ++ type_t *src_vals = vals, *dst_vals = vals; \ ++@@ -706,6 +803,7 @@ ++ { ++ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); ++ int ret = bcf_get_format_char(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1); +++ if ( !ret ) return; // all values can be empty, leave out the tag, no need to panic ++ assert( ret>0 ); ++ ++ kstring_t str; ++@@ -762,9 +860,25 @@ ++ if ( *se==',' ) nfields++; ++ se++; ++ } +++ if ( nfields==1 && se-ptr==1 && *ptr=='.' ) continue; // missing value ++ if ( nfields!=src->n_allele*(src->n_allele+1)/2 && nfields!=src->n_allele ) ++- error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d or %d, found %d\n", ++- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); +++ { +++ if ( args->force && !args->force_warned ) +++ { +++ fprintf(bcftools_stderr, +++ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d. Removing the field.\n" +++ " (This warning is printed only once.)\n", +++ tag,bcf_seqname(args->hdr,src),(int64_t)src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); +++ args->force_warned = 1; +++ } +++ if ( args->force ) +++ { +++ bcf_update_format_char(args->hdr,dst,tag,NULL,0); +++ return; +++ } +++ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n", +++ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); +++ } ++ ++ int len = 0; ++ if ( nfields==src->n_allele ) // haploid ++@@ -890,7 +1004,7 @@ ++ if ( len==BCF_VL_A ) \ ++ { \ ++ if (nvals_ori!=lines[0]->n_allele - 1) \ ++- error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele-1); \ +++ error("vcfnorm: number of fields in first record at position %s:%"PRId64" for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),(int64_t) lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele-1); \ ++ int nvals = dst->n_allele - 1; \ ++ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \ ++ vals = (type_t*) args->tmp_arr1; \ ++@@ -901,7 +1015,7 @@ ++ if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \ ++ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ ++ if (nvals2!=lines[i]->n_allele-1) \ ++- error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ +++ error("vcfnorm: could not merge INFO tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ ++ vals2 = (type_t*) args->tmp_arr2; \ ++ for (k=0; kn_allele) \ ++- error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele); \ +++ error("vcfnorm: number of fields in first record at position %s:%"PRId64" for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),(int64_t) lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele); \ ++ int nvals = dst->n_allele; \ ++ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \ ++ vals = (type_t*) args->tmp_arr1; \ ++@@ -925,7 +1039,7 @@ ++ if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \ ++ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ ++ if (nvals2!=lines[i]->n_allele) \ ++- error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ +++ error("vcfnorm: could not merge INFO tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ ++ vals2 = (type_t*) args->tmp_arr2; \ ++ for (k=0; kn_allele*(lines[0]->n_allele+1)/2) { \ ++ fprintf(bcftools_stderr, "todo: merge Number=G INFO fields for haploid sites\n"); \ ++- error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \ +++ error("vcfnorm: number of fields in first record at position %s:%"PRId64" for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),(int64_t) lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \ ++ } \ ++ int nvals = dst->n_allele*(dst->n_allele+1)/2; \ ++ ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \ ++@@ -952,7 +1066,7 @@ ++ if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \ ++ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ ++ if (nvals2!=lines[i]->n_allele*(lines[i]->n_allele+1)/2) \ ++- error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ +++ error("vcfnorm: could not merge INFO tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ ++ vals2 = (type_t*) args->tmp_arr2; \ ++ int ia,ib; \ ++ k = 0; \ ++@@ -1064,7 +1178,7 @@ ++ int ngts2 = bcf_get_genotypes(args->hdr,lines[i],&args->tmp_arr2,&ntmp2); ++ args->ntmp_arr2 = ntmp2 * 4; ++ ngts2 /= nsmpl; ++- if ( ngts!=ngts2 ) error("Error at %s:%d: cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); +++ if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); ++ ++ int32_t *gt = (int32_t*) args->tmp_arr1; ++ int32_t *gt2 = (int32_t*) args->tmp_arr2; ++@@ -1078,7 +1192,7 @@ ++ else ++ { ++ int ial = bcf_gt_allele(gt2[k]); ++- if ( ial>=args->maps[i].nals ) error("Error at %s:%d: incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1,ial); +++ if ( ial>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial); ++ gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]); ++ } ++ } ++@@ -1125,7 +1239,7 @@ ++ args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ ++ nvals2 /= nsmpl; \ ++ if (nvals2!=lines[i]->n_allele-1) \ ++- error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ +++ error("vcfnorm: could not merge FORMAT tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ ++ vals = (type_t*) args->tmp_arr1; \ ++ vals2 = (type_t*) args->tmp_arr2; \ ++ for (j=0; jntmp_arr2 = ntmp2 * sizeof(type_t); \ ++ nvals2 /= nsmpl; \ ++ if (nvals2!=lines[i]->n_allele) \ ++- error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ +++ error("vcfnorm: could not merge FORMAT tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ ++ vals = (type_t*) args->tmp_arr1; \ ++ vals2 = (type_t*) args->tmp_arr2; \ ++ for (j=0; jn_allele*(lines[i]->n_allele+1)/2; \ ++ int line_diploid = nvals2==ndiploid ? 1 : 0; \ ++ if (!(nvals2==1 || nvals2==lines[i]->n_allele || nvals2==lines[i]->n_allele*(lines[i]->n_allele+1)/2)) \ ++- error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ +++ error("vcfnorm: could not merge FORMAT tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ ++ vals = (type_t*) args->tmp_arr1; \ ++ vals2 = (type_t*) args->tmp_arr2; \ ++ for (j=0; jn_allele*(dst->n_allele+1)/2; ++ } ++- else error("The field %s at %s:%d neither diploid nor haploid?\n", tag,bcf_seqname(args->hdr,dst),dst->pos+1); +++ else error("The field %s at %s:%"PRId64" neither diploid nor haploid?\n", tag,bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1); ++ ++ kstring_t *tmp = &args->tmp_str[i]; ++ kputc('.',tmp); ++@@ -1417,7 +1531,7 @@ ++ args->maps[i].nals = lines[i]->n_allele; ++ hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map); ++ args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals); ++- if ( !args->als ) error("Failed to merge alleles at %s:%d\n", bcf_seqname(args->hdr,dst),dst->pos+1); +++ if ( !args->als ) error("Failed to merge alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1); ++ } ++ bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals); ++ for (i=0; inals; i++) ++@@ -1535,11 +1649,11 @@ ++ } ++ return NULL; ++ } ++-static void cmpals_add(args_t *args, bcf1_t *rec) +++static void cmpals_add(cmpals_t *ca, bcf1_t *rec) ++ { ++- args->ncmpals++; ++- hts_expand0(cmpals_t, args->ncmpals, args->mcmpals, args->cmpals); ++- cmpals_t *cmpals = args->cmpals + args->ncmpals - 1; +++ ca->ncmpals++; +++ hts_expand0(cmpals1_t, ca->ncmpals, ca->mcmpals, ca->cmpals); +++ cmpals1_t *cmpals = ca->cmpals + ca->ncmpals - 1; ++ free(cmpals->ref); ++ cmpals->ref = strdup(rec->d.allele[0]); ++ cmpals->n = rec->n_allele; ++@@ -1557,21 +1671,21 @@ ++ khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i])); ++ } ++ } ++-static int cmpals_match(args_t *args, bcf1_t *rec) +++static int cmpals_match(cmpals_t *ca, bcf1_t *rec) ++ { ++ int i, j; ++- for (i=0; incmpals; i++) +++ for (i=0; incmpals; i++) ++ { ++- cmpals_t *cmpals = args->cmpals + i; +++ cmpals1_t *cmpals = ca->cmpals + i; ++ if ( rec->n_allele != cmpals->n ) continue; ++ ++ // NB. assuming both are normalized ++- if ( strcmp(rec->d.allele[0], cmpals->ref) ) continue; +++ if ( strcasecmp(rec->d.allele[0], cmpals->ref) ) continue; ++ ++ // the most frequent case ++ if ( rec->n_allele==2 ) ++ { ++- if ( strcmp(rec->d.allele[1], cmpals->alt) ) continue; +++ if ( strcasecmp(rec->d.allele[1], cmpals->alt) ) continue; ++ return 1; ++ } ++ ++@@ -1581,21 +1695,20 @@ ++ if ( jn_allele ) continue; ++ return 1; ++ } ++- cmpals_add(args, rec); ++ return 0; ++ } ++-static void cmpals_reset(args_t *args) { args->ncmpals = 0; } ++-static void cmpals_destroy(args_t *args) +++static void cmpals_reset(cmpals_t *ca) { ca->ncmpals = 0; } +++static void cmpals_destroy(cmpals_t *ca) ++ { ++ int i; ++- for (i=0; imcmpals; i++) +++ for (i=0; imcmpals; i++) ++ { ++- cmpals_t *cmpals = args->cmpals + i; +++ cmpals1_t *cmpals = ca->cmpals + i; ++ free(cmpals->ref); ++ free(cmpals->alt); ++ if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash); ++ } ++- free(args->cmpals); +++ free(ca->cmpals); ++ } ++ ++ static void flush_buffer(args_t *args, htsFile *file, int n) ++@@ -1610,7 +1723,8 @@ ++ { ++ if ( mrows_ready_to_flush(args, args->lines[k]) ) ++ { ++- while ( (line=mrows_flush(args)) ) bcf_write1(file, args->hdr, line); +++ while ( (line=mrows_flush(args)) ) +++ if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ int merge = 1; ++ if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY ) ++@@ -1631,23 +1745,24 @@ ++ if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only ++ if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; ++ if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; ++- if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, args->lines[k]) ) continue; +++ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_out, args->lines[k]) ) continue; ++ } ++ else ++ { ++ prev_rid = args->lines[k]->rid; ++ prev_pos = args->lines[k]->pos; ++ prev_type = 0; ++- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args); +++ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_out); ++ } ++ prev_type |= line_type; ++- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, args->lines[k]); +++ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]); ++ } ++- bcf_write1(file, args->hdr, args->lines[k]); +++ if ( bcf_write1(file, args->hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n ) ++ { ++- while ( (line=mrows_flush(args)) ) bcf_write1(file, args->hdr, line); +++ while ( (line=mrows_flush(args)) ) +++ if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ } ++ ++@@ -1671,7 +1786,8 @@ ++ ++ static void destroy_data(args_t *args) ++ { ++- cmpals_destroy(args); +++ cmpals_destroy(&args->cmpals_in); +++ cmpals_destroy(&args->cmpals_out); ++ int i; ++ for (i=0; irbuf.m; i++) ++ if ( args->lines[i] ) bcf_destroy1(args->lines[i]); ++@@ -1729,9 +1845,9 @@ ++ if ( args->check_ref & CHECK_REF_FIX ) ++ fix_dup_alt(args, line); ++ else if ( args->check_ref==CHECK_REF_EXIT ) ++- error("Duplicate alleles at %s:%d; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),line->pos+1); +++ error("Duplicate alleles at %s:%"PRId64"; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ else if ( args->check_ref & CHECK_REF_WARN ) ++- fprintf(bcftools_stderr,"ALT_DUP\t%s\t%d\n", bcf_seqname(args->hdr,line),line->pos+1); +++ fprintf(bcftools_stderr,"ALT_DUP\t%s\t%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); ++ } ++ } ++ } ++@@ -1756,7 +1872,7 @@ ++ if ( args->n_threads ) ++ hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p); ++ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); ++- bcf_hdr_write(out, args->hdr); +++ if ( bcf_hdr_write(out, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ ++ int prev_rid = -1, prev_pos = -1, prev_type = 0; ++ while ( bcf_sr_next_line(args->files) ) ++@@ -1772,17 +1888,17 @@ ++ if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only ++ if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; ++ if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; ++- if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, line) ) continue; +++ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_in, line) ) continue; ++ } ++ else ++ { ++ prev_rid = line->rid; ++ prev_pos = line->pos; ++ prev_type = 0; ++- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args); +++ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_in); ++ } ++ prev_type |= line_type; ++- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, line); +++ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_in, line); ++ } ++ ++ // still on the same chromosome? ++@@ -1821,7 +1937,7 @@ ++ if ( j>0 ) flush_buffer(args, out, j); ++ } ++ flush_buffer(args, out, args->rbuf.n); ++- hts_close(out); +++ if ( hts_close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ ++ fprintf(bcftools_stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); ++ if ( args->check_ref & CHECK_REF_FIX ) ++@@ -1839,8 +1955,9 @@ ++ fprintf(bcftools_stderr, "Options:\n"); ++ fprintf(bcftools_stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); ++ fprintf(bcftools_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); ++- fprintf(bcftools_stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|none\n"); ++- fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence (MANDATORY)\n"); +++ fprintf(bcftools_stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|exact\n"); +++ fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence\n"); +++ fprintf(bcftools_stderr, " --force try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); ++ fprintf(bcftools_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); ++ fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); ++ fprintf(bcftools_stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); ++@@ -1851,9 +1968,16 @@ ++ fprintf(bcftools_stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); ++ fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); ++ fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); ++- fprintf(bcftools_stderr, " --threads number of extra (de)compression threads [0]\n"); +++ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(bcftools_stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); ++ fprintf(bcftools_stderr, "\n"); +++ fprintf(bcftools_stderr, "Examples:\n"); +++ fprintf(bcftools_stderr, " # normalize and left-align indels\n"); +++ fprintf(bcftools_stderr, " bcftools norm -f ref.fa in.vcf\n"); +++ fprintf(bcftools_stderr, "\n"); +++ fprintf(bcftools_stderr, " # split multi-allelic sites\n"); +++ fprintf(bcftools_stderr, " bcftools norm -m- in.vcf\n"); +++ fprintf(bcftools_stderr, "\n"); ++ exit(1); ++ } ++ ++@@ -1877,6 +2001,7 @@ ++ static struct option loptions[] = ++ { ++ {"help",no_argument,NULL,'h'}, +++ {"force",no_argument,NULL,7}, ++ {"fasta-ref",required_argument,NULL,'f'}, ++ {"do-not-normalize",no_argument,NULL,'N'}, ++ {"multiallelics",required_argument,NULL,'m'}, ++@@ -1906,6 +2031,7 @@ ++ else if ( !strcmp("all",optarg) ) args->rmdup = BCF_SR_PAIR_ANY; ++ else if ( !strcmp("any",optarg) ) args->rmdup = BCF_SR_PAIR_ANY; ++ else if ( !strcmp("none",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT; +++ else if ( !strcmp("exact",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT; ++ else error("The argument to -d not recognised: %s\n", optarg); ++ break; ++ case 'm': ++@@ -1953,8 +2079,9 @@ ++ break; ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; +++ case 7 : args->force = 1; break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -1968,7 +2095,8 @@ ++ else fname = argv[optind]; ++ ++ if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n"); ++- if ( !args->ref_fname && args->check_ref&CHECK_REF_FIX ) error("Expected --fasta-ref with --check-ref s\n"); +++ if ( !args->check_ref && args->ref_fname ) args->check_ref = CHECK_REF_EXIT; +++ if ( args->check_ref && !args->ref_fname ) error("Expected --fasta-ref with --check-ref\n"); ++ ++ if ( args->region ) ++ { ++@@ -1982,7 +2110,7 @@ ++ } ++ ++ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n"); ++ init_data(args); ++ normalize_vcf(args); ++--- python-pysam.orig/bcftools/vcfplugin.c +++++ python-pysam/bcftools/vcfplugin.c ++@@ -38,7 +38,11 @@ ++ #include ++ #include ++ #include +++#ifdef _WIN32 +++#include +++#else ++ #include +++#endif ++ #include "bcftools.h" ++ #include "vcmp.h" ++ #include "filter.h" ++@@ -154,7 +158,7 @@ ++ { ++ while (1) ++ { ++- size_t len = strcspn(path, ":"); +++ size_t len = strcspn(path, HTS_PATH_SEPARATOR_STR); ++ ++ if ( len == 0 ) ++ { ++@@ -185,7 +189,7 @@ ++ } ++ ++ path += len; ++- if ( *path == ':' ) path++; +++ if ( *path == HTS_PATH_SEPARATOR_CHAR ) path++; ++ else break; ++ } ++ } ++@@ -207,28 +211,55 @@ ++ ++ void *handle; ++ char *tmp; ++- if ( fname[0]!='/' ) // not an absolute path +++ int is_absolute_path = 0; +++#ifdef _WIN32 +++ // Windows accepts both forward slash (/) and backslash (\) as folder separator +++ // and can have any path prefixed by the drive letter and a colon (:). +++ if ( fname[0]=='/' || fname[0]=='\\') is_absolute_path = 1; +++ else if ( fname[0] && fname[1]==':' && (fname[2]=='/' || fname[2]=='\\') ) is_absolute_path = 1; +++#else +++ if ( fname[0]=='/' ) is_absolute_path = 1; +++#endif +++ if ( !is_absolute_path ) ++ { ++ int i; ++ for (i=0; inplugin_paths; i++) ++ { ++- tmp = msprintf("%s/%s%s", args->plugin_paths[i], fname, PLUGIN_EXT); +++ tmp = msprintf("%s/%s%s", args->plugin_paths[i], fname, PLUGIN_EXT); +++#ifdef _WIN32 +++ handle = LoadLibraryA(tmp); +++#else ++ handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though +++#endif ++ if ( args->verbose > 1 ) ++ { ++- if ( !handle ) fprintf(stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror()); ++- else fprintf(stderr,"%s:\n\tdlopen .. ok\n", tmp); +++ if ( !handle ) +++#ifdef _WIN32 +++ fprintf(stderr,"%s:\n\tLoadLibraryA .. %lu\n", tmp, GetLastError()); +++#else +++ fprintf(stderr,"%s:\n\tdlopen .. %s\n", tmp, dlerror()); +++#endif +++ else fprintf(stderr,"%s:\n\tplugin open .. ok\n", tmp); ++ } ++ free(tmp); ++ if ( handle ) return handle; ++ } ++ } ++ +++#ifdef _WIN32 +++ handle = LoadLibraryA(fname); +++#else ++ handle = dlopen(fname, RTLD_NOW); +++#endif ++ if ( args->verbose > 1 ) ++ { ++- if ( !handle ) fprintf(stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror()); ++- else fprintf(stderr,"%s:\n\tdlopen .. ok\n", fname); +++ if ( !handle ) +++#ifdef _WIN32 +++ fprintf(stderr,"%s:\n\tLoadLibraryA .. %lu\n", fname, GetLastError()); +++#else +++ fprintf(stderr,"%s:\n\tdlopen .. %s\n", fname, dlerror()); +++#endif +++ else fprintf(stderr,"%s:\n\tplugin open .. ok\n", fname); ++ } ++ ++ return handle; ++@@ -264,6 +295,55 @@ ++ return -1; ++ } ++ +++#ifdef _WIN32 +++ plugin->init = (dl_init_f) GetProcAddress(plugin->handle, "init"); +++ if ( plugin->init && args->verbose > 1 ) fprintf(stderr,"\tinit .. ok\n"); +++ +++ plugin->run = (dl_run_f) GetProcAddress(plugin->handle, "run"); +++ if ( plugin->run && args->verbose > 1 ) fprintf(stderr,"\trun .. ok\n"); +++ +++ if ( !plugin->init && !plugin->run ) +++ { +++ if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name); +++ else if ( args->verbose > 1 ) fprintf(stderr,"\tinit/run .. not found\n"); +++ return -1; +++ } +++ +++ plugin->version = (dl_version_f) GetProcAddress(plugin->handle, "version"); +++ if ( !plugin->version ) +++ { +++ if ( exit_on_error ) error("Could not initialize %s: version string not found\n", plugin->name); +++ else if ( args->verbose > 1 ) fprintf(stderr,"\tversion .. not found\n"); +++ return -1; +++ } +++ +++ plugin->about = (dl_about_f) GetProcAddress(plugin->handle, "about"); +++ if ( !plugin->about ) +++ { +++ if ( exit_on_error ) error("Could not initialize %s: about string not found\n", plugin->name); +++ return -1; +++ } +++ +++ plugin->usage = (dl_about_f) GetProcAddress(plugin->handle, "usage"); +++ if ( !plugin->usage ) +++ plugin->usage = plugin->about; +++ +++ if ( plugin->run ) return 0; +++ +++ plugin->process = (dl_process_f) GetProcAddress(plugin->handle, "process"); +++ if ( !plugin->process ) +++ { +++ if ( exit_on_error ) error("Could not initialize %s: process method not found\n", plugin->name); +++ return -1; +++ } +++ +++ plugin->destroy = (dl_destroy_f) GetProcAddress(plugin->handle, "destroy"); +++ if ( !plugin->destroy ) +++ { +++ if ( exit_on_error ) error("Could not initialize %s: destroy method not found\n", plugin->name); +++ return -1; +++ } +++#else ++ dlerror(); ++ plugin->init = (dl_init_f) dlsym(plugin->handle, "init"); ++ char *ret = dlerror(); ++@@ -325,6 +405,7 @@ ++ if ( exit_on_error ) error("Could not initialize %s: %s\n", plugin->name, ret); ++ return -1; ++ } +++#endif ++ ++ return 0; ++ } ++@@ -427,7 +508,7 @@ ++ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); ++ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); ++- bcf_hdr_write(args->out_fh, args->hdr_out); +++ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ } ++ ++@@ -435,7 +516,11 @@ ++ { ++ free(args->plugin.name); ++ if ( args->plugin.destroy ) args->plugin.destroy(); +++#ifdef _WIN32 +++ FreeLibrary(args->plugin.handle); +++#else ++ dlclose(args->plugin.handle); +++#endif ++ if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out); ++ if ( args->nplugin_paths>0 ) ++ { ++@@ -445,7 +530,7 @@ ++ } ++ if ( args->filter ) ++ filter_destroy(args->filter); ++- if (args->out_fh) hts_close(args->out_fh); +++ if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ } ++ ++ static void usage(args_t *args) ++@@ -466,7 +551,7 @@ ++ fprintf(stderr, " --no-version do not append version and command line to the header\n"); ++ fprintf(stderr, " -o, --output write output to a file [standard output]\n"); ++ fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); ++- fprintf(stderr, " --threads number of extra output compression threads [0]\n"); +++ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(stderr, "Plugin options:\n"); ++ fprintf(stderr, " -h, --help list plugin's options\n"); ++ fprintf(stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); ++@@ -599,10 +684,16 @@ ++ char *fname = NULL; ++ if ( optind>=argc || argv[optind][0]=='-' ) ++ { ++- if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin ++- else usage(args); ++ args->plugin.argc = argc - optind + 1; ++ args->plugin.argv = argv + optind - 1; +++ +++ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin +++ else if ( optind>=argc ) usage(args); +++ else +++ { +++ optind = 1; +++ init_plugin(args); +++ } ++ } ++ else ++ { ++@@ -624,7 +715,7 @@ ++ error("Failed to read the targets: %s\n", args->targets_list); ++ args->files->collapse |= COLLAPSE_SOME; ++ } ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ ++ init_data(args); ++ while ( bcf_sr_next_line(args->files) ) ++@@ -640,7 +731,7 @@ ++ if ( line ) ++ { ++ if ( line->errcode ) error("[E::main_plugin] Unchecked error (%d), exiting\n",line->errcode); ++- bcf_write1(args->out_fh, args->hdr_out, line); +++ if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ } ++ destroy_data(args); ++--- python-pysam.orig/bcftools/vcfplugin.c.pysam.c +++++ python-pysam/bcftools/vcfplugin.c.pysam.c ++@@ -40,7 +40,11 @@ ++ #include ++ #include ++ #include +++#ifdef _WIN32 +++#include +++#else ++ #include +++#endif ++ #include "bcftools.h" ++ #include "vcmp.h" ++ #include "filter.h" ++@@ -156,7 +160,7 @@ ++ { ++ while (1) ++ { ++- size_t len = strcspn(path, ":"); +++ size_t len = strcspn(path, HTS_PATH_SEPARATOR_STR); ++ ++ if ( len == 0 ) ++ { ++@@ -187,7 +191,7 @@ ++ } ++ ++ path += len; ++- if ( *path == ':' ) path++; +++ if ( *path == HTS_PATH_SEPARATOR_CHAR ) path++; ++ else break; ++ } ++ } ++@@ -209,28 +213,55 @@ ++ ++ void *handle; ++ char *tmp; ++- if ( fname[0]!='/' ) // not an absolute path +++ int is_absolute_path = 0; +++#ifdef _WIN32 +++ // Windows accepts both forward slash (/) and backslash (\) as folder separator +++ // and can have any path prefixed by the drive letter and a colon (:). +++ if ( fname[0]=='/' || fname[0]=='\\') is_absolute_path = 1; +++ else if ( fname[0] && fname[1]==':' && (fname[2]=='/' || fname[2]=='\\') ) is_absolute_path = 1; +++#else +++ if ( fname[0]=='/' ) is_absolute_path = 1; +++#endif +++ if ( !is_absolute_path ) ++ { ++ int i; ++ for (i=0; inplugin_paths; i++) ++ { ++- tmp = msprintf("%s/%s%s", args->plugin_paths[i], fname, PLUGIN_EXT); +++ tmp = msprintf("%s/%s%s", args->plugin_paths[i], fname, PLUGIN_EXT); +++#ifdef _WIN32 +++ handle = LoadLibraryA(tmp); +++#else ++ handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though +++#endif ++ if ( args->verbose > 1 ) ++ { ++- if ( !handle ) fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror()); ++- else fprintf(bcftools_stderr,"%s:\n\tdlopen .. ok\n", tmp); +++ if ( !handle ) +++#ifdef _WIN32 +++ fprintf(bcftools_stderr,"%s:\n\tLoadLibraryA .. %lu\n", tmp, GetLastError()); +++#else +++ fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", tmp, dlerror()); +++#endif +++ else fprintf(bcftools_stderr,"%s:\n\tplugin open .. ok\n", tmp); ++ } ++ free(tmp); ++ if ( handle ) return handle; ++ } ++ } ++ +++#ifdef _WIN32 +++ handle = LoadLibraryA(fname); +++#else ++ handle = dlopen(fname, RTLD_NOW); +++#endif ++ if ( args->verbose > 1 ) ++ { ++- if ( !handle ) fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror()); ++- else fprintf(bcftools_stderr,"%s:\n\tdlopen .. ok\n", fname); +++ if ( !handle ) +++#ifdef _WIN32 +++ fprintf(bcftools_stderr,"%s:\n\tLoadLibraryA .. %lu\n", fname, GetLastError()); +++#else +++ fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", fname, dlerror()); +++#endif +++ else fprintf(bcftools_stderr,"%s:\n\tplugin open .. ok\n", fname); ++ } ++ ++ return handle; ++@@ -266,6 +297,55 @@ ++ return -1; ++ } ++ +++#ifdef _WIN32 +++ plugin->init = (dl_init_f) GetProcAddress(plugin->handle, "init"); +++ if ( plugin->init && args->verbose > 1 ) fprintf(bcftools_stderr,"\tinit .. ok\n"); +++ +++ plugin->run = (dl_run_f) GetProcAddress(plugin->handle, "run"); +++ if ( plugin->run && args->verbose > 1 ) fprintf(bcftools_stderr,"\trun .. ok\n"); +++ +++ if ( !plugin->init && !plugin->run ) +++ { +++ if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name); +++ else if ( args->verbose > 1 ) fprintf(bcftools_stderr,"\tinit/run .. not found\n"); +++ return -1; +++ } +++ +++ plugin->version = (dl_version_f) GetProcAddress(plugin->handle, "version"); +++ if ( !plugin->version ) +++ { +++ if ( exit_on_error ) error("Could not initialize %s: version string not found\n", plugin->name); +++ else if ( args->verbose > 1 ) fprintf(bcftools_stderr,"\tversion .. not found\n"); +++ return -1; +++ } +++ +++ plugin->about = (dl_about_f) GetProcAddress(plugin->handle, "about"); +++ if ( !plugin->about ) +++ { +++ if ( exit_on_error ) error("Could not initialize %s: about string not found\n", plugin->name); +++ return -1; +++ } +++ +++ plugin->usage = (dl_about_f) GetProcAddress(plugin->handle, "usage"); +++ if ( !plugin->usage ) +++ plugin->usage = plugin->about; +++ +++ if ( plugin->run ) return 0; +++ +++ plugin->process = (dl_process_f) GetProcAddress(plugin->handle, "process"); +++ if ( !plugin->process ) +++ { +++ if ( exit_on_error ) error("Could not initialize %s: process method not found\n", plugin->name); +++ return -1; +++ } +++ +++ plugin->destroy = (dl_destroy_f) GetProcAddress(plugin->handle, "destroy"); +++ if ( !plugin->destroy ) +++ { +++ if ( exit_on_error ) error("Could not initialize %s: destroy method not found\n", plugin->name); +++ return -1; +++ } +++#else ++ dlerror(); ++ plugin->init = (dl_init_f) dlsym(plugin->handle, "init"); ++ char *ret = dlerror(); ++@@ -327,6 +407,7 @@ ++ if ( exit_on_error ) error("Could not initialize %s: %s\n", plugin->name, ret); ++ return -1; ++ } +++#endif ++ ++ return 0; ++ } ++@@ -429,7 +510,7 @@ ++ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); ++ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); ++ if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); ++- bcf_hdr_write(args->out_fh, args->hdr_out); +++ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ } ++ ++@@ -437,7 +518,11 @@ ++ { ++ free(args->plugin.name); ++ if ( args->plugin.destroy ) args->plugin.destroy(); +++#ifdef _WIN32 +++ FreeLibrary(args->plugin.handle); +++#else ++ dlclose(args->plugin.handle); +++#endif ++ if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out); ++ if ( args->nplugin_paths>0 ) ++ { ++@@ -447,7 +532,7 @@ ++ } ++ if ( args->filter ) ++ filter_destroy(args->filter); ++- if (args->out_fh) hts_close(args->out_fh); +++ if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); ++ } ++ ++ static void usage(args_t *args) ++@@ -468,7 +553,7 @@ ++ fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); ++ fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); ++ fprintf(bcftools_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); ++- fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); +++ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(bcftools_stderr, "Plugin options:\n"); ++ fprintf(bcftools_stderr, " -h, --help list plugin's options\n"); ++ fprintf(bcftools_stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); ++@@ -601,10 +686,16 @@ ++ char *fname = NULL; ++ if ( optind>=argc || argv[optind][0]=='-' ) ++ { ++- if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin ++- else usage(args); ++ args->plugin.argc = argc - optind + 1; ++ args->plugin.argv = argv + optind - 1; +++ +++ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin +++ else if ( optind>=argc ) usage(args); +++ else +++ { +++ optind = 1; +++ init_plugin(args); +++ } ++ } ++ else ++ { ++@@ -626,7 +717,7 @@ ++ error("Failed to read the targets: %s\n", args->targets_list); ++ args->files->collapse |= COLLAPSE_SOME; ++ } ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ ++ init_data(args); ++ while ( bcf_sr_next_line(args->files) ) ++@@ -642,7 +733,7 @@ ++ if ( line ) ++ { ++ if ( line->errcode ) error("[E::main_plugin] Unchecked error (%d), exiting\n",line->errcode); ++- bcf_write1(args->out_fh, args->hdr_out, line); +++ if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ } ++ } ++ destroy_data(args); ++--- python-pysam.orig/bcftools/vcfquery.c +++++ python-pysam/bcftools/vcfquery.c ++@@ -128,7 +128,7 @@ ++ if ( args->print_header ) ++ { ++ convert_header(args->convert,&str); ++- fwrite(str.s, str.l, 1, args->out); +++ if ( fwrite(str.s, str.l, 1, args->out)!=1 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out?args->fn_out:"standard output"); ++ } ++ ++ int i,max_convert_unpack = convert_max_unpack(args->convert); ++@@ -168,8 +168,7 @@ ++ ++ str.l = 0; ++ convert_line(args->convert, line, &str); ++- if ( str.l ) ++- fwrite(str.s, str.l, 1, args->out); +++ if ( str.l && fwrite(str.s, str.l, 1, args->out)!=1 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out?args->fn_out:"standard output"); ++ } ++ if ( str.m ) free(str.s); ++ } ++@@ -308,7 +307,7 @@ ++ case 's': args->sample_list = optarg; break; ++ case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -324,14 +323,18 @@ ++ { ++ if ( !fname ) error("Missing the VCF file name\n"); ++ args->files = bcf_sr_init(); ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ list_columns(args); ++ bcf_sr_destroy(args->files); ++ free(args); ++ return 0; ++ } ++ ++- if ( !args->format_str ) usage(); +++ if ( !args->format_str ) +++ { +++ if ( argc==1 && !fname ) usage(); +++ error("Error: Missing the --format option\n"); +++ } ++ args->out = args->fn_out ? fopen(args->fn_out, "w") : stdout; ++ if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); ++ ++@@ -349,7 +352,7 @@ ++ } ++ while ( fname ) ++ { ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ fname = ++optind < argc ? argv[optind] : NULL; ++ } ++ init_data(args); ++@@ -357,7 +360,7 @@ ++ free(args->format_str); ++ destroy_data(args); ++ bcf_sr_destroy(args->files); ++- fclose(args->out); +++ if ( fclose(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fn_out); ++ free(args); ++ return 0; ++ } ++@@ -384,7 +387,10 @@ ++ if ( !bcf_sr_add_reader(args->files, argv[k]) ) error("Failed to open %s: %s\n", argv[k],bcf_sr_strerror(args->files->errnum)); ++ init_data(args); ++ if ( i==0 ) +++ { ++ prev_samples = copy_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header)); +++ prev_nsamples = bcf_hdr_nsamples(args->files->readers[0].header); +++ } ++ else ++ { ++ args->print_header = 0; ++@@ -395,7 +401,7 @@ ++ destroy_data(args); ++ bcf_sr_destroy(args->files); ++ } ++- fclose(args->out); +++ if ( fclose(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fn_out);; ++ destroy_list(fnames, nfiles); ++ destroy_list(prev_samples, prev_nsamples); ++ free(args->format_str); ++--- python-pysam.orig/bcftools/vcfquery.c.pysam.c +++++ python-pysam/bcftools/vcfquery.c.pysam.c ++@@ -130,7 +130,7 @@ ++ if ( args->print_header ) ++ { ++ convert_header(args->convert,&str); ++- fwrite(str.s, str.l, 1, args->out); +++ if ( fwrite(str.s, str.l, 1, args->out)!=1 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out?args->fn_out:"standard output"); ++ } ++ ++ int i,max_convert_unpack = convert_max_unpack(args->convert); ++@@ -170,8 +170,7 @@ ++ ++ str.l = 0; ++ convert_line(args->convert, line, &str); ++- if ( str.l ) ++- fwrite(str.s, str.l, 1, args->out); +++ if ( str.l && fwrite(str.s, str.l, 1, args->out)!=1 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out?args->fn_out:"standard output"); ++ } ++ if ( str.m ) free(str.s); ++ } ++@@ -310,7 +309,7 @@ ++ case 's': args->sample_list = optarg; break; ++ case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -326,14 +325,18 @@ ++ { ++ if ( !fname ) error("Missing the VCF file name\n"); ++ args->files = bcf_sr_init(); ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ list_columns(args); ++ bcf_sr_destroy(args->files); ++ free(args); ++ return 0; ++ } ++ ++- if ( !args->format_str ) usage(); +++ if ( !args->format_str ) +++ { +++ if ( argc==1 && !fname ) usage(); +++ error("Error: Missing the --format option\n"); +++ } ++ args->out = args->fn_out ? fopen(args->fn_out, "w") : bcftools_stdout; ++ if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); ++ ++@@ -351,7 +354,7 @@ ++ } ++ while ( fname ) ++ { ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ fname = ++optind < argc ? argv[optind] : NULL; ++ } ++ init_data(args); ++@@ -359,7 +362,7 @@ ++ free(args->format_str); ++ destroy_data(args); ++ bcf_sr_destroy(args->files); ++- fclose(args->out); +++ if ( fclose(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fn_out); ++ free(args); ++ return 0; ++ } ++@@ -386,7 +389,10 @@ ++ if ( !bcf_sr_add_reader(args->files, argv[k]) ) error("Failed to open %s: %s\n", argv[k],bcf_sr_strerror(args->files->errnum)); ++ init_data(args); ++ if ( i==0 ) +++ { ++ prev_samples = copy_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header)); +++ prev_nsamples = bcf_hdr_nsamples(args->files->readers[0].header); +++ } ++ else ++ { ++ args->print_header = 0; ++@@ -397,7 +403,7 @@ ++ destroy_data(args); ++ bcf_sr_destroy(args->files); ++ } ++- fclose(args->out); +++ if ( fclose(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fn_out);; ++ destroy_list(fnames, nfiles); ++ destroy_list(prev_samples, prev_nsamples); ++ free(args->format_str); ++--- python-pysam.orig/bcftools/vcfroh.c +++++ python-pysam/bcftools/vcfroh.c ++@@ -130,6 +130,11 @@ ++ return mem; ++ } ++ +++static inline int max255(int i) +++{ +++ return i < 256 ? i : 255; +++} +++ ++ static void init_data(args_t *args) ++ { ++ int i; ++@@ -156,7 +161,7 @@ ++ if ( !strncmp("GT,",args->estimate_AF,3) ) args->estimate_AF += 3; ++ else if ( !strncmp("PL,",args->estimate_AF,3) ) { args->estimate_AF += 3; args->af_from_PL = 1; } ++ if ( strcmp("-",args->estimate_AF) ) ++- args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE); +++ args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE|SMPL_VERBOSE); ++ } ++ ++ if ( args->estimate_AF || args->fake_PLs ) ++@@ -181,7 +186,7 @@ ++ error("Error: The FORMAT/GT tag not found in the header\n"); ++ } ++ ++- args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE); +++ args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE|SMPL_VERBOSE); ++ if ( args->samples ) ++ { ++ // we may be able to subset to a few samples, for a text VCF this can be a major speedup ++@@ -749,9 +754,9 @@ ++ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ ++ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ ++ double prob[3], norm = 0; \ ++- prob[0] = p[irr] < 256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ ++- prob[1] = p[ira] < 256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ ++- prob[2] = p[iaa] < 256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ +++ prob[0] = args->pl2p[ max255(p[irr]) ]; \ +++ prob[1] = args->pl2p[ max255(p[ira]) ]; \ +++ prob[2] = args->pl2p[ max255(p[iaa]) ]; \ ++ for (j=0; j<3; j++) norm += prob[j]; \ ++ for (j=0; j<3; j++) prob[j] /= norm; \ ++ af += 0.5*prob[1] + prob[2]; \ ++@@ -779,9 +784,9 @@ ++ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ ++ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ ++ double prob[3], norm = 0; \ ++- prob[0] = p[irr] < 256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ ++- prob[1] = p[ira] < 256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ ++- prob[2] = p[iaa] < 256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ +++ prob[0] = args->pl2p[ max255(p[irr]) ]; \ +++ prob[1] = args->pl2p[ max255(p[ira]) ]; \ +++ prob[2] = args->pl2p[ max255(p[iaa]) ]; \ ++ for (j=0; j<3; j++) norm += prob[j]; \ ++ for (j=0; j<3; j++) prob[j] /= norm; \ ++ af += 0.5*prob[1] + prob[2]; \ ++@@ -827,7 +832,7 @@ ++ if ( ret>0 ) ++ alt_freq = args->AFs[ial-1]; ++ if ( ret==-2 ) ++- error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1); +++ error("Type mismatch for INFO/%s tag at %s:%"PRId64"\n", args->af_tag, bcf_seqname(args->hdr,line), (int64_t) line->pos+1); ++ } ++ else if ( args->af_fname ) ++ { ++@@ -926,9 +931,9 @@ ++ type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \ ++ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ ++ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ ++- pdg[0] = p[irr] < 256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ ++- pdg[1] = p[ira] < 256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ ++- pdg[2] = p[iaa] < 256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ +++ pdg[0] = args->pl2p[ max255(p[irr]) ]; \ +++ pdg[1] = args->pl2p[ max255(p[ira]) ]; \ +++ pdg[2] = args->pl2p[ max255(p[iaa]) ]; \ ++ } ++ switch (fmt_pl->type) { ++ case BCF_BT_INT8: BRANCH(int8_t); break; ++@@ -1089,7 +1094,7 @@ ++ fprintf(stderr, " -S, --samples-file file of samples to analyze [all samples]\n"); ++ fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); ++ fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); ++- fprintf(stderr, " --threads number of extra decompression threads [0]\n"); +++ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(stderr, "\n"); ++ fprintf(stderr, "HMM Options:\n"); ++ fprintf(stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); ++@@ -1198,7 +1203,7 @@ ++ } ++ } ++ if ( !args->output_fname ) args->output_fname = "stdout"; ++- if ( !args->output_type ) args->output_type = OUTPUT_ST|OUTPUT_RG; +++ if ( !args->output_type || args->output_type==OUTPUT_GZ ) args->output_type |= OUTPUT_ST|OUTPUT_RG; ++ char *fname = NULL; ++ if ( optind==argc ) ++ { ++@@ -1229,7 +1234,7 @@ ++ } ++ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0) ++ error("Failed to create threads\n"); ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ ++ init_data(args); ++ while ( bcf_sr_next_line(args->files) ) ++--- python-pysam.orig/bcftools/vcfroh.c.pysam.c +++++ python-pysam/bcftools/vcfroh.c.pysam.c ++@@ -132,6 +132,11 @@ ++ return mem; ++ } ++ +++static inline int max255(int i) +++{ +++ return i < 256 ? i : 255; +++} +++ ++ static void init_data(args_t *args) ++ { ++ int i; ++@@ -158,7 +163,7 @@ ++ if ( !strncmp("GT,",args->estimate_AF,3) ) args->estimate_AF += 3; ++ else if ( !strncmp("PL,",args->estimate_AF,3) ) { args->estimate_AF += 3; args->af_from_PL = 1; } ++ if ( strcmp("-",args->estimate_AF) ) ++- args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE); +++ args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE|SMPL_VERBOSE); ++ } ++ ++ if ( args->estimate_AF || args->fake_PLs ) ++@@ -183,7 +188,7 @@ ++ error("Error: The FORMAT/GT tag not found in the header\n"); ++ } ++ ++- args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE); +++ args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE|SMPL_VERBOSE); ++ if ( args->samples ) ++ { ++ // we may be able to subset to a few samples, for a text VCF this can be a major speedup ++@@ -751,9 +756,9 @@ ++ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ ++ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ ++ double prob[3], norm = 0; \ ++- prob[0] = p[irr] < 256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ ++- prob[1] = p[ira] < 256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ ++- prob[2] = p[iaa] < 256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ +++ prob[0] = args->pl2p[ max255(p[irr]) ]; \ +++ prob[1] = args->pl2p[ max255(p[ira]) ]; \ +++ prob[2] = args->pl2p[ max255(p[iaa]) ]; \ ++ for (j=0; j<3; j++) norm += prob[j]; \ ++ for (j=0; j<3; j++) prob[j] /= norm; \ ++ af += 0.5*prob[1] + prob[2]; \ ++@@ -781,9 +786,9 @@ ++ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ ++ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ ++ double prob[3], norm = 0; \ ++- prob[0] = p[irr] < 256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ ++- prob[1] = p[ira] < 256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ ++- prob[2] = p[iaa] < 256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ +++ prob[0] = args->pl2p[ max255(p[irr]) ]; \ +++ prob[1] = args->pl2p[ max255(p[ira]) ]; \ +++ prob[2] = args->pl2p[ max255(p[iaa]) ]; \ ++ for (j=0; j<3; j++) norm += prob[j]; \ ++ for (j=0; j<3; j++) prob[j] /= norm; \ ++ af += 0.5*prob[1] + prob[2]; \ ++@@ -829,7 +834,7 @@ ++ if ( ret>0 ) ++ alt_freq = args->AFs[ial-1]; ++ if ( ret==-2 ) ++- error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1); +++ error("Type mismatch for INFO/%s tag at %s:%"PRId64"\n", args->af_tag, bcf_seqname(args->hdr,line), (int64_t) line->pos+1); ++ } ++ else if ( args->af_fname ) ++ { ++@@ -928,9 +933,9 @@ ++ type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \ ++ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ ++ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ ++- pdg[0] = p[irr] < 256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ ++- pdg[1] = p[ira] < 256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ ++- pdg[2] = p[iaa] < 256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ +++ pdg[0] = args->pl2p[ max255(p[irr]) ]; \ +++ pdg[1] = args->pl2p[ max255(p[ira]) ]; \ +++ pdg[2] = args->pl2p[ max255(p[iaa]) ]; \ ++ } ++ switch (fmt_pl->type) { ++ case BCF_BT_INT8: BRANCH(int8_t); break; ++@@ -1091,7 +1096,7 @@ ++ fprintf(bcftools_stderr, " -S, --samples-file file of samples to analyze [all samples]\n"); ++ fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); ++ fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); ++- fprintf(bcftools_stderr, " --threads number of extra decompression threads [0]\n"); +++ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(bcftools_stderr, "\n"); ++ fprintf(bcftools_stderr, "HMM Options:\n"); ++ fprintf(bcftools_stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); ++@@ -1200,7 +1205,7 @@ ++ } ++ } ++ if ( !args->output_fname ) args->output_fname = "bcftools_stdout"; ++- if ( !args->output_type ) args->output_type = OUTPUT_ST|OUTPUT_RG; +++ if ( !args->output_type || args->output_type==OUTPUT_GZ ) args->output_type |= OUTPUT_ST|OUTPUT_RG; ++ char *fname = NULL; ++ if ( optind==argc ) ++ { ++@@ -1231,7 +1236,7 @@ ++ } ++ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0) ++ error("Failed to create threads\n"); ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ ++ init_data(args); ++ while ( bcf_sr_next_line(args->files) ) ++--- python-pysam.orig/bcftools/vcfsom.c +++++ python-pysam/bcftools/vcfsom.c ++@@ -35,6 +35,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include "bcftools.h" ++ ++@@ -356,7 +357,7 @@ ++ if ( !som->w ) error("Could not alloc %"PRIu64" bytes [nbin=%d ndim=%d]\n", (uint64_t)(sizeof(double)*som->size),som->nbin,som->ndim); ++ int i; ++ for (i=0; isize*som->kdim; i++) ++- som->w[i] = (double)random()/RAND_MAX; +++ som->w[i] = random(); ++ som->a_idx = (int*) malloc(sizeof(int)*som->ndim); ++ som->b_idx = (int*) malloc(sizeof(int)*som->ndim); ++ som->div = (double*) malloc(sizeof(double)*som->ndim); ++@@ -695,7 +696,7 @@ ++ case 't': args->action = SOM_TRAIN; break; ++ case 'c': args->action = SOM_CLASSIFY; break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++--- python-pysam.orig/bcftools/vcfsom.c.pysam.c +++++ python-pysam/bcftools/vcfsom.c.pysam.c ++@@ -37,6 +37,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include "bcftools.h" ++ ++@@ -358,7 +359,7 @@ ++ if ( !som->w ) error("Could not alloc %"PRIu64" bytes [nbin=%d ndim=%d]\n", (uint64_t)(sizeof(double)*som->size),som->nbin,som->ndim); ++ int i; ++ for (i=0; isize*som->kdim; i++) ++- som->w[i] = (double)random()/RAND_MAX; +++ som->w[i] = random(); ++ som->a_idx = (int*) malloc(sizeof(int)*som->ndim); ++ som->b_idx = (int*) malloc(sizeof(int)*som->ndim); ++ som->div = (double*) malloc(sizeof(double)*som->ndim); ++@@ -697,7 +698,7 @@ ++ case 't': args->action = SOM_TRAIN; break; ++ case 'c': args->action = SOM_CLASSIFY; break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++--- python-pysam.orig/bcftools/vcfsort.c +++++ python-pysam/bcftools/vcfsort.c ++@@ -29,13 +29,18 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++ #include ++ #include +++#ifdef _WIN32 +++#include +++#endif ++ #include ++ #include +++#include ++ #include "kheap.h" ++ #include "bcftools.h" ++ ++@@ -59,6 +64,33 @@ ++ } ++ args_t; ++ +++void clean_files(args_t *args) +++{ +++ int i; +++ fprintf(stderr,"Cleaning\n"); +++ for (i=0; inblk; i++) +++ { +++ blk_t *blk = args->blk + i; +++ if ( blk->fname ) +++ { +++ unlink(blk->fname); +++ free(blk->fname); +++ } +++ if ( blk->rec ) +++ bcf_destroy(blk->rec); +++ } +++ rmdir(args->tmp_dir); +++} +++void clean_files_and_throw(args_t *args, const char *format, ...) +++{ +++ va_list ap; +++ va_start(ap, format); +++ vfprintf(stderr, format, ap); +++ va_end(ap); +++ clean_files(args); +++ exit(-1); +++} +++ ++ int cmp_bcf_pos(const void *aptr, const void *bptr) ++ { ++ bcf1_t *a = *((bcf1_t**)aptr); ++@@ -98,18 +130,20 @@ ++ kstring_t str = {0,0,0}; ++ ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk); ++ blk->fname = str.s; +++ blk->rec = NULL; +++ blk->fh = NULL; ++ ++ htsFile *fh = hts_open(blk->fname, "wbu"); ++- if ( fh == NULL ) error("Cannot write %s: %s\n", blk->fname, strerror(errno)); ++- bcf_hdr_write(fh, args->hdr); +++ if ( fh == NULL ) clean_files_and_throw(args, "Cannot write %s: %s\n", blk->fname, strerror(errno)); +++ if ( bcf_hdr_write(fh, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); ++ ++ int i; ++ for (i=0; inbuf; i++) ++ { ++- bcf_write(fh, args->hdr, args->buf[i]); +++ if ( bcf_write(fh, args->hdr, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); ++ bcf_destroy(args->buf[i]); ++ } ++- hts_close(fh); +++ if ( hts_close(fh)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname); ++ ++ args->nbuf = 0; ++ args->mem = 0; ++@@ -128,25 +162,27 @@ ++ void sort_blocks(args_t *args) ++ { ++ htsFile *in = hts_open(args->fname, "r"); ++- if ( !in ) error("Could not read %s\n", args->fname); +++ if ( !in ) clean_files_and_throw(args, "Could not read %s\n", args->fname); ++ args->hdr = bcf_hdr_read(in); +++ if ( !args->hdr) clean_files_and_throw(args, "Could not read VCF/BCF headers from %s\n", args->fname); ++ ++ while ( 1 ) ++ { ++ bcf1_t *rec = bcf_init(); ++ int ret = bcf_read1(in, args->hdr, rec); ++- if ( ret < -1 ) error("Error encountered while parsing the input\n"); +++ if ( ret < -1 ) clean_files_and_throw(args,"Error encountered while parsing the input\n"); ++ if ( ret == -1 ) ++ { ++ bcf_destroy(rec); ++ break; ++ } +++ if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%d\n",bcf_seqname(args->hdr,rec),rec->pos+1); ++ buf_push(args, rec); ++ } ++ buf_flush(args); ++ free(args->buf); ++ ++- if ( hts_close(in)!=0 ) error("Close failed: %s\n", args->fname); +++ if ( hts_close(in)!=0 ) clean_files_and_throw(args,"Close failed: %s\n", args->fname); ++ } ++ ++ static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr) ++@@ -159,14 +195,14 @@ ++ } ++ KHEAP_INIT(blk, blk_t*, blk_is_smaller) ++ ++-void blk_read(khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) +++void blk_read(args_t *args, khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) ++ { ++ if ( !blk->fh ) return; ++ int ret = bcf_read(blk->fh, hdr, blk->rec); ++- if ( ret < -1 ) error("Error reading %s\n", blk->fname); +++ if ( ret < -1 ) clean_files_and_throw(args, "Error reading %s\n", blk->fname); ++ if ( ret == -1 ) ++ { ++- if ( hts_close(blk->fh)!=0 ) error("Close failed: %s\n", blk->fname); +++ if ( hts_close(blk->fh)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", blk->fname); ++ blk->fh = 0; ++ return; ++ } ++@@ -184,33 +220,26 @@ ++ { ++ blk_t *blk = args->blk + i; ++ blk->fh = hts_open(blk->fname, "r"); ++- if ( !blk->fh ) error("Could not read %s: %s\n", blk->fname, strerror(errno)); +++ if ( !blk->fh ) clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno)); ++ bcf_hdr_t *hdr = bcf_hdr_read(blk->fh); ++ bcf_hdr_destroy(hdr); ++ blk->rec = bcf_init(); ++- blk_read(bhp, args->hdr, blk); +++ blk_read(args, bhp, args->hdr, blk); ++ } ++ ++ htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); ++- bcf_hdr_write(out, args->hdr); +++ if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ while ( bhp->ndat ) ++ { ++ blk_t *blk = bhp->dat[0]; ++- bcf_write(out, args->hdr, blk->rec); +++ if ( bcf_write(out, args->hdr, blk->rec)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ khp_delete(blk, bhp); ++- blk_read(bhp, args->hdr, blk); +++ blk_read(args, bhp, args->hdr, blk); ++ } ++- if ( hts_close(out)!=0 ) error("Close failed: %s\n", args->output_fname); +++ if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname); +++ +++ clean_files(args); ++ ++- fprintf(stderr,"Cleaning\n"); ++- for (i=0; inblk; i++) ++- { ++- blk_t *blk = args->blk + i; ++- unlink(blk->fname); ++- free(blk->fname); ++- bcf_destroy(blk->rec); ++- } ++- rmdir(args->tmp_dir); ++ free(args->blk); ++ khp_destroy(blk, bhp); ++ fprintf(stderr,"Done\n"); ++@@ -226,7 +255,7 @@ ++ fprintf(stderr, " -m, --max-mem [kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 ++ fprintf(stderr, " -o, --output-file output file name [stdout]\n"); ++ fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); ++- fprintf(stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX/]\n"); +++ fprintf(stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX]\n"); ++ fprintf(stderr, "\n"); ++ exit(1); ++ } ++@@ -243,22 +272,40 @@ ++ } ++ ++ void mkdir_p(const char *fmt, ...); ++-void init(args_t *args) +++static void init(args_t *args) ++ { ++- if ( !args->tmp_dir ) +++#ifdef _WIN32 +++ char tmp_path[MAX_PATH]; +++ int ret = GetTempPath(MAX_PATH, tmp_path); +++ if (!ret || ret > MAX_PATH) +++ error("Could not get the path to the temporary folder\n"); +++ if (strlen(tmp_path) + strlen("/bcftools-sort.XXXXXX") >= MAX_PATH) +++ error("Full path to the temporary folder is too long\n"); +++ strcat(tmp_path, "/bcftools-sort.XXXXXX"); +++ args->tmp_dir = strdup(tmp_path); +++#else +++ args->tmp_dir = args->tmp_dir ? strdup(args->tmp_dir) : strdup("/tmp/bcftools-sort.XXXXXX"); +++#endif +++ size_t len = strlen(args->tmp_dir); +++ if ( !strcmp("XXXXXX",args->tmp_dir+len-6) ) ++ { ++- args->tmp_dir = strdup("/tmp/bcftools-sort.XXXXXX"); ++- char *tmp_dir = mkdtemp(args->tmp_dir); ++- if ( !tmp_dir ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno)); +++#ifdef _WIN32 +++ int ret = mkdir(mktemp(args->tmp_dir), 0700); +++ if ( ret ) error("mkdir(%s) failed: %s\n", args->tmp_dir,strerror(errno)); +++#else +++ char *tmp = mkdtemp(args->tmp_dir); +++ if ( !tmp ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno)); +++ int ret = chmod(tmp, S_IRUSR|S_IWUSR|S_IXUSR); +++ if ( ret ) error("chmod(%s,S_IRUSR|S_IWUSR|S_IXUSR) failed: %s\n", args->tmp_dir,strerror(errno)); +++#endif ++ } ++- else ++- { ++- args->tmp_dir = strdup(args->tmp_dir); ++- mkdir_p(args->tmp_dir); +++ else { +++ mkdir_p("%s/",args->tmp_dir); ++ } +++ ++ fprintf(stderr,"Writing to %s\n", args->tmp_dir); ++ } ++-void destroy(args_t *args) +++static void destroy(args_t *args) ++ { ++ bcf_hdr_destroy(args->hdr); ++ free(args->tmp_dir); ++@@ -298,8 +345,8 @@ ++ default: error("The output type \"%s\" not recognised\n", optarg); ++ }; ++ break; ++- case 'h': usage(args); ++- case '?': usage(args); +++ case 'h': +++ case '?': usage(args); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++--- python-pysam.orig/bcftools/vcfsort.c.pysam.c +++++ python-pysam/bcftools/vcfsort.c.pysam.c ++@@ -31,13 +31,18 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++ #include ++ #include +++#ifdef _WIN32 +++#include +++#endif ++ #include ++ #include +++#include ++ #include "kheap.h" ++ #include "bcftools.h" ++ ++@@ -61,6 +66,33 @@ ++ } ++ args_t; ++ +++void clean_files(args_t *args) +++{ +++ int i; +++ fprintf(bcftools_stderr,"Cleaning\n"); +++ for (i=0; inblk; i++) +++ { +++ blk_t *blk = args->blk + i; +++ if ( blk->fname ) +++ { +++ unlink(blk->fname); +++ free(blk->fname); +++ } +++ if ( blk->rec ) +++ bcf_destroy(blk->rec); +++ } +++ rmdir(args->tmp_dir); +++} +++void clean_files_and_throw(args_t *args, const char *format, ...) +++{ +++ va_list ap; +++ va_start(ap, format); +++ vfprintf(bcftools_stderr, format, ap); +++ va_end(ap); +++ clean_files(args); +++ exit(-1); +++} +++ ++ int cmp_bcf_pos(const void *aptr, const void *bptr) ++ { ++ bcf1_t *a = *((bcf1_t**)aptr); ++@@ -100,18 +132,20 @@ ++ kstring_t str = {0,0,0}; ++ ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk); ++ blk->fname = str.s; +++ blk->rec = NULL; +++ blk->fh = NULL; ++ ++ htsFile *fh = hts_open(blk->fname, "wbu"); ++- if ( fh == NULL ) error("Cannot write %s: %s\n", blk->fname, strerror(errno)); ++- bcf_hdr_write(fh, args->hdr); +++ if ( fh == NULL ) clean_files_and_throw(args, "Cannot write %s: %s\n", blk->fname, strerror(errno)); +++ if ( bcf_hdr_write(fh, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); ++ ++ int i; ++ for (i=0; inbuf; i++) ++ { ++- bcf_write(fh, args->hdr, args->buf[i]); +++ if ( bcf_write(fh, args->hdr, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); ++ bcf_destroy(args->buf[i]); ++ } ++- hts_close(fh); +++ if ( hts_close(fh)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname); ++ ++ args->nbuf = 0; ++ args->mem = 0; ++@@ -130,25 +164,27 @@ ++ void sort_blocks(args_t *args) ++ { ++ htsFile *in = hts_open(args->fname, "r"); ++- if ( !in ) error("Could not read %s\n", args->fname); +++ if ( !in ) clean_files_and_throw(args, "Could not read %s\n", args->fname); ++ args->hdr = bcf_hdr_read(in); +++ if ( !args->hdr) clean_files_and_throw(args, "Could not read VCF/BCF headers from %s\n", args->fname); ++ ++ while ( 1 ) ++ { ++ bcf1_t *rec = bcf_init(); ++ int ret = bcf_read1(in, args->hdr, rec); ++- if ( ret < -1 ) error("Error encountered while parsing the input\n"); +++ if ( ret < -1 ) clean_files_and_throw(args,"Error encountered while parsing the input\n"); ++ if ( ret == -1 ) ++ { ++ bcf_destroy(rec); ++ break; ++ } +++ if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%d\n",bcf_seqname(args->hdr,rec),rec->pos+1); ++ buf_push(args, rec); ++ } ++ buf_flush(args); ++ free(args->buf); ++ ++- if ( hts_close(in)!=0 ) error("Close failed: %s\n", args->fname); +++ if ( hts_close(in)!=0 ) clean_files_and_throw(args,"Close failed: %s\n", args->fname); ++ } ++ ++ static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr) ++@@ -161,14 +197,14 @@ ++ } ++ KHEAP_INIT(blk, blk_t*, blk_is_smaller) ++ ++-void blk_read(khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) +++void blk_read(args_t *args, khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) ++ { ++ if ( !blk->fh ) return; ++ int ret = bcf_read(blk->fh, hdr, blk->rec); ++- if ( ret < -1 ) error("Error reading %s\n", blk->fname); +++ if ( ret < -1 ) clean_files_and_throw(args, "Error reading %s\n", blk->fname); ++ if ( ret == -1 ) ++ { ++- if ( hts_close(blk->fh)!=0 ) error("Close failed: %s\n", blk->fname); +++ if ( hts_close(blk->fh)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", blk->fname); ++ blk->fh = 0; ++ return; ++ } ++@@ -186,33 +222,26 @@ ++ { ++ blk_t *blk = args->blk + i; ++ blk->fh = hts_open(blk->fname, "r"); ++- if ( !blk->fh ) error("Could not read %s: %s\n", blk->fname, strerror(errno)); +++ if ( !blk->fh ) clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno)); ++ bcf_hdr_t *hdr = bcf_hdr_read(blk->fh); ++ bcf_hdr_destroy(hdr); ++ blk->rec = bcf_init(); ++- blk_read(bhp, args->hdr, blk); +++ blk_read(args, bhp, args->hdr, blk); ++ } ++ ++ htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); ++- bcf_hdr_write(out, args->hdr); +++ if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ while ( bhp->ndat ) ++ { ++ blk_t *blk = bhp->dat[0]; ++- bcf_write(out, args->hdr, blk->rec); +++ if ( bcf_write(out, args->hdr, blk->rec)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); ++ khp_delete(blk, bhp); ++- blk_read(bhp, args->hdr, blk); +++ blk_read(args, bhp, args->hdr, blk); ++ } ++- if ( hts_close(out)!=0 ) error("Close failed: %s\n", args->output_fname); +++ if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname); +++ +++ clean_files(args); ++ ++- fprintf(bcftools_stderr,"Cleaning\n"); ++- for (i=0; inblk; i++) ++- { ++- blk_t *blk = args->blk + i; ++- unlink(blk->fname); ++- free(blk->fname); ++- bcf_destroy(blk->rec); ++- } ++- rmdir(args->tmp_dir); ++ free(args->blk); ++ khp_destroy(blk, bhp); ++ fprintf(bcftools_stderr,"Done\n"); ++@@ -228,7 +257,7 @@ ++ fprintf(bcftools_stderr, " -m, --max-mem [kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 ++ fprintf(bcftools_stderr, " -o, --output-file output file name [bcftools_stdout]\n"); ++ fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); ++- fprintf(bcftools_stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX/]\n"); +++ fprintf(bcftools_stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX]\n"); ++ fprintf(bcftools_stderr, "\n"); ++ exit(1); ++ } ++@@ -245,22 +274,40 @@ ++ } ++ ++ void mkdir_p(const char *fmt, ...); ++-void init(args_t *args) +++static void init(args_t *args) ++ { ++- if ( !args->tmp_dir ) +++#ifdef _WIN32 +++ char tmp_path[MAX_PATH]; +++ int ret = GetTempPath(MAX_PATH, tmp_path); +++ if (!ret || ret > MAX_PATH) +++ error("Could not get the path to the temporary folder\n"); +++ if (strlen(tmp_path) + strlen("/bcftools-sort.XXXXXX") >= MAX_PATH) +++ error("Full path to the temporary folder is too long\n"); +++ strcat(tmp_path, "/bcftools-sort.XXXXXX"); +++ args->tmp_dir = strdup(tmp_path); +++#else +++ args->tmp_dir = args->tmp_dir ? strdup(args->tmp_dir) : strdup("/tmp/bcftools-sort.XXXXXX"); +++#endif +++ size_t len = strlen(args->tmp_dir); +++ if ( !strcmp("XXXXXX",args->tmp_dir+len-6) ) ++ { ++- args->tmp_dir = strdup("/tmp/bcftools-sort.XXXXXX"); ++- char *tmp_dir = mkdtemp(args->tmp_dir); ++- if ( !tmp_dir ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno)); +++#ifdef _WIN32 +++ int ret = mkdir(mktemp(args->tmp_dir), 0700); +++ if ( ret ) error("mkdir(%s) failed: %s\n", args->tmp_dir,strerror(errno)); +++#else +++ char *tmp = mkdtemp(args->tmp_dir); +++ if ( !tmp ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno)); +++ int ret = chmod(tmp, S_IRUSR|S_IWUSR|S_IXUSR); +++ if ( ret ) error("chmod(%s,S_IRUSR|S_IWUSR|S_IXUSR) failed: %s\n", args->tmp_dir,strerror(errno)); +++#endif ++ } ++- else ++- { ++- args->tmp_dir = strdup(args->tmp_dir); ++- mkdir_p(args->tmp_dir); +++ else { +++ mkdir_p("%s/",args->tmp_dir); ++ } +++ ++ fprintf(bcftools_stderr,"Writing to %s\n", args->tmp_dir); ++ } ++-void destroy(args_t *args) +++static void destroy(args_t *args) ++ { ++ bcf_hdr_destroy(args->hdr); ++ free(args->tmp_dir); ++@@ -300,8 +347,8 @@ ++ default: error("The output type \"%s\" not recognised\n", optarg); ++ }; ++ break; ++- case 'h': usage(args); ++- case '?': usage(args); +++ case 'h': +++ case '?': usage(args); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++--- python-pysam.orig/bcftools/vcfstats.c +++++ python-pysam/bcftools/vcfstats.c ++@@ -70,7 +70,7 @@ ++ ++ typedef struct ++ { ++- int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; +++ uint32_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; ++ int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons ++ #if HWE_STATS ++ int *af_hwe; ++@@ -88,12 +88,14 @@ ++ int subst[15]; ++ int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; ++ int *smpl_hapRef, *smpl_hapAlt, *smpl_missing; ++- int *smpl_indel_hets, *smpl_indel_homs; +++ int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs; ++ int *smpl_frm_shifts; // not-applicable, in-frame, out-frame ++ unsigned long int *smpl_dp; ++ idist_t dp, dp_sites; ++ int nusr; ++ user_stats_t *usr; +++ double *dvaf; // distribution of the mean indel-allele frequency by length: -m_indel,-(m_indel-1),...-1,0,1,..,m_indel +++ uint32_t *nvaf; ++ } ++ stats_t; ++ ++@@ -476,8 +478,10 @@ ++ stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int)); ++ stats->smpl_hapRef = (int *) calloc(args->files->n_smpl,sizeof(int)); ++ stats->smpl_hapAlt = (int *) calloc(args->files->n_smpl,sizeof(int)); ++- stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); ++- stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); +++ stats->smpl_ins_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); +++ stats->smpl_del_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); +++ stats->smpl_ins_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); +++ stats->smpl_del_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); ++ stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int)); ++ stats->smpl_tv = (int *) calloc(args->files->n_smpl,sizeof(int)); ++ stats->smpl_indels = (int *) calloc(args->files->n_smpl,sizeof(int)); ++@@ -489,6 +493,8 @@ ++ #endif ++ if ( args->exons_fname ) ++ stats->smpl_frm_shifts = (int*) calloc(args->files->n_smpl*3,sizeof(int)); +++ stats->nvaf = (uint32_t*) calloc(stats->m_indel*2+1,sizeof(*stats->nvaf)); +++ stats->dvaf = (double*) calloc(stats->m_indel*2+1,sizeof(*stats->dvaf)); ++ } ++ idist_init(&stats->dp, args->dp_min,args->dp_max,args->dp_step); ++ idist_init(&stats->dp_sites, args->dp_min,args->dp_max,args->dp_step); ++@@ -558,8 +564,10 @@ ++ free(stats->smpl_homRR); ++ free(stats->smpl_hapRef); ++ free(stats->smpl_hapAlt); ++- free(stats->smpl_indel_homs); ++- free(stats->smpl_indel_hets); +++ free(stats->smpl_ins_homs); +++ free(stats->smpl_del_homs); +++ free(stats->smpl_ins_hets); +++ free(stats->smpl_del_hets); ++ free(stats->smpl_ts); ++ free(stats->smpl_tv); ++ free(stats->smpl_indels); ++@@ -576,6 +584,8 @@ ++ } ++ free(stats->usr); ++ if ( args->exons ) free(stats->smpl_frm_shifts); +++ free(stats->nvaf); +++ free(stats->dvaf); ++ } ++ for (j=0; jnusr; j++) free(args->usr[j].tag); ++ if ( args->af_bins ) bin_destroy(args->af_bins); ++@@ -844,6 +854,34 @@ ++ } ++ } ++ +++static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int ismpl, int ial, int jal) +++{ +++ if ( !fmt ) return; +++ +++ float dvaf; +++ #define BRANCH_INT(type_t,missing,vector_end) { \ +++ type_t *p = (type_t *) (fmt->p + fmt->size*ismpl); \ +++ if ( p[ial]==vector_end || p[jal]==vector_end ) return; \ +++ if ( p[ial]==missing || p[jal]==missing ) return; \ +++ if ( !p[ial] && !p[jal] ) return; \ +++ dvaf = (float)p[ial]/(p[ial]+p[jal]); \ +++ } +++ switch (fmt->type) { +++ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; +++ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; +++ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; +++ default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt->type); exit(1); break; +++ } +++ #undef BRANCH_INT +++ +++ int len = line->d.var[ial].n; +++ if ( len < -stats->m_indel ) len = -stats->m_indel; +++ else if ( len > stats->m_indel ) len = stats->m_indel; +++ int bin = stats->m_indel + len; +++ stats->nvaf[bin]++; +++ stats->dvaf[bin] += dvaf; +++} +++ ++ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) ++ { ++ bcf_srs_t *files = args->files; ++@@ -854,6 +892,8 @@ ++ ++ if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) ) ++ { +++ bcf_fmt_t *ad_fmt_ptr = bcf_get_variant_types(line)&VCF_INDEL ? bcf_get_fmt(reader->header,reader->buffer[0],"AD") : NULL; +++ ++ int ref = bcf_acgt2int(*line->d.allele[0]); ++ int is, n_nref = 0, i_nref = 0; ++ for (is=0; isfiles->n_smpl; is++) ++@@ -910,8 +950,31 @@ ++ if ( gt != GT_HOM_RR ) ++ { ++ stats->smpl_indels[is]++; ++- if ( gt==GT_HET_RA || gt==GT_HET_AA ) stats->smpl_indel_hets[is]++; ++- else if ( gt==GT_HOM_AA ) stats->smpl_indel_homs[is]++; +++ +++ if ( gt==GT_HET_RA || gt==GT_HET_AA ) +++ { +++ int is_ins = 0, is_del = 0; +++ if ( bcf_get_variant_type(line,ial)&VCF_INDEL ) +++ { +++ if ( line->d.var[ial].n < 0 ) is_del = 1; +++ else is_ins = 1; +++ update_dvaf(stats,line,ad_fmt_ptr,is,ial,jal); +++ } +++ if ( bcf_get_variant_type(line,jal)&VCF_INDEL ) +++ { +++ if ( line->d.var[jal].n < 0 ) is_del = 1; +++ else is_ins = 1; +++ update_dvaf(stats,line,ad_fmt_ptr,is,jal,ial); +++ } +++ // Note that alt-het genotypes with both ins and del allele are counted twice!! +++ if ( is_del ) stats->smpl_del_hets[is]++; +++ if ( is_ins ) stats->smpl_ins_hets[is]++; +++ } +++ else if ( gt==GT_HOM_AA ) +++ { +++ if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[is]++; +++ else stats->smpl_ins_homs[is]++; +++ } ++ } ++ if ( stats->smpl_frm_shifts ) ++ { ++@@ -959,6 +1022,37 @@ ++ } ++ #undef BRANCH_INT ++ } +++ else if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD")) ) +++ { +++ #define BRANCH_INT(type_t,missing,vector_end) { \ +++ int is,iv; \ +++ for (is=0; isfiles->n_smpl; is++) \ +++ { \ +++ type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \ +++ int dp = 0, has_value = 0; \ +++ for (iv=0; ivn; iv++) \ +++ { \ +++ if ( p[iv]==vector_end ) break; \ +++ if ( p[iv]==missing ) continue; \ +++ has_value = 1; \ +++ dp += p[iv]; \ +++ } \ +++ if ( has_value ) \ +++ { \ +++ (*idist(&stats->dp, dp))++; \ +++ stats->smpl_ndp[is]++; \ +++ stats->smpl_dp[is] += dp; \ +++ } \ +++ } \ +++ } +++ switch (fmt_ptr->type) { +++ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; +++ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; +++ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; +++ default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; +++ } +++ #undef BRANCH_INT +++ } ++ ++ if ( matched==3 ) ++ { ++@@ -968,6 +1062,7 @@ ++ fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return; ++ ++ // only the first ALT allele is considered +++ if (args->ntmp_iaf <= 1) return; // Do not consider invariate sites ++ int iaf = args->tmp_iaf[1]; ++ int line_type = bcf_get_variant_types(files->readers[0].buffer[0]); ++ gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels; ++@@ -1019,7 +1114,7 @@ ++ { ++ nmm++; ++ bcf_sr_t *reader = &files->readers[0]; ++- printf("DBG\t%s\t%d\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,files->samples[is],gt,gt2); +++ printf("DBG\t%s\t%"PRId64"\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,(int64_t) reader->buffer[0]->pos+1,files->samples[is],gt,gt2); ++ } ++ else ++ { ++@@ -1028,7 +1123,7 @@ ++ } ++ } ++ float nrd = nrefm+nmm ? 100.*nmm/(nrefm+nmm) : 0; ++- printf("PSD\t%s\t%d\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,nm,nmm,nrd); +++ printf("PSD\t%s\t%"PRId64"\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,(int64_t) reader->buffer[0]->pos+1,nm,nmm,nrd); ++ } ++ } ++ } ++@@ -1162,14 +1257,14 @@ ++ for (id=0; idnstats; id++) ++ { ++ stats_t *stats = &args->stats[id]; ++- printf("SN\t%d\tnumber of records:\t%d\n", id, stats->n_records); ++- printf("SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts); ++- printf("SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps); ++- printf("SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps); ++- printf("SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels); ++- printf("SN\t%d\tnumber of others:\t%d\n", id, stats->n_others); ++- printf("SN\t%d\tnumber of multiallelic sites:\t%d\n", id, stats->n_mals); ++- printf("SN\t%d\tnumber of multiallelic SNP sites:\t%d\n", id, stats->n_snp_mals); +++ printf("SN\t%d\tnumber of records:\t%u\n", id, stats->n_records); +++ printf("SN\t%d\tnumber of no-ALTs:\t%u\n", id, stats->n_noalts); +++ printf("SN\t%d\tnumber of SNPs:\t%u\n", id, stats->n_snps); +++ printf("SN\t%d\tnumber of MNPs:\t%u\n", id, stats->n_mnps); +++ printf("SN\t%d\tnumber of indels:\t%u\n", id, stats->n_indels); +++ printf("SN\t%d\tnumber of others:\t%u\n", id, stats->n_others); +++ printf("SN\t%d\tnumber of multiallelic sites:\t%u\n", id, stats->n_mals); +++ printf("SN\t%d\tnumber of multiallelic SNP sites:\t%u\n", id, stats->n_snp_mals); ++ } ++ printf("# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); ++ for (id=0; idnstats; id++) ++@@ -1287,14 +1382,33 @@ ++ } ++ } ++ } ++- printf("# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]count\n"); +++ printf("# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]number of sites\t[5]number of genotypes\t[6]mean VAF\n"); ++ for (id=0; idnstats; id++) ++ { ++ stats_t *stats = &args->stats[id]; ++ for (i=stats->m_indel-1; i>=0; i--) ++- if ( stats->deletions[i] ) printf("IDD\t%d\t%d\t%d\n", id,-i-1,stats->deletions[i]); +++ { +++ if ( !stats->deletions[i] ) continue; +++ // whops, differently organized arrow, dels are together with ins +++ int bin = stats->m_indel - i - 1; +++ printf("IDD\t%d\t%d\t%d\t", id,-i-1,stats->deletions[i]); +++ if ( stats->nvaf && stats->nvaf[bin] ) +++ printf("%u\t%.2f",stats->nvaf[bin],stats->dvaf[bin]/stats->nvaf[bin]); +++ else +++ printf("0\t."); +++ printf("\n"); +++ } ++ for (i=0; im_indel; i++) ++- if ( stats->insertions[i] ) printf("IDD\t%d\t%d\t%d\n", id,i+1,stats->insertions[i]); +++ { +++ if ( !stats->insertions[i] ) continue; +++ int bin = stats->m_indel + i + 1; +++ printf("IDD\t%d\t%d\t%d\t", id,i+1,stats->insertions[i]); +++ if ( stats->nvaf && stats->nvaf[bin] ) +++ printf("%u\t%.2f",stats->nvaf[bin],stats->dvaf[bin]/stats->nvaf[bin]); +++ else +++ printf("0\t."); +++ printf("\n"); +++ } ++ } ++ printf("# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n"); ++ for (id=0; idnstats; id++) ++@@ -1517,8 +1631,8 @@ ++ } ++ } ++ ++- ++- printf("# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n"); +++ printf("# PSI, Per-Sample Indels. Note that alt-het genotypes with both ins and del allele are counted twice, in both nInsHets and nDelHets.\n"); +++ printf("# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nInsHets\t[9]nDelHets\t[10]nInsAltHoms\t[11]nDelAltHoms\n"); ++ for (id=0; idnstats; id++) ++ { ++ stats_t *stats = &args->stats[id]; ++@@ -1531,9 +1645,8 @@ ++ in = stats->smpl_frm_shifts[i*3 + 1]; ++ out = stats->smpl_frm_shifts[i*3 + 2]; ++ } ++- int nhom = stats->smpl_indel_homs[i]; ++- int nhet = stats->smpl_indel_hets[i]; ++- printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom); +++ printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0, +++ stats->smpl_ins_hets[i],stats->smpl_del_hets[i],stats->smpl_ins_homs[i],stats->smpl_del_homs[i]); ++ } ++ } ++ ++@@ -1609,7 +1722,7 @@ ++ fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); ++ fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); ++ fprintf(stderr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); ++- fprintf(stderr, " --threads number of extra decompression threads [0]\n"); +++ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(stderr, " -v, --verbose produce verbose per-site and per-sample output\n"); ++ fprintf(stderr, "\n"); ++ exit(1); ++@@ -1686,7 +1799,7 @@ ++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -1715,7 +1828,7 @@ ++ while (fname) ++ { ++ if ( !bcf_sr_add_reader(args->files, fname) ) ++- error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ fname = ++optind < argc ? argv[optind] : NULL; ++ } ++ ++--- python-pysam.orig/bcftools/vcfstats.c.pysam.c +++++ python-pysam/bcftools/vcfstats.c.pysam.c ++@@ -72,7 +72,7 @@ ++ ++ typedef struct ++ { ++- int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; +++ uint32_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; ++ int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons ++ #if HWE_STATS ++ int *af_hwe; ++@@ -90,12 +90,14 @@ ++ int subst[15]; ++ int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; ++ int *smpl_hapRef, *smpl_hapAlt, *smpl_missing; ++- int *smpl_indel_hets, *smpl_indel_homs; +++ int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs; ++ int *smpl_frm_shifts; // not-applicable, in-frame, out-frame ++ unsigned long int *smpl_dp; ++ idist_t dp, dp_sites; ++ int nusr; ++ user_stats_t *usr; +++ double *dvaf; // distribution of the mean indel-allele frequency by length: -m_indel,-(m_indel-1),...-1,0,1,..,m_indel +++ uint32_t *nvaf; ++ } ++ stats_t; ++ ++@@ -478,8 +480,10 @@ ++ stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int)); ++ stats->smpl_hapRef = (int *) calloc(args->files->n_smpl,sizeof(int)); ++ stats->smpl_hapAlt = (int *) calloc(args->files->n_smpl,sizeof(int)); ++- stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); ++- stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); +++ stats->smpl_ins_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); +++ stats->smpl_del_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); +++ stats->smpl_ins_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); +++ stats->smpl_del_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); ++ stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int)); ++ stats->smpl_tv = (int *) calloc(args->files->n_smpl,sizeof(int)); ++ stats->smpl_indels = (int *) calloc(args->files->n_smpl,sizeof(int)); ++@@ -491,6 +495,8 @@ ++ #endif ++ if ( args->exons_fname ) ++ stats->smpl_frm_shifts = (int*) calloc(args->files->n_smpl*3,sizeof(int)); +++ stats->nvaf = (uint32_t*) calloc(stats->m_indel*2+1,sizeof(*stats->nvaf)); +++ stats->dvaf = (double*) calloc(stats->m_indel*2+1,sizeof(*stats->dvaf)); ++ } ++ idist_init(&stats->dp, args->dp_min,args->dp_max,args->dp_step); ++ idist_init(&stats->dp_sites, args->dp_min,args->dp_max,args->dp_step); ++@@ -560,8 +566,10 @@ ++ free(stats->smpl_homRR); ++ free(stats->smpl_hapRef); ++ free(stats->smpl_hapAlt); ++- free(stats->smpl_indel_homs); ++- free(stats->smpl_indel_hets); +++ free(stats->smpl_ins_homs); +++ free(stats->smpl_del_homs); +++ free(stats->smpl_ins_hets); +++ free(stats->smpl_del_hets); ++ free(stats->smpl_ts); ++ free(stats->smpl_tv); ++ free(stats->smpl_indels); ++@@ -578,6 +586,8 @@ ++ } ++ free(stats->usr); ++ if ( args->exons ) free(stats->smpl_frm_shifts); +++ free(stats->nvaf); +++ free(stats->dvaf); ++ } ++ for (j=0; jnusr; j++) free(args->usr[j].tag); ++ if ( args->af_bins ) bin_destroy(args->af_bins); ++@@ -846,6 +856,34 @@ ++ } ++ } ++ +++static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int ismpl, int ial, int jal) +++{ +++ if ( !fmt ) return; +++ +++ float dvaf; +++ #define BRANCH_INT(type_t,missing,vector_end) { \ +++ type_t *p = (type_t *) (fmt->p + fmt->size*ismpl); \ +++ if ( p[ial]==vector_end || p[jal]==vector_end ) return; \ +++ if ( p[ial]==missing || p[jal]==missing ) return; \ +++ if ( !p[ial] && !p[jal] ) return; \ +++ dvaf = (float)p[ial]/(p[ial]+p[jal]); \ +++ } +++ switch (fmt->type) { +++ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; +++ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; +++ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; +++ default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt->type); exit(1); break; +++ } +++ #undef BRANCH_INT +++ +++ int len = line->d.var[ial].n; +++ if ( len < -stats->m_indel ) len = -stats->m_indel; +++ else if ( len > stats->m_indel ) len = stats->m_indel; +++ int bin = stats->m_indel + len; +++ stats->nvaf[bin]++; +++ stats->dvaf[bin] += dvaf; +++} +++ ++ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) ++ { ++ bcf_srs_t *files = args->files; ++@@ -856,6 +894,8 @@ ++ ++ if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) ) ++ { +++ bcf_fmt_t *ad_fmt_ptr = bcf_get_variant_types(line)&VCF_INDEL ? bcf_get_fmt(reader->header,reader->buffer[0],"AD") : NULL; +++ ++ int ref = bcf_acgt2int(*line->d.allele[0]); ++ int is, n_nref = 0, i_nref = 0; ++ for (is=0; isfiles->n_smpl; is++) ++@@ -912,8 +952,31 @@ ++ if ( gt != GT_HOM_RR ) ++ { ++ stats->smpl_indels[is]++; ++- if ( gt==GT_HET_RA || gt==GT_HET_AA ) stats->smpl_indel_hets[is]++; ++- else if ( gt==GT_HOM_AA ) stats->smpl_indel_homs[is]++; +++ +++ if ( gt==GT_HET_RA || gt==GT_HET_AA ) +++ { +++ int is_ins = 0, is_del = 0; +++ if ( bcf_get_variant_type(line,ial)&VCF_INDEL ) +++ { +++ if ( line->d.var[ial].n < 0 ) is_del = 1; +++ else is_ins = 1; +++ update_dvaf(stats,line,ad_fmt_ptr,is,ial,jal); +++ } +++ if ( bcf_get_variant_type(line,jal)&VCF_INDEL ) +++ { +++ if ( line->d.var[jal].n < 0 ) is_del = 1; +++ else is_ins = 1; +++ update_dvaf(stats,line,ad_fmt_ptr,is,jal,ial); +++ } +++ // Note that alt-het genotypes with both ins and del allele are counted twice!! +++ if ( is_del ) stats->smpl_del_hets[is]++; +++ if ( is_ins ) stats->smpl_ins_hets[is]++; +++ } +++ else if ( gt==GT_HOM_AA ) +++ { +++ if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[is]++; +++ else stats->smpl_ins_homs[is]++; +++ } ++ } ++ if ( stats->smpl_frm_shifts ) ++ { ++@@ -961,6 +1024,37 @@ ++ } ++ #undef BRANCH_INT ++ } +++ else if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD")) ) +++ { +++ #define BRANCH_INT(type_t,missing,vector_end) { \ +++ int is,iv; \ +++ for (is=0; isfiles->n_smpl; is++) \ +++ { \ +++ type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \ +++ int dp = 0, has_value = 0; \ +++ for (iv=0; ivn; iv++) \ +++ { \ +++ if ( p[iv]==vector_end ) break; \ +++ if ( p[iv]==missing ) continue; \ +++ has_value = 1; \ +++ dp += p[iv]; \ +++ } \ +++ if ( has_value ) \ +++ { \ +++ (*idist(&stats->dp, dp))++; \ +++ stats->smpl_ndp[is]++; \ +++ stats->smpl_dp[is] += dp; \ +++ } \ +++ } \ +++ } +++ switch (fmt_ptr->type) { +++ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; +++ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; +++ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; +++ default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; +++ } +++ #undef BRANCH_INT +++ } ++ ++ if ( matched==3 ) ++ { ++@@ -970,6 +1064,7 @@ ++ fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return; ++ ++ // only the first ALT allele is considered +++ if (args->ntmp_iaf <= 1) return; // Do not consider invariate sites ++ int iaf = args->tmp_iaf[1]; ++ int line_type = bcf_get_variant_types(files->readers[0].buffer[0]); ++ gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels; ++@@ -1021,7 +1116,7 @@ ++ { ++ nmm++; ++ bcf_sr_t *reader = &files->readers[0]; ++- fprintf(bcftools_stdout, "DBG\t%s\t%d\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,files->samples[is],gt,gt2); +++ fprintf(bcftools_stdout, "DBG\t%s\t%"PRId64"\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,(int64_t) reader->buffer[0]->pos+1,files->samples[is],gt,gt2); ++ } ++ else ++ { ++@@ -1030,7 +1125,7 @@ ++ } ++ } ++ float nrd = nrefm+nmm ? 100.*nmm/(nrefm+nmm) : 0; ++- fprintf(bcftools_stdout, "PSD\t%s\t%d\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,nm,nmm,nrd); +++ fprintf(bcftools_stdout, "PSD\t%s\t%"PRId64"\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,(int64_t) reader->buffer[0]->pos+1,nm,nmm,nrd); ++ } ++ } ++ } ++@@ -1164,14 +1259,14 @@ ++ for (id=0; idnstats; id++) ++ { ++ stats_t *stats = &args->stats[id]; ++- fprintf(bcftools_stdout, "SN\t%d\tnumber of records:\t%d\n", id, stats->n_records); ++- fprintf(bcftools_stdout, "SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts); ++- fprintf(bcftools_stdout, "SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps); ++- fprintf(bcftools_stdout, "SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps); ++- fprintf(bcftools_stdout, "SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels); ++- fprintf(bcftools_stdout, "SN\t%d\tnumber of others:\t%d\n", id, stats->n_others); ++- fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic sites:\t%d\n", id, stats->n_mals); ++- fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic SNP sites:\t%d\n", id, stats->n_snp_mals); +++ fprintf(bcftools_stdout, "SN\t%d\tnumber of records:\t%u\n", id, stats->n_records); +++ fprintf(bcftools_stdout, "SN\t%d\tnumber of no-ALTs:\t%u\n", id, stats->n_noalts); +++ fprintf(bcftools_stdout, "SN\t%d\tnumber of SNPs:\t%u\n", id, stats->n_snps); +++ fprintf(bcftools_stdout, "SN\t%d\tnumber of MNPs:\t%u\n", id, stats->n_mnps); +++ fprintf(bcftools_stdout, "SN\t%d\tnumber of indels:\t%u\n", id, stats->n_indels); +++ fprintf(bcftools_stdout, "SN\t%d\tnumber of others:\t%u\n", id, stats->n_others); +++ fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic sites:\t%u\n", id, stats->n_mals); +++ fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic SNP sites:\t%u\n", id, stats->n_snp_mals); ++ } ++ fprintf(bcftools_stdout, "# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); ++ for (id=0; idnstats; id++) ++@@ -1289,14 +1384,33 @@ ++ } ++ } ++ } ++- fprintf(bcftools_stdout, "# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]count\n"); +++ fprintf(bcftools_stdout, "# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]number of sites\t[5]number of genotypes\t[6]mean VAF\n"); ++ for (id=0; idnstats; id++) ++ { ++ stats_t *stats = &args->stats[id]; ++ for (i=stats->m_indel-1; i>=0; i--) ++- if ( stats->deletions[i] ) fprintf(bcftools_stdout, "IDD\t%d\t%d\t%d\n", id,-i-1,stats->deletions[i]); +++ { +++ if ( !stats->deletions[i] ) continue; +++ // whops, differently organized arrow, dels are together with ins +++ int bin = stats->m_indel - i - 1; +++ fprintf(bcftools_stdout, "IDD\t%d\t%d\t%d\t", id,-i-1,stats->deletions[i]); +++ if ( stats->nvaf && stats->nvaf[bin] ) +++ fprintf(bcftools_stdout, "%u\t%.2f",stats->nvaf[bin],stats->dvaf[bin]/stats->nvaf[bin]); +++ else +++ fprintf(bcftools_stdout, "0\t."); +++ fprintf(bcftools_stdout, "\n"); +++ } ++ for (i=0; im_indel; i++) ++- if ( stats->insertions[i] ) fprintf(bcftools_stdout, "IDD\t%d\t%d\t%d\n", id,i+1,stats->insertions[i]); +++ { +++ if ( !stats->insertions[i] ) continue; +++ int bin = stats->m_indel + i + 1; +++ fprintf(bcftools_stdout, "IDD\t%d\t%d\t%d\t", id,i+1,stats->insertions[i]); +++ if ( stats->nvaf && stats->nvaf[bin] ) +++ fprintf(bcftools_stdout, "%u\t%.2f",stats->nvaf[bin],stats->dvaf[bin]/stats->nvaf[bin]); +++ else +++ fprintf(bcftools_stdout, "0\t."); +++ fprintf(bcftools_stdout, "\n"); +++ } ++ } ++ fprintf(bcftools_stdout, "# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n"); ++ for (id=0; idnstats; id++) ++@@ -1519,8 +1633,8 @@ ++ } ++ } ++ ++- ++- fprintf(bcftools_stdout, "# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n"); +++ fprintf(bcftools_stdout, "# PSI, Per-Sample Indels. Note that alt-het genotypes with both ins and del allele are counted twice, in both nInsHets and nDelHets.\n"); +++ fprintf(bcftools_stdout, "# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nInsHets\t[9]nDelHets\t[10]nInsAltHoms\t[11]nDelAltHoms\n"); ++ for (id=0; idnstats; id++) ++ { ++ stats_t *stats = &args->stats[id]; ++@@ -1533,9 +1647,8 @@ ++ in = stats->smpl_frm_shifts[i*3 + 1]; ++ out = stats->smpl_frm_shifts[i*3 + 2]; ++ } ++- int nhom = stats->smpl_indel_homs[i]; ++- int nhet = stats->smpl_indel_hets[i]; ++- fprintf(bcftools_stdout, "PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom); +++ fprintf(bcftools_stdout, "PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0, +++ stats->smpl_ins_hets[i],stats->smpl_del_hets[i],stats->smpl_ins_homs[i],stats->smpl_del_homs[i]); ++ } ++ } ++ ++@@ -1611,7 +1724,7 @@ ++ fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); ++ fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); ++ fprintf(bcftools_stderr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); ++- fprintf(bcftools_stderr, " --threads number of extra decompression threads [0]\n"); +++ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(bcftools_stderr, " -v, --verbose produce verbose per-site and per-sample output\n"); ++ fprintf(bcftools_stderr, "\n"); ++ exit(1); ++@@ -1688,7 +1801,7 @@ ++ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 'h': ++- case '?': usage(); +++ case '?': usage(); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -1717,7 +1830,7 @@ ++ while (fname) ++ { ++ if ( !bcf_sr_add_reader(args->files, fname) ) ++- error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ fname = ++optind < argc ? argv[optind] : NULL; ++ } ++ ++--- python-pysam.orig/bcftools/vcfview.c +++++ python-pysam/bcftools/vcfview.c ++@@ -32,6 +32,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -85,11 +86,14 @@ ++ ++ if (args->calc_ac && args->update_info) ++ { ++- bcf_hdr_append(args->hdr,"##INFO="); ++- bcf_hdr_append(args->hdr,"##INFO="); +++ if (bcf_hdr_append(args->hdr,"##INFO=") < 0) +++ error_errno("[%s] Failed to add \"AC\" INFO header", __func__); +++ if (bcf_hdr_append(args->hdr,"##INFO=") < 0) +++ error_errno("[%s] Failed to add \"AN\" INFO header", __func__); ++ } ++ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view"); ++- else bcf_hdr_sync(args->hdr); +++ else if (bcf_hdr_sync(args->hdr) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ ++ // setup sample data ++ if (args->sample_names) ++@@ -452,7 +456,7 @@ ++ if (args->trim_alts) ++ { ++ int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line); ++- if ( ret<0 ) error("Error: Could not trim alleles at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1); +++ if ( ret<0 ) error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), (int64_t) line->pos+1); ++ } ++ if (args->phased) { ++ int phased = bcf_all_phased(args->hdr, line); ++@@ -503,10 +507,10 @@ ++ fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); ++ fprintf(stderr, " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); ++ fprintf(stderr, " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); ++- fprintf(stderr, " --threads number of extra (de)compression threads [0]\n"); +++ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(stderr, "\n"); ++ fprintf(stderr, "Subset options:\n"); ++- fprintf(stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n"); +++ fprintf(stderr, " -a, --trim-alt-alleles trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n"); ++ fprintf(stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); ++ fprintf(stderr, " -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" prefix)\n"); ++ fprintf(stderr, " -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n"); ++@@ -694,7 +698,7 @@ ++ } ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; ++- case '?': usage(args); +++ case '?': usage(args); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -737,12 +741,14 @@ ++ } ++ ++ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ ++ init_data(args); ++ bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr); ++ if (args->print_header) ++- bcf_hdr_write(args->out, out_hdr); +++ { +++ if ( bcf_hdr_write(args->out, out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out); +++ } ++ else if ( args->output_type & FT_BCF ) ++ error("BCF output requires header, cannot proceed with -H\n"); ++ ++@@ -753,8 +759,7 @@ ++ { ++ bcf1_t *line = args->files->readers[0].buffer[0]; ++ if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n"); ++- if ( subset_vcf(args, line) ) ++- bcf_write1(args->out, out_hdr, line); +++ if ( subset_vcf(args, line) && bcf_write1(args->out, out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out); ++ } ++ ret = args->files->errnum; ++ if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); ++--- python-pysam.orig/bcftools/vcfview.c.pysam.c +++++ python-pysam/bcftools/vcfview.c.pysam.c ++@@ -34,6 +34,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -87,11 +88,14 @@ ++ ++ if (args->calc_ac && args->update_info) ++ { ++- bcf_hdr_append(args->hdr,"##INFO="); ++- bcf_hdr_append(args->hdr,"##INFO="); +++ if (bcf_hdr_append(args->hdr,"##INFO=") < 0) +++ error_errno("[%s] Failed to add \"AC\" INFO header", __func__); +++ if (bcf_hdr_append(args->hdr,"##INFO=") < 0) +++ error_errno("[%s] Failed to add \"AN\" INFO header", __func__); ++ } ++ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view"); ++- else bcf_hdr_sync(args->hdr); +++ else if (bcf_hdr_sync(args->hdr) < 0) +++ error_errno("[%s] Failed to update header", __func__); ++ ++ // setup sample data ++ if (args->sample_names) ++@@ -454,7 +458,7 @@ ++ if (args->trim_alts) ++ { ++ int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line); ++- if ( ret<0 ) error("Error: Could not trim alleles at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1); +++ if ( ret<0 ) error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), (int64_t) line->pos+1); ++ } ++ if (args->phased) { ++ int phased = bcf_all_phased(args->hdr, line); ++@@ -505,10 +509,10 @@ ++ fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); ++ fprintf(bcftools_stderr, " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); ++ fprintf(bcftools_stderr, " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); ++- fprintf(bcftools_stderr, " --threads number of extra (de)compression threads [0]\n"); +++ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); ++ fprintf(bcftools_stderr, "\n"); ++ fprintf(bcftools_stderr, "Subset options:\n"); ++- fprintf(bcftools_stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n"); +++ fprintf(bcftools_stderr, " -a, --trim-alt-alleles trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n"); ++ fprintf(bcftools_stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); ++ fprintf(bcftools_stderr, " -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" prefix)\n"); ++ fprintf(bcftools_stderr, " -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n"); ++@@ -696,7 +700,7 @@ ++ } ++ case 9 : args->n_threads = strtol(optarg, 0, 0); break; ++ case 8 : args->record_cmd_line = 0; break; ++- case '?': usage(args); +++ case '?': usage(args); break; ++ default: error("Unknown argument: %s\n", optarg); ++ } ++ } ++@@ -739,12 +743,14 @@ ++ } ++ ++ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); ++- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); +++ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); ++ ++ init_data(args); ++ bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr); ++ if (args->print_header) ++- bcf_hdr_write(args->out, out_hdr); +++ { +++ if ( bcf_hdr_write(args->out, out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out); +++ } ++ else if ( args->output_type & FT_BCF ) ++ error("BCF output requires header, cannot proceed with -H\n"); ++ ++@@ -755,8 +761,7 @@ ++ { ++ bcf1_t *line = args->files->readers[0].buffer[0]; ++ if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n"); ++- if ( subset_vcf(args, line) ) ++- bcf_write1(args->out, out_hdr, line); +++ if ( subset_vcf(args, line) && bcf_write1(args->out, out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out); ++ } ++ ret = args->files->errnum; ++ if ( ret ) fprintf(bcftools_stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); ++--- python-pysam.orig/bcftools/version.c +++++ python-pysam/bcftools/version.c ++@@ -25,6 +25,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include "bcftools.h" ++ #include "version.h" ++@@ -44,6 +45,22 @@ ++ exit(-1); ++ } ++ +++void error_errno(const char *format, ...) +++{ +++ va_list ap; +++ int e = errno; +++ va_start(ap, format); +++ vfprintf(stderr, format, ap); +++ va_end(ap); +++ if (e) { +++ fprintf(stderr, ": %s\n", strerror(e)); +++ } else { +++ fprintf(stderr, "\n"); +++ } +++ exit(-1); +++} +++ +++ ++ const char *hts_bcf_wmode(int file_type) ++ { ++ if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF ++--- python-pysam.orig/bcftools/version.c.pysam.c +++++ python-pysam/bcftools/version.c.pysam.c ++@@ -27,6 +27,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include "bcftools.h" ++ #include "version.h" ++@@ -46,6 +47,22 @@ ++ exit(-1); ++ } ++ +++void error_errno(const char *format, ...) +++{ +++ va_list ap; +++ int e = errno; +++ va_start(ap, format); +++ vfprintf(bcftools_stderr, format, ap); +++ va_end(ap); +++ if (e) { +++ fprintf(bcftools_stderr, ": %s\n", strerror(e)); +++ } else { +++ fprintf(bcftools_stderr, "\n"); +++ } +++ exit(-1); +++} +++ +++ ++ const char *hts_bcf_wmode(int file_type) ++ { ++ if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF ++--- python-pysam.orig/bcftools/version.h +++++ python-pysam/bcftools/version.h ++@@ -1 +1 @@ ++-#define BCFTOOLS_VERSION "1.9" +++#define BCFTOOLS_VERSION "1.10" diff --cc debian/patches/clean_less index 0000000,0000000..b1ad6c7 new file mode 100644 --- /dev/null +++ b/debian/patches/clean_less @@@ -1,0 -1,0 +1,15 @@@ ++Author: Michael R. Crusoe ++Last-Update: 2020-01-23 14:31:06 +0100 ++Description: Do not clean *.bam files ++ ++--- a/tests/pysam_data/Makefile +++++ b/tests/pysam_data/Makefile ++@@ -100,7 +100,7 @@ ++ cp ex1.cram $@ ++ ++ clean: ++- rm -fr [a-z]*.bam *.bai *.csi *.fai *.gzi *.pileup* [a-z]*.cram *.crai \ +++ rm -fr *.bai *.csi *.fai *.gzi *.pileup* [a-z]*.cram *.crai \ ++ all.stamp *~ calDepth *.dSYM pysam_*.sam \ ++ ex2.sam ex2.sam.gz ex1.sam ex1.fa.gz \ ++ with_md.sam.gz \ diff --cc debian/patches/hts1.10 index 0000000,0000000..6fbe3ef new file mode 100644 --- /dev/null +++ b/debian/patches/hts1.10 @@@ -1,0 -1,0 +1,104 @@@ ++Author: Michael R. Crusoe ++Description: sync with htslib, samtools, and bcftools 1.10 ++ ++- Remove symbols that was removed in libhts3 (hts_useek and uts_utell) ++- use devtools/import.py and the contents of the samtools & bcftools 1.10 ++Debian packages with their patches fully applied ++ ++--- python-pysam.orig/pysam/htslib_util.h +++++ python-pysam/pysam/htslib_util.h ++@@ -5,9 +5,6 @@ ++ #include "htslib/vcf.h" ++ #include "htslib/khash.h" ++ ++-int hts_useek(htsFile *fp, long uoffset, int where); ++-long hts_utell(htsFile *fp); ++- ++ int hts_set_verbosity(int verbosity); ++ int hts_get_verbosity(void); ++ ++--- python-pysam.orig/pysam/libchtslib.pxd +++++ python-pysam/pysam/libchtslib.pxd ++@@ -632,8 +632,6 @@ ++ int8_t HTS_FMT_CRAI ++ ++ BGZF *hts_get_bgzfp(htsFile *fp) ++- int hts_useek(htsFile *fp, long uoffset, int where) ++- long hts_utell(htsFile *fp) ++ ++ ctypedef struct hts_idx_t ++ ++--- python-pysam.orig/tests/00README.txt +++++ python-pysam/tests/00README.txt ++@@ -15,7 +15,7 @@ ++ To try samtools, you may run the following commands: ++ ++ samtools faidx ex1.fa # index the reference FASTA ++- samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM +++ samtools view -bt ex1.fa.fai -o ex1.bam ex1.sam.gz # SAM->BAM ++ samtools index ex1.bam # index BAM ++ samtools tview ex1.bam ex1.fa # view alignment ++ samtools pileup -cf ex1.fa ex1.bam # pileup and consensus ++--- python-pysam.orig/tests/pysam_data/Makefile +++++ python-pysam/tests/pysam_data/Makefile ++@@ -32,7 +32,7 @@ ++ samtools calmd --output-fmt BAM $^ > $@ ++ ++ #%.bam: %.sam ex1.fa.fai ++-# samtools import ex1.fa.fai $< $@ +++# samtools view -bt ex1.fa.fai -i $@ $< ++ ++ uncompressed.bam: ex2.sam ++ samtools view -buS $< > $@ ++@@ -53,7 +53,7 @@ ++ samtools faidx ex1.fa ++ ++ ex1.bam:ex1.sam.gz ex1.fa.fai ++- samtools import ex1.fa.fai ex1.sam.gz ex1.bam +++ samtools view -bt ex1.fa.fai -o ex1.bam ex1.sam.gz ++ ++ %.bam.bai:%.bam ++ samtools index $< ++@@ -73,7 +73,7 @@ ++ ++ example_unmapped_reads_no_sq.bam: example_unmapped_reads_no_sq.sam ++ touch tmp.list ++- samtools import tmp.list $< $@ +++ samtools view -bt tmp.list -o $@ $< ++ rm -f tmp.list ++ ++ example_bai.bam: ex1.bam ++--- python-pysam.orig/setup.py +++++ python-pysam/setup.py ++@@ -159,8 +159,7 @@ ++ package_list = ['pysam', ++ 'pysam.include', ++ 'pysam.include.samtools', ++- 'pysam.include.bcftools', ++- 'pysam.include.samtools.win32'] +++ 'pysam.include.bcftools'] ++ package_dirs = {'pysam': 'pysam', ++ 'pysam.include.samtools': 'samtools', ++ 'pysam.include.bcftools': 'bcftools'} ++--- python-pysam.orig/pysam/libchtslib.pyx +++++ python-pysam/pysam/libchtslib.pyx ++@@ -490,8 +490,7 @@ ++ with nogil: ++ ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET) ++ elif self.htsfile.format.compression == no_compression: ++- with nogil: ++- ret = hts_useek(self.htsfile, offset, SEEK_SET) +++ ret = 0 if (hseek(self.htsfile.fp.hfile, offset, SEEK_SET) >= 0) else -1 ++ else: ++ raise NotImplementedError("seek not implemented in files compressed by method {}".format( ++ self.htsfile.format.compression)) ++@@ -509,8 +508,7 @@ ++ with nogil: ++ ret = bgzf_tell(hts_get_bgzfp(self.htsfile)) ++ elif self.htsfile.format.compression == no_compression: ++- with nogil: ++- ret = hts_utell(self.htsfile) +++ ret = htell(self.htsfile.fp.hfile) ++ elif self.htsfile.format.format == cram: ++ with nogil: ++ ret = htell(cram_fd_get_fp(self.htsfile.fp.cram)) diff --cc debian/patches/samtools_v1.10 index 0000000,0000000..14400c7 new file mode 100644 --- /dev/null +++ b/debian/patches/samtools_v1.10 @@@ -1,0 -1,0 +1,3371 @@@ ++Author: Michael R. Crusoe ++Description: support samtools 1.10 as it is more strict ++ ++--- python-pysam.orig/tests/pysam_data/example_empty_with_header.sam +++++ /dev/null ++@@ -1 +0,0 @@ ++-@HD VN:1.3 SO:coordinate ++--- python-pysam.orig/tests/pysam_data/rg_with_tab.sam +++++ /dev/null ++@@ -1,3273 +0,0 @@ ++-@SQ SN:chr1 LN:1575 ++-@SQ SN:chr2 LN:1584 ++-@PG ID:bwa PN:bwa VN:0.7.9a-r786 CL:bwa mem -p -t 8 -M -R @RG ID:None SM:None /mnt/data/hg19.fa /mnt/analysis/default-0.fastq ++-EAS56_57:6:190:289:82 69 chr1 100 0 * = 100 0 CTCAAGGTTGTTGCAAGGGGGTCTATGTGAACAAA <<<7<<<;<<<<<<<<8;;<7;4<;<;;;;;94<; MF:i:192 ++-EAS56_57:6:190:289:82 137 chr1 100 73 35M = 100 0 AGGGGTGCAGAGCCGAGTCACGGGGTTGCCAGCAC <<<<<<;<<<<<<<<<<;<<;<<<<;8<6;9;;2; MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_64:3:190:727:308 99 chr1 103 99 35M = 263 195 GGTGCAGAGCCGAGTCACGGGGTTGCCAGCACAGG <<<<<<<<<<<<<<<<<<<<<<<<<<<::<<<844 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:7:141:80:875 99 chr1 110 99 35M = 265 190 AGCCGAGTCACGGGGTTGCCAGCACAGGGGCTTAA <<<<<<<<<<<<<<<<<<<<<<:<<8;<<8+7;-7 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_FC30151:3:40:1128:1940 163 chr1 112 99 35M = 291 214 CCGAGTCACGGGGTTGCCAGCACAGGGGCTTAACC <<<<<<<<<<<<<<<<<<<;<<5;;<<<9;;;;7: MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_62:5:290:319:736 69 chr1 113 0 * = 113 0 GTTCTCAAGGTTGTTGCAAGGGGGTCTATGTGAAC <<<<<<:7:<.<<<<7<<.<.<<.9*<4<:<4%74 MF:i:192 ++-EAS51_62:5:290:319:736 137 chr1 113 73 35M = 113 0 CGAGTCACGGGGTTGCCAGCACAGGGGCTTAACCT ==;=======7====6=;==:;;====66=::27: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_597:2:132:493:921 69 chr1 119 0 * = 119 0 GTTCTCAAGGTTGTTGCAAGGGGGTCTATGTGAAC <<<<<<<<<<<<<<<<<<<;<<<<77;0<;;6777 MF:i:192 ++-B7_597:2:132:493:921 137 chr1 119 75 35M = 119 0 ACGGGGTTGCCAGCACAGGGGCTTAACCTCTGGTG <<<<<<<<<<<<<<<<;<<<<<<<<;<<<<;;88: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:7:283:799:560 163 chr1 121 66 35M = 283 197 GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGAC <<<<+<<<<8<<<+<<<<<;<<:07;8;7402447 MF:i:18 Aq:i:66 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:1:225:195:543 99 chr1 123 99 35M = 299 211 GGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTG <<<<<<<<<<<<<<<<<<<<<<<9<<;::388998 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_589:6:114:714:317 99 chr1 126 99 35M = 311 220 TGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCA <<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<5;<; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_39:1:70:147:84 163 chr1 128 73 35M = 285 192 CCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGA <<<<<<<<<<<<<<<<<<<<;<<<<<<<<<;(5<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:2:187:227:818 163 chr1 129 99 35M = 290 196 CAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<3<;<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:4:77:29:126 99 chr1 131 99 35M = 315 219 GCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCT <<<<<<<<<<3<<<<<<<;;;7<;<<449<-:977 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:4:327:795:103 99 chr1 133 99 35M = 302 204 ACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:3:139:117:262 69 chr1 135 0 * = 135 0 GTTCTCAAGGTTGTTGCAAGGGGGTCTATGTGAAC <<<7<<<<<<<<<<<<<<<<<<<;<;<<<<<37;3 MF:i:192 ++-EAS114_30:3:139:117:262 137 chr1 135 76 35M = 135 0 AGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTG <<<<;<<<<<<<<<<<<<:<<<<<:<<8<<<<:<: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_FC30151:5:29:817:854 73 chr1 135 77 35M = 135 0 AGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_FC30151:5:29:817:854 133 chr1 135 0 * = 135 0 GTTCTCAAGGTTGTTGCAAGGGGGTTTATGTGAAC <<<<<<<<<<<<<<<1..;:;;;;1%407)07&7. MF:i:192 ++-EAS192_3:6:170:169:57 163 chr1 138 99 35M = 296 193 GGCTTGACCTCTGGTGACTGCCAGAGCTGCTGGCC <<<<<;<<<<<<;<<<<<<<<<<<<:<<<<<;;+% MF:i:18 Aq:i:45 NM:i:2 UQ:i:30 H0:i:0 H1:i:1 ++-B7_595:4:84:802:737 99 chr1 140 68 35M = 284 179 CTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAG <<<<<<<<<<;9<9<<<;<<;73;<<<<<37;1+. MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_4:7:78:583:670 163 chr1 142 99 35M = 316 209 TAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCT <<<<<<<<<<;;;<;;<<<:7;5;<5;;<2--8-; MF:i:18 Aq:i:66 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_64:3:90:435:691 99 chr1 147 99 35M = 318 206 TCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAGT <<<<<<<<<<;<<<;<<<<:<<<;<81;<<1;784 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:3:13:122:187 163 chr1 153 99 35M = 343 225 GACTGCCAGAGCTGCTGGCAAGCTAGAGTCCCCTT <<<<<<<;<;<<<;<<<<:;6<<<<;;;;:<<%%< MF:i:18 Aq:i:69 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 ++-EAS221_1:6:69:735:1915 99 chr1 154 99 35M = 321 202 ACTGCCAGAGCTGCTGGCAAGCTAGAGTCCCATTT <<<<<<<<;<<<<<<<<<;<<<<;<8<<<<;1:<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:5:66:959:1311 163 chr1 159 95 35M = 336 212 CAGAGCTGCTGGCAAGCTAGAGGCCCATCTGGAGC ;;4;;;+;;;-01;;&-;;4;;&;;73)(&**274 MF:i:18 Aq:i:31 NM:i:2 UQ:i:12 H0:i:0 H1:i:1 ++-EAS56_57:6:325:759:288 99 chr1 163 99 35M = 341 213 GCTGCTGGCAAGCTAGAGTCCCATTTGGAGCCCCT 8<;<<<<81<<<<<;<<;<<<;9<<<<1>><<<< MF:i:18 Aq:i:21 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:4:240:264:231 121 chr1 213 66 35M = 213 0 TGTAATGAAAACTATATTTATGCTATTCAGTTCTA 9;,;;62<9<)29<<<;96<<<;<<7<<<<<<;<< MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:4:240:264:231 181 chr1 213 0 * = 213 0 CAACAGATCAAGAAGGAGGGGCAATGGACGAGTTA %15+5022))0&<<)0)+7:4+&<0<<:0<<<7<< MF:i:192 ++-EAS1_93:7:14:426:613 99 chr1 214 99 35M = 379 200 GTAATGAAAACTATATTTATGCTATTCAGTTCTAA ======;=;==========;;==3=;==-=<;<;< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:2:173:995:93 163 chr1 215 99 35M = 382 202 TAATGAAAACTATATTTATGCTATTCAGTTCTAAA <<<<<<<<<<<<<<<<<<<7:<<<<;:<:<<<<:7 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_64:6:195:348:703 163 chr1 215 99 35M = 353 173 TAATGAAAACTATATTTATGCTATTCAGTTCTAAA <<<<<<<;<<<<<;:<<<<<<<<<<<<:<1:<:7< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:2:62:879:264 163 chr1 216 99 35M = 396 215 AATGAAAACTATATTTATGCTATTCAGTTCTAAAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<2<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_61:4:83:452:970 99 chr1 216 99 35M = 379 198 AATGAAAACTATATTTATGCTATTCAGTTCTAAAT ==========================;======== MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_1:2:64:1318:1711 99 chr1 218 99 35M = 389 206 TGAAAACTATATTTATGCTATTCAGTTCTAAATAT <<<<<<<<<<<<<<<<7<<<<<<<:<<<<<2<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_589:8:113:968:19 83 chr1 219 99 35M = 50 -204 GAAAACTATATTTATGCTATTCAGTTCTAAATATA 8;<;8;9<<<<<<<9<:<<<<<<<<<;<<<<<<<< MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:4:160:896:275 163 chr1 220 99 35M = 387 202 AAAACTATATTTATGCTATTCAGTTCTAAATATAG ============<====<==<====<==<==;=:6 MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:6:181:191:418 163 chr1 221 99 36M = 387 202 AAACTATATTTATGCTATTCAGTTCTAAATATAGAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<988 MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:7:242:354:637 99 chr1 222 99 36M = 417 231 AACTATATTTATGCTATTCAGTTCTAAATATAGAAA <<<<<<<<<<<<<<<<<<<<<<<<<<;<<<<<6<;; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_589:1:122:77:789 163 chr1 223 99 35M = 396 208 ACTATATTTATGCTATTCAGTTCTAAATATAGAAA <<<:<4<<9<:7<<<:<<<7<<<<<<<<<<9<9<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:5:42:540:501 147 chr1 224 99 36M = 60 -200 CTATATTTATGCTATTCAGTTCTAAATATAGAAATT ;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:6:155:12:674 83 chr1 224 99 36M = 52 -208 CTATATTTATGCTATTCAGTTCTAAATATAGAAATT ;<<<<<<<<<<;<<<<;<<<<8<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:4:106:316:452 147 chr1 224 99 36M = 49 -211 CTATATTTATGCTATTCAGTTCTAAATATAGAAATT :<<<<<;<<<<:<<:<<<<<<7<<<<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:5:89:525:113 163 chr1 227 78 40M = 397 210 TATTTATGCTATTCAGTTATAAATATAGAAATTGAAACAG <1<7<6;+0;7;7'<70;-<7<:<:<<5<<:9<5:7:%:7 MF:i:18 Aq:i:39 NM:i:1 UQ:i:12 H0:i:0 H1:i:1 ++-EAS54_65:3:321:311:983 147 chr1 228 99 35M = 51 -212 ATTTATGCTATTCAGTTCTAAATATAGAAATTGAA ;;4;;<7<<<<<<77<<<<<<<<<<17<<<<<<<< MF:i:18 Aq:i:62 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_65:8:76:493:708 147 chr1 229 44 35M = 73 -191 TTTATGCTATTCAGTTCTAAATATAGAAATTGAAA 5/)63.&1517(544(055(0454&7706566679 MF:i:18 Aq:i:44 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:2:125:628:79 163 chr1 229 99 35M = 400 205 TTTATGCTATTCAGTTCTAAATATAGAAATTGAAA ==================<6<====<<:<==7;:: MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_32:5:78:583:499 83 chr1 229 74 35M = 37 -227 TTTACGCTATTCAGTACTAAATATAGAAATTGAAA &6&9774&<;67<44&-4<;<9<7<<<<<;<<<<< MF:i:18 Aq:i:37 NM:i:2 UQ:i:27 H0:i:0 H1:i:1 ++-EAS54_67:3:175:730:949 83 chr1 230 99 35M = 70 -195 TTATGCTATTCAGTTCTAAATATAGAAATTGAAAC <<<<;+<<<<7<;<<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:1:84:275:1572 163 chr1 230 99 35M = 394 199 TTATGCTATTCAGTTCTAAATATAGAAATTGAAAC /6;;;4;;;;;;;;7;;4;.4;;;;;6;;;77077 MF:i:18 Aq:i:62 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:4:248:753:731 99 chr1 231 99 35M = 402 206 TATGCTATTCAGTTCTAAATATAGAAATTGAAACA <<<<<<<<<<<8<<<<<<<<<<<<:<<<<&<:<.: MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:1:9:1289:215 99 chr1 231 99 35M = 394 198 TATGCTATTCAGTTCTAAATATAGAAATTGAAACA ;;;;;;9;;;67;;;;;99;9;;;;;;;;977747 MF:i:18 Aq:i:59 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:7:188:802:71 163 chr1 232 99 35M = 415 218 ATGCTATTCAGTTCTAAATATAGAAATTGAAACAG <<<<<<<<<;<<<<<9<<<:<<<:<<<<<<:<<<; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:7:252:171:323 83 chr1 234 99 35M = 43 -226 GCTATTCAGTTCTAAATATAGAAATTGAAACAGCT ;8<;<=3=6==:====;;======;========== MF:i:18 Aq:i:62 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:5:223:142:410 147 chr1 235 99 35M = 60 -210 CTATTCAGTTCTAAATATAGAAATTGAAACAGCTG 8;<<<;<<<<;<<<<<<;<;<<<<<<<<<<<<;<< MF:i:18 Aq:i:39 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:1:243:10:911 83 chr1 236 99 35M = 63 -208 TATTCAGTTCTAAATATAGAAATTGAAACAGCTGT ;<;;;<4;9:<<<;<<;<<<<<;;<<<<<<<<<<< MF:i:18 Aq:i:64 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:6:5:730:1436 163 chr1 236 99 35M = 403 202 TATTCAGTTCTAAATATAGAAATTGAAACAGCTGT ;;;;;;;;;;;;;;;;;;8;;;;;8;;;;;67777 MF:i:18 Aq:i:67 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:2:57:1672:1890 121 chr1 236 75 40M = 236 0 TATTCAGTTCTAAATATAGAAATTGAAACAGCTGTGTTTA :;;;9<8;;*<<<<<<:<<<<<<<<1:<<<<<<<<<<<7< MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:2:57:1672:1890 181 chr1 236 0 * = 236 0 CCCCCCCCCCCCCCCCCAGCCACTGCGGCCCCCCCAGCCA -+)%)'-'+,,<066,))090+:&486083:5&&:<<5<0 MF:i:192 ++-EAS1_105:2:299:360:220 99 chr1 237 99 35M = 403 201 ATTCAGTTCTAAATATAGAAATTGAAACAGCTGTG <<<<<<<9<9<<<<<<<<<<<<<<<<<5<;<0<<< MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:2:24:1037:84 163 chr1 238 99 35M = 415 212 TTCAGTTCTAAATATAGAAATTGAAACAGCTGTGT <<<<<<<<<<<<<<<<<<<<<<;<<<<<<;:<57< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_105:3:86:823:683 163 chr1 240 99 35M = 408 203 CAGTTCTAAATATAGAAATTGAAACAGCTGTGTTT <<<<<<<<;<<<<<<<<<<<<<<<<<<<<;;9<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:4:130:568:978 99 chr1 246 88 35M = 434 223 TAAATATAGAAATTGAAACAGCTGTGTTTAGTGAC 7<<;<<;<7<:;<7<<<<<<<<);4;+<7+3+%;< MF:i:18 Aq:i:24 NM:i:1 UQ:i:26 H0:i:1 H1:i:0 ++-EAS114_45:4:73:1208:495 163 chr1 246 99 35M = 431 220 TAAATATAGAAATTGAAACAGCTGTGTTTAGTGCC ;;;;;;;;;;;;;;;;;;;;;;;;5;;;;;37377 MF:i:18 Aq:i:67 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:7:264:642:506 99 chr1 247 99 35M = 420 208 AAATATAGAAATTGAAACAGCTGTGTTTATTGTAT <<;<<<<<<;<<<;:;;:;;<<;<<<<;*+;*&.4 MF:i:18 Aq:i:56 NM:i:3 UQ:i:28 H0:i:1 H1:i:0 ++-EAS114_28:5:104:350:749 163 chr1 247 99 36M = 415 204 AAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTT <<8<<<<<<<<<<<;<<<<<<<<0;<<<9;<85;;; MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_61:6:227:259:597 147 chr1 248 99 35M = 61 -222 AATATAGAAATTGAAACAGCTGTGTTTAGTGCCTT <8<;2;9;<;;-92<;;;<;<<<<<<<<<<<<<<< MF:i:18 Aq:i:61 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_32:7:113:809:364 99 chr1 250 99 35M = 413 198 TATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTG <<<<<<<<<<<<<<<<<<<<<<<<<<7<;<;<<<4 MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:2:218:877:489 83 chr1 250 86 35M = 80 -205 TATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTG 9<<<8<<<;<9<<<<<<<<<<;<<<<<<<<<<<<< MF:i:18 Aq:i:10 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:7:20:979:96 83 chr1 254 99 35M = 79 -210 GAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCA '9996;(:;-<;1<<<<=<<<<=<<<<<<<<<<<< MF:i:18 Aq:i:37 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:2:259:219:114 99 chr1 254 99 35M = 411 192 GAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCA <<<<<<<<<<<<<<<;<<<<<<7<7<<<<<0<<9< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_39:6:13:1034:1144 99 chr1 256 99 35M = 429 208 AATTGAAACAGCTGTGTTTAGTGCCTTTGTTCACA <<<<<<<<<<<<<<<<<<<<<<<<3<<<;<<;<++ MF:i:18 Aq:i:69 NM:i:2 UQ:i:48 H0:i:1 H1:i:0 ++-EAS221_1:2:29:1486:672 147 chr1 256 99 35M = 79 -212 AATTGAAACAGCTGTGTTTAGTGCCTTTGTTCACA <<:<<<<;<<<<<<<<<<<<<<<<<<<<<<<<<++ MF:i:18 Aq:i:29 NM:i:2 UQ:i:54 H0:i:0 H1:i:0 ++-EAS139_11:7:46:695:738 163 chr1 259 74 35M = 428 204 TGAAACAGCTGAGTTTAGCGCCTGTGTTCACATAG <;<<<<;<<),&4<3<<7&7<0;)).3;79;7<;0 MF:i:130 Aq:i:74 NM:i:3 UQ:i:18 H0:i:0 H1:i:0 ++-EAS139_11:8:26:1221:222 163 chr1 261 99 35M = 446 220 AAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:78 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_64:3:190:727:308 147 chr1 263 99 35M = 103 -195 ACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTG ;;<;;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:3:284:261:124 83 chr1 263 99 35M = 79 -219 ACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTG ===27===.====&===========;;======== MF:i:18 Aq:i:39 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:7:141:80:875 147 chr1 265 99 35M = 110 -190 AGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCA 6/<;84<;<;<<<<<<5<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:3:24:1135:563 163 chr1 266 99 40M = 446 220 GCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAACAACC <<<<:<<<<:1:<<<<<<.<<<<<<<<;<;;;43+:30:: MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:2:90:986:1224 83 chr1 267 99 35M = 67 -235 CTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAAC <7*37;;;;;;;9<<;<7<<<<<<<<<<<;;<<<< MF:i:18 Aq:i:41 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:7:287:492:169 99 chr1 269 99 36M = 449 216 GTGTTTAGTGCCTTTGTTCAACCCCCTTGCAACAAC <<<7<<<<<<<<<<<<<<<<<<<<<<<8;;<;6<<; MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_4:1:48:9:409 99 chr1 271 75 18M5I12M = 464 228 GTTTAGTGCCTTTGTTCACATAGACCCCCTTGCAA <<<<<<<<<<<<<:<<<<<<<<<<<<<<<<<<<<< MF:i:130 Aq:i:75 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-EAS139_19:1:87:1222:878 163 chr1 272 10 40M = 435 203 TATAGGGCCTTTGTTCAAACCCCTTGCAACAACCTTGAGA &+6<6&<:<<9<1112<<;)9227>>>>>>>>>>>>>;<>>>>><<>>>;<+<>=>>+==>>==<==<=8=><:;8/;7<<<<<<<<;<:<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:2:240:603:890 83 chr1 740 99 36M = 590 -186 GCTCCCAAGAGGGAAAGCTTTCAACGCTTCTAGCCA ;+&+//&<<<<<<<<<<9<<<8<<<<9<<<<<<<<< MF:i:18 Aq:i:66 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-B7_591:7:129:956:115 163 chr1 740 99 36M = 927 223 GCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;877- MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:4:168:528:288 83 chr1 740 99 35M = 570 -205 GCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCC 8<%<31;<<;<;<<<<<<<;<<<<<<<<<<;<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:8:275:851:240 147 chr1 743 99 35M = 561 -217 CCCCAGAGGGAAAGCTTTCAACGTTTCTAGCCATT 66614/&3616630666&66666&66666868666 MF:i:18 Aq:i:31 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 ++-EAS188_7:6:205:873:464 147 chr1 743 99 35M = 552 -226 CCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATT <-((+:+;289<--;<;-;<:;;<<<;;<<<<<<< MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:6:37:610:260 163 chr1 745 99 35M = 913 203 CCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTC <<<;<;<<7<<<<<<<<<<<<<<;6<963;;;3;1 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:7:93:945:176 147 chr1 745 99 35M = 582 -198 CCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTC 6;;;8<<3<<8.<;6)<<<<<9<<<<<<<<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:6:61:628:681 83 chr1 746 99 36M = 586 -196 CAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCTT 95<<<<<<<<;<<<<;<<<:<<;;<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:7:288:552:440 83 chr1 747 87 35M = 560 -222 AGAGGGAACGCTTTCAACTCTTCTAGCCATTTCTT 9<<%'%<<.2<<<<<<<<5:<<<<<<<<<<<<<<< MF:i:18 Aq:i:26 NM:i:2 UQ:i:33 H0:i:0 H1:i:0 ++-EAS56_53:2:170:265:818 163 chr1 748 10 35M = 920 207 GAGGGGAAGCTTTCAACGCTTCTAGCACTTTCTTT <<<<<(5/959<8.<9<8<<<2<&59&&:22:8+( MF:i:18 Aq:i:10 NM:i:3 UQ:i:17 H0:i:0 H1:i:0 ++-B7_595:2:251:121:479 83 chr1 750 99 35M = 572 -213 GGGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTG <<<<<6'..663;&<<;<<9<<<9<<<<<<<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:8:67:1797:1931 147 chr1 750 99 35M = 562 -223 GGGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_103:2:226:302:758 83 chr1 751 99 35M = 556 -230 GGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGG ;<<<<9;<<<<<<<<<<7<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:33 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_32:2:163:618:570 83 chr1 751 99 35M = 571 -215 GGAAAGCTGTCAACGCTTCTAGCCATTTCTTTTGG <9774<88&:8<:8<8:8<8<<<<<;88<88<<<< MF:i:18 Aq:i:41 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 ++-EAS1_97:3:73:292:429 99 chr1 752 99 35M = 920 203 GAAAGCTTTCAACGCTTCTAGCCATTTCTTTTTGC <<<<<<<<<<7<<;<<<<<<<2<<<5<<<<<:%)< MF:i:18 Aq:i:69 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 ++-EAS1_108:3:82:356:253 99 chr1 752 99 35M = 927 210 GAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGGC ===================<========;===39= MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:6:62:386:959 147 chr1 752 99 35M = 594 -193 AAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGGC %;71131((<<6<92(+<1<<;<-3<8<<;<;;<< MF:i:18 Aq:i:57 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 ++-EAS51_62:3:263:74:407 83 chr1 754 99 35M = 574 -215 AAGCTTTCAACGCTTCTAGCCATTTCTTTTGGCAT ;;88<::+;<)<5<<:<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_597:3:67:620:344 99 chr1 755 99 35M = 905 185 AGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATT <<<<2<:2<<<<<<7<<<<:<<*<<<<<<***3<< MF:i:18 Aq:i:33 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:6:148:776:486 83 chr1 755 99 35M = 578 -212 AGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATT ;:<<<;<<;<<<<<;<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_61:3:150:933:810 83 chr1 755 99 35M = 593 -197 AGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATT :89===:=:=;;==;==================== MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_64:4:102:467:897 99 chr1 756 97 35M = 940 219 GCTTTCAACGCTTCTAGCCATTTCTTTTGTCTTTT <<<<9<<<<9<2<<<&,/=====>=>=>>>=>>==>=>>>>>> MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:3:297:637:86 83 chr1 869 99 35M = 704 -200 TCTCAGCTAGGGGAACAGGGAGGTGCACTAATGCG <:75<;<;;<<<<<<;;<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:68 NM:i:1 UQ:i:26 H0:i:1 H1:i:0 ++-EAS54_65:3:290:558:349 147 chr1 869 99 35M = 719 -185 TCTCAGCTAGGGGAACAGGGAGGTGCACTAATGCG 2;2;;'5&;<<5<<;5/<<<<<7<<;+;<<+1<8< MF:i:18 Aq:i:59 NM:i:1 UQ:i:6 H0:i:1 H1:i:0 ++-EAS1_95:3:308:956:873 99 chr1 870 99 35M = 1068 233 CTCATCTAGGGGAACAGGGAGGTGCACTAATGCGC <<<<<<<<<<<<<;<;<;1<<<<<.<9<;<<<<+; MF:i:18 Aq:i:31 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_78:7:147:64:416 147 chr1 870 99 35M = 701 -204 CTCATCTAGGGGAACAGGGAGGTGCACTAATGCGC /;49;:6<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:4:30:812:345 163 chr1 871 99 36M = 1036 201 TCATCTAGGGGAACAGGGAGGTGCACTAATGCGCTC <<<<<<<7<;<<7<;77;3<&0-;<5<;6<1'13<: MF:i:18 Aq:i:64 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:7:134:243:630 163 chr1 871 99 35M = 1052 216 TCATCTAGGGGAACAGGGAGGCGCACTAATGAGCT <<<:<<<<::1:818;;&::<>.; MF:i:18 Aq:i:35 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 ++-EAS54_81:2:31:98:804 147 chr1 982 99 35M = 805 -212 CTTTACTGTCATAACTATGAAGAGACTATTGCCAG ====;========7===================== MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_103:2:235:805:373 163 chr1 983 99 35M = 1146 198 TTTACTGTCATAACTATGAAGAGACTATTTCCAGA <<<<<<<<<<<<<<<<<<<<;<;<<<<<<;;<99; MF:i:18 Aq:i:74 NM:i:1 UQ:i:26 H0:i:1 H1:i:0 ++-EAS114_28:5:11:868:62 99 chr1 983 99 36M = 1154 207 TTTACTGTCATAACTATGAAGAGACTATTGCCAGAT <<<<<<<<<<<<<<<<<<<<:<<<;<<<<(7:7039 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_81:2:280:512:316 163 chr1 984 99 35M = 1159 210 TTACTGTCATAACTATGAAGAGACTATTGCCAGCT ==<========6==4==6;;==:===;=2/:+8%6 MF:i:18 Aq:i:68 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 ++-EAS1_93:5:292:122:666 99 chr1 985 99 35M = 1159 209 TACTGTCATAACTATGAAGAGACTATTGTCAGATG <<<<<<6<<<<<<<<8;<<<<<<<<<<3&9+;;(; MF:i:18 Aq:i:68 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS56_53:1:23:403:981 99 chr1 985 99 35M = 1151 201 TACTGTCATAACTATGAAGAGACTATTGCCAGATG <8<<<;<<<<<<;<<<<<<8;<<<9<9,3;,6(91 MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:7:33:1566:588 99 chr1 985 76 35M = 1166 216 TACTGTCATAACTATGAAGAGCCTATTGCCAGATG <;.;;;;6;;;;6;;29;;;<+9;;;.3;;73797 MF:i:18 Aq:i:37 NM:i:1 UQ:i:10 H0:i:0 H1:i:1 ++-EAS139_11:7:92:367:1495 83 chr1 987 99 35M = 820 -202 CTGTCATAACTATGAAGAGACTATTGCCAGATGAA <8<88<<<<7<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:78 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS220_1:8:38:1576:1923 83 chr1 987 99 35M = 822 -200 CTGTCATAACTATGAAGAGACTATTGCCAGATGAA 8;<98<<<<<<<;<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:7:190:481:295 163 chr1 990 99 35M = 1161 206 TCATAACTATGAAGAGACTATTGCCAGATGAACCA <<<<<<<<<<<<<<<<<<<<<<9<<<<<9<7<2:: MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_32:7:168:117:441 99 chr1 990 99 35M = 1151 196 TCATAACTATGAAGAGACTATTGCCAGATGAACCA <<3<<<<<<<<<<<<<<<<<<<+<<17;<;:<995 MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:3:239:796:221 163 chr1 992 99 35M = 1160 203 ATAACTATGAAGAGACTATTGCCAGCTGACCCCCC <<<7<<7<<7<<7<;<<<<<,;;,+'<+/+99%:' MF:i:18 Aq:i:37 NM:i:4 UQ:i:26 H0:i:0 H1:i:1 ++-EAS220_1:4:69:88:1154 147 chr1 992 99 35M = 805 -222 ATAACTATGAAGAGACTATTGCCAGATGAACCACA <<<<9<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_3:8:34:956:1309 99 chr1 994 99 35M = 1168 209 AACTATGAAGAGACTATTGCCAGATGAACCACACA <<<<<<7<<<<<<<<<<<6<<<<<<<<<<<:<8<8 MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:5:229:717:121 99 chr1 995 99 35M = 1150 190 ACTATGAAGAGACTATTGCCAGATGAACCACACAC =================<)=<4<0=.<<<71;41& MF:i:18 Aq:i:43 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS219_1:1:67:191:668 99 chr1 995 99 35M = 1134 174 ACTATGAAGAGACTATTGCCAGATGAACCACACCT <<<<<<<<<<<<<<<<<6<<;<;<;<<<<<<6;%2 MF:i:18 Aq:i:74 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 ++-EAS51_64:3:309:303:278 163 chr1 996 99 35M = 1178 217 CTATGAAGAGACTATTGCCAGATGAACCACACATT <<<<<<<<<<<<<<<<+<<+<<7<<<<<5<<<;;; MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:8:60:1020:1259 99 chr1 996 99 35M = 1157 196 CTATGAAGAGACTATTGCCAGATGAACCACACATT <;<<<<;<<<<<<<<<;<<<<<<<8<<<<<:<:<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_4:7:89:1487:520 83 chr1 997 99 35M = 805 -227 TATGAAGAGACTATTGCCAGATGAACCACACATTA 4;;/<<<<<:<;<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:4:15:805:420 163 chr1 998 35 35M = 1164 201 ATGAAGAGACTATTCACATGTGAACCACACATTTA ;73;;;;67.;1<<+*.;*&<4947<&474&*9*( MF:i:130 Aq:i:35 NM:i:4 UQ:i:33 H0:i:0 H1:i:0 ++-EAS56_57:3:119:761:239 147 chr1 999 99 35M = 813 -221 TGAAGAGACTATTGCCAGATGAACCACACATTAAT ;;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:7:142:457:584 99 chr1 999 99 35M = 1160 196 TGAAGAGACTATTTCCAGATGAACCACACATTAAT <<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_63:7:190:95:706 147 chr1 1078 99 35M = 920 -193 TTGTGTCCATGTACACACGCTGTCCTATGTACTTA 9;97437;<;;<<;<;<<<<<<;<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_589:1:101:825:28 83 chr1 1079 99 35M = 879 -235 TGTGTCCATGTACACACGCTGTCCTATGTACTTAT 0;0'0;<<<<<<8<;<<<<;;3<<;;<<<8<<<<< MF:i:18 Aq:i:39 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:4:188:460:1000 99 chr1 1080 99 35M = 1251 206 GTGTCCATGTACACACGCTGTCCTATGTACTTATC <<<<<<<<<<<<<<<<7<<;:4;44<;;:8;;9;; MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_95:3:268:523:511 99 chr1 1081 99 35M = 1241 195 TGTCCATGTACACACGCTGTCCTATGTACTTATCA <<<<<<<<<<<<<<<<<<<<;<<<<6<:9<<3<44 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:6:54:263:585 99 chr1 1081 99 36M = 1254 209 TGTCCATGTACACACGCTGTCCTATGTACTTATCAT <<<<<<<<<<<<<<<<<<<:;<<;<:;::<<;;:;4 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:7:174:987:334 83 chr1 1082 99 35M = 908 -209 GTCCATGTACACACGCTGTCCTATGTACTTATCAT ,;<;;<<<&<<<1<<<<<<<<<<<<<;<<<<<<<< MF:i:18 Aq:i:41 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:6:224:932:942 99 chr1 1082 99 34M = 1250 203 GTCCATGTACACACGCTGTCCTATGTACTTATCA <<<<<<<<<<<<<<<<<<<<<<;<<<<7<<(;3, MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:1:12:1296:358 99 chr1 1082 96 35M = 1252 205 GTCCATGTACACACGCTGTCCTATGTACTTATCAT ;;;6;7;7;;;;;7;9;;-*1;9;699/99/7477 MF:i:18 Aq:i:37 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_32:2:306:119:56 147 chr1 1083 99 35M = 919 -199 TCCATGTACACACGCTGTCCTATGTACTTATCATG ;;;;;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_95:4:66:179:118 163 chr1 1084 99 35M = 1262 213 CCATGTACACACGCTGTCCTATGTACTTATCATGA <<<<<<<<<<<<<<<<<<<<<<<<<<:<<;<<6<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_105:2:110:584:649 99 chr1 1084 99 35M = 1266 217 CCATGTACACACGCTGTCCTATGTACTTATCATGA <<<<<<<<<<<<<<<<<<<<<<<<;<<<<<::<38 MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_1:4:28:315:310 163 chr1 1085 99 35M = 1242 192 CATGTACACACGCTGTCCTATGTACTTATCATGAC <<<<<<<<<<<<<<<<<<<<<:<+.<<.<+7<*17 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:7:242:4:593 147 chr1 1086 99 35M = 905 -216 ATATACACACGCTGTCCTATGTACTTATCATGACT 1.%55877+8+88808887+7;7;18:8;;;.&;8 MF:i:18 Aq:i:53 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 ++-EAS1_93:1:131:946:353 163 chr1 1087 99 35M = 1249 197 TGTACACACGCTGTCCTATGTACTTATCATGACTC <<<<<<<<<<<<<;<<<<;;<<<<<<<;<:52;<2 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:4:4:1732:88 99 chr1 1087 99 35M = 1265 213 TGTACACACGCTGTCCTATGTACTTATCATGACTC <<<<<<<<<<<<<<<<<2<8;8<;<8;<2;2:<:< MF:i:18 Aq:i:45 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:4:58:703:72 83 chr1 1088 99 35M = 905 -218 GTACACACGCTGTCCTATGTACTTATCATGACTCT 5&<<7;+95;7'6<<<<<.<<<<<;<<9<7<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:5:113:694:725 163 chr1 1088 99 35M = 1266 213 GTACACACGCTGTCCTATGTACTTATCATGACTCT <<<<<<<<<<<<9<<<<<:<<<<<<<<<<:;;<;; MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:5:278:848:765 147 chr1 1088 99 35M = 920 -203 GTACACACGCTGTCCTATGTACTTATCATGACTCT 7;;<;5<55<<;;<;<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:6:234:787:12 163 chr1 1092 97 35M = 1257 200 ACACGCTGGCCTATGTACTTATAATGACTCTATCC <;<<<9<<&+9;3;<993;<9<+94;9&41;08%9 MF:i:18 Aq:i:24 NM:i:2 UQ:i:15 H0:i:0 H1:i:0 ++-EAS218_1:4:15:856:340 147 chr1 1093 99 35M = 936 -192 CACGCTGTCCTATGTACTTATCATGACTCTATCCC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_62:2:258:266:101 163 chr1 1094 99 35M = 1285 226 ACGCTGTCCTATGTACTTATCATGACTCTATCCCA <<<<<<<<<<<<<<<<<<5<;,<-2<<<<;68<<6 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:2:177:552:234 147 chr1 1094 99 35M = 903 -226 ACGCTGTCCTATGTACTTATCATGACTCTATCCCA ::;:=;=99=====;;====;==========<=== MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:1:134:379:893 147 chr1 1095 99 35M = 927 -203 CGCTGTCCTATGTACTTATCATGACTCTATCCCAA 7137::;<<<<<<<;<<<<<<<<<<;<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_105:8:256:404:584 147 chr1 1096 99 35M = 928 -203 ACTGTCCTATGTACTTATCATGACTCTATCCCAAA &&326+23<3<<<+:<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:3:57:735:151 99 chr1 1121 94 35M = 1314 228 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTC <<<<<<<<8<<8<:<<*<:<<<4<<<;,<<<<:<: MF:i:18 Aq:i:26 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_81:8:142:858:903 147 chr1 1121 99 35M = 943 -213 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTC <<<<<;<<<<9<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:7:247:522:670 83 chr1 1121 99 35M = 960 -196 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTC ;;;9;:<<<<<;<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:3:75:732:442 99 chr1 1121 99 40M = 1293 212 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTCTTAGG <<<<<;<<<<<9<<<;<<;<<<5<<;8<<<<<<<<;:9%% MF:i:18 Aq:i:60 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_99:7:183:645:699 99 chr1 1122 86 35M = 1281 194 TATCCCAAATTCCCAATTACGTCCTATCTTCTTCT <<9<9<<<<<<<<<;<<;<<*175;173<;;;<-/ MF:i:18 Aq:i:21 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:6:175:437:950 163 chr1 1126 99 35M = 1298 207 CCAAATTCCCAATTACGTCCTATCTTCTTCTTAGG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<:59 MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_63:3:93:1002:845 83 chr1 1129 99 35M = 954 -210 AATTCCCAATTACGTCCTATCTTCTTCTTAGGGAA <<::;;;<<<<<<<<<<<<<<<<;<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_62:6:50:542:881 163 chr1 1132 99 35M = 1324 227 TCCCAATTACGTCCTATCTTCTTCTTAGGTAAGAA <<<<<4<09<<9<<2<<<<<<<<<<<2/.&2<%<7 MF:i:18 Aq:i:63 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS1_99:3:118:851:285 83 chr1 1133 99 35M = 953 -215 CCCAATTACGTCCTATCTTCTTCTTAGGGAAGAAC 3+7<<<;<;<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_1:1:67:191:668 147 chr1 1134 99 35M = 995 -174 CCAATTACGTCCTATCTTCTTCTTAGGGAAGAACA <<<<<7<<7<<<<<<<;<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:7:166:203:416 83 chr1 1136 99 35M = 963 -208 AATTACGTCCTATCTTCTTCTTAGGGAAGAACAGC <<<<<<<<::<<<<<<<<<;<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:2:15:1497:1530 99 chr1 1136 99 35M = 1314 213 AATTACGTCCTATCTTCTTCTTAGGGAAGAACAGC 0<;;;9;;86<;;;<<&<<.<<;)3;7;654-471 MF:i:18 Aq:i:57 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:8:206:563:262 83 chr1 1137 99 35M = 971 -201 ATTACGTCCTATCTTCTTCTTAGGGAAGAACAGCT <<<<7<<<<<<<<<<<<<<<<<<<<<<<<<<<<<7 MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:4:40:352:151 99 chr1 1137 99 35M = 1327 225 ATTACGTCCTATCTTCTTCTTAGGGAAGAACAGCT <<<<<<<<<<<<<<<;<<9<<<<:<<<<;<99<3< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:7:67:302:762 99 chr1 1138 99 36M = 1313 211 TTACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTA <<<<<<<<<<<<<<<<<<<<;;65;<-<;<:8<<<3 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:5:84:927:843 147 chr1 1138 99 35M = 938 -235 TTACGTCCTATCTTCTTCTTAGGGAAGAACAGCTT 588;<:<<<<<<<6<<<<;<<<:/<<3<:;<*<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_99:5:147:479:41 163 chr1 1139 99 35M = 1322 218 TACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTA <<<<<<<<<<<<<<<<<<<<::6<<;<<<;;9;;6 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_105:3:329:177:267 83 chr1 1139 99 35M = 962 -212 TACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_589:7:72:916:763 163 chr1 1142 99 35M = 1340 233 GTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGT ==7>==9>=7=>=>>=>> MF:i:18 Aq:i:78 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_65:4:91:267:655 147 chr1 1365 99 35M = 1204 -196 TGTTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGT ;,:;5:<6:<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:2:91:856:504 99 chr1 1366 99 35M = 1520 189 GTTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTT <<<<<<<<<<<<<<7<<<<<<<7<<<&;<<<&&<& MF:i:18 Aq:i:68 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS1_108:2:170:326:433 99 chr1 1367 99 35M = 1535 203 TTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGG =====<=9===:=<:==2=======2:===9==/5 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:6:132:717:233 99 chr1 1368 99 35M = 1529 196 TGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGA <<<<<<<<<<<<;<<<<<<<<<<<7<<<<&-<4<1 MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:1:14:420:712 99 chr1 1368 99 40M = 1525 197 TGTTGGTTTTCTGTTTCTTTGTTTGATTTTTTTGAAGACA <<<<<<<<<<<<;<<<<<<<;<<<-;<<<&,<&*8111:6 MF:i:18 Aq:i:66 NM:i:3 UQ:i:21 H0:i:1 H1:i:0 ++-EAS114_39:4:43:1047:1626 163 chr1 1369 99 35M = 1523 189 GTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAA <<<<<<<<<<<<<<<<<<<:<<<:<<<<:+;-4:( MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:2:20:413:1334 147 chr1 1370 99 35M = 1215 -190 TTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAG 88878777;:;:1:;9;;;6;;;6;9;;;;;296; MF:i:18 Aq:i:60 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_62:5:154:669:853 83 chr1 1371 99 35M = 1193 -213 TGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGA <::<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:7:117:857:942 99 chr1 1372 99 35M = 1527 190 GGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGAC <<<<<<<<<<<<<<<<<<<<<<<<<:6<;;7;9<; MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:6:145:144:796 147 chr1 1372 99 35M = 1181 -226 GGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGAC ;<<<;<<<<<<<<<;<<<;<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_32:6:88:162:587 147 chr1 1372 99 35M = 1189 -218 GGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGAC 386;;388-<8;<;68<<;;<;<6<<<8<<<<<<< MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:8:73:108:1621 99 chr1 1373 99 35M = 1532 194 GTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACA <<<<<<<<71<<<<<<<<<+<<<<70:0<9<<61< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:6:127:153:861 147 chr1 1374 99 35M = 1202 -207 TTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACAT :;:6;9<<1;<<95<<<9<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:2:152:765:744 163 chr1 1374 99 35M = 1534 195 TTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACAT <<<<<<<<<<<<<<<<<<:<<<<<<<<&<7293<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:3:313:827:992 147 chr1 1379 99 35M = 1197 -217 TGTTTCTTTGTTTGATTTGGTGGAAGACATAATCC '187:1'<75<.*<<:5<..<<*<<917<<7<<17 MF:i:18 Aq:i:57 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_64:3:7:268:263 121 chr1 1381 22 35M = 1381 0 TTGCGTTATTTGAGTTGGTGGAAGACATAATCCCA ',)*&2<$7+<<<'<-<7<<<<<<<7<<7><>;>+>>/;>>=>=>=:>><>=<<==;)<=8; MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_4:7:85:923:726 147 chr2 199 99 35M = 43 -191 GTAAAGTAACTGAACCTATGAGTCACAGGTATTCC <:<<<%3<<1<<86<<-<<<<<<<<<<<<6<<1<< MF:i:18 Aq:i:44 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_103:5:285:241:560 83 chr2 200 99 35M = 37 -198 TAAAGTAACTGAACCTATGAGTCACAGGTATTCCT :<<<<;<<,<<<<5<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:6:41:461:436 163 chr2 200 74 35M = 389 224 TAAAGTAACTGAACCTATGAGTCACAGGTATTCCT <<<<<<<<<<<<<<<<<<<:<<<<<<<<;<;;;:; MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_61:6:25:949:33 99 chr2 201 99 35M = 383 217 AAAGTAACTGAACCTATGAGTCACAGGTATTCCTG =;===/8========*==&;6=&=&:=6&:=::67 MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_3:2:60:590:1760 99 chr2 201 99 35M = 376 210 AAAGTAACTGAACCTATGAGTCACAGGTATTCCTG <:<<<<<2<<<<:<::<<<::<<<<<6<<<<<<<6 MF:i:18 Aq:i:51 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:6:86:693:234 163 chr2 202 82 35M = 388 221 AAGTAACTGAACCTATGAGTCACAGGTATTCCTGA ;;;;;;;;;;;;;;;;9;;;;;;;;99;;&70777 MF:i:18 Aq:i:18 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:6:4:223:776 163 chr2 203 93 35M = 387 219 AGTAACTGAACCTATGAGTCACAGGTATTCCTGAG <<<<<<<<<<<<<<<<<<<<<<<<<9<<<:;<;2< MF:i:18 Aq:i:19 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_39:3:88:84:1558 99 chr2 203 95 35M = 394 226 AGTAACTGAACCTATGAGTCACAGGTATTCCTGTG <<;<<<<<<<<<<<<;;<<<<<<<::<<<<<<7&< MF:i:18 Aq:i:22 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-B7_597:2:168:829:88 163 chr2 205 99 35M = 369 199 TAACTGAACCTATGAGTCACAGGTATTCCTGAGGA <<<<<<<<<<<<<<<<<<<<<<<6<<<<<<9;4;2 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:1:168:389:889 147 chr2 205 99 36M = 37 -204 TAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA ;<<;;56;==================8========8 MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:5:81:685:141 99 chr2 207 85 34M = 382 210 ACTGAACCTATGAGTCACAGGTATTCCTGAGGAA <<<<<<<<<<<<<<<<<<<<<;;<<;<<<',7,7 MF:i:18 Aq:i:17 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:4:26:1312:1400 99 chr2 207 99 40M = 385 218 ACTGAACCTATGAGTCACAGGTATTCCTGAGGAAAAAGAA <<<<;<<<:<<:<;<:<<<;:;<<<<<<:<8<1;;:::88 MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:4:127:725:381 83 chr2 209 99 34M = 39 -204 TGAACCTATGAGTCACAGGTATTCCTGAGGAAAA +<<.<<;<;<<<3;<;<<<<<<6<8;<<<<<<<1 MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:2:19:736:559 99 chr2 209 99 35M = 370 196 TGAACCTATGAGTCACAGGTATTCCTGAGGAAAAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:3:117:284:589 83 chr2 210 99 35M = 43 -202 GAACCTATGAGTCACAGGTATTCCTGAGGAAAAAG ==8==;==================;========== MF:i:18 Aq:i:56 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:5:120:596:847 163 chr2 211 83 35M = 410 234 AACCTATGAGTCACAGGTATTCCTGAGGAAAAAGA <<<<<<<<<<<<<;<<<9<<<<<<<;:<62;58;2 MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:1 ++-B7_610:5:51:904:391 163 chr2 212 97 35M = 401 224 ACCTATGAGTCACAGGTATTCCTGAGGAAAAAGAA <<<<<<<<<<<<<<<<<;<<<<<;:;<2<6;;;;; MF:i:18 Aq:i:24 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_11:8:96:1314:1448 163 chr2 213 93 35M = 388 210 CCTATGAGTCACAGGTATTCCTGAGGAAAAATAAA <<<<<<<<<<<<<<<<<<<<<<<<5<4<<<<-<<< MF:i:18 Aq:i:18 NM:i:1 UQ:i:12 H0:i:1 H1:i:0 ++-EAS139_19:3:73:1158:535 163 chr2 213 99 40M = 377 204 CCTATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGTG <<<<<<<<<<<<<<<<<<<<<<8<;;<<<<<9<<9::8:8 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:2:223:583:968 147 chr2 215 88 36M = 47 -204 TATGAGGCACAGGTATTCCTGAGGAAAAAGAAAAAG 1<';<<&%-:<<<<<:66%<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS1_97:3:160:173:889 163 chr2 215 99 35M = 379 199 TATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAA <<<<<<<<<<<<<<<<<<<<;0<7<<;<<<;7<09 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_39:1:28:350:895 83 chr2 215 95 35M = 48 -202 TATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAA :<;<<<:;<-<<<<<4;77<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:4:45:707:147 163 chr2 216 99 35M = 424 243 ATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAG <<<<<<<<<<<<&<<<<:<<9<<<9<<<<75;;;< MF:i:18 Aq:i:51 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS220_1:8:18:1757:95 99 chr2 216 45 35M = 374 193 ATGAGTCGCAGGTATTCCTGAGGAAAAAGAAAAAG <<<<<<<<<<<<<<<<<<<1<:<<<<<<:<<<;:< MF:i:18 Aq:i:0 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS51_66:6:310:747:415 163 chr2 217 99 35M = 387 205 TGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGT <<<<<<<<<<<<<<<<<<:<<<<<;<<<<<;<;<; MF:i:18 Aq:i:24 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:2:114:938:216 147 chr2 218 99 36M = 63 -191 GAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGA <<<<7<6<<<<<<<6<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:1:179:629:513 163 chr2 220 99 35M = 409 224 GTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAG <<<<<<<<<<<<<<<;<;<<<<<<<<<<<<<;<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:4:88:55:1187 99 chr2 220 66 35M = 391 206 GTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAG ;;<;;;<<99<<;;<;;;;;:;49;:;;;;87898 MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_62:5:119:38:945 99 chr2 221 99 35M = 428 242 TCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGA <<<<<<<<<<<<<<<8<<<<<8<<<8<;<<7<:<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:6:67:800:450 147 chr2 221 99 35M = 41 -215 TCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGA 9-<9<;<<<<9;5<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:5:102:915:87 147 chr2 222 99 35M = 65 -192 CACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAA ;<8<;;<<<<7;<<<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:1:113:367:659 163 chr2 222 72 35M = 390 203 CACAGGTATTCCTGAGGAAAAAGAAAAAGCGAGAA =9====8==========:=:=====9=:=&====5 MF:i:18 Aq:i:0 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS218_1:2:26:211:481 147 chr2 222 99 35M = 43 -214 CACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAA :<:<<<<<<9:5<<<<<<<<<<<<<<:<:<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_FC30151:3:90:1906:1528 83 chr2 222 99 35M = 41 -216 CACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAA :<<<<<<<<<3:<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:2:13:100:876 163 chr2 223 73 36M = 397 210 ACAGGGATTCCTGAGGAAAAAGAAAAAGTGAGAAGT <8<<<*<2<7<<<6<<<<<<6<<8<<<<5<<<<4<9 MF:i:18 Aq:i:30 NM:i:1 UQ:i:9 H0:i:0 H1:i:1 ++-EAS56_63:5:117:570:971 163 chr2 223 99 35M = 413 225 ACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAG <<<<<<<<<<<<<;;;<<<<6<7;9;<:;<;<;;< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_3:8:50:1203:1094 83 chr2 223 99 35M = 46 -212 ACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAG <7<<<<<5:+63<<<<<<<<<<<<<<<<2<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:6:107:395:312 83 chr2 224 99 35M = 44 -215 CAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGT ;<;;<<<<;<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:27 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:3:29:833:612 83 chr2 224 99 35M = 58 -201 CAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGT <<;<<<;<::<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:27 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:7:158:943:467 83 chr2 225 99 35M = 57 -203 AGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTT <:<<;;<:5<<<<<<<<<<<<<<<<<<;<<<;<<< MF:i:18 Aq:i:46 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:2:201:768:529 163 chr2 225 99 35M = 396 206 AGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTT ==========================1=======; MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:6:11:994:584 99 chr2 226 97 35M = 417 226 GGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTT <<<<<<<<<<<7<<<<<<<<<<<<<6<<<<<<3<6 MF:i:18 Aq:i:24 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:2:206:873:186 83 chr2 227 99 35M = 66 -196 GTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTG ;<<;--7<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_63:4:38:28:122 83 chr2 227 99 35M = 46 -216 GTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTG ;9;9;-1<<<<<<<<<<<<<<<<<;<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:5:66:372:343 83 chr2 228 99 35M = 40 -223 TATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTGG ;<1;89<<<<<;<9<<<<9<<<;8<9<;<<<<<;8 MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:3:277:144:848 83 chr2 228 99 35M = 64 -199 TATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTGG <<<)63<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:6:21:1601:1666 83 chr2 228 99 40M = 56 -212 TATTACTGAGGAAAAAGAAAAAGTGAGAAGTTTGGAAAAA -;;3&1<<<<<<<<<<<<1<<<<<<<<<<<7<<<<<<<<=<<<<<<<<<<<<<< MF:i:32 Aq:i:19 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:2:23:268:529 153 chr2 329 71 35M * 0 0 TGAAAGAGGCTCAAAGAATGCCAGGAAGATACATT 7;<<<<<<57;-<<<<<<:<77<<<<<<<;<;<<< MF:i:32 Aq:i:28 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:2:315:219:7 153 chr2 330 69 35M * 0 0 GAAAGAGGCTCAAAGAATGCCAGGAAGATACATTG 7==::<2=8<<<=====>888<=2=>==>,>,>>8 MF:i:32 Aq:i:19 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:4:63:5:870 83 chr2 330 75 35M = 148 -217 GAAAGAGGCTCAAAGAATGCCAGGAAGATACATTG :<;<;<<<4:;<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:5:243:557:560 163 chr2 331 75 36M = 499 204 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGCA <<<<<<<9<<<<<<<<<<<<<<<<<<;<<89<<9<; MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:2:270:430:269 163 chr2 331 99 36M = 519 224 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;;7;: MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:6:284:442:747 89 chr2 331 75 35M * 0 0 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGC <;<<<<<:<;<<<<<<<<;<<<<<<<<<<<<<<<< MF:i:32 Aq:i:29 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_4:7:71:31:1973 89 chr2 331 76 35M * 0 0 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGC <<<<<7<:<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:32 Aq:i:29 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:2:30:466:652 147 chr2 332 98 35M = 163 -204 AAGAGGCTAAAAGAATGCCAGGAAGATACATTGCA <<<<<;3;&<<<<<<<============= MF:i:18 Aq:i:22 NM:i:0 UQ:i:0 H0:i:4 H1:i:13 ++-EAS114_39:3:88:84:1558 147 chr2 394 95 35M = 203 -226 ATCAGACTATCTAAAGTCAACATGAAGGAAAAAAA ;;<<;<<;<<5<<<<<<;<<:<<<;<<<<<<;<<< MF:i:18 Aq:i:22 NM:i:0 UQ:i:0 H0:i:2 H1:i:3 ++-EAS56_59:2:201:768:529 83 chr2 396 99 35M = 225 -206 CAGACTATCTAAAGTCAACATGAAGGAAAAAAATT 3<:<9<<;<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:2:13:100:876 83 chr2 397 73 36M = 223 -210 AGAATATATAAAGTCAACATGAAGGAAAAAAATTCT ;9<$<<<$<<<<<<<<75<<<<<<<9<9<<<<<<<< MF:i:18 Aq:i:30 NM:i:2 UQ:i:6 H0:i:1 H1:i:1 ++-EAS139_11:4:26:137:1382 99 chr2 397 99 35M = 579 217 AGACTATCTAAAGTCAACATGAAGGAAAAAAATTC <<<<<<7<<<77<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:1:93:490:901 83 chr2 445 99 35M = 280 -200 AGAAAAGCATACAGTCATCTATAAAGGAAATCCCA <<<<<<<;<<<;<<<;<<;<<;<<<<<<<<<<<<< MF:i:18 Aq:i:53 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:7:96:489:453 99 chr2 445 99 35M = 625 215 AGAAAAGCATACAGTCATCTATAAAGGAAATCCCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;;;: MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:6:46:13:880 147 chr2 445 99 35M = 290 -190 AGAAAAGCATACAGTCATCTATAAAGGAAATCCCA =&====8==========0================= MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:2:167:905:852 163 chr2 445 99 36M = 647 238 AGAAAAGCATACAGTCATCTATAAAGAAAATCCCAT <<<7<<<<<<<<<<<<<<:<:<<:::&.<:<66:3< MF:i:18 Aq:i:43 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 ++-EAS219_FC30151:3:13:674:1717 163 chr2 445 99 35M = 623 213 AGAAAAGCATGCAGTCATCTATAAAGGAAATCCCA <<<<<<<<<<%<<<<<<<<<<<<<<<<<<<;:;;; MF:i:18 Aq:i:45 NM:i:1 UQ:i:4 H0:i:0 H1:i:1 ++-EAS51_62:7:196:511:896 83 chr2 446 99 35M = 283 -198 GAAAAGCATACAGTCATCTATAAAGGAAATCCCAT 8<<<<<;<<<:<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:52 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:1:154:118:488 163 chr2 447 99 35M = 624 212 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<<<<<<<<<<<<<<<<<<<<<:7<<<<7<:;;:: MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:2:44:153:969 83 chr2 447 95 35M = 245 -237 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<5<:7<72<51<7<*79<<<<<5<<<<<<<<<2< MF:i:18 Aq:i:36 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:4:215:246:640 99 chr2 447 99 36M = 624 213 AAAAGCATACAGTCATCTATAAAGGAAATCCCATCA <<<<<<<<<<9<;<<<<<<<<<<9;<<<<<<3;<;3 MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_1:7:94:1655:1921 147 chr2 447 85 35M = 258 -224 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<<<;:===<==;<==<;================; MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:6:60:1037:1146 147 chr2 447 99 35M = 250 -232 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<<<<<<<<;<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:53 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:1:23:536:229 99 chr2 448 99 35M = 614 201 AAAGCATACAGTCATCTATAAAGGAAATCCCATCA <<<<<<<<<<<<<<<<<:<8<:<<;<<<<<<7<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:6:130:865:838 163 chr2 448 99 35M = 649 236 AAAGCATACAGTCATCTATAAAGGAAATCCCATCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;:<;3 MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:2:239:1001:406 99 chr2 450 99 35M = 634 219 AGCATACAGTCATCTATAAAGGAAATCCCATCAGA <<<<<<7<<<<<<<<8<;<<<7<<<<36<<3<:33 MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:3:147:423:584 147 chr2 451 99 35M = 277 -209 GCATACAGTCATCTATAAAGGAAATCCCATCAGAA 27<;<3<<<+<<;<<<;;-4<<<<<;<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_99:1:187:715:521 83 chr2 451 99 35M = 291 -195 GCATACAGTCATCTATAAAGGAAATCCCATCAGAA <7<:<9<<<<<<<<<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:3:172:196:746 99 chr2 451 99 35M = 620 204 GCATACAGTCATCTATAAAGGAAATCCCATCAGAA <<<<<<<<9<<<<9<<<<<<<<<;<<<<6<<<<;< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:3:267:821:860 83 chr2 451 99 34M = 296 -189 GCATACAGTCATCTATAAAGGAAATCCCATCAGA $&<<<.<:;6<<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:3 ++-EAS56_61:7:7:682:201 83 chr2 452 99 35M = 288 -199 CATACAGTCATCTATAAAGGAAATCCCATCAGAAT 0:8;5<8<1:78<<<<<<<<<<<<:8<<2<<<<:< MF:i:18 Aq:i:66 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_589:3:82:13:897 163 chr2 453 99 35M = 606 188 ATACAGTCATCTATAAAGGAAATCCCAGCAGAATA <<<<;<<<<<<;<;<;5<51;<1<<<<%<<<<,58 MF:i:18 Aq:i:41 NM:i:1 UQ:i:4 H0:i:0 H1:i:1 ++-EAS56_53:6:180:695:621 99 chr2 453 99 35M = 637 219 ATACAGTCATCTATAAAGGAAATCCCATCAGAATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<;::<<< MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:2:158:909:321 83 chr2 453 99 35M = 271 -217 ATACAGTCATCTATAAAGGAAATCCCATCAGAATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:2:237:497:165 99 chr2 454 99 35M = 619 200 TACAGTCATCTATAAAGGAAATCCCATCAGAATAA 8===<8===========37=<===7=;7=8===== MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_99:2:152:355:962 83 chr2 456 99 35M = 269 -222 CAGTCATCTATAAAGGAAATCCCATCAGAATAACA &<.9.<;+;<;<<<<<<<<<<::<<:<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 ++-EAS192_3:4:255:549:422 83 chr2 456 99 35M = 295 -196 AAGTCATCTATAAAGGAAATCCCATCAGAATAACA &<;;+<;4;<<<<<<<<<<<;<;<<;<<<<<<<<< MF:i:18 Aq:i:30 NM:i:1 UQ:i:5 H0:i:1 H1:i:2 ++-EAS220_1:4:100:20:1199 163 chr2 456 99 35M = 614 193 CAGTCATCTATAAAGGAAATCCCATCAGAATAACA 7<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<4<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:5:71:408:741 163 chr2 457 99 35M = 637 215 AGTCATCTATAAAGGAAATCCCATCAGAATAACAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:5:285:395:450 147 chr2 458 99 35M = 269 -224 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT 8)3<8+;<)<<<<<<<<97:7<<<<<<<<<<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:3:4:854:140 99 chr2 458 72 35M = 638 215 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT <<<6<<<:<6<<<:36:<<<<3<<8:.6<38::4< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:6:227:657:95 147 chr2 458 99 35M = 280 -213 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT ;3;<);<<<<<<<<<<<<18<<<<<<<<<<<<<<< MF:i:18 Aq:i:59 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:7:57:324:546 83 chr2 458 99 36M = 281 -213 GTCATCTATAAAGGAAATCCCATCAGAATAACAATG ;;5<;,<<<;;<<<<<<<97<<<<<<<<<<9<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:4:26:274:1078 83 chr2 458 99 40M = 282 -216 GTCATCTATAAAGGAAATCCCATCAGAATAACAATGGGCT 9:*:64<<;<<<<<<<<<;8;<<:<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:6:107:636:642 163 chr2 458 99 35M = 630 207 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_81:7:226:869:36 147 chr2 461 99 35M = 273 -223 ATATATAAAGGAAATCCCATCAGAATAACAATGGG <0/)&<=,==4>4=>>= MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_1:8:82:1540:77 163 chr2 619 99 35M = 786 202 GAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGC <<<<<<<<<<<<<<<<<<<<<<<<<<<;;<<<<:8 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:3:172:196:746 147 chr2 620 99 35M = 451 -204 AAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCT <<<;><<+<<<<:<<<<2<;<<<;<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:7:97:892:419 163 chr2 621 99 35M = 800 214 AATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:4:312:915:751 147 chr2 621 99 35M = 461 -195 AATAAAGTCAAGTCTTTCCTGACAAGCAAAAGCTA <:-<<<99:::);:7<4;8<<<<<<<;<2<+8<;< MF:i:18 Aq:i:41 NM:i:1 UQ:i:10 H0:i:0 H1:i:1 ++-EAS1_93:4:325:352:67 163 chr2 622 99 35M = 794 207 ATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAA ==================<========<=<;-=== MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:4:83:731:540 99 chr2 623 99 35M = 804 216 TAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAG <<<<<<<<<;<<<<<<<<<<<<<:<7<*;&;<;;9 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:5:74:329:459 163 chr2 623 99 35M = 795 207 TAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAG <<<<<<<<<<<<<<<<<<<<<<<;<<;<<;9;599 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_FC30151:3:13:674:1717 83 chr2 623 99 35M = 445 -213 TAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_105:1:141:415:738 69 chr2 624 0 * = 624 0 TTACCTAGTTGCTCTGTAGTCTCAATTAATTGTTT <<<<<<<<<<<<<<<<<<<<<<<;<<<<<;<8<<< MF:i:192 ++-EAS1_105:1:141:415:738 137 chr2 624 76 35M = 624 0 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGA <<<<<<<<<<<<<<<<<<<<<<:<<;<<;<<<<6: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:1:154:118:488 83 chr2 624 99 35M = 447 -212 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGA <<<;58<<95:<<;<;<<<;<<<;;<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:5:198:929:684 83 chr2 624 99 35M = 471 -188 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGA <<;<<<<<:<<<<<<<<<<<<<<<<<;<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:4:215:246:640 147 chr2 624 99 36M = 447 -213 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGAT ;<<,<<<96<<:<:<9<6<97<<<<<9<<<<9<<9< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_103:2:234:167:381 83 chr2 625 99 35M = 443 -217 AAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGAT <<;<;<<<<;<<<<7<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:7:96:489:453 147 chr2 625 99 35M = 445 -215 AAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGAT ;<;;;<<<<5:<<:<<<<:<<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:3:79:879:15 99 chr2 626 99 35M = 790 199 AGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<2<;<<1< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:2:59:286:290 147 chr2 628 99 35M = 467 -196 TCAAGTCTTTCCTGACAAGCAAATGCTAAGATAAT 77<<<<7<<<97<<,7<<<;<<<;<9<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_95:5:263:511:936 99 chr2 629 99 35M = 801 207 CAAGTCTTTCCTGACAAGCAAATGCTAAGATAATT <<<<<<<<<<<<<<<<<;<<<<<;<<:<:<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:3:181:582:435 147 chr2 629 99 35M = 471 -193 CAAGTCTTTCCTGACAAGCAAATGCTAAGATAATT <<<<<<<<;<<<<<;<<4<<<<<<;<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:6:107:636:642 83 chr2 630 99 35M = 458 -207 AAGTCTTTCCTGACAAGCAAATGCTAAGATAATTC <<<<<<<;<<<<<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:4:12:273:89 83 chr2 631 99 35M = 477 -189 AGTCTTTCCTGACAAGCAAATGCTAAGATAATTCA <:737<288<<<7<<<<<<<<<:9<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:2:239:1001:406 147 chr2 634 99 35M = 450 -219 CTTTCCTGACAAGCAAATGCTAAGATAATTCATCA 0':.71;;:9==9=;====;=;============= MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_4:7:96:899:106 147 chr2 636 99 35M = 462 -209 TTCCTGACAAGCAAATGCTAAGATAATTCATCATC ;;;;<<<<<<<;<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_65:6:67:56:806 147 chr2 637 99 35M = 464 -208 TCCTGACAAGCAAATGCTAAGATAATTCATCATCA 844:8;7<88;8<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:5:71:408:741 83 chr2 637 99 35M = 457 -215 TCCTGACAAGCAAATGCTAAGATAATTCATCATCA ;7;<;<0<<<<<<<<:;<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:6:180:695:621 147 chr2 637 99 35M = 453 -219 TACTGAAAAGCAAATGCTAAGATAATTCATCATCA ;&377<&<<;7<<<<<7<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:68 NM:i:2 UQ:i:10 H0:i:1 H1:i:0 ++-EAS114_30:6:49:656:507 147 chr2 637 99 35M = 468 -204 TCCTGACAAGCAAATGCTAAGATAATTCATCATCA %44;;<:<<;<;<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:3:4:854:140 147 chr2 638 72 35M = 458 -215 CCTGACAAGCAAATGCTAAGATAATTCATCATCAC :9':<;<<<;<<<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:1:85:1521:58 99 chr2 639 99 40M = 813 214 CTGACAAGCAAATGCTAAGATAATTCATCATCACTAAACC <<<<<<<<9<<<<<<<<<<<<<7<<<<<<<<<<<<;;:7: MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_39:2:57:1064:925 137 chr2 640 76 35M * 0 0 TGACAAGCAAATGCTAAGATAATTCATCATCACTA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<:<<< MF:i:32 Aq:i:29 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_103:3:323:196:855 163 chr2 642 99 35M = 809 202 ACAAGCAAATGCTAAGATAATTCATCATCACTAAA <<<<<<<7<<<<<<:<<<<<<<<<<<<<<<<<;7: MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:5:117:33:262 163 chr2 642 99 35M = 814 207 ACAAGCAAATGCTAAGATAATTCATCATCACTAAA <<<;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<; MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_11:1:59:742:549 99 chr2 642 99 35M = 816 209 ACAAGCAAATGCTAAGATAATTCATCATCACTAAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<8< MF:i:18 Aq:i:48 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:2:55:562:403 163 chr2 643 99 36M = 825 218 CAAGCAAATGCTAAGATAATTCATCATCACTAAACC <<<<<<<<<<<<<<<<<<<<<;<<;<<<<<<<;<;: MF:i:18 Aq:i:51 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:7:97:743:602 163 chr2 644 99 35M = 821 211 AAGCAAATGCTAAGATAATTCATCATCACTAAACC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<: MF:i:18 Aq:i:26 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:2:167:905:852 83 chr2 647 99 36M = 445 -238 CAAATGCTAAGATAATTCATCATCACTAAACCAGTC +<<<9;7;<<+<<<<<39<;9<;9<<7<<<<<<<<< MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:6:130:865:838 83 chr2 649 99 35M = 448 -236 AATGCTAAGATAATTCATCATCACTAAACCAGTCC ;<:84<<<4<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_1:1:60:1420:660 163 chr2 649 99 35M = 808 194 AATGCTAAGATAATTCATCATCACTAAACCAGTCC <<<<<<<<<<<<<<<<<<<<<<<<<<;<<<8<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:3:180:89:582 99 chr2 650 99 36M = 809 195 ATGCTAAGATAATTCATCATCACTAAACCAGTCCTA <<<<<<<<<7<<<<<<<<<7<<<:<<<:<<::77:< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_99:1:86:871:319 147 chr2 651 71 35M = 494 -192 TGCTAAGATAATTCATCATCACTAAACCAGTCCTA 7;+1;<:<<<<<<<<;<<;<<9<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:2:236:841:20 83 chr2 652 99 35M = 467 -220 GCTAAGATAATTCATCATCACTAAACCAGTCCTAT 7;<<<;<<<<;;<<<<<<<<<<<<<<<<<<<<;<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_62:2:133:8:379 83 chr2 653 99 35M = 470 -218 ATAAGATAATTCATCATCACTAAACCAGTCCTATA &=========='==7==0=2====28===00==== MF:i:18 Aq:i:70 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS1_105:8:96:720:940 83 chr2 654 99 35M = 467 -222 TAAGATAATTCATCATCACTAAACCAGTCCTATAA *<<<<;<<<9<<;,<;0<;<<<<<<<<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:5:71:994:576 99 chr2 655 99 35M = 805 185 AAGATAATTCATCATCACTAAACCAGTCCTATAAG <<<<<<<<<<<<<<<<<<<<<<<;<<5<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_103:4:164:79:134 147 chr2 656 99 35M = 488 -203 AGATAATTCATCATCACTAAACCAGTCCTATAAGA <;<;<<<;<<;<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:6:78:1029:512 83 chr2 656 99 40M = 500 -196 AGATAATTCATCATCACTAAACCAGTCCTATAAGAAATGC ;;;;;<;;<<<.<<6;<<;<;8<<<<::<<<<<<<<;<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:1:214:784:690 147 chr2 657 99 35M = 472 -220 GATAATTCATCATCACTAAACCAGTCCTATAAGAA -<7<<7<:<<2<<<<;<<<<<;<<<<3<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:1 ++-EAS220_1:4:6:1178:1105 99 chr2 657 93 35M = 830 208 GATAATTCATCATCACTAAACCAGTCCTATAAGAA <<<<<<:<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:17 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_99:7:171:196:287 83 chr2 658 99 35M = 485 -208 ATAATTCATCATCACTAAACCAGTCCTATAAGAAA <;;;98;<;&<;;<<<<<<<;<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:1:220:801:282 99 chr2 660 99 36M = 837 213 AATTCATCATCACTAAACCAGTCCTATAAGAAATGC <<<<<<<<<<<<<<<<<<<<<;<+<;<<<<<::<<: MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 ++-EAS221_1:2:73:955:728 163 chr2 660 44 35M = 823 198 AATTCATCATCACTAAACCAGTCCTATAAGAAATG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<<< MF:i:18 Aq:i:14 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 ++-EAS1_105:1:3:903:957 147 chr2 661 99 35M = 516 -180 ATTCATCATCACTAAACCAGTCCTATAAGAAATGC <%12<&<<<;<:<<<<<<<<<7<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:1 ++-EAS56_65:2:224:579:433 83 chr2 662 99 35M = 485 -212 TTCATCATCACTAAACCAGTCCTATAAGAAATGCT '<08/8<+<>===> MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_105:2:146:374:692 99 chr2 690 99 35M = 874 219 AAATGCTCAAAAGAATTGTAAAAGTCAAAATTAAA <<<<<<<<<<<<<<<=>>>==>>===>==> MF:i:130 Aq:i:74 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-EAS1_108:6:159:493:275 99 chr2 760 72 35M = 939 214 ACAAAACTCACAGGTTTTATAAAACAATTAATTGA =====3============================= MF:i:130 Aq:i:72 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-EAS139_11:1:81:1019:558 163 chr2 760 77 35M = 926 201 ACAAAACTCACAGGTTTTATAAAACAATTAATTGA <<<<<<<<<<<6<<<<<<<<<<<<<<<<<<<<<7< MF:i:130 Aq:i:77 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-EAS51_62:7:162:195:761 163 chr2 767 30 18M4I13M = 922 190 TCACAGGTTTTATAAAACAATTAATTGAGACTACA <<<<<<<<<<<<<<<<<<<<<<<<<<;<:<<<<;; MF:i:130 Aq:i:30 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-B7_597:3:115:646:430 147 chr2 768 45 17M4I14M = 582 -217 CACAGGTTTTATAAAACAATTAATTGAGACTACAG 5;5<;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:130 Aq:i:45 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-EAS114_30:6:243:209:110 163 chr2 768 48 17M4I14M = 920 187 CACAGGTTTTATAAAACAATTAATTGAGACTACAG <<<<<;<;<<<;<<<<<<<<<<<;<:;<<:;;+85 MF:i:130 Aq:i:48 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-EAS1_108:2:266:994:429 147 chr2 769 76 16M4I15M = 612 -188 ACAGGTTTTATAAAACAATTAATTGAGACTACAGA <<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:1:85:1521:58 147 chr2 813 99 40M = 639 -214 AAATTAACATTACAACAGGAACAAAACCTCATATATCAAT :::86<<:<<8<<<<;<<8<<<<<<<<<<<<<<<;<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:4:164:719:947 99 chr2 813 99 35M = 1005 227 AAATTAACATTACAACAGGAACAAAACCTCATATA <<<<<<<<<<<<<<<<<<<;<<<<<<<<<;<<<<< MF:i:18 Aq:i:64 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_1:1:50:257:341 163 chr2 813 99 35M = 971 193 AAATTAACATTACAACAGGAACAAAACCTCATATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<7<6<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:5:117:33:262 83 chr2 814 99 35M = 642 -207 AATTAACATTACAACAGGAACAAAACCTCATATAT <<;;<<;<:8<7<<;<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_1:8:90:706:1276 163 chr2 814 99 35M = 980 201 AATTAACATTACAACAGGAACAAAACCTCATATAT <<<<<<<<<<<<<<<<<<<<<<;<<<<<<<<:<:< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:2:116:966:193 163 chr2 815 99 35M = 967 187 ATTAACATTACAACAGGAACAAAACCTCATATATC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_11:1:59:742:549 147 chr2 816 99 35M = 642 -209 TTAACATTACAACAGGAACAAAACCTCATATATCA -<<<3<<<<6<<6<<<<<6<<<<6<<<<<<<<<<< MF:i:18 Aq:i:48 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_11:7:74:213:877 99 chr2 816 99 35M = 996 215 TTAACATTACAACAGGAACAAAACCTCATATATCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 ++-B7_610:3:85:219:371 163 chr2 817 99 35M = 967 185 TAACATTACAACAGGAACAAAACCTCATATATCAA <<<<<<<<<<<<<<<<<<<<<:<<<<<<<<<<;<; MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:2:176:653:957 163 chr2 819 82 35M = 982 198 ACATTACAACAGGAACAAAACCTCATATATCAATA ????????????<==>=>=>=>>>==>>>=>>> MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:5:57:366:844 83 chr2 877 99 40M = 708 -209 AAATTCCCCCACTTAAGAGATATAGATTGGCAGAACAGAT ;;;7:8&555<,;<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_32:3:236:475:254 163 chr2 880 99 35M = 1051 206 TTCCCCCACTTAAGAGATATAGATTGGCAGAACAG <<<<<<<<<<<<<<<<<<<<<<<<<:::<:;>=>>>>==>=>>>==>=>=:=====;=:=6:::6 MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_1:2:10:686:1024 163 chr2 947 99 35M = 1103 191 ACAAGAAACTCATTAATAAAGACATGAGTTCAGGT <:<<<<:<<<<<<<<<<:<:<<<<<<<<<<<5<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:5:53:61:31 163 chr2 949 99 35M = 1122 208 AAGAAACTCATTAATAAAGACATGAGTTCAGATAA <<<7;<7<<<;7<;;<7<7<7<;5<73<<<;>588>9<>7:<0<9; MF:i:18 Aq:i:30 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 ++-B7_589:2:30:644:942 99 chr2 1045 83 35M = 1229 219 TATATCAGATAAAGCACACTTTAAATCAACAACAG <<<<<<<<<<<<<<<<<<<<<<<9<<<<<<9;<9< MF:i:18 Aq:i:22 NM:i:0 UQ:i:0 H0:i:1 H1:i:3 ++-B7_591:2:123:924:645 83 chr2 1045 84 36M = 861 -220 TATATCAGATAAAGCACACTTTAAATCAACAACAGT ;<<<<*<<<<<<<<8<<<<<><<<<<><<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS51_62:4:308:614:911 99 chr2 1319 90 35M = 1493 209 TGCGCTTGTACTTCTAAATCTATAACAAAATTAAA <<<<<<<<<<<<<<<<<<<<<<<<<<<;;<<<<8< MF:i:18 Aq:i:43 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS54_65:3:155:541:234 83 chr2 1319 99 35M = 1151 -203 TGCGCTTGTACTTCTAAATCTATAAAAAAATTAAA 78;<7<<<<<<<<<<<<<<;<<<<<<<<<<;<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:6:175:289:351 147 chr2 1319 99 35M = 1144 -210 TGCGCTTGTACTTCTAAATCTATAAAAAAATTAAA 9;;:+<<<<<;<<:<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:7:283:186:707 83 chr2 1321 99 36M = 1154 -203 CGCTTGTACTTCTAAATCTATAACAAAATTAAAATT 889;<7;<7<<7<<<<<7<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS1_105:3:308:66:538 147 chr2 1321 99 35M = 1138 -218 CGCTTGTACTTCTAAATCTATAACAAAATTAAAAT 996999;<9;<:<<<<<:<<7<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS1_108:5:11:555:330 163 chr2 1321 99 35M = 1492 206 CGCTTGTACTTCTAAATCTATAAAAAAATTAAAAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<4<;< MF:i:18 Aq:i:56 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:7:84:411:336 73 chr2 1322 75 35M * 0 0 GCTTGTACTTCTAAATCTATAAAAAAATTAAAATT <<<;<<<;<<<<<<<<<<<<:<<;<<<<<<;8<;< MF:i:32 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_11:5:52:1278:1478 163 chr2 1322 47 35M = 1513 226 GCTTGTACTTCTAAATCTATAACAAAATTAAAATT <<<<<<<<<<<<<<9<<<<<<<<<<<<<<<<9<<< MF:i:18 Aq:i:0 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS56_53:3:101:809:776 147 chr2 1326 99 35M = 1160 -201 GTACTTCTAAATCTATAAAAAAATTAAAATTTAAC <<<-<;7;<<<<:;<<<7<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:3:221:881:916 147 chr2 1327 96 35M = 1168 -194 TAATTCTAAATCTAGAACAAAATTAAAATTTAACA 44%-4(5<;9/,:<68:1<:8<:<<84;<<<<<;< MF:i:18 Aq:i:24 NM:i:3 UQ:i:41 H0:i:0 H1:i:0 ++-EAS1_105:1:28:745:352 147 chr2 1329 99 35M = 1159 -205 CTTCTAAATCTATAACAAAATTAAAATTTAACAAA 4;;*;<<<;;<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS114_45:2:23:1754:796 99 chr2 1329 99 35M = 1488 194 CTTCTAAATCTATAAAAAAATTAAAATTTAACAAA ;<<;<;<;<;<;<<;;;;;<<<<;;<<<<<97999 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:2:96:419:327 147 chr2 1331 99 35M = 1149 -217 TCTAAATCTATAACAAAATTAAAATTTAACAAAAG ;1<<<<<9<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS1_97:4:274:287:423 163 chr2 1332 75 35M = 1515 218 CTAAATCTATAAAAAAATTAAAATTTAACAAAAGT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_1:7:35:392:2042 83 chr2 1332 99 35M = 1168 -199 ATAAATCTATAAAAAAATTAAAATTTAACAAAAGT +<<<<4<>>>>;>>&>->9>9;4>->>>>,4>9>,<1> MF:i:18 Aq:i:27 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 ++-EAS139_19:2:82:154:1333 99 chr2 1349 77 40M = 1511 202 TTAAAATTTAACAAAAGTAAATAAAACACACAGCTAAAAC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;<;;:;: MF:i:18 Aq:i:0 NM:i:1 UQ:i:27 H0:i:1 H1:i:0 ++-EAS188_7:1:290:286:763 99 chr2 1349 75 35M = 1515 201 TTAAAATTTAACAAAAGTAAATAAAACACATAGCT <<<<<<<<<<<<<<<<7<<<<<<<<<<<<<<<8<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:4:3:248:1491 73 chr2 1349 99 35M * 0 0 TTAAAATTTAACAAAAGTAAATAAAACACATAGCT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<:8:< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_39:3:6:1064:1805 99 chr2 1350 99 35M = 1502 187 TAAAATTTAACAAAAGTAAATAAAACACATAGCTA <<<<<<<<<<<<<<<<<<<<<<;<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:6:137:811:130 83 chr2 1351 99 35M = 1175 -211 AAAATTTAACAAAAGTAAATAAAACACATAGCTAA <<<<<<<<<:<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:1:155:809:543 83 chr2 1352 99 35M = 1156 -231 AAATTTAACAAAAGTAAATAAAACACATAGCTAAA <<<+0<<<9<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_63:3:41:468:459 99 chr2 1352 75 35M = 1513 196 AAATTTAACAAAAGTAAATAAAACACATAGCTAAA <<<<<<<<<<<<<<<<<<<<<;<<<<<<<<<<;;7 MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:4:31:622:216 73 chr2 1354 99 35M * 0 0 ATTTAACAAAAGTAAATAAAACACATAGCTAAAAC <<<<<<<<<<<<<<<<<<<<<<<<<<<<8<<96<7 MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:8:105:854:975 163 chr2 1354 71 35M = 1523 202 ATTTAACAAAAGTAAATAAAACACATAGCTAAAAC <<<<<<<<<<<<<<<<<<<<<<<<<<<7:<;;;;5 MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:7:26:749:174 147 chr2 1357 78 35M = 1183 -209 TAACAAAAGTAAATAAAACACATAGCTAAAACTAA (<<)<<<<6<<<<<<<<<<&:<3<<<6<<<)<:<< MF:i:18 Aq:i:11 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:6:21:553:57 147 chr2 1358 99 35M = 1197 -196 AACAAAAGTAAATAAAACACATAGCTAAAACTAAA <<+<<<<<<<<<;<<<<8<<<<<<8<<<<<;<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:2:128:629:484 83 chr2 1359 96 35M = 1185 -209 AAAAAAGTAAATAAAACACATAGCTAAAACTAAAA :(::<<<<<<<<<< MF:i:32 Aq:i:0 NM:i:0 UQ:i:0 H0:i:82 H1:i:85 ++-B7_589:6:33:356:636 73 chr2 1520 0 35M * 0 0 TTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTT <<<<<<<8;<<<<<<<<<<<<<7<<<<<<<;;3&3 MF:i:32 Aq:i:0 NM:i:0 UQ:i:0 H0:i:14 H1:i:85 ++-EAS114_45:6:86:859:1779 137 chr2 1520 0 35M * 0 0 TTTTTTTCATTTCTCTTTTTTTTTTTTTTTTTTTT ;;;;;;;;;;;;;;;;;;;;;;;;;;;8;;)7699 MF:i:32 Aq:i:0 NM:i:1 UQ:i:26 H0:i:0 H1:i:15 ++-EAS54_71:8:105:854:975 83 chr2 1523 71 33M = 1354 -202 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTG <<<<;<:<<;<&<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:85 H1:i:85 ++-EAS51_62:4:187:907:145 153 chr2 1524 28 35M * 0 0 TTTCTTCTCTCTCTTTTTTTTTTTTTTTATTGCAT <<<+;;,6<<<<6<<<<<<<<<<<<<<<<<<<<<< MF:i:32 Aq:i:28 NM:i:3 UQ:i:59 H0:i:0 H1:i:0 ++-EAS54_71:4:284:269:882 73 chr2 1524 0 34M * 0 0 TTTCTTTTCTCTTTTTTTTTTTTTTGTTTTTGCA <;<<<<<8<7<8;<<<;<7<<<<<;272;73&&) MF:i:32 Aq:i:0 NM:i:1 UQ:i:17 H0:i:0 H1:i:85 ++-EAS56_63:4:141:9:811 137 chr2 1524 10 35M * 0 0 TTTCTTTTCTCCTTTTTTTTTTTTTTTTTCTACAT <<<;<<<<<<<;<;<:<<<;<<<<<<<<..));;. MF:i:32 Aq:i:0 NM:i:3 UQ:i:47 H0:i:2 H1:i:27 ++-EAS114_30:6:277:397:932 73 chr2 1524 0 35M * 0 0 TTTCTTTTCACTTTTTTTTTTTTTTTTTTTTACTT <<<<<<<<<<<<<<<<<<<<<<<<<<<<:8(,0%( MF:i:32 Aq:i:0 NM:i:3 UQ:i:42 H0:i:2 H1:i:85 ++-EAS139_11:7:50:1229:1313 83 chr2 1528 77 35M = 1376 -187 TTTTTTCTTTTTTTTTTTTTTTTTTTTGCATGCCA <<<<,<&<7<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:1 UQ:i:11 H0:i:3 H1:i:7 ++-EAS54_65:3:320:20:250 147 chr2 1532 77 35M = 1367 -200 TTTTTTTTTTTTTTTTTTTTTTTGCATGCCAGAAA +'''/<<<<7:;+<;::<<<;;<<<<<<<<<<<<< MF:i:18 Aq:i:6 NM:i:2 UQ:i:24 H0:i:1 H1:i:2 ++-EAS114_26:7:37:79:581 83 chr2 1533 68 35M = 1349 -219 TTTTTTTTTTTTTTTTTTTTTTTCATGCCAGAAAA 3,,,===6===<===<;=====-============ MF:i:18 Aq:i:27 NM:i:2 UQ:i:23 H0:i:0 H1:i:1 ++--- python-pysam.orig/tests/pysam_data/example_user_header.sam +++++ /dev/null ++@@ -1,8 +0,0 @@ ++-@HD VN:1.0 ++-@SQ SN:chr1 LN:1575 ++-@SQ SN:chr2 LN:1584 ++-@x1 A:2 B:5 ++-@x2 A:4 B:5 ++-@x3 A:6 B:5 ++-read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 ++-read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 ++--- python-pysam.orig/tests/pysam_data/Makefile +++++ python-pysam/tests/pysam_data/Makefile ++@@ -14,7 +14,6 @@ ++ $(BAM) $(BAI) \ ++ $(CRAM) $(CRAI) \ ++ example_bai.bam \ ++- rg_with_tab.bam \ ++ ex2_truncated.bam \ ++ empty.bam empty.bam.bai \ ++ explicit_index.bam explicit_index.cram \ ++--- python-pysam.orig/pysam/alternatives.py.obsolete +++++ python-pysam/pysam/alternatives.py.obsolete ++@@ -12,7 +12,6 @@ ++ int bam_merge(int argc, char *argv[]) ++ int bam_index(int argc, char *argv[]) ++ int bam_sort(int argc, char *argv[]) ++- int bam_tview_main(int argc, char *argv[]) ++ int bam_mating(int argc, char *argv[]) ++ int bam_rmdup(int argc, char *argv[]) ++ int bam_rmdupse(int argc, char *argv[]) ++--- python-pysam.orig/tests/AlignmentFile_test.py +++++ python-pysam/tests/AlignmentFile_test.py ++@@ -1382,19 +1382,19 @@ ++ os.unlink(tmpfilename) ++ ++ ++-class TestDeNovoConstructionUserTags(TestDeNovoConstruction): ++- ++- '''test de novo construction with a header that contains lower-case tags.''' ++- ++- header = {'HD': {'VN': '1.0'}, ++- 'SQ': [{'LN': 1575, 'SN': 'chr1'}, ++- {'LN': 1584, 'SN': 'chr2'}], ++- 'x1': {'A': 2, 'B': 5}, ++- 'x3': {'A': 6, 'B': 5}, ++- 'x2': {'A': 4, 'B': 5}} ++- ++- bamfile = os.path.join(BAM_DATADIR, "example_user_header.bam") ++- samfile = os.path.join(BAM_DATADIR, "example_user_header.sam") +++# class TestDeNovoConstructionUserTags(TestDeNovoConstruction): +++# +++# '''test de novo construction with a header that contains lower-case tags.''' +++# +++# header = {'HD': {'VN': '1.0'}, +++# 'SQ': [{'LN': 1575, 'SN': 'chr1'}, +++# {'LN': 1584, 'SN': 'chr2'}], +++# 'x1': {'A': 2, 'B': 5}, +++# 'x3': {'A': 6, 'B': 5}, +++# 'x2': {'A': 4, 'B': 5}} +++# +++# bamfile = os.path.join(BAM_DATADIR, "example_user_header.bam") +++# samfile = os.path.join(BAM_DATADIR, "example_user_header.sam") ++ ++ ++ class TestEmptyHeader(unittest.TestCase): ++--- python-pysam.orig/tests/samtools_test.py +++++ python-pysam/tests/samtools_test.py ++@@ -78,7 +78,7 @@ ++ # ("view -bT ex1.fa -o %(out)s_ex1.view2 ex1.sam", ++ "sort ex1.bam -o %(out)s_ex1.sort.bam", ++ "mpileup ex1.bam > %(out)s_ex1.pileup", ++- "depth ex1.bam > %(out)s_ex1.depth", +++ #"depth ex1.bam > %(out)s_ex1.depth", ++ # TODO: issues with file naming ++ # "faidx ex1.fa; %(out)s_ex1.fa.fai", ++ "index ex1.bam %(out)s_ex1.bam.fai", ++@@ -100,8 +100,8 @@ ++ "cat -o %(out)s_ex1.cat.bam ex1.bam ex1.bam", ++ "targetcut ex1.bam > %(out)s_ex1.targetcut", ++ "phase ex1.bam > %(out)s_ex1.phase", ++- "import ex1.fa.fai ex1.sam.gz %(out)s_ex1.bam", ++- "bam2fq ex1.bam > %(out)s_ex1.bam2fq", +++ #"view -bt ex1.fa.fai -o %(out)s_ex1.bam ex1.sam.gz", +++ #"bam2fq ex1.bam > %(out)s_ex1.bam2fq", ++ # TODO: not the same ++ # "pad2unpad -T ex1.fa ex2.bam > %(out)s_ex2.unpad", ++ # TODO: command line option problem diff --cc debian/patches/samtools_v1.10_full index 0000000,0000000..dce902d new file mode 100644 --- /dev/null +++ b/debian/patches/samtools_v1.10_full @@@ -1,0 -1,0 +1,39678 @@@ ++Author: Michael R. Crusoe ++Description: sync with samtools 1.10 ++ ++use devtools/import.py and the contents of the samtools ++Debian package with its patches fully applied ++ ++--- python-pysam.orig/samtools/LICENSE +++++ python-pysam/samtools/LICENSE ++@@ -1,6 +1,6 @@ ++ The MIT/Expat License ++ ++-Copyright (C) 2008-2018 Genome Research Ltd. +++Copyright (C) 2008-2019 Genome Research Ltd. ++ ++ Permission is hereby granted, free of charge, to any person obtaining a copy ++ of this software and associated documentation files (the "Software"), to deal ++--- python-pysam.orig/samtools/README +++++ python-pysam/samtools/README ++@@ -9,7 +9,7 @@ ++ The typical simple case of building Samtools using the HTSlib bundled within ++ this Samtools release tarball is done as follows: ++ ++- cd .../samtools-1.9 # Within the unpacked release directory +++ cd .../samtools-1.10 # Within the unpacked release directory ++ ./configure ++ make ++ ++@@ -21,7 +21,7 @@ ++ installation using the HTSlib bundled within this Samtools release tarball, ++ and building the various HTSlib utilities such as bgzip is done as follows: ++ ++- cd .../samtools-1.9 # Within the unpacked release directory +++ cd .../samtools-1.10 # Within the unpacked release directory ++ ./configure --prefix=/path/to/location ++ make all all-htslib ++ make install install-htslib ++@@ -48,7 +48,7 @@ ++ To build with plug-ins, you need to use the --enable-plugins configure option ++ as follows: ++ ++- cd .../samtools-1.9 # Within the unpacked release directory +++ cd .../samtools-1.10 # Within the unpacked release directory ++ ./configure --enable-plugins --prefix=/path/to/location ++ make all all-htslib ++ make install install-htslib ++@@ -66,8 +66,8 @@ ++ the source distribution instead of installing the package. In that case ++ you can use: ++ ++- cd .../samtools-1.9 # Within the unpacked release directory ++- ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.9 +++ cd .../samtools-1.10 # Within the unpacked release directory +++ ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.10 ++ make all all-htslib ++ ++ It is possible to override the built-in search path using the HTS_PATH ++--- python-pysam.orig/samtools/bam.c +++++ python-pysam/samtools/bam.c ++@@ -1,6 +1,6 @@ ++ /* bam.c -- BAM format. ++ ++- Copyright (C) 2008-2013, 2015 Genome Research Ltd. +++ Copyright (C) 2008-2013, 2015, 2019 Genome Research Ltd. ++ Portions copyright (C) 2009-2012 Broad Institute. ++ ++ Author: Heng Li ++@@ -30,7 +30,6 @@ ++ #include ++ #include "bam.h" ++ #include "htslib/kstring.h" ++-#include "sam_header.h" ++ ++ char *bam_format1(const bam_header_t *header, const bam1_t *b) ++ { ++@@ -59,7 +58,7 @@ ++ char *s; ++ ++ if (b->core.tid < -1 || b->core.mtid < -1) return 0; ++- if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0; +++ if (header && (b->core.tid >= sam_hdr_nref(header) || b->core.mtid >= sam_hdr_nref(header))) return 0; ++ ++ if (b->data_len < b->core.l_qname) return 0; ++ s = memchr(bam1_qname(b), '\0', b->core.l_qname); ++@@ -77,9 +76,8 @@ ++ // FIXME: we should also check the LB tag associated with each alignment ++ const char *bam_get_library(bam_header_t *h, const bam1_t *b) ++ { ++- // Slow and inefficient. Rewrite once we get a proper header API. ++ const char *rg; ++- char *cp = h->text; +++ kstring_t lib = { 0, 0, NULL }; ++ rg = (char *)bam_aux_get(b, "RG"); ++ ++ if (!rg) ++@@ -87,50 +85,18 @@ ++ else ++ rg++; ++ ++- // Header is guaranteed to be nul terminated, so this is valid. ++- while (*cp) { ++- char *ID, *LB; ++- char last = '\t'; ++- ++- // Find a @RG line ++- if (strncmp(cp, "@RG", 3) != 0) { ++- while (*cp && *cp != '\n') cp++; // skip line ++- if (*cp) cp++; ++- continue; ++- } ++- ++- // Find ID: and LB: keys ++- cp += 4; ++- ID = LB = NULL; ++- while (*cp && *cp != '\n') { ++- if (last == '\t') { ++- if (strncmp(cp, "LB:", 3) == 0) ++- LB = cp+3; ++- else if (strncmp(cp, "ID:", 3) == 0) ++- ID = cp+3; ++- } ++- last = *cp++; ++- } ++- ++- if (!ID || !LB) ++- continue; ++- ++- // Check it's the correct ID ++- if (strncmp(rg, ID, strlen(rg)) != 0 || ID[strlen(rg)] != '\t') ++- continue; ++- ++- // Valid until next query ++- static char LB_text[1024]; ++- for (cp = LB; *cp && *cp != '\t' && *cp != '\n'; cp++) ++- ; ++- strncpy(LB_text, LB, MIN(cp-LB, 1023)); ++- LB_text[MIN(cp-LB, 1023)] = 0; +++ if (sam_hdr_find_tag_id(h, "RG", "ID", rg, "LB", &lib) < 0) +++ return NULL; ++ ++- // Return it; valid until the next query. ++- return LB_text; ++- } +++ static char LB_text[1024]; +++ int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1; +++ +++ memcpy(LB_text, lib.s, len); +++ LB_text[len] = 0; +++ +++ free(lib.s); ++ ++- return NULL; +++ return LB_text; ++ } ++ ++ int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) ++--- python-pysam.orig/samtools/bam.c.pysam.c +++++ python-pysam/samtools/bam.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam.c -- BAM format. ++ ++- Copyright (C) 2008-2013, 2015 Genome Research Ltd. +++ Copyright (C) 2008-2013, 2015, 2019 Genome Research Ltd. ++ Portions copyright (C) 2009-2012 Broad Institute. ++ ++ Author: Heng Li ++@@ -32,7 +32,6 @@ ++ #include ++ #include "bam.h" ++ #include "htslib/kstring.h" ++-#include "sam_header.h" ++ ++ char *bam_format1(const bam_header_t *header, const bam1_t *b) ++ { ++@@ -61,7 +60,7 @@ ++ char *s; ++ ++ if (b->core.tid < -1 || b->core.mtid < -1) return 0; ++- if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0; +++ if (header && (b->core.tid >= sam_hdr_nref(header) || b->core.mtid >= sam_hdr_nref(header))) return 0; ++ ++ if (b->data_len < b->core.l_qname) return 0; ++ s = memchr(bam1_qname(b), '\0', b->core.l_qname); ++@@ -79,9 +78,8 @@ ++ // FIXME: we should also check the LB tag associated with each alignment ++ const char *bam_get_library(bam_header_t *h, const bam1_t *b) ++ { ++- // Slow and inefficient. Rewrite once we get a proper header API. ++ const char *rg; ++- char *cp = h->text; +++ kstring_t lib = { 0, 0, NULL }; ++ rg = (char *)bam_aux_get(b, "RG"); ++ ++ if (!rg) ++@@ -89,50 +87,18 @@ ++ else ++ rg++; ++ ++- // Header is guaranteed to be nul terminated, so this is valid. ++- while (*cp) { ++- char *ID, *LB; ++- char last = '\t'; ++- ++- // Find a @RG line ++- if (strncmp(cp, "@RG", 3) != 0) { ++- while (*cp && *cp != '\n') cp++; // skip line ++- if (*cp) cp++; ++- continue; ++- } ++- ++- // Find ID: and LB: keys ++- cp += 4; ++- ID = LB = NULL; ++- while (*cp && *cp != '\n') { ++- if (last == '\t') { ++- if (strncmp(cp, "LB:", 3) == 0) ++- LB = cp+3; ++- else if (strncmp(cp, "ID:", 3) == 0) ++- ID = cp+3; ++- } ++- last = *cp++; ++- } ++- ++- if (!ID || !LB) ++- continue; ++- ++- // Check it's the correct ID ++- if (strncmp(rg, ID, strlen(rg)) != 0 || ID[strlen(rg)] != '\t') ++- continue; ++- ++- // Valid until next query ++- static char LB_text[1024]; ++- for (cp = LB; *cp && *cp != '\t' && *cp != '\n'; cp++) ++- ; ++- strncpy(LB_text, LB, MIN(cp-LB, 1023)); ++- LB_text[MIN(cp-LB, 1023)] = 0; +++ if (sam_hdr_find_tag_id(h, "RG", "ID", rg, "LB", &lib) < 0) +++ return NULL; ++ ++- // Return it; valid until the next query. ++- return LB_text; ++- } +++ static char LB_text[1024]; +++ int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1; +++ +++ memcpy(LB_text, lib.s, len); +++ LB_text[len] = 0; +++ +++ free(lib.s); ++ ++- return NULL; +++ return LB_text; ++ } ++ ++ int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) ++--- python-pysam.orig/samtools/bam.h +++++ python-pysam/samtools/bam.h ++@@ -1,6 +1,6 @@ ++ /* bam.h -- BAM API. ++ ++- Copyright (C) 2008-2014 Genome Research Ltd. +++ Copyright (C) 2008-2014, 2019 Genome Research Ltd. ++ Portions copyright (C) 2010-2012 Broad Institute. ++ ++ Author: Heng Li ++@@ -38,7 +38,7 @@ ++ @copyright Genome Research Ltd. ++ */ ++ ++-#define BAM_VERSION "1.9" +++#define BAM_VERSION "1.10" ++ ++ #include ++ #include ++@@ -224,16 +224,6 @@ ++ // int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b); ++ ++ /*! ++- @abstract Read header information from a TAB-delimited list file. ++- @param fn_list file name for the list ++- @return a pointer to the header structure ++- ++- @discussion Each line in this file consists of chromosome name and ++- the length of chromosome. ++- */ ++- bam_header_t *sam_header_read2(const char *fn_list); ++- ++- /*! ++ @abstract Read header from a SAM file (if present) ++ @param fp SAM file handler ++ @return pointer to header struct; 0 if no @SQ lines available ++@@ -252,13 +242,13 @@ ++ @abstract Initialize a header structure. ++ @return the pointer to the header structure ++ */ ++- static inline bam_header_t *bam_header_init(void) { return bam_hdr_init(); } +++ static inline bam_header_t *bam_header_init(void) { return sam_hdr_init(); } ++ ++ /*! ++ @abstract Destroy a header structure. ++ @param header pointer to the header ++ */ ++- static inline void bam_header_destroy(bam_header_t *header) { bam_hdr_destroy(header); } +++ static inline void bam_header_destroy(bam_header_t *header) { sam_hdr_destroy(header); } ++ ++ /*! ++ @abstract Read a header structure from BAM. ++@@ -277,7 +267,7 @@ ++ @param header pointer to the header structure ++ @return always 0 currently ++ */ ++- static inline int bam_header_write(bamFile fp, const bam_header_t *header) { return bam_hdr_write(fp, header); } +++ static inline int bam_header_write(bamFile fp, bam_header_t *header) { return bam_hdr_write(fp, header); } ++ ++ /*! ++ @abstract Read an alignment from BAM. ++--- python-pysam.orig/samtools/bam2bcf.c +++++ python-pysam/samtools/bam2bcf.c ++@@ -1,7 +1,7 @@ ++ /* bam2bcf.c -- variant calling. ++ ++ Copyright (C) 2010-2012 Broad Institute. ++- Copyright (C) 2012-2014 Genome Research Ltd. +++ Copyright (C) 2012-2015 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++--- python-pysam.orig/samtools/bam2bcf.c.pysam.c +++++ python-pysam/samtools/bam2bcf.c.pysam.c ++@@ -3,7 +3,7 @@ ++ /* bam2bcf.c -- variant calling. ++ ++ Copyright (C) 2010-2012 Broad Institute. ++- Copyright (C) 2012-2014 Genome Research Ltd. +++ Copyright (C) 2012-2015 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++--- python-pysam.orig/samtools/bam2bcf.h +++++ python-pysam/samtools/bam2bcf.h ++@@ -1,7 +1,7 @@ ++ /* bam2bcf.h -- variant calling. ++ ++ Copyright (C) 2010-2012 Broad Institute. ++- Copyright (C) 2012-2014 Genome Research Ltd. +++ Copyright (C) 2012-2014, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -99,7 +99,8 @@ ++ } bcf_callret1_t; ++ ++ typedef struct { ++- int tid, pos; +++ int tid; +++ hts_pos_t pos; ++ bcf_hdr_t *bcf_hdr; ++ int a[5]; // alleles: ref, alt, alt2, alt3 ++ float qsum[5]; // for the QS tag ++@@ -128,7 +129,7 @@ ++ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call); ++ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, ++ const bcf_callaux_t *bca, const char *ref); ++- int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, +++ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref, ++ const void *rghash); ++ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call); ++ ++--- python-pysam.orig/samtools/bam2bcf_indel.c +++++ python-pysam/samtools/bam2bcf_indel.c ++@@ -1,7 +1,7 @@ ++ /* bam2bcf_indel.c -- indel caller. ++ ++ Copyright (C) 2010, 2011 Broad Institute. ++- Copyright (C) 2012-2014 Genome Research Ltd. +++ Copyright (C) 2012-2014, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -87,9 +87,10 @@ ++ kh_destroy(rg, hash); ++ } ++ ++-static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) +++static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, hts_pos_t tpos, hts_pos_t is_left, hts_pos_t *_tpos) ++ { ++- int k, x = c->pos, y = 0, last_y = 0; +++ int k, y = 0, last_y = 0; +++ hts_pos_t x = c->pos; ++ *_tpos = c->pos; ++ for (k = 0; k < c->n_cigar; ++k) { ++ int op = cigar[k] & BAM_CIGAR_MASK; ++@@ -124,9 +125,10 @@ ++ return q < qh? q : qh; ++ } ++ ++-static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) +++static inline int est_indelreg(hts_pos_t pos, const char *ref, int l, char *ins4) ++ { ++- int i, j, max = 0, max_i = pos, score = 0; +++ int j, max = 0, score = 0; +++ hts_pos_t i, max_i = pos; ++ l = abs(l); ++ for (i = pos + 1, j = 0; ref[i]; ++i, ++j) { ++ if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1; ++@@ -146,11 +148,12 @@ ++ - 8: estimated sequence quality .. (aux>>8)&0xff ++ - 8: indel quality .. aux&0xff ++ */ ++-int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, +++int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref, ++ const void *rghash) ++ { ++- int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; +++ int s, k, t, n_types, *types, max_rd_len, max_ins, *score1, *score2, max_ref2; ++ int N, K, l_run, ref_type, n_alt; +++ hts_pos_t i, j, left, right; ++ char *inscns = 0, *ref2, *query, **ref_sample; ++ khash_t(rg) *hash = (khash_t(rg)*)rghash; ++ if (ref == 0 || bca == 0) return -1; ++@@ -225,7 +228,7 @@ ++ free(aux); ++ // TODO revisit how/whether to control printing this warning ++ if (hts_verbose >= 2) ++- fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); +++ fprintf(stderr, "[%s] excessive INDEL alleles at position %"PRIhts_pos". Skip the position.\n", __func__, pos + 1); ++ return -1; ++ } ++ types = (int*)calloc(n_types, sizeof(int)); ++@@ -274,7 +277,7 @@ ++ bam1_t *b = p->b; ++ uint32_t *cigar = bam_get_cigar(b); ++ uint8_t *seq = bam_get_seq(b); ++- int x = b->core.pos, y = 0; +++ hts_pos_t x = b->core.pos, y = 0; ++ for (k = 0; k < b->core.n_cigar; ++k) { ++ int op = cigar[k]&0xf; ++ int j, l = cigar[k]>>4; ++@@ -382,7 +385,8 @@ ++ // align each read to ref2 ++ for (i = 0; i < n_plp[s]; ++i, ++K) { ++ bam_pileup1_t *p = plp[s] + i; ++- int qbeg, qend, tbeg, tend, sc, kk; +++ int qbeg, qend, sc, kk; +++ hts_pos_t tbeg, tend; ++ uint8_t *seq = bam_get_seq(p->b); ++ uint32_t *cigar = bam_get_cigar(p->b); ++ if (p->b->core.flag&4) continue; // unmapped reads ++--- python-pysam.orig/samtools/bam2bcf_indel.c.pysam.c +++++ python-pysam/samtools/bam2bcf_indel.c.pysam.c ++@@ -3,7 +3,7 @@ ++ /* bam2bcf_indel.c -- indel caller. ++ ++ Copyright (C) 2010, 2011 Broad Institute. ++- Copyright (C) 2012-2014 Genome Research Ltd. +++ Copyright (C) 2012-2014, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -89,9 +89,10 @@ ++ kh_destroy(rg, hash); ++ } ++ ++-static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) +++static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, hts_pos_t tpos, hts_pos_t is_left, hts_pos_t *_tpos) ++ { ++- int k, x = c->pos, y = 0, last_y = 0; +++ int k, y = 0, last_y = 0; +++ hts_pos_t x = c->pos; ++ *_tpos = c->pos; ++ for (k = 0; k < c->n_cigar; ++k) { ++ int op = cigar[k] & BAM_CIGAR_MASK; ++@@ -126,9 +127,10 @@ ++ return q < qh? q : qh; ++ } ++ ++-static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) +++static inline int est_indelreg(hts_pos_t pos, const char *ref, int l, char *ins4) ++ { ++- int i, j, max = 0, max_i = pos, score = 0; +++ int j, max = 0, score = 0; +++ hts_pos_t i, max_i = pos; ++ l = abs(l); ++ for (i = pos + 1, j = 0; ref[i]; ++i, ++j) { ++ if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1; ++@@ -148,11 +150,12 @@ ++ - 8: estimated sequence quality .. (aux>>8)&0xff ++ - 8: indel quality .. aux&0xff ++ */ ++-int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, +++int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref, ++ const void *rghash) ++ { ++- int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; +++ int s, k, t, n_types, *types, max_rd_len, max_ins, *score1, *score2, max_ref2; ++ int N, K, l_run, ref_type, n_alt; +++ hts_pos_t i, j, left, right; ++ char *inscns = 0, *ref2, *query, **ref_sample; ++ khash_t(rg) *hash = (khash_t(rg)*)rghash; ++ if (ref == 0 || bca == 0) return -1; ++@@ -227,7 +230,7 @@ ++ free(aux); ++ // TODO revisit how/whether to control printing this warning ++ if (hts_verbose >= 2) ++- fprintf(samtools_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); +++ fprintf(samtools_stderr, "[%s] excessive INDEL alleles at position %"PRIhts_pos". Skip the position.\n", __func__, pos + 1); ++ return -1; ++ } ++ types = (int*)calloc(n_types, sizeof(int)); ++@@ -276,7 +279,7 @@ ++ bam1_t *b = p->b; ++ uint32_t *cigar = bam_get_cigar(b); ++ uint8_t *seq = bam_get_seq(b); ++- int x = b->core.pos, y = 0; +++ hts_pos_t x = b->core.pos, y = 0; ++ for (k = 0; k < b->core.n_cigar; ++k) { ++ int op = cigar[k]&0xf; ++ int j, l = cigar[k]>>4; ++@@ -384,7 +387,8 @@ ++ // align each read to ref2 ++ for (i = 0; i < n_plp[s]; ++i, ++K) { ++ bam_pileup1_t *p = plp[s] + i; ++- int qbeg, qend, tbeg, tend, sc, kk; +++ int qbeg, qend, sc, kk; +++ hts_pos_t tbeg, tend; ++ uint8_t *seq = bam_get_seq(p->b); ++ uint32_t *cigar = bam_get_cigar(p->b); ++ if (p->b->core.flag&4) continue; // unmapped reads ++--- python-pysam.orig/samtools/bam2depth.c +++++ python-pysam/samtools/bam2depth.c ++@@ -1,7 +1,7 @@ ++ /* bam2depth.c -- depth subcommand. ++ ++ Copyright (C) 2011, 2012 Broad Institute. ++- Copyright (C) 2012-2014 Genome Research Ltd. +++ Copyright (C) 2012-2016, 2018, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -39,20 +39,19 @@ ++ #include ++ #include "htslib/sam.h" ++ #include "samtools.h" +++#include "bedidx.h" ++ #include "sam_opts.h" ++ +++#define BAM_FMAX ((BAM_FSUPPLEMENTARY << 1) - 1) +++ ++ typedef struct { // auxiliary data structure ++ samFile *fp; // the file handle ++- bam_hdr_t *hdr; // the file header +++ sam_hdr_t *hdr; // the file header ++ hts_itr_t *iter; // NULL if a region not specified ++ int min_mapQ, min_len; // mapQ filter; length filter +++ uint32_t flags; // read filtering flags ++ } aux_t; ++ ++-void *bed_read(const char *fn); // read a BED or position list file ++-void bed_destroy(void *_h); // destroy the BED data structure ++-int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps ++-int bed_query(const void *_h, const char *chr, int pos, int *beg, int *end); ++- ++ // This function reads a BAM alignment from one BAM file. ++ static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup ++ { ++@@ -62,7 +61,7 @@ ++ { ++ ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); ++ if ( ret<0 ) break; ++- if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; +++ if ( b->core.flag & aux->flags) continue; ++ if ( (int)b->core.qual < aux->min_mapQ ) continue; ++ if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; ++ break; ++@@ -79,15 +78,21 @@ ++ fprintf(stderr, " -a output all positions (including zero depth)\n"); ++ fprintf(stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); ++ fprintf(stderr, " -b list of positions or regions\n"); +++ fprintf(stderr, " -X use customized index files\n"); ++ fprintf(stderr, " -f list of input BAM filenames, one per line [null]\n"); +++ fprintf(stderr, " -H print a file header\n"); ++ fprintf(stderr, " -l read length threshold (ignore reads shorter than ) [0]\n"); ++ fprintf(stderr, " -d/-m maximum coverage depth [8000]. If 0, depth is set to the maximum\n" ++ " integer value, effectively removing any depth limit.\n"); // the htslib's default +++ fprintf(stderr, " -o FILE where to write output to [stdout]\n"); ++ fprintf(stderr, " -q base quality threshold [0]\n"); ++ fprintf(stderr, " -Q mapping quality threshold [0]\n"); ++ fprintf(stderr, " -r region\n"); +++ fprintf(stderr, " -g include reads that have any of the specified flags set [0]\n"); +++ fprintf(stderr, " -G filter out reads that have any of the specified flags set" +++ " [UNMAP,SECONDARY,QCFAIL,DUP]\n"); ++ ++- sam_global_opt_help(stderr, "-.--.-"); +++ sam_global_opt_help(stderr, "-.--.--."); ++ ++ fprintf(stderr, "\n"); ++ fprintf(stderr, "The output is a simple tab-separated table with three columns: reference name,\n"); ++@@ -95,21 +100,27 @@ ++ fprintf(stderr, "omitted by default; see the -a option.\n"); ++ fprintf(stderr, "\n"); ++ ++- return 1; +++ return EXIT_FAILURE; ++ } ++ ++ int main_depth(int argc, char *argv[]) ++ { ++- int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; +++ int i, n, tid, reg_tid, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, has_index_file = 0; +++ hts_pos_t beg, end, pos, last_pos = -1; ++ int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1; ++ const bam_pileup1_t **plp; ++ char *reg = 0; // specified region ++ void *bed = 0; // BED data structure ++ char *file_list = NULL, **fn = NULL; ++- bam_hdr_t *h = NULL; // BAM header of the 1st input +++ sam_hdr_t *h = NULL; // BAM header of the 1st input ++ aux_t **data; ++ bam_mplp_t mplp; ++- int last_pos = -1, last_tid = -1, ret; +++ int last_tid = -1, ret; +++ int print_header = 0; +++ char *output_file = NULL; +++ FILE *file_out = stdout; +++ uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); +++ int tflags = 0; ++ ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ static const struct option lopts[] = { ++@@ -118,19 +129,41 @@ ++ }; ++ ++ // parse the command line ++- while ((n = getopt_long(argc, argv, "r:b:q:Q:l:f:am:d:", lopts, NULL)) >= 0) { +++ while ((n = getopt_long(argc, argv, "r:b:Xq:Q:l:f:am:d:Ho:g:G:", lopts, NULL)) >= 0) { ++ switch (n) { ++ case 'l': min_len = atoi(optarg); break; // minimum query length ++ case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header ++ case 'b': ++ bed = bed_read(optarg); // BED or position list file can be parsed now ++- if (!bed) { print_error_errno("depth", "Could not read file \"%s\"", optarg); return 1; } +++ if (!bed) { +++ print_error_errno("depth", "Could not read file \"%s\"", optarg); +++ return EXIT_FAILURE; +++ } ++ break; +++ case 'X': has_index_file = 1; break; ++ case 'q': baseQ = atoi(optarg); break; // base quality threshold ++ case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold ++ case 'f': file_list = optarg; break; ++ case 'a': all++; break; ++ case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth +++ case 'H': print_header = 1; break; +++ case 'o': output_file = optarg; break; +++ case 'g': +++ tflags = bam_str2flag(optarg); +++ if (tflags < 0 || tflags > BAM_FMAX) { +++ print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); +++ return 1; +++ } +++ flags &= ~tflags; +++ break; +++ case 'G': +++ tflags = bam_str2flag(optarg); +++ if (tflags < 0 || tflags > BAM_FMAX) { +++ print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); +++ return 1; +++ } +++ flags |= tflags; +++ break; ++ default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': return usage(); ++@@ -139,18 +172,40 @@ ++ if (optind == argc && !file_list) ++ return usage(); ++ +++ /* output file provided by user */ +++ if (output_file != NULL && strcmp(output_file,"-")!=0) { +++ file_out = fopen( output_file, "w" ); +++ if (file_out == NULL) { +++ print_error_errno("depth", "Cannot open \"%s\" for writing.", output_file); +++ return EXIT_FAILURE; +++ } +++ } +++ +++ ++ // initialize the auxiliary data structures ++ if (file_list) ++ { ++- if ( read_file_list(file_list,&nfiles,&fn) ) return 1; +++ if (has_index_file) { +++ print_error("depth", "The -f option cannot be combined with -X"); +++ return 1; +++ } +++ if ( read_file_list(file_list,&nfiles,&fn) ) return EXIT_FAILURE; ++ n = nfiles; ++ argv = fn; ++ optind = 0; ++ } ++- else ++- n = argc - optind; // the number of BAMs on the command line +++ else if (has_index_file) { // Calculate # of input BAM files +++ if ((argc - optind) % 2 != 0) { +++ fprintf(stderr, "Error: Odd number of filenames detected! Each BAM file should have an index file\n"); +++ return 1; +++ } +++ n = (argc - optind) / 2; +++ } else { +++ n = argc - optind; +++ } ++ data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input ++- reg_tid = 0; beg = 0; end = INT_MAX; // set the default region +++ reg_tid = 0; beg = 0; end = HTS_POS_MAX; // set the default region +++ ++ for (i = 0; i < n; ++i) { ++ int rf; ++ data[i] = calloc(1, sizeof(aux_t)); ++@@ -163,24 +218,32 @@ ++ rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; ++ if (baseQ) rf |= SAM_QUAL; ++ if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { ++- fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); ++- return 1; +++ print_error_errno("depth", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); +++ status = EXIT_FAILURE; +++ goto depth_end; ++ } ++ if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { ++- fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); ++- return 1; +++ print_error_errno("depth", "Failed to set CRAM_OPT_DECODE_MD value"); +++ status = EXIT_FAILURE; +++ goto depth_end; ++ } ++ data[i]->min_mapQ = mapQ; // set the mapQ filter ++ data[i]->min_len = min_len; // set the qlen filter ++ data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header ++ if (data[i]->hdr == NULL) { ++- fprintf(stderr, "Couldn't read header for \"%s\"\n", ++- argv[optind+i]); +++ print_error_errno("depth", "Couldn't read header for \"%s\"", +++ argv[optind+i]); ++ status = EXIT_FAILURE; ++ goto depth_end; ++ } ++ if (reg) { // if a region is specified ++- hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index +++ hts_idx_t *idx = NULL; +++ // If index filename has not been specfied, look in BAM folder +++ if (has_index_file) { +++ idx = sam_index_load2(data[i]->fp, argv[optind+i], argv[optind+i+n]); // load the index +++ } else { +++ idx = sam_index_load(data[i]->fp, argv[optind+i]); +++ } ++ if (idx == NULL) { ++ print_error("depth", "can't load index for \"%s\"", argv[optind+i]); ++ status = EXIT_FAILURE; ++@@ -194,8 +257,16 @@ ++ goto depth_end; ++ } ++ } +++ data[i]->flags = flags; ++ } ++- +++ if (print_header) { +++ fputs("#CHROM\tPOS", file_out); +++ for (i = 0; i < n; ++i) { +++ fputc('\t', file_out); +++ fputs(argv[optind+i], file_out); +++ } +++ fputc('\n', file_out); +++ } ++ h = data[0]->hdr; // easy access to the header of the 1st BAM ++ if (reg) { ++ beg = data[0]->iter->beg; // and to the parsed region coordinates ++@@ -211,21 +282,22 @@ ++ bam_mplp_set_maxcnt(mplp,INT_MAX); ++ n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM ++ plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) ++- while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position +++ while ((ret=bam_mplp64_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position ++ if (pos < beg || pos >= end) continue; // out of range; skip ++- if (tid >= h->n_targets) continue; // diff number of @SQ lines per file? +++ if (tid >= sam_hdr_nref(h)) continue; // diff number of @SQ lines per file? ++ if (all) { ++ while (tid > last_tid) { ++ if (last_tid >= 0 && !reg) { ++ // Deal with remainder or entirety of last tid. ++- while (++last_pos < h->target_len[last_tid]) { +++ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { ++ // Horribly inefficient, but the bed API is an obfuscated black box. ++- if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) +++ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) ++ continue; ++- fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); +++ fputs(sam_hdr_tid2name(h, last_tid), file_out); +++ fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); ++ for (i = 0; i < n; i++) ++- putchar('\t'), putchar('0'); ++- putchar('\n'); +++ fputc('\t', file_out), fputc('0', file_out); +++ fputc('\n', file_out); ++ } ++ } ++ last_tid++; ++@@ -237,19 +309,21 @@ ++ // Deal with missing portion of current tid ++ while (++last_pos < pos) { ++ if (last_pos < beg) continue; // out of range; skip ++- if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0) +++ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) ++ continue; ++- fputs(h->target_name[tid], stdout); printf("\t%d", last_pos+1); +++ fputs(sam_hdr_tid2name(h, tid), file_out); +++ fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); ++ for (i = 0; i < n; i++) ++- putchar('\t'), putchar('0'); ++- putchar('\n'); +++ fputc('\t', file_out), fputc('0', file_out); +++ fputc('\n', file_out); ++ } ++ ++ last_tid = tid; ++ last_pos = pos; ++ } ++- if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; ++- fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster +++ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), pos, pos + 1) == 0) continue; +++ fputs(sam_hdr_tid2name(h, tid), file_out); +++ fprintf(file_out, "\t%"PRIhts_pos, pos+1); // a customized printf() would be faster ++ for (i = 0; i < n; ++i) { // base level filters have to go here ++ int j, m = 0; ++ for (j = 0; j < n_plp[i]; ++j) { ++@@ -258,9 +332,9 @@ ++ else if (p->qpos < p->b->core.l_qseq && ++ bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality ++ } ++- printf("\t%d", n_plp[i] - m); // this the depth to output +++ fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output ++ } ++- putchar('\n'); +++ fputc('\n', file_out); ++ } ++ if (ret < 0) status = EXIT_FAILURE; ++ free(n_plp); free(plp); ++@@ -268,19 +342,20 @@ ++ ++ if (all) { ++ // Handle terminating region ++- if (last_tid < 0 && reg && all > 1) { +++ if (last_tid < 0 && reg) { ++ last_tid = reg_tid; ++ last_pos = beg-1; ++ } ++- while (last_tid >= 0 && last_tid < h->n_targets) { ++- while (++last_pos < h->target_len[last_tid]) { +++ while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { +++ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { ++ if (last_pos >= end) break; ++- if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) +++ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) ++ continue; ++- fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); +++ fputs(sam_hdr_tid2name(h, last_tid), file_out); +++ fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); ++ for (i = 0; i < n; i++) ++- putchar('\t'), putchar('0'); ++- putchar('\n'); +++ fputc('\t', file_out), fputc('0', file_out); +++ fputc('\n', file_out); ++ } ++ last_tid++; ++ last_pos = -1; ++@@ -290,8 +365,17 @@ ++ } ++ ++ depth_end: +++ if (fclose(file_out) != 0) { +++ if (status == EXIT_SUCCESS) { +++ print_error_errno("depth", "error on closing \"%s\"", +++ (output_file && strcmp(output_file, "-") != 0 +++ ? output_file : "stdout")); +++ status = EXIT_FAILURE; +++ } +++ } +++ ++ for (i = 0; i < n && data[i]; ++i) { ++- bam_hdr_destroy(data[i]->hdr); +++ sam_hdr_destroy(data[i]->hdr); ++ if (data[i]->fp) sam_close(data[i]->fp); ++ hts_itr_destroy(data[i]->iter); ++ free(data[i]); ++--- python-pysam.orig/samtools/bam2depth.c.pysam.c +++++ python-pysam/samtools/bam2depth.c.pysam.c ++@@ -3,7 +3,7 @@ ++ /* bam2depth.c -- depth subcommand. ++ ++ Copyright (C) 2011, 2012 Broad Institute. ++- Copyright (C) 2012-2014 Genome Research Ltd. +++ Copyright (C) 2012-2016, 2018, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -41,20 +41,19 @@ ++ #include ++ #include "htslib/sam.h" ++ #include "samtools.h" +++#include "bedidx.h" ++ #include "sam_opts.h" ++ +++#define BAM_FMAX ((BAM_FSUPPLEMENTARY << 1) - 1) +++ ++ typedef struct { // auxiliary data structure ++ samFile *fp; // the file handle ++- bam_hdr_t *hdr; // the file header +++ sam_hdr_t *hdr; // the file header ++ hts_itr_t *iter; // NULL if a region not specified ++ int min_mapQ, min_len; // mapQ filter; length filter +++ uint32_t flags; // read filtering flags ++ } aux_t; ++ ++-void *bed_read(const char *fn); // read a BED or position list file ++-void bed_destroy(void *_h); // destroy the BED data structure ++-int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps ++-int bed_query(const void *_h, const char *chr, int pos, int *beg, int *end); ++- ++ // This function reads a BAM alignment from one BAM file. ++ static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup ++ { ++@@ -64,7 +63,7 @@ ++ { ++ ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); ++ if ( ret<0 ) break; ++- if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; +++ if ( b->core.flag & aux->flags) continue; ++ if ( (int)b->core.qual < aux->min_mapQ ) continue; ++ if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; ++ break; ++@@ -81,15 +80,21 @@ ++ fprintf(samtools_stderr, " -a output all positions (including zero depth)\n"); ++ fprintf(samtools_stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); ++ fprintf(samtools_stderr, " -b list of positions or regions\n"); +++ fprintf(samtools_stderr, " -X use customized index files\n"); ++ fprintf(samtools_stderr, " -f list of input BAM filenames, one per line [null]\n"); +++ fprintf(samtools_stderr, " -H print a file header\n"); ++ fprintf(samtools_stderr, " -l read length threshold (ignore reads shorter than ) [0]\n"); ++ fprintf(samtools_stderr, " -d/-m maximum coverage depth [8000]. If 0, depth is set to the maximum\n" ++ " integer value, effectively removing any depth limit.\n"); // the htslib's default +++ fprintf(samtools_stderr, " -o FILE where to write output to [samtools_stdout]\n"); ++ fprintf(samtools_stderr, " -q base quality threshold [0]\n"); ++ fprintf(samtools_stderr, " -Q mapping quality threshold [0]\n"); ++ fprintf(samtools_stderr, " -r region\n"); +++ fprintf(samtools_stderr, " -g include reads that have any of the specified flags set [0]\n"); +++ fprintf(samtools_stderr, " -G filter out reads that have any of the specified flags set" +++ " [UNMAP,SECONDARY,QCFAIL,DUP]\n"); ++ ++- sam_global_opt_help(samtools_stderr, "-.--.-"); +++ sam_global_opt_help(samtools_stderr, "-.--.--."); ++ ++ fprintf(samtools_stderr, "\n"); ++ fprintf(samtools_stderr, "The output is a simple tab-separated table with three columns: reference name,\n"); ++@@ -97,21 +102,27 @@ ++ fprintf(samtools_stderr, "omitted by default; see the -a option.\n"); ++ fprintf(samtools_stderr, "\n"); ++ ++- return 1; +++ return EXIT_FAILURE; ++ } ++ ++ int main_depth(int argc, char *argv[]) ++ { ++- int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; +++ int i, n, tid, reg_tid, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, has_index_file = 0; +++ hts_pos_t beg, end, pos, last_pos = -1; ++ int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1; ++ const bam_pileup1_t **plp; ++ char *reg = 0; // specified region ++ void *bed = 0; // BED data structure ++ char *file_list = NULL, **fn = NULL; ++- bam_hdr_t *h = NULL; // BAM header of the 1st input +++ sam_hdr_t *h = NULL; // BAM header of the 1st input ++ aux_t **data; ++ bam_mplp_t mplp; ++- int last_pos = -1, last_tid = -1, ret; +++ int last_tid = -1, ret; +++ int print_header = 0; +++ char *output_file = NULL; +++ FILE *file_out = samtools_stdout; +++ uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); +++ int tflags = 0; ++ ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ static const struct option lopts[] = { ++@@ -120,19 +131,41 @@ ++ }; ++ ++ // parse the command line ++- while ((n = getopt_long(argc, argv, "r:b:q:Q:l:f:am:d:", lopts, NULL)) >= 0) { +++ while ((n = getopt_long(argc, argv, "r:b:Xq:Q:l:f:am:d:Ho:g:G:", lopts, NULL)) >= 0) { ++ switch (n) { ++ case 'l': min_len = atoi(optarg); break; // minimum query length ++ case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header ++ case 'b': ++ bed = bed_read(optarg); // BED or position list file can be parsed now ++- if (!bed) { print_error_errno("depth", "Could not read file \"%s\"", optarg); return 1; } +++ if (!bed) { +++ print_error_errno("depth", "Could not read file \"%s\"", optarg); +++ return EXIT_FAILURE; +++ } ++ break; +++ case 'X': has_index_file = 1; break; ++ case 'q': baseQ = atoi(optarg); break; // base quality threshold ++ case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold ++ case 'f': file_list = optarg; break; ++ case 'a': all++; break; ++ case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth +++ case 'H': print_header = 1; break; +++ case 'o': output_file = optarg; break; +++ case 'g': +++ tflags = bam_str2flag(optarg); +++ if (tflags < 0 || tflags > BAM_FMAX) { +++ print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); +++ return 1; +++ } +++ flags &= ~tflags; +++ break; +++ case 'G': +++ tflags = bam_str2flag(optarg); +++ if (tflags < 0 || tflags > BAM_FMAX) { +++ print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); +++ return 1; +++ } +++ flags |= tflags; +++ break; ++ default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': return usage(); ++@@ -141,18 +174,40 @@ ++ if (optind == argc && !file_list) ++ return usage(); ++ +++ /* output file provided by user */ +++ if (output_file != NULL && strcmp(output_file,"-")!=0) { +++ file_out = fopen( output_file, "w" ); +++ if (file_out == NULL) { +++ print_error_errno("depth", "Cannot open \"%s\" for writing.", output_file); +++ return EXIT_FAILURE; +++ } +++ } +++ +++ ++ // initialize the auxiliary data structures ++ if (file_list) ++ { ++- if ( read_file_list(file_list,&nfiles,&fn) ) return 1; +++ if (has_index_file) { +++ print_error("depth", "The -f option cannot be combined with -X"); +++ return 1; +++ } +++ if ( read_file_list(file_list,&nfiles,&fn) ) return EXIT_FAILURE; ++ n = nfiles; ++ argv = fn; ++ optind = 0; ++ } ++- else ++- n = argc - optind; // the number of BAMs on the command line +++ else if (has_index_file) { // Calculate # of input BAM files +++ if ((argc - optind) % 2 != 0) { +++ fprintf(samtools_stderr, "Error: Odd number of filenames detected! Each BAM file should have an index file\n"); +++ return 1; +++ } +++ n = (argc - optind) / 2; +++ } else { +++ n = argc - optind; +++ } ++ data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input ++- reg_tid = 0; beg = 0; end = INT_MAX; // set the default region +++ reg_tid = 0; beg = 0; end = HTS_POS_MAX; // set the default region +++ ++ for (i = 0; i < n; ++i) { ++ int rf; ++ data[i] = calloc(1, sizeof(aux_t)); ++@@ -165,24 +220,32 @@ ++ rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; ++ if (baseQ) rf |= SAM_QUAL; ++ if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { ++- fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); ++- return 1; +++ print_error_errno("depth", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); +++ status = EXIT_FAILURE; +++ goto depth_end; ++ } ++ if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { ++- fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); ++- return 1; +++ print_error_errno("depth", "Failed to set CRAM_OPT_DECODE_MD value"); +++ status = EXIT_FAILURE; +++ goto depth_end; ++ } ++ data[i]->min_mapQ = mapQ; // set the mapQ filter ++ data[i]->min_len = min_len; // set the qlen filter ++ data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header ++ if (data[i]->hdr == NULL) { ++- fprintf(samtools_stderr, "Couldn't read header for \"%s\"\n", ++- argv[optind+i]); +++ print_error_errno("depth", "Couldn't read header for \"%s\"", +++ argv[optind+i]); ++ status = EXIT_FAILURE; ++ goto depth_end; ++ } ++ if (reg) { // if a region is specified ++- hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index +++ hts_idx_t *idx = NULL; +++ // If index filename has not been specfied, look in BAM folder +++ if (has_index_file) { +++ idx = sam_index_load2(data[i]->fp, argv[optind+i], argv[optind+i+n]); // load the index +++ } else { +++ idx = sam_index_load(data[i]->fp, argv[optind+i]); +++ } ++ if (idx == NULL) { ++ print_error("depth", "can't load index for \"%s\"", argv[optind+i]); ++ status = EXIT_FAILURE; ++@@ -196,8 +259,16 @@ ++ goto depth_end; ++ } ++ } +++ data[i]->flags = flags; ++ } ++- +++ if (print_header) { +++ fputs("#CHROM\tPOS", file_out); +++ for (i = 0; i < n; ++i) { +++ fputc('\t', file_out); +++ fputs(argv[optind+i], file_out); +++ } +++ fputc('\n', file_out); +++ } ++ h = data[0]->hdr; // easy access to the header of the 1st BAM ++ if (reg) { ++ beg = data[0]->iter->beg; // and to the parsed region coordinates ++@@ -213,21 +284,22 @@ ++ bam_mplp_set_maxcnt(mplp,INT_MAX); ++ n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM ++ plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) ++- while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position +++ while ((ret=bam_mplp64_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position ++ if (pos < beg || pos >= end) continue; // out of range; skip ++- if (tid >= h->n_targets) continue; // diff number of @SQ lines per file? +++ if (tid >= sam_hdr_nref(h)) continue; // diff number of @SQ lines per file? ++ if (all) { ++ while (tid > last_tid) { ++ if (last_tid >= 0 && !reg) { ++ // Deal with remainder or entirety of last tid. ++- while (++last_pos < h->target_len[last_tid]) { +++ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { ++ // Horribly inefficient, but the bed API is an obfuscated black box. ++- if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) +++ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) ++ continue; ++- fputs(h->target_name[last_tid], samtools_stdout); fprintf(samtools_stdout, "\t%d", last_pos+1); +++ fputs(sam_hdr_tid2name(h, last_tid), file_out); +++ fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); ++ for (i = 0; i < n; i++) ++- fputc('\t', samtools_stdout), fputc('0', samtools_stdout); ++- fputc('\n', samtools_stdout); +++ fputc('\t', file_out), fputc('0', file_out); +++ fputc('\n', file_out); ++ } ++ } ++ last_tid++; ++@@ -239,19 +311,21 @@ ++ // Deal with missing portion of current tid ++ while (++last_pos < pos) { ++ if (last_pos < beg) continue; // out of range; skip ++- if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0) +++ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) ++ continue; ++- fputs(h->target_name[tid], samtools_stdout); fprintf(samtools_stdout, "\t%d", last_pos+1); +++ fputs(sam_hdr_tid2name(h, tid), file_out); +++ fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); ++ for (i = 0; i < n; i++) ++- fputc('\t', samtools_stdout), fputc('0', samtools_stdout); ++- fputc('\n', samtools_stdout); +++ fputc('\t', file_out), fputc('0', file_out); +++ fputc('\n', file_out); ++ } ++ ++ last_tid = tid; ++ last_pos = pos; ++ } ++- if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; ++- fputs(h->target_name[tid], samtools_stdout); fprintf(samtools_stdout, "\t%d", pos+1); // a customized fprintf(samtools_stdout, ) would be faster +++ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), pos, pos + 1) == 0) continue; +++ fputs(sam_hdr_tid2name(h, tid), file_out); +++ fprintf(file_out, "\t%"PRIhts_pos, pos+1); // a customized fprintf(samtools_stdout, ) would be faster ++ for (i = 0; i < n; ++i) { // base level filters have to go here ++ int j, m = 0; ++ for (j = 0; j < n_plp[i]; ++j) { ++@@ -260,9 +334,9 @@ ++ else if (p->qpos < p->b->core.l_qseq && ++ bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality ++ } ++- fprintf(samtools_stdout, "\t%d", n_plp[i] - m); // this the depth to output +++ fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output ++ } ++- fputc('\n', samtools_stdout); +++ fputc('\n', file_out); ++ } ++ if (ret < 0) status = EXIT_FAILURE; ++ free(n_plp); free(plp); ++@@ -270,19 +344,20 @@ ++ ++ if (all) { ++ // Handle terminating region ++- if (last_tid < 0 && reg && all > 1) { +++ if (last_tid < 0 && reg) { ++ last_tid = reg_tid; ++ last_pos = beg-1; ++ } ++- while (last_tid >= 0 && last_tid < h->n_targets) { ++- while (++last_pos < h->target_len[last_tid]) { +++ while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { +++ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { ++ if (last_pos >= end) break; ++- if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) +++ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) ++ continue; ++- fputs(h->target_name[last_tid], samtools_stdout); fprintf(samtools_stdout, "\t%d", last_pos+1); +++ fputs(sam_hdr_tid2name(h, last_tid), file_out); +++ fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); ++ for (i = 0; i < n; i++) ++- fputc('\t', samtools_stdout), fputc('0', samtools_stdout); ++- fputc('\n', samtools_stdout); +++ fputc('\t', file_out), fputc('0', file_out); +++ fputc('\n', file_out); ++ } ++ last_tid++; ++ last_pos = -1; ++@@ -292,8 +367,17 @@ ++ } ++ ++ depth_end: +++ if (fclose(file_out) != 0) { +++ if (status == EXIT_SUCCESS) { +++ print_error_errno("depth", "error on closing \"%s\"", +++ (output_file && strcmp(output_file, "-") != 0 +++ ? output_file : "samtools_stdout")); +++ status = EXIT_FAILURE; +++ } +++ } +++ ++ for (i = 0; i < n && data[i]; ++i) { ++- bam_hdr_destroy(data[i]->hdr); +++ sam_hdr_destroy(data[i]->hdr); ++ if (data[i]->fp) sam_close(data[i]->fp); ++ hts_itr_destroy(data[i]->iter); ++ free(data[i]); ++--- python-pysam.orig/samtools/bam_addrprg.c +++++ python-pysam/samtools/bam_addrprg.c ++@@ -1,6 +1,6 @@ ++ /* bam_addrprg.c -- samtools command to add or replace readgroups. ++ ++- Copyright (c) 2013, 2015, 2016 Genome Research Limited. +++ Copyright (c) 2013, 2015-2017, 2019 Genome Research Limited. ++ ++ Author: Martin O. Pollard ++ ++@@ -47,6 +47,7 @@ ++ char* output_name; ++ char* rg_id; ++ char* rg_line; +++ int no_pg; ++ rg_mode mode; ++ sam_global_args ga; ++ htsThreadPool p; ++@@ -58,9 +59,9 @@ ++ ++ struct state { ++ samFile* input_file; ++- bam_hdr_t* input_header; +++ sam_hdr_t* input_header; ++ samFile* output_file; ++- bam_hdr_t* output_header; +++ sam_hdr_t* output_header; ++ char* rg_id; ++ void (*mode_func)(const state_t*, bam1_t*); ++ }; ++@@ -71,6 +72,7 @@ ++ free(opts->rg_id); ++ free(opts->output_name); ++ free(opts->input_name); +++ free(opts->rg_line); ++ if (opts->p.pool) hts_tpool_destroy(opts->p.pool); ++ sam_global_args_free(&opts->ga); ++ free(opts); ++@@ -81,9 +83,9 @@ ++ if (!state) return; ++ free(state->rg_id); ++ if (state->output_file) sam_close(state->output_file); ++- bam_hdr_destroy(state->output_header); +++ sam_hdr_destroy(state->output_header); ++ if (state->input_file) sam_close(state->input_file); ++- bam_hdr_destroy(state->input_header); +++ sam_hdr_destroy(state->input_header); ++ free(state); ++ } ++ ++@@ -147,20 +149,6 @@ ++ return ns; ++ } ++ ++-// These are to be replaced by samtools header parser ++-// Extracts the first @RG line from a string. ++-static char* get_rg_line(const char* text, size_t* last) ++-{ ++- const char* rg = text; ++- if (rg[0] != '@' || rg[1] != 'R' || rg[2] != 'G' ) { ++- if ((rg = (const char*)strstr(text,"\n@RG")) == NULL) { ++- return NULL; ++- } ++- rg++;//skip initial \n ++- } ++- // duplicate the line for return ++- return dup_substring(rg, strchr(rg, '\n'), last); ++-} ++ ++ // Given a @RG line return the id ++ static char* get_rg_id(const char *line) ++@@ -172,44 +160,6 @@ ++ return dup_substring(id, strchr(id, '\t'), NULL); ++ } ++ ++-// Confirms the existance of an RG line with a given ID in a bam header ++-static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid ) ++-{ ++- assert( hdr != NULL && rgid != NULL ); ++- ++- const char *ptr = hdr->text; ++- bool found = false; ++- while (ptr != NULL && *ptr != '\0' && found == false ) { ++- size_t end = 0; ++- char* line = get_rg_line(ptr, &end); ++- if (line == NULL) break; // No more @RG ++- char* id; ++- if (((id = get_rg_id(line)) != NULL) && !strcmp(id, rgid)) { ++- found = true; ++- } ++- free(id); ++- free(line); ++- ptr += end; ++- } ++- return found; ++-} ++- ++-static char* get_first_rgid( const bam_hdr_t *hdr ) ++-{ ++- assert( hdr != NULL ); ++- const char *ptr = hdr->text; ++- char* found = NULL; ++- while (ptr != NULL && *ptr != '\0' && found == NULL ) { ++- size_t end = 0; ++- char* line = get_rg_line(ptr, &end); ++- if ( line ) { ++- found = get_rg_id(line); ++- } else break; ++- free(line); ++- ptr += end; ++- } ++- return found; ++-} ++ ++ static void usage(FILE *fp) ++ { ++@@ -221,8 +171,9 @@ ++ " -o FILE Where to write output to [stdout]\n" ++ " -r STRING @RG line text\n" ++ " -R STRING ID of @RG line in existing header to use\n" +++ " --no-PG Do not add a PG line\n" ++ ); ++- sam_global_opt_help(fp, "..O..@"); +++ sam_global_opt_help(fp, "..O..@.."); ++ } ++ ++ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) ++@@ -242,6 +193,7 @@ ++ sam_global_args_init(&retval->ga); ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ kstring_t rg_line = {0,0,NULL}; ++@@ -280,6 +232,9 @@ ++ usage(stdout); ++ free(retval); ++ return true; +++ case 1: +++ retval->no_pg = 1; +++ break; ++ case '?': ++ usage(stderr); ++ free(retval); ++@@ -316,6 +271,7 @@ ++ cleanup_opts(retval); ++ return false; ++ } +++ free(retval->rg_line); ++ retval->rg_line = tmp; ++ } ++ retval->input_name = strdup(argv[optind+0]); ++@@ -375,7 +331,7 @@ ++ } ++ retval->input_header = sam_hdr_read(retval->input_file); ++ ++- retval->output_header = bam_hdr_dup(retval->input_header); +++ retval->output_header = sam_hdr_dup(retval->input_header); ++ if (opts->output_name) // File format auto-detection ++ sam_open_mode(output_mode + 1, opts->output_name, NULL); ++ retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out); ++@@ -393,34 +349,39 @@ ++ if (opts->rg_line) { ++ // Append new RG line to header. ++ // Check does not already exist ++- if ( confirm_rg(retval->output_header, opts->rg_id) ) { +++ kstring_t hdr_line = { 0, 0, NULL }; +++ if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) { ++ fprintf(stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); +++ free(hdr_line.s); ++ return false; ++ } ++- retval->rg_id = strdup(opts->rg_id); ++- size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2; ++- char* new_header = malloc(new_len); ++- if (!new_header) { ++- fprintf(stderr, "[init] Out of memory whilst writing new header.\n"); +++ if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) { +++ fprintf(stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id); +++ return false; +++ } +++ if (opts->mode == overwrite_all && +++ -1 == sam_hdr_remove_except(retval->output_header, "RG", "ID", opts->rg_id)) { +++ fprintf(stderr, "[init] Error removing the old RG lines from the output header.\n"); ++ return false; ++ } ++- sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line); ++- free(retval->output_header->text); ++- retval->output_header->text = new_header; ++- retval->output_header->l_text = (int)new_len - 1; +++ retval->rg_id = strdup(opts->rg_id); ++ } else { ++ if (opts->rg_id) { ++ // Confirm what has been supplied exists ++- if ( !confirm_rg(retval->output_header, opts->rg_id) ) { +++ kstring_t hdr_line = { 0, 0, NULL }; +++ if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) < 0) { ++ fprintf(stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n"); ++ return false; ++ } ++ retval->rg_id = strdup(opts->rg_id); +++ free(hdr_line.s); ++ } else { ++- if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) { +++ kstring_t rg_id = { 0, 0, NULL }; +++ if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) { ++ fprintf(stderr, "No RG specified on command line or in existing header.\n"); ++ return false; ++ } +++ retval->rg_id = ks_release(&rg_id); ++ } ++ } ++ ++@@ -436,12 +397,24 @@ ++ return true; ++ } ++ ++-static bool readgroupise(state_t* state) +++static bool readgroupise(parsed_opts_t *opts, state_t* state, char *arg_list) ++ { +++ if (!opts->no_pg && sam_hdr_add_pg(state->output_header, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) +++ return false; +++ ++ if (sam_hdr_write(state->output_file, state->output_header) != 0) { ++ print_error_errno("addreplacerg", "[%s] Could not write header to output file", __func__); ++ return false; ++ } +++ char *idx_fn = NULL; +++ if (opts->ga.write_index) { +++ if (!(idx_fn = auto_index(state->output_file, opts->output_name, state->output_header))) +++ return false; +++ } ++ ++ bam1_t* file_read = bam_init1(); ++ int ret; ++@@ -451,14 +424,25 @@ ++ if (sam_write1(state->output_file, state->output_header, file_read) < 0) { ++ print_error_errno("addreplacerg", "[%s] Could not write read to output file", __func__); ++ bam_destroy1(file_read); +++ free(idx_fn); ++ return false; ++ } ++ } ++ bam_destroy1(file_read); ++ if (ret != -1) { ++ print_error_errno("addreplacerg", "[%s] Error reading from input file", __func__); +++ free(idx_fn); ++ return false; ++ } else { +++ +++ if (opts->ga.write_index) { +++ if (sam_idx_save(state->output_file) < 0) { +++ print_error_errno("addreplacerg", "[%s] Writing index failed", __func__); +++ free(idx_fn); +++ return false; +++ } +++ } +++ free(idx_fn); ++ return true; ++ } ++ } ++@@ -467,20 +451,25 @@ ++ { ++ parsed_opts_t* opts = NULL; ++ state_t* state = NULL; +++ char *arg_list = stringify_argv(argc+1, argv-1); +++ if (!arg_list) +++ return EXIT_FAILURE; ++ ++ if (!parse_args(argc, argv, &opts)) goto error; ++- if (opts == NULL) return EXIT_SUCCESS; // Not an error but user doesn't want us to proceed ++- if (!opts || !init(opts, &state)) goto error; ++- ++- if (!readgroupise(state)) goto error; +++ if (opts) { // Not an error but user doesn't want us to proceed +++ if (!init(opts, &state) || !readgroupise(opts, state, arg_list)) +++ goto error; +++ } ++ ++ cleanup_state(state); ++ cleanup_opts(opts); +++ free(arg_list); ++ ++ return EXIT_SUCCESS; ++ error: ++ cleanup_state(state); ++ cleanup_opts(opts); +++ free(arg_list); ++ ++ return EXIT_FAILURE; ++ } ++--- python-pysam.orig/samtools/bam_addrprg.c.pysam.c +++++ python-pysam/samtools/bam_addrprg.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam_addrprg.c -- samtools command to add or replace readgroups. ++ ++- Copyright (c) 2013, 2015, 2016 Genome Research Limited. +++ Copyright (c) 2013, 2015-2017, 2019 Genome Research Limited. ++ ++ Author: Martin O. Pollard ++ ++@@ -49,6 +49,7 @@ ++ char* output_name; ++ char* rg_id; ++ char* rg_line; +++ int no_pg; ++ rg_mode mode; ++ sam_global_args ga; ++ htsThreadPool p; ++@@ -60,9 +61,9 @@ ++ ++ struct state { ++ samFile* input_file; ++- bam_hdr_t* input_header; +++ sam_hdr_t* input_header; ++ samFile* output_file; ++- bam_hdr_t* output_header; +++ sam_hdr_t* output_header; ++ char* rg_id; ++ void (*mode_func)(const state_t*, bam1_t*); ++ }; ++@@ -73,6 +74,7 @@ ++ free(opts->rg_id); ++ free(opts->output_name); ++ free(opts->input_name); +++ free(opts->rg_line); ++ if (opts->p.pool) hts_tpool_destroy(opts->p.pool); ++ sam_global_args_free(&opts->ga); ++ free(opts); ++@@ -83,9 +85,9 @@ ++ if (!state) return; ++ free(state->rg_id); ++ if (state->output_file) sam_close(state->output_file); ++- bam_hdr_destroy(state->output_header); +++ sam_hdr_destroy(state->output_header); ++ if (state->input_file) sam_close(state->input_file); ++- bam_hdr_destroy(state->input_header); +++ sam_hdr_destroy(state->input_header); ++ free(state); ++ } ++ ++@@ -149,20 +151,6 @@ ++ return ns; ++ } ++ ++-// These are to be replaced by samtools header parser ++-// Extracts the first @RG line from a string. ++-static char* get_rg_line(const char* text, size_t* last) ++-{ ++- const char* rg = text; ++- if (rg[0] != '@' || rg[1] != 'R' || rg[2] != 'G' ) { ++- if ((rg = (const char*)strstr(text,"\n@RG")) == NULL) { ++- return NULL; ++- } ++- rg++;//skip initial \n ++- } ++- // duplicate the line for return ++- return dup_substring(rg, strchr(rg, '\n'), last); ++-} ++ ++ // Given a @RG line return the id ++ static char* get_rg_id(const char *line) ++@@ -174,44 +162,6 @@ ++ return dup_substring(id, strchr(id, '\t'), NULL); ++ } ++ ++-// Confirms the existance of an RG line with a given ID in a bam header ++-static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid ) ++-{ ++- assert( hdr != NULL && rgid != NULL ); ++- ++- const char *ptr = hdr->text; ++- bool found = false; ++- while (ptr != NULL && *ptr != '\0' && found == false ) { ++- size_t end = 0; ++- char* line = get_rg_line(ptr, &end); ++- if (line == NULL) break; // No more @RG ++- char* id; ++- if (((id = get_rg_id(line)) != NULL) && !strcmp(id, rgid)) { ++- found = true; ++- } ++- free(id); ++- free(line); ++- ptr += end; ++- } ++- return found; ++-} ++- ++-static char* get_first_rgid( const bam_hdr_t *hdr ) ++-{ ++- assert( hdr != NULL ); ++- const char *ptr = hdr->text; ++- char* found = NULL; ++- while (ptr != NULL && *ptr != '\0' && found == NULL ) { ++- size_t end = 0; ++- char* line = get_rg_line(ptr, &end); ++- if ( line ) { ++- found = get_rg_id(line); ++- } else break; ++- free(line); ++- ptr += end; ++- } ++- return found; ++-} ++ ++ static void usage(FILE *fp) ++ { ++@@ -223,8 +173,9 @@ ++ " -o FILE Where to write output to [samtools_stdout]\n" ++ " -r STRING @RG line text\n" ++ " -R STRING ID of @RG line in existing header to use\n" +++ " --no-PG Do not add a PG line\n" ++ ); ++- sam_global_opt_help(fp, "..O..@"); +++ sam_global_opt_help(fp, "..O..@.."); ++ } ++ ++ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) ++@@ -244,6 +195,7 @@ ++ sam_global_args_init(&retval->ga); ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ kstring_t rg_line = {0,0,NULL}; ++@@ -282,6 +234,9 @@ ++ usage(samtools_stdout); ++ free(retval); ++ return true; +++ case 1: +++ retval->no_pg = 1; +++ break; ++ case '?': ++ usage(samtools_stderr); ++ free(retval); ++@@ -318,6 +273,7 @@ ++ cleanup_opts(retval); ++ return false; ++ } +++ free(retval->rg_line); ++ retval->rg_line = tmp; ++ } ++ retval->input_name = strdup(argv[optind+0]); ++@@ -377,7 +333,7 @@ ++ } ++ retval->input_header = sam_hdr_read(retval->input_file); ++ ++- retval->output_header = bam_hdr_dup(retval->input_header); +++ retval->output_header = sam_hdr_dup(retval->input_header); ++ if (opts->output_name) // File format auto-detection ++ sam_open_mode(output_mode + 1, opts->output_name, NULL); ++ retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out); ++@@ -395,34 +351,39 @@ ++ if (opts->rg_line) { ++ // Append new RG line to header. ++ // Check does not already exist ++- if ( confirm_rg(retval->output_header, opts->rg_id) ) { +++ kstring_t hdr_line = { 0, 0, NULL }; +++ if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) { ++ fprintf(samtools_stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); +++ free(hdr_line.s); ++ return false; ++ } ++- retval->rg_id = strdup(opts->rg_id); ++- size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2; ++- char* new_header = malloc(new_len); ++- if (!new_header) { ++- fprintf(samtools_stderr, "[init] Out of memory whilst writing new header.\n"); +++ if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) { +++ fprintf(samtools_stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id); +++ return false; +++ } +++ if (opts->mode == overwrite_all && +++ -1 == sam_hdr_remove_except(retval->output_header, "RG", "ID", opts->rg_id)) { +++ fprintf(samtools_stderr, "[init] Error removing the old RG lines from the output header.\n"); ++ return false; ++ } ++- sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line); ++- free(retval->output_header->text); ++- retval->output_header->text = new_header; ++- retval->output_header->l_text = (int)new_len - 1; +++ retval->rg_id = strdup(opts->rg_id); ++ } else { ++ if (opts->rg_id) { ++ // Confirm what has been supplied exists ++- if ( !confirm_rg(retval->output_header, opts->rg_id) ) { +++ kstring_t hdr_line = { 0, 0, NULL }; +++ if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) < 0) { ++ fprintf(samtools_stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n"); ++ return false; ++ } ++ retval->rg_id = strdup(opts->rg_id); +++ free(hdr_line.s); ++ } else { ++- if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) { +++ kstring_t rg_id = { 0, 0, NULL }; +++ if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) { ++ fprintf(samtools_stderr, "No RG specified on command line or in existing header.\n"); ++ return false; ++ } +++ retval->rg_id = ks_release(&rg_id); ++ } ++ } ++ ++@@ -438,12 +399,24 @@ ++ return true; ++ } ++ ++-static bool readgroupise(state_t* state) +++static bool readgroupise(parsed_opts_t *opts, state_t* state, char *arg_list) ++ { +++ if (!opts->no_pg && sam_hdr_add_pg(state->output_header, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) +++ return false; +++ ++ if (sam_hdr_write(state->output_file, state->output_header) != 0) { ++ print_error_errno("addreplacerg", "[%s] Could not write header to output file", __func__); ++ return false; ++ } +++ char *idx_fn = NULL; +++ if (opts->ga.write_index) { +++ if (!(idx_fn = auto_index(state->output_file, opts->output_name, state->output_header))) +++ return false; +++ } ++ ++ bam1_t* file_read = bam_init1(); ++ int ret; ++@@ -453,14 +426,25 @@ ++ if (sam_write1(state->output_file, state->output_header, file_read) < 0) { ++ print_error_errno("addreplacerg", "[%s] Could not write read to output file", __func__); ++ bam_destroy1(file_read); +++ free(idx_fn); ++ return false; ++ } ++ } ++ bam_destroy1(file_read); ++ if (ret != -1) { ++ print_error_errno("addreplacerg", "[%s] Error reading from input file", __func__); +++ free(idx_fn); ++ return false; ++ } else { +++ +++ if (opts->ga.write_index) { +++ if (sam_idx_save(state->output_file) < 0) { +++ print_error_errno("addreplacerg", "[%s] Writing index failed", __func__); +++ free(idx_fn); +++ return false; +++ } +++ } +++ free(idx_fn); ++ return true; ++ } ++ } ++@@ -469,20 +453,25 @@ ++ { ++ parsed_opts_t* opts = NULL; ++ state_t* state = NULL; +++ char *arg_list = stringify_argv(argc+1, argv-1); +++ if (!arg_list) +++ return EXIT_FAILURE; ++ ++ if (!parse_args(argc, argv, &opts)) goto error; ++- if (opts == NULL) return EXIT_SUCCESS; // Not an error but user doesn't want us to proceed ++- if (!opts || !init(opts, &state)) goto error; ++- ++- if (!readgroupise(state)) goto error; +++ if (opts) { // Not an error but user doesn't want us to proceed +++ if (!init(opts, &state) || !readgroupise(opts, state, arg_list)) +++ goto error; +++ } ++ ++ cleanup_state(state); ++ cleanup_opts(opts); +++ free(arg_list); ++ ++ return EXIT_SUCCESS; ++ error: ++ cleanup_state(state); ++ cleanup_opts(opts); +++ free(arg_list); ++ ++ return EXIT_FAILURE; ++ } ++--- python-pysam.orig/samtools/bam_aux.c +++++ python-pysam/samtools/bam_aux.c ++@@ -1,6 +1,6 @@ ++ /* bam_aux.c -- remaining aux field handling. ++ ++- Copyright (C) 2008-2010, 2013 Genome Research Ltd. +++ Copyright (C) 2008-2010, 2013, 2015, 2019 Genome Research Ltd. ++ Portions copyright (C) 2011 Broad Institute. ++ ++ Author: Heng Li ++@@ -61,21 +61,15 @@ ++ return 0; ++ } ++ +++// Only here due to libbam.a being used by some applications. ++ int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) ++ { ++- const char *name_lim = hts_parse_reg(str, beg, end); ++- if (name_lim) { ++- char *name = malloc(name_lim - str + 1); ++- memcpy(name, str, name_lim - str); ++- name[name_lim - str] = '\0'; ++- *ref_id = bam_name2id(header, name); ++- free(name); ++- } ++- else { ++- // not parsable as a region, but possibly a sequence named "foo:a" ++- *ref_id = bam_name2id(header, str); ++- *beg = 0; *end = INT_MAX; ++- } ++- if (*ref_id == -1) return -1; ++- return *beg <= *end? 0 : -1; +++ hts_pos_t beg64, end64; +++ int r; +++ r = sam_parse_region(header, str, ref_id, &beg64, &end64, 0) ? 0 : -1; +++ if (beg64 > INT_MAX || end64 > INT_MAX) +++ return -1; +++ *beg = beg64; +++ *end = end64; +++ return r; ++ } ++--- python-pysam.orig/samtools/bam_aux.c.pysam.c +++++ python-pysam/samtools/bam_aux.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam_aux.c -- remaining aux field handling. ++ ++- Copyright (C) 2008-2010, 2013 Genome Research Ltd. +++ Copyright (C) 2008-2010, 2013, 2015, 2019 Genome Research Ltd. ++ Portions copyright (C) 2011 Broad Institute. ++ ++ Author: Heng Li ++@@ -63,21 +63,15 @@ ++ return 0; ++ } ++ +++// Only here due to libbam.a being used by some applications. ++ int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) ++ { ++- const char *name_lim = hts_parse_reg(str, beg, end); ++- if (name_lim) { ++- char *name = malloc(name_lim - str + 1); ++- memcpy(name, str, name_lim - str); ++- name[name_lim - str] = '\0'; ++- *ref_id = bam_name2id(header, name); ++- free(name); ++- } ++- else { ++- // not parsable as a region, but possibly a sequence named "foo:a" ++- *ref_id = bam_name2id(header, str); ++- *beg = 0; *end = INT_MAX; ++- } ++- if (*ref_id == -1) return -1; ++- return *beg <= *end? 0 : -1; +++ hts_pos_t beg64, end64; +++ int r; +++ r = sam_parse_region(header, str, ref_id, &beg64, &end64, 0) ? 0 : -1; +++ if (beg64 > INT_MAX || end64 > INT_MAX) +++ return -1; +++ *beg = beg64; +++ *end = end64; +++ return r; ++ } ++--- python-pysam.orig/samtools/bam_cat.c +++++ python-pysam/samtools/bam_cat.c ++@@ -1,6 +1,6 @@ ++ /* bam_cat.c -- efficiently concatenates bam files. ++ ++- Copyright (C) 2008-2009, 2011-2013, 2015-2016 Genome Research Ltd. +++ Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019 Genome Research Ltd. ++ Modified SAMtools work copyright (C) 2010 Illumina, Inc. ++ ++ Permission is hereby granted, free of charge, to any person obtaining a copy ++@@ -45,162 +45,43 @@ ++ #include "htslib/bgzf.h" ++ #include "htslib/sam.h" ++ #include "htslib/cram.h" ++-#include "htslib/khash.h" +++#include "htslib/kstring.h" ++ #include "samtools.h" ++- ++-KHASH_MAP_INIT_STR(s2i, int) ++- ++-// Bi-directional lookup. ++-// We can go from name to ID or ID to name. ++-typedef struct khash_s2i { ++- khash_t(s2i) *h; ++- int n_id, a_id; ++- const char **id; // map Nth entry back to key ++- const char **line; ++-} khash_s2i; ++- ++-static int hash_s2i_inc(khash_s2i *hash, const char *str, const char *line, int *added) { ++- // loosly based on khash_str2int_inc ++- khint_t k; ++- int n; ++- ++- if ( !hash ) return -1; ++- // inefficient, but works ++- char *my_str = strdup(str); ++- k = kh_put(s2i, hash->h, my_str, added); ++- if (*added == 0) { ++- free(my_str); ++- return kh_val(hash->h, k); ++- } ++- n = hash->n_id++; ++- kh_val(hash->h, k) = n; ++- if (hash->a_id <= n) { ++- const char **id; ++- hash->a_id = (n+1)*2; ++- if (!(id = realloc(hash->id, hash->a_id*sizeof(*hash->id)))) ++- return -1; ++- hash->id = id; ++- if (!(id = realloc(hash->line, hash->a_id*sizeof(*hash->line)))) ++- return -1; ++- hash->line = id; ++- } ++- hash->id[n] = my_str; // reverse map ++- if (line) ++- hash->line[n] = line; ++- ++- return n; ++-} ++- ++-khash_s2i *hash_s2i_create(void) { ++- khash_s2i *h = calloc(1, sizeof(*h)); ++- if (!h) ++- return NULL; ++- ++- h->h = kh_init(s2i); ++- if (!h->h) { ++- free(h); ++- return NULL; ++- } ++- return h; ++-} ++- ++-static void hash_s2i_free(khash_s2i *hash) { ++- // based on khash_str2int_destroy_free ++- khint_t k; ++- if (!hash) return; ++- if (hash->h) { ++- for (k = 0; k < kh_end(hash->h); ++k) ++- if (kh_exist(hash->h, k)) free((char*)kh_key(hash->h, k)); ++- kh_destroy(s2i, hash->h); ++- } ++- if (hash->id) ++- free(hash->id); ++- if (hash->line) ++- free(hash->line); ++- ++- free(hash); ++-} ++- ++-static khash_s2i *hash_rg(const bam_hdr_t *h) { ++- khash_s2i *rg2id = hash_s2i_create(); ++- char *cp, *line; ++- int j, l; ++- ++- if (!h) ++- return rg2id; ++- ++- if (!rg2id) ++- return NULL; ++- ++- cp = h->text; ++- ++- for (l = 0; l+3 < h->l_text; l++) { ++- line = &cp[l]; ++- if (!(cp[l] == '@' && cp[l+1] == 'R' && cp[l+2] == 'G')) { ++- while (l < h->l_text && cp[l] != '\n') ++- l++; ++- continue; ++- } ++- ++- // Found an @RG line; add to hash ++- while (cp[l] != '\n') { ++- while (l < h->l_text && cp[l] != '\n' && cp[l] != '\t') ++- l++; ++- if (l+4 < h->l_text && cp[l+1] == 'I' && cp[l+2] == 'D') ++- break; ++- } ++- if (cp[l] == '\n') ++- continue; ++- l = (j = l+4); ++- while (l < h->l_text && cp[l] != '\n' && cp[l] != '\t') ++- l++; ++- ++- // To do: save id and keep realloc as needed, as hash_s2i_inc strdups. ++- char *id = malloc(l-j+1); ++- strncpy(id, &cp[j], l-j); ++- id[l-j] = 0; ++- ++- int added; ++- hash_s2i_inc(rg2id, id, line, &added); ++- free(id); ++- ++- while (l < h->l_text && cp[l] != '\n') ++- l++; ++- } ++- ++- return rg2id; ++-} +++#include "sam_opts.h" ++ ++ /* ++ * Check the files are consistent and capable of being concatenated. ++- * Also fills out the rg2id read-group hash and the version numbers ++- * and produces a new bam_hdr_t structure with merged RG lines. ++- * Note it is only a simple merge, as we lack the niceties of a proper ++- * header API. +++ * Also fills out the version numbers and produces a new sam_hdr_t +++ * structure with merged RG lines. +++ * Note it is only a simple merge. ++ * ++ * Returns updated header on success; ++ * NULL on failure. ++ */ ++-static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t *h, ++- khash_s2i **rg2id, int *vers_maj_p, int *vers_min_p) { +++static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t *h, +++ int *vers_maj_p, int *vers_min_p) { ++ int i, vers_maj = -1, vers_min = -1; ++- bam_hdr_t *new_h = NULL; +++ sam_hdr_t *new_h = NULL, *old_h = NULL; +++ samFile *in = NULL; +++ kstring_t ks = KS_INITIALIZE; ++ ++ if (h) { ++- new_h = bam_hdr_dup(h); ++- *rg2id = hash_rg(new_h); +++ new_h = sam_hdr_dup(h); +++ if (!new_h) { +++ fprintf(stderr, "[%s] ERROR: header duplication failed.\n", +++ __func__); +++ goto fail; +++ } ++ } ++ ++ for (i = 0; i < nfn; ++i) { ++- samFile *in; ++ cram_fd *in_c; ++- khint_t ki; ++- int new_rg = -1; +++ int ki; ++ ++ in = sam_open(fn[i], "rc"); ++ if (in == 0) { ++ print_error_errno("cat", "fail to open file '%s'", fn[i]); ++- return NULL; +++ goto fail; ++ } ++ in_c = in->fp.cram; ++ ++@@ -210,55 +91,81 @@ ++ (vers_min != -1 && vers_min != vmin)) { ++ fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n", ++ __func__); ++- return NULL; +++ goto fail; ++ } ++ vers_maj = vmaj; ++ vers_min = vmin; ++ ++- bam_hdr_t *old = sam_hdr_read(in); ++- khash_s2i *rg2id_in = hash_rg(old); +++ old_h = sam_hdr_read(in); +++ if (!old_h) { +++ fprintf(stderr, "[%s] ERROR: header reading for file '%s' filed.\n", +++ __func__, fn[i]); +++ goto fail; +++ } ++ ++ if (!new_h) { ++- new_h = bam_hdr_dup(old); ++- *rg2id = hash_rg(new_h); +++ new_h = sam_hdr_dup(old_h); +++ if (!new_h) { +++ fprintf(stderr, "[%s] ERROR: header duplication for file '%s' failed.\n", +++ __func__, fn[i]); +++ goto fail; +++ } +++ sam_hdr_destroy(old_h); +++ sam_close(in); +++ continue; ++ } ++ ++- // Add any existing @RG entries to our global @RG hash. ++- for (ki = 0; ki < rg2id_in->n_id; ki++) { ++- int added; ++- ++- new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added); ++- //fprintf(stderr, "RG %s: #%d -> #%d\n", ++- // rg2id_in->id[ki], ki, new_rg); ++- ++- if (added) { ++- // Also add to new_h ++- const char *line = rg2id_in->line[ki]; ++- const char *line_end = line; ++- while (*line && *line_end++ != '\n') ++- ; ++- new_h->l_text += line_end - line; ++- new_h->text = realloc(new_h->text, new_h->l_text+1); ++- strncat(&new_h->text[new_h->l_text - (line_end - line)], ++- line, line_end - line); +++ int old_count = sam_hdr_count_lines(old_h, "RG"); +++ for (ki = 0; ki < old_count; ki++) { +++ const char *old_name = sam_hdr_line_name(old_h, "RG", ki); +++ if (old_name) { +++ int new_i = sam_hdr_line_index(new_h, "RG", old_name); +++ if (-1 == new_i) { // line does not exist in the new header +++ if (sam_hdr_find_line_pos(old_h, "RG", ki, &ks) || +++ !ks.s || sam_hdr_add_lines(new_h, ks.s, ks.l)) { +++ fprintf(stderr, "[%s] ERROR: failed to add @RG line 'ID:%s' from file '%s'\n", +++ __func__, old_name, fn[i]); +++ goto fail; +++ } +++ ks_free(&ks); +++ } +++ } else { +++ fprintf(stderr, "[%s] ERROR: failed to read %d @RG line from file '%s'\n", +++ __func__, ki, fn[i]); +++ goto fail; ++ } +++ } ++ ++- if (new_rg != ki && rg2id_in->n_id > 1) { ++- fprintf(stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", ++- __func__); ++- return NULL; +++ if (old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) { +++ for (ki = 0; ki < old_count; ki++) { +++ const char *old_name = sam_hdr_line_name(old_h, "RG", ki); +++ const char *new_name = sam_hdr_line_name(new_h, "RG", ki); +++ if (!old_name || !new_name || strcmp(old_name, new_name)) { +++ fprintf(stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", +++ __func__); +++ goto fail; +++ } ++ } ++ } ++ ++- hash_s2i_free(rg2id_in); ++- bam_hdr_destroy(old); +++ sam_hdr_destroy(old_h); ++ sam_close(in); ++ } ++ +++ ks_free(&ks); +++ ++ *vers_maj_p = vers_maj; ++ *vers_min_p = vers_min; ++ ++ return new_h; +++ +++fail: +++ ks_free(&ks); +++ if (old_h) sam_hdr_destroy(old_h); +++ if (new_h) sam_hdr_destroy(new_h); +++ if (in) sam_close(in); +++ +++ return NULL; ++ } ++ ++ ++@@ -289,22 +196,21 @@ ++ * huffman code. In this situation we can change the meta-data in the ++ * compression header to renumber an RG value.. ++ */ ++-int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) +++int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_global_args *ga, char *arg_list, int no_pg) ++ { ++ samFile *out; ++ cram_fd *out_c; ++ int i, vers_maj, vers_min; ++- khash_s2i *rg2id = NULL; ++- bam_hdr_t *new_h = NULL; +++ sam_hdr_t *new_h = NULL; ++ ++ /* Check consistent versioning and compatible headers */ ++- if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &rg2id, &vers_maj, &vers_min))) +++ if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &vers_maj, &vers_min))) ++ return -1; ++ ++ /* Open the file with cram_vers */ ++ char vers[100]; ++ sprintf(vers, "%d.%d", vers_maj, vers_min); ++- out = sam_open(outcram, "wc"); +++ out = sam_open_format(outcram, "wc", &ga->out); ++ if (out == 0) { ++ print_error_errno("cat", "fail to open output file '%s'", outcram); ++ return -1; ++@@ -313,7 +219,13 @@ ++ cram_set_option(out_c, CRAM_OPT_VERSION, vers); ++ //fprintf(stderr, "Creating cram vers %s\n", vers); ++ ++- cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed? +++ if (!no_pg && sam_hdr_add_pg(new_h, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) +++ return -1; +++ ++ if (sam_hdr_write(out, new_h) < 0) { ++ print_error_errno("cat", "Couldn't write header"); ++ return -1; ++@@ -323,7 +235,7 @@ ++ samFile *in; ++ cram_fd *in_c; ++ cram_container *c; ++- bam_hdr_t *old; +++ sam_hdr_t *old_h; ++ int new_rg = -1; ++ ++ in = sam_open(fn[i], "rc"); ++@@ -333,20 +245,29 @@ ++ } ++ in_c = in->fp.cram; ++ ++- old = sam_hdr_read(in); ++- khash_s2i *rg2id_in = hash_rg(old); +++ old_h = sam_hdr_read(in); +++ if (!old_h) { +++ print_error("cat", "fail to read the header of file '%s'", fn[i]); +++ return -1; +++ } ++ ++ // Compute RG mapping if suitable for changing. ++- if (rg2id_in->n_id == 1) { ++- int _; ++- new_rg = hash_s2i_inc(rg2id, rg2id_in->id[0], NULL, &_); +++ if (sam_hdr_count_lines(old_h, "RG") == 1) { +++ const char *old_name = sam_hdr_line_name(old_h, "RG", 0); +++ if (old_name) { +++ new_rg = sam_hdr_line_index(new_h, "RG", old_name); +++ if (new_rg < 0) { +++ print_error("cat", "fail to find @RG line '%s' in the new header", old_name); +++ return -1; +++ } +++ } else { +++ print_error("cat", "fail to find @RG line in file '%s'", fn[i]); +++ return -1; +++ } ++ } else { ++ new_rg = 0; ++ } ++ ++- hash_s2i_free(rg2id_in); ++- ++- ++ // Copy contains and blocks within them ++ while ((c = cram_read_container(in_c))) { ++ cram_block *blk; ++@@ -400,13 +321,11 @@ ++ cram_free_container(c); ++ } ++ ++- bam_hdr_destroy(old); +++ sam_hdr_destroy(old_h); ++ sam_close(in); ++ } ++ sam_close(out); ++- ++- hash_s2i_free(rg2id); ++- bam_hdr_destroy(new_h); +++ sam_hdr_destroy(new_h); ++ ++ return 0; ++ } ++@@ -419,7 +338,7 @@ ++ ++ #define BGZF_EMPTY_BLOCK_SIZE 28 ++ ++-int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) +++int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg) ++ { ++ BGZF *fp, *in = NULL; ++ uint8_t *buf = NULL; ++@@ -433,6 +352,13 @@ ++ return -1; ++ } ++ if (h) { +++ if (!no_pg && sam_hdr_add_pg(h, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) +++ goto fail; +++ ++ if (bam_hdr_write(fp, h) < 0) { ++ print_error_errno("cat", "Couldn't write header"); ++ goto fail; ++@@ -445,7 +371,7 @@ ++ goto fail; ++ } ++ for(i = 0; i < nfn; ++i){ ++- bam_hdr_t *old; +++ sam_hdr_t *old; ++ int len,j; ++ ++ in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r"); ++@@ -462,6 +388,13 @@ ++ goto fail; ++ } ++ if (h == 0 && i == 0) { +++ if (!no_pg && sam_hdr_add_pg(old, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) +++ goto fail; +++ ++ if (bam_hdr_write(fp, old) < 0) { ++ print_error_errno("cat", "Couldn't write header"); ++ goto fail; ++@@ -507,7 +440,7 @@ ++ if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail; ++ } ++ } ++- bam_hdr_destroy(old); +++ sam_hdr_destroy(old); ++ bgzf_close(in); ++ in = NULL; ++ } ++@@ -530,14 +463,25 @@ ++ ++ int main_cat(int argc, char *argv[]) ++ { ++- bam_hdr_t *h = 0; +++ sam_hdr_t *h = 0; ++ char *outfn = 0; ++ char **infns = NULL; // files to concatenate ++ int infns_size = 0; ++- int c, ret = 0; +++ int c, ret = 0, no_pg = 0; ++ samFile *in; +++ sam_global_args ga; +++ +++ static const struct option lopts[] = { +++ SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', 0, '-', '@'), +++ {"no-PG", no_argument, NULL, 1}, +++ { NULL, 0, NULL, 0 } +++ }; +++ +++ char *arg_list = NULL; ++ ++- while ((c = getopt(argc, argv, "h:o:b:")) >= 0) { +++ sam_global_args_init(&ga); +++ +++ while ((c = getopt_long(argc, argv, "h:o:b:", lopts, NULL)) >= 0) { ++ switch (c) { ++ case 'h': { ++ samFile *fph = sam_open(optarg, "r"); ++@@ -573,9 +517,19 @@ ++ } ++ break; ++ } +++ case 1: +++ no_pg = 1; +++ break; +++ default: +++ if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ } ++ } ++ +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("cat", "failed to create arg_list"); +++ return 1; +++ } +++ ++ // Append files specified in argv to the list. ++ int nargv_fns = argc - optind; ++ if (nargv_fns > 0) { ++@@ -592,6 +546,8 @@ ++ fprintf(stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n"); ++ fprintf(stderr, " -h FILE copy the header from FILE [default is 1st input file]\n"); ++ fprintf(stderr, " -o FILE output BAM/CRAM\n"); +++ fprintf(stderr, " --no-PG do not add a PG line\n"); +++ sam_global_opt_help(stderr, "--..-@-."); ++ return 1; ++ } ++ ++@@ -604,13 +560,13 @@ ++ switch (hts_get_format(in)->format) { ++ case bam: ++ sam_close(in); ++- if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0) +++ if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0) ++ ret = 1; ++ break; ++ ++ case cram: ++ sam_close(in); ++- if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0) +++ if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", &ga, arg_list, no_pg) < 0) ++ ret = 1; ++ break; ++ ++@@ -629,9 +585,9 @@ ++ ++ free(outfn); ++ free(infns); ++- +++ free(arg_list); ++ if (h) ++- bam_hdr_destroy(h); +++ sam_hdr_destroy(h); ++ ++ return ret; ++ } ++--- python-pysam.orig/samtools/bam_cat.c.pysam.c +++++ python-pysam/samtools/bam_cat.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam_cat.c -- efficiently concatenates bam files. ++ ++- Copyright (C) 2008-2009, 2011-2013, 2015-2016 Genome Research Ltd. +++ Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019 Genome Research Ltd. ++ Modified SAMtools work copyright (C) 2010 Illumina, Inc. ++ ++ Permission is hereby granted, free of charge, to any person obtaining a copy ++@@ -47,162 +47,43 @@ ++ #include "htslib/bgzf.h" ++ #include "htslib/sam.h" ++ #include "htslib/cram.h" ++-#include "htslib/khash.h" +++#include "htslib/kstring.h" ++ #include "samtools.h" ++- ++-KHASH_MAP_INIT_STR(s2i, int) ++- ++-// Bi-directional lookup. ++-// We can go from name to ID or ID to name. ++-typedef struct khash_s2i { ++- khash_t(s2i) *h; ++- int n_id, a_id; ++- const char **id; // map Nth entry back to key ++- const char **line; ++-} khash_s2i; ++- ++-static int hash_s2i_inc(khash_s2i *hash, const char *str, const char *line, int *added) { ++- // loosly based on khash_str2int_inc ++- khint_t k; ++- int n; ++- ++- if ( !hash ) return -1; ++- // inefficient, but works ++- char *my_str = strdup(str); ++- k = kh_put(s2i, hash->h, my_str, added); ++- if (*added == 0) { ++- free(my_str); ++- return kh_val(hash->h, k); ++- } ++- n = hash->n_id++; ++- kh_val(hash->h, k) = n; ++- if (hash->a_id <= n) { ++- const char **id; ++- hash->a_id = (n+1)*2; ++- if (!(id = realloc(hash->id, hash->a_id*sizeof(*hash->id)))) ++- return -1; ++- hash->id = id; ++- if (!(id = realloc(hash->line, hash->a_id*sizeof(*hash->line)))) ++- return -1; ++- hash->line = id; ++- } ++- hash->id[n] = my_str; // reverse map ++- if (line) ++- hash->line[n] = line; ++- ++- return n; ++-} ++- ++-khash_s2i *hash_s2i_create(void) { ++- khash_s2i *h = calloc(1, sizeof(*h)); ++- if (!h) ++- return NULL; ++- ++- h->h = kh_init(s2i); ++- if (!h->h) { ++- free(h); ++- return NULL; ++- } ++- return h; ++-} ++- ++-static void hash_s2i_free(khash_s2i *hash) { ++- // based on khash_str2int_destroy_free ++- khint_t k; ++- if (!hash) return; ++- if (hash->h) { ++- for (k = 0; k < kh_end(hash->h); ++k) ++- if (kh_exist(hash->h, k)) free((char*)kh_key(hash->h, k)); ++- kh_destroy(s2i, hash->h); ++- } ++- if (hash->id) ++- free(hash->id); ++- if (hash->line) ++- free(hash->line); ++- ++- free(hash); ++-} ++- ++-static khash_s2i *hash_rg(const bam_hdr_t *h) { ++- khash_s2i *rg2id = hash_s2i_create(); ++- char *cp, *line; ++- int j, l; ++- ++- if (!h) ++- return rg2id; ++- ++- if (!rg2id) ++- return NULL; ++- ++- cp = h->text; ++- ++- for (l = 0; l+3 < h->l_text; l++) { ++- line = &cp[l]; ++- if (!(cp[l] == '@' && cp[l+1] == 'R' && cp[l+2] == 'G')) { ++- while (l < h->l_text && cp[l] != '\n') ++- l++; ++- continue; ++- } ++- ++- // Found an @RG line; add to hash ++- while (cp[l] != '\n') { ++- while (l < h->l_text && cp[l] != '\n' && cp[l] != '\t') ++- l++; ++- if (l+4 < h->l_text && cp[l+1] == 'I' && cp[l+2] == 'D') ++- break; ++- } ++- if (cp[l] == '\n') ++- continue; ++- l = (j = l+4); ++- while (l < h->l_text && cp[l] != '\n' && cp[l] != '\t') ++- l++; ++- ++- // To do: save id and keep realloc as needed, as hash_s2i_inc strdups. ++- char *id = malloc(l-j+1); ++- strncpy(id, &cp[j], l-j); ++- id[l-j] = 0; ++- ++- int added; ++- hash_s2i_inc(rg2id, id, line, &added); ++- free(id); ++- ++- while (l < h->l_text && cp[l] != '\n') ++- l++; ++- } ++- ++- return rg2id; ++-} +++#include "sam_opts.h" ++ ++ /* ++ * Check the files are consistent and capable of being concatenated. ++- * Also fills out the rg2id read-group hash and the version numbers ++- * and produces a new bam_hdr_t structure with merged RG lines. ++- * Note it is only a simple merge, as we lack the niceties of a proper ++- * header API. +++ * Also fills out the version numbers and produces a new sam_hdr_t +++ * structure with merged RG lines. +++ * Note it is only a simple merge. ++ * ++ * Returns updated header on success; ++ * NULL on failure. ++ */ ++-static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t *h, ++- khash_s2i **rg2id, int *vers_maj_p, int *vers_min_p) { +++static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t *h, +++ int *vers_maj_p, int *vers_min_p) { ++ int i, vers_maj = -1, vers_min = -1; ++- bam_hdr_t *new_h = NULL; +++ sam_hdr_t *new_h = NULL, *old_h = NULL; +++ samFile *in = NULL; +++ kstring_t ks = KS_INITIALIZE; ++ ++ if (h) { ++- new_h = bam_hdr_dup(h); ++- *rg2id = hash_rg(new_h); +++ new_h = sam_hdr_dup(h); +++ if (!new_h) { +++ fprintf(samtools_stderr, "[%s] ERROR: header duplication failed.\n", +++ __func__); +++ goto fail; +++ } ++ } ++ ++ for (i = 0; i < nfn; ++i) { ++- samFile *in; ++ cram_fd *in_c; ++- khint_t ki; ++- int new_rg = -1; +++ int ki; ++ ++ in = sam_open(fn[i], "rc"); ++ if (in == 0) { ++ print_error_errno("cat", "fail to open file '%s'", fn[i]); ++- return NULL; +++ goto fail; ++ } ++ in_c = in->fp.cram; ++ ++@@ -212,55 +93,81 @@ ++ (vers_min != -1 && vers_min != vmin)) { ++ fprintf(samtools_stderr, "[%s] ERROR: input files have differing version numbers.\n", ++ __func__); ++- return NULL; +++ goto fail; ++ } ++ vers_maj = vmaj; ++ vers_min = vmin; ++ ++- bam_hdr_t *old = sam_hdr_read(in); ++- khash_s2i *rg2id_in = hash_rg(old); +++ old_h = sam_hdr_read(in); +++ if (!old_h) { +++ fprintf(samtools_stderr, "[%s] ERROR: header reading for file '%s' filed.\n", +++ __func__, fn[i]); +++ goto fail; +++ } ++ ++ if (!new_h) { ++- new_h = bam_hdr_dup(old); ++- *rg2id = hash_rg(new_h); +++ new_h = sam_hdr_dup(old_h); +++ if (!new_h) { +++ fprintf(samtools_stderr, "[%s] ERROR: header duplication for file '%s' failed.\n", +++ __func__, fn[i]); +++ goto fail; +++ } +++ sam_hdr_destroy(old_h); +++ sam_close(in); +++ continue; ++ } ++ ++- // Add any existing @RG entries to our global @RG hash. ++- for (ki = 0; ki < rg2id_in->n_id; ki++) { ++- int added; ++- ++- new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added); ++- //fprintf(samtools_stderr, "RG %s: #%d -> #%d\n", ++- // rg2id_in->id[ki], ki, new_rg); ++- ++- if (added) { ++- // Also add to new_h ++- const char *line = rg2id_in->line[ki]; ++- const char *line_end = line; ++- while (*line && *line_end++ != '\n') ++- ; ++- new_h->l_text += line_end - line; ++- new_h->text = realloc(new_h->text, new_h->l_text+1); ++- strncat(&new_h->text[new_h->l_text - (line_end - line)], ++- line, line_end - line); +++ int old_count = sam_hdr_count_lines(old_h, "RG"); +++ for (ki = 0; ki < old_count; ki++) { +++ const char *old_name = sam_hdr_line_name(old_h, "RG", ki); +++ if (old_name) { +++ int new_i = sam_hdr_line_index(new_h, "RG", old_name); +++ if (-1 == new_i) { // line does not exist in the new header +++ if (sam_hdr_find_line_pos(old_h, "RG", ki, &ks) || +++ !ks.s || sam_hdr_add_lines(new_h, ks.s, ks.l)) { +++ fprintf(samtools_stderr, "[%s] ERROR: failed to add @RG line 'ID:%s' from file '%s'\n", +++ __func__, old_name, fn[i]); +++ goto fail; +++ } +++ ks_free(&ks); +++ } +++ } else { +++ fprintf(samtools_stderr, "[%s] ERROR: failed to read %d @RG line from file '%s'\n", +++ __func__, ki, fn[i]); +++ goto fail; ++ } +++ } ++ ++- if (new_rg != ki && rg2id_in->n_id > 1) { ++- fprintf(samtools_stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", ++- __func__); ++- return NULL; +++ if (old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) { +++ for (ki = 0; ki < old_count; ki++) { +++ const char *old_name = sam_hdr_line_name(old_h, "RG", ki); +++ const char *new_name = sam_hdr_line_name(new_h, "RG", ki); +++ if (!old_name || !new_name || strcmp(old_name, new_name)) { +++ fprintf(samtools_stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", +++ __func__); +++ goto fail; +++ } ++ } ++ } ++ ++- hash_s2i_free(rg2id_in); ++- bam_hdr_destroy(old); +++ sam_hdr_destroy(old_h); ++ sam_close(in); ++ } ++ +++ ks_free(&ks); +++ ++ *vers_maj_p = vers_maj; ++ *vers_min_p = vers_min; ++ ++ return new_h; +++ +++fail: +++ ks_free(&ks); +++ if (old_h) sam_hdr_destroy(old_h); +++ if (new_h) sam_hdr_destroy(new_h); +++ if (in) sam_close(in); +++ +++ return NULL; ++ } ++ ++ ++@@ -291,22 +198,21 @@ ++ * huffman code. In this situation we can change the meta-data in the ++ * compression header to renumber an RG value.. ++ */ ++-int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) +++int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_global_args *ga, char *arg_list, int no_pg) ++ { ++ samFile *out; ++ cram_fd *out_c; ++ int i, vers_maj, vers_min; ++- khash_s2i *rg2id = NULL; ++- bam_hdr_t *new_h = NULL; +++ sam_hdr_t *new_h = NULL; ++ ++ /* Check consistent versioning and compatible headers */ ++- if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &rg2id, &vers_maj, &vers_min))) +++ if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &vers_maj, &vers_min))) ++ return -1; ++ ++ /* Open the file with cram_vers */ ++ char vers[100]; ++ sprintf(vers, "%d.%d", vers_maj, vers_min); ++- out = sam_open(outcram, "wc"); +++ out = sam_open_format(outcram, "wc", &ga->out); ++ if (out == 0) { ++ print_error_errno("cat", "fail to open output file '%s'", outcram); ++ return -1; ++@@ -315,7 +221,13 @@ ++ cram_set_option(out_c, CRAM_OPT_VERSION, vers); ++ //fprintf(samtools_stderr, "Creating cram vers %s\n", vers); ++ ++- cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed? +++ if (!no_pg && sam_hdr_add_pg(new_h, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) +++ return -1; +++ ++ if (sam_hdr_write(out, new_h) < 0) { ++ print_error_errno("cat", "Couldn't write header"); ++ return -1; ++@@ -325,7 +237,7 @@ ++ samFile *in; ++ cram_fd *in_c; ++ cram_container *c; ++- bam_hdr_t *old; +++ sam_hdr_t *old_h; ++ int new_rg = -1; ++ ++ in = sam_open(fn[i], "rc"); ++@@ -335,20 +247,29 @@ ++ } ++ in_c = in->fp.cram; ++ ++- old = sam_hdr_read(in); ++- khash_s2i *rg2id_in = hash_rg(old); +++ old_h = sam_hdr_read(in); +++ if (!old_h) { +++ print_error("cat", "fail to read the header of file '%s'", fn[i]); +++ return -1; +++ } ++ ++ // Compute RG mapping if suitable for changing. ++- if (rg2id_in->n_id == 1) { ++- int _; ++- new_rg = hash_s2i_inc(rg2id, rg2id_in->id[0], NULL, &_); +++ if (sam_hdr_count_lines(old_h, "RG") == 1) { +++ const char *old_name = sam_hdr_line_name(old_h, "RG", 0); +++ if (old_name) { +++ new_rg = sam_hdr_line_index(new_h, "RG", old_name); +++ if (new_rg < 0) { +++ print_error("cat", "fail to find @RG line '%s' in the new header", old_name); +++ return -1; +++ } +++ } else { +++ print_error("cat", "fail to find @RG line in file '%s'", fn[i]); +++ return -1; +++ } ++ } else { ++ new_rg = 0; ++ } ++ ++- hash_s2i_free(rg2id_in); ++- ++- ++ // Copy contains and blocks within them ++ while ((c = cram_read_container(in_c))) { ++ cram_block *blk; ++@@ -402,13 +323,11 @@ ++ cram_free_container(c); ++ } ++ ++- bam_hdr_destroy(old); +++ sam_hdr_destroy(old_h); ++ sam_close(in); ++ } ++ sam_close(out); ++- ++- hash_s2i_free(rg2id); ++- bam_hdr_destroy(new_h); +++ sam_hdr_destroy(new_h); ++ ++ return 0; ++ } ++@@ -421,7 +340,7 @@ ++ ++ #define BGZF_EMPTY_BLOCK_SIZE 28 ++ ++-int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) +++int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg) ++ { ++ BGZF *fp, *in = NULL; ++ uint8_t *buf = NULL; ++@@ -435,6 +354,13 @@ ++ return -1; ++ } ++ if (h) { +++ if (!no_pg && sam_hdr_add_pg(h, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) +++ goto fail; +++ ++ if (bam_hdr_write(fp, h) < 0) { ++ print_error_errno("cat", "Couldn't write header"); ++ goto fail; ++@@ -447,7 +373,7 @@ ++ goto fail; ++ } ++ for(i = 0; i < nfn; ++i){ ++- bam_hdr_t *old; +++ sam_hdr_t *old; ++ int len,j; ++ ++ in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r"); ++@@ -464,6 +390,13 @@ ++ goto fail; ++ } ++ if (h == 0 && i == 0) { +++ if (!no_pg && sam_hdr_add_pg(old, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) +++ goto fail; +++ ++ if (bam_hdr_write(fp, old) < 0) { ++ print_error_errno("cat", "Couldn't write header"); ++ goto fail; ++@@ -509,7 +442,7 @@ ++ if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail; ++ } ++ } ++- bam_hdr_destroy(old); +++ sam_hdr_destroy(old); ++ bgzf_close(in); ++ in = NULL; ++ } ++@@ -532,14 +465,25 @@ ++ ++ int main_cat(int argc, char *argv[]) ++ { ++- bam_hdr_t *h = 0; +++ sam_hdr_t *h = 0; ++ char *outfn = 0; ++ char **infns = NULL; // files to concatenate ++ int infns_size = 0; ++- int c, ret = 0; +++ int c, ret = 0, no_pg = 0; ++ samFile *in; +++ sam_global_args ga; +++ +++ static const struct option lopts[] = { +++ SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', 0, '-', '@'), +++ {"no-PG", no_argument, NULL, 1}, +++ { NULL, 0, NULL, 0 } +++ }; +++ +++ char *arg_list = NULL; ++ ++- while ((c = getopt(argc, argv, "h:o:b:")) >= 0) { +++ sam_global_args_init(&ga); +++ +++ while ((c = getopt_long(argc, argv, "h:o:b:", lopts, NULL)) >= 0) { ++ switch (c) { ++ case 'h': { ++ samFile *fph = sam_open(optarg, "r"); ++@@ -575,9 +519,19 @@ ++ } ++ break; ++ } +++ case 1: +++ no_pg = 1; +++ break; +++ default: +++ if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ } ++ } ++ +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("cat", "failed to create arg_list"); +++ return 1; +++ } +++ ++ // Append files specified in argv to the list. ++ int nargv_fns = argc - optind; ++ if (nargv_fns > 0) { ++@@ -594,6 +548,8 @@ ++ fprintf(samtools_stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n"); ++ fprintf(samtools_stderr, " -h FILE copy the header from FILE [default is 1st input file]\n"); ++ fprintf(samtools_stderr, " -o FILE output BAM/CRAM\n"); +++ fprintf(samtools_stderr, " --no-PG do not add a PG line\n"); +++ sam_global_opt_help(samtools_stderr, "--..-@-."); ++ return 1; ++ } ++ ++@@ -606,13 +562,13 @@ ++ switch (hts_get_format(in)->format) { ++ case bam: ++ sam_close(in); ++- if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0) +++ if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0) ++ ret = 1; ++ break; ++ ++ case cram: ++ sam_close(in); ++- if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0) +++ if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", &ga, arg_list, no_pg) < 0) ++ ret = 1; ++ break; ++ ++@@ -631,9 +587,9 @@ ++ ++ free(outfn); ++ free(infns); ++- +++ free(arg_list); ++ if (h) ++- bam_hdr_destroy(h); +++ sam_hdr_destroy(h); ++ ++ return ret; ++ } ++--- /dev/null +++++ python-pysam/samtools/bam_fastq.c ++@@ -0,0 +1,1037 @@ +++/* bam_fastq.c -- FASTA and FASTQ file generation +++ +++ Copyright (C) 2009-2017, 2019 Genome Research Ltd. +++ Portions copyright (C) 2009, 2011, 2012 Broad Institute. +++ +++ Author: Heng Li +++ +++Permission is hereby granted, free of charge, to any person obtaining a copy +++of this software and associated documentation files (the "Software"), to deal +++in the Software without restriction, including without limitation the rights +++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++copies of the Software, and to permit persons to whom the Software is +++furnished to do so, subject to the following conditions: +++ +++The above copyright notices and this permission notice shall be included in +++all copies or substantial portions of the Software. +++ +++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++DEALINGS IN THE SOFTWARE. */ +++ +++#include +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++ +++#include "htslib/sam.h" +++#include "htslib/klist.h" +++#include "htslib/kstring.h" +++#include "htslib/bgzf.h" +++#include "htslib/thread_pool.h" +++#include "samtools.h" +++#include "sam_opts.h" +++ +++#define taglist_free(p) +++KLIST_INIT(ktaglist, char*, taglist_free) +++ +++#define DEFAULT_BARCODE_TAG "BC" +++#define DEFAULT_QUALITY_TAG "QT" +++#define INDEX_SEPARATOR "+" +++ +++int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; +++static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; +++ +++static void bam2fq_usage(FILE *to, const char *command) +++{ +++ int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; +++ fprintf(to, +++"Usage: samtools %s [options...] \n", command); +++ fprintf(to, +++"\n" +++"Description:\n" +++"Converts a SAM, BAM or CRAM into either FASTQ or FASTA format depending on the command invoked.\n" +++"\n" +++"Options:\n" +++" -0 FILE write reads designated READ_OTHER to FILE\n" +++" -1 FILE write reads designated READ1 to FILE\n" +++" -2 FILE write reads designated READ2 to FILE\n" +++" -o FILE write reads designated READ1 or READ2 to FILE\n" +++" note: if a singleton file is specified with -s, only\n" +++" paired reads will be written to the -1 and -2 files.\n" +++" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x +++" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0 +++" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) +++" -n don't append /1 and /2 to the read name\n" +++" -N always append /1 and /2 to the read name\n"); +++ if (fq) fprintf(to, +++" -O output quality in the OQ tag if present\n"); +++ fprintf(to, +++" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" +++" -t copy RG, BC and QT tags to the %s header line\n", +++ fq ? "FASTQ" : "FASTA"); +++ fprintf(to, +++" -T TAGLIST copy arbitrary tags to the %s header line\n", +++ fq ? "FASTQ" : "FASTA"); +++ if (fq) fprintf(to, +++" -v INT default quality score if not given in file [1]\n" +++" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" +++" -c compression level [0..9] to use when creating gz or bgzf fastq files [1]\n" +++" --i1 FILE write first index reads to FILE\n" +++" --i2 FILE write second index reads to FILE\n" +++" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" +++" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" +++" --index-format STR How to parse barcode and quality tags\n\n"); +++ sam_global_opt_help(to, "-.--.@-."); +++ fprintf(to, +++"\n" +++"The files will be automatically compressed if the file names have a .gz or .bgzf extension.\n" +++"The input to this program must be collated by name. Run 'samtools collate' or 'samtools sort -n'.\n" +++"\n" +++"Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n" +++"Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n" +++"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n" +++"or both unset.\n" +++"Run 'samtools flags' for more information on flag codes and meanings.\n"); +++ fprintf(to, +++"\n" +++"The index-format string describes how to parse the barcode and quality tags, for example:\n" +++" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" +++" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" +++"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" +++"'read until the separator or end of tag', for example:\n" +++" n*i* ignore the left part of the tag until the separator, then use the second part\n" +++" of the tag as index 1\n"); +++ fprintf(to, +++"\n" +++"Examples:\n" +++" To get just the paired reads in separate files, use:\n" +++" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n in.bam\n" +++"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n" +++" samtools %s in.bam > all_reads.%s\n", +++ command, fq ? "fq" : "fa", fq ? "fq" : "fa", +++ command, fq ? "fq" : "fa"); +++} +++ +++typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart; +++typedef enum { FASTA, FASTQ } fastfile; +++typedef struct bam2fq_opts { +++ char *fnse; +++ char *fnr[3]; +++ char *fn_input; // pointer to input filename in argv do not free +++ bool has12, has12always, use_oq, copy_tags, illumina_tag; +++ int flag_on, flag_off, flag_alloff; +++ sam_global_args ga; +++ fastfile filetype; +++ int def_qual; +++ char *barcode_tag; +++ char *quality_tag; +++ char *index_file[2]; +++ char *index_format; +++ char *extra_tags; +++ char compression_level; +++} bam2fq_opts_t; +++ +++typedef struct bam2fq_state { +++ samFile *fp; +++ BGZF *fpse; +++ BGZF *fpr[3]; +++ BGZF *fpi[2]; +++ BGZF *hstdout; +++ sam_hdr_t *h; +++ bool has12, use_oq, copy_tags, illumina_tag; +++ int flag_on, flag_off, flag_alloff; +++ fastfile filetype; +++ int def_qual; +++ klist_t(ktaglist) *taglist; +++ char *index_sequence; +++ char compression_level; +++ htsThreadPool p; +++} bam2fq_state_t; +++ +++/* +++ * Get and decode the read from a BAM record. +++ * +++ * TODO: htslib really needs an interface for this. Consider this or perhaps +++ * bam_get_seq_str (current vs original orientation) and bam_get_qual_str +++ * functions as string formatted equivalents to bam_get_{seq,qual}? +++ */ +++ +++/* +++ * Reverse a string in place. +++ * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. +++ * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik +++ */ +++static char *reverse(char *str) +++{ +++ int i = strlen(str)-1,j=0; +++ char ch; +++ while (i>j) { +++ ch = str[i]; +++ str[i]= str[j]; +++ str[j] = ch; +++ i--; +++ j++; +++ } +++ return str; +++} +++ +++/* return the read, reverse complemented if necessary */ +++static char *get_read(const bam1_t *rec) +++{ +++ int len = rec->core.l_qseq + 1; +++ char *read = calloc(1, len); +++ char *seq = (char *)bam_get_seq(rec); +++ int n; +++ +++ if (!read) return NULL; +++ +++ for (n=0; n < rec->core.l_qseq; n++) { +++ if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; +++ else read[n] = seq_nt16_str[bam_seqi(seq,n)]; +++ } +++ if (rec->core.flag & BAM_FREVERSE) reverse(read); +++ return read; +++} +++ +++/* +++ * get and decode the quality from a BAM record +++ */ +++static int get_quality(const bam1_t *rec, char **qual_out) +++{ +++ char *quality = calloc(1, rec->core.l_qseq + 1); +++ char *q = (char *)bam_get_qual(rec); +++ int n; +++ +++ if (!quality) return -1; +++ +++ if (*q == '\xff') { +++ free(quality); +++ *qual_out = NULL; +++ return 0; +++ } +++ +++ for (n=0; n < rec->core.l_qseq; n++) { +++ quality[n] = q[n]+33; +++ } +++ if (rec->core.flag & BAM_FREVERSE) reverse(quality); +++ *qual_out = quality; +++ return 0; +++} +++ +++// +++// End of htslib complaints +++// +++ +++ +++static readpart which_readpart(const bam1_t *b) +++{ +++ if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { +++ return READ_1; +++ } else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) { +++ return READ_2; +++ } else { +++ return READ_UNKNOWN; +++ } +++} +++ +++/* +++ * parse the length part from the index-format string +++ */ +++static int getLength(char **s) +++{ +++ int n = 0; +++ while (**s) { +++ if (**s == '*') { n=-1; (*s)++; break; } +++ if ( !isdigit(**s)) break; +++ n = n*10 + ((**s)-'0'); +++ (*s)++; +++ } +++ return n; +++} +++ +++static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf) +++{ +++ uint8_t *s = bam_aux_get(rec, tag); +++ if (s) { +++ char aux_type = *s; +++ switch (aux_type) { +++ case 'C': +++ case 'S': aux_type = 'I'; break; +++ case 'c': +++ case 's': aux_type = 'i'; break; +++ case 'd': aux_type = 'f'; break; +++ } +++ +++ // Ensure space. Need 6 chars + length of tag. Max length of +++ // i is 16, A is 21, B currently 26, Z is unknown, so +++ // have to check that one later. +++ if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false; +++ +++ kputc('\t', linebuf); +++ kputsn(tag, 2, linebuf); +++ kputc(':', linebuf); +++ kputc(aux_type=='I'? 'i': aux_type, linebuf); +++ kputc(':', linebuf); +++ switch (aux_type) { +++ case 'H': +++ case 'Z': +++ if (kputs(bam_aux2Z(s), linebuf) < 0) return false; +++ break; +++ case 'i': kputw(bam_aux2i(s), linebuf); break; +++ case 'I': kputuw(bam_aux2i(s), linebuf); break; +++ case 'A': kputc(bam_aux2A(s), linebuf); break; +++ case 'f': kputd(bam_aux2f(s), linebuf); break; +++ case 'B': kputs("*** Unhandled aux type ***", linebuf); return false; +++ default: kputs("*** Unknown aux type ***", linebuf); return false; +++ } +++ } +++ return true; +++} +++ +++static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec) +++{ +++ if (!index_sequence) return 0; +++ +++ kstring_t new = {0,0,NULL}; +++ if (linebuf->s) { +++ char *s = strchr(linebuf->s, '\n'); +++ if (s) { +++ if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0) +++ return -1; +++ *s = 0; +++ kputs(linebuf->s, &new); +++ kputc(' ', &new); +++ readpart readpart = which_readpart(rec); +++ if (readpart == READ_1) kputc('1', &new); +++ else if (readpart == READ_2) kputc('2', &new); +++ else kputc('0', &new); +++ +++ kputc(':', &new); +++ if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new); +++ else kputc('N', &new); +++ +++ kputs(":0:", &new); +++ kputs(index_sequence, &new); +++ kputc('\n', &new); +++ kputs(s+1, &new); +++ free(ks_release(linebuf)); +++ linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m; +++ } +++ } +++ return 0; +++} +++ +++static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) +++{ +++ int i; +++ +++ linebuf->l = 0; +++ // Write read name +++ if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false; +++ if (kputs(bam_get_qname(rec), linebuf) < 0) return false; +++ // Add the /1 /2 if requested +++ if (state->has12) { +++ readpart readpart = which_readpart(rec); +++ if (readpart == READ_1) { +++ if (kputs("/1", linebuf) < 0) return false; +++ } else if (readpart == READ_2) { +++ if (kputs("/2", linebuf) < 0) return false; +++ } +++ } +++ if (state->copy_tags) { +++ for (i = 0; copied_tags[i]; ++i) { +++ if (!copy_tag(copied_tags[i], rec, linebuf)) { +++ fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s); +++ return false; +++ } +++ } +++ } +++ +++ if (state->taglist->size) { +++ kliter_t(ktaglist) *p; +++ for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) { +++ if (!copy_tag(kl_val(p), rec, linebuf)) { +++ fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s); +++ return false; +++ } +++ } +++ } +++ +++ if (kputc('\n', linebuf) < 0) return false; +++ if (kputs(seq, linebuf) < 0) return false; +++ if (kputc('\n', linebuf) < 0) return false; +++ +++ if (state->filetype == FASTQ) { +++ // Write quality +++ if (kputs("+\n", linebuf) < 0) return false; +++ if (qual && *qual) { +++ if (kputs(qual, linebuf) < 0) return false; +++ } else { +++ int len = strlen(seq); +++ if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false; +++ for (i = 0; i < len; ++i) { +++ kputc(33 + state->def_qual, linebuf); +++ } +++ } +++ if (kputc('\n', linebuf) < 0) return false; +++ } +++ return true; +++} +++ +++/* +++ * Create FASTQ lines from the barcode tag using the index-format +++ */ +++static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) +++{ +++ uint8_t *p; +++ char *ifmt = opts->index_format; +++ char *tag = NULL; +++ char *qual = NULL; +++ char *sub_tag = NULL; +++ char *sub_qual = NULL; +++ size_t tag_len; +++ int file_number = 0; +++ kstring_t linebuf = { 0, 0, NULL }; // Buffer +++ +++ if (!ifmt) return true; +++ +++ // read barcode tag +++ p = bam_aux_get(rec,opts->barcode_tag); +++ if (p) tag = bam_aux2Z(p); +++ +++ if (!tag) return true; // there is no tag +++ +++ tag_len = strlen(tag); +++ sub_tag = calloc(1, tag_len + 1); +++ if (!sub_tag) goto fail; +++ sub_qual = calloc(1, tag_len + 1); +++ if (!sub_qual) goto fail; +++ +++ // read quality tag +++ p = bam_aux_get(rec, opts->quality_tag); +++ if (p) qual = bam_aux2Z(p); +++ +++ // Parse the index-format string +++ while (*ifmt) { +++ if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly +++ char action = *ifmt; // should be 'i' or 'n' +++ ifmt++; // skip over action +++ int index_len = getLength(&ifmt); +++ int n = 0; +++ +++ if (index_len < 0) { +++ // read until separator +++ while (isalpha(*tag)) { +++ sub_tag[n] = *tag++; +++ if (qual) sub_qual[n] = *qual++; +++ n++; +++ } +++ if (*tag) { // skip separator +++ tag++; +++ if (qual) qual++; +++ } +++ } else { +++ // read index_len characters +++ while (index_len-- && *tag) { +++ sub_tag[n] = *tag++; +++ if (qual) sub_qual[n] = *qual++; +++ n++; +++ } +++ } +++ sub_tag[n] = '\0'; +++ sub_qual[n] = '\0'; +++ +++ if (action=='i' && *sub_tag) { +++ if (state->index_sequence) { +++ char *new_index_sequence = realloc(state->index_sequence, strlen(state->index_sequence) + strlen(sub_tag) + 2); +++ if (!new_index_sequence) goto fail; +++ state->index_sequence = new_index_sequence; +++ strcat(state->index_sequence, INDEX_SEPARATOR); +++ strcat(state->index_sequence, sub_tag); +++ } else { +++ state->index_sequence = strdup(sub_tag); // we're going to need this later... +++ } +++ if (!state->index_sequence) goto fail; +++ if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail; +++ if (state->illumina_tag) { +++ if (insert_index_sequence_into_linebuf(sub_tag, &linebuf, rec) < 0) { +++ goto fail; +++ } +++ } +++ if (state->fpi[file_number]) { +++ if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0) +++ goto fail; +++ } +++ } +++ +++ } +++ +++ free(sub_qual); free(sub_tag); +++ free(linebuf.s); +++ return true; +++ +++ fail: +++ perror(__func__); +++ free(sub_qual); free(sub_tag); +++ free(linebuf.s); +++ return false; +++} +++ +++// Transform a bam1_t record into a string with the FASTQ representation of it +++// @returns false for error, true for success +++static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) +++{ +++ int32_t qlen = b->core.l_qseq; +++ assert(qlen >= 0); +++ const uint8_t *oq = NULL; +++ char *qual = NULL; +++ +++ char *seq = get_read(b); +++ if (!seq) return false; +++ +++ if (state->use_oq) oq = bam_aux_get(b, "OQ"); +++ if (oq && *oq=='Z') { +++ qual = strdup(bam_aux2Z(oq)); +++ if (!qual) goto fail; +++ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented +++ reverse(qual); +++ } +++ } else { +++ if (get_quality(b, &qual) < 0) goto fail; +++ } +++ +++ if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail; +++ +++ free(qual); +++ free(seq); +++ return true; +++ +++ fail: +++ free(seq); +++ free(qual); +++ return false; +++} +++ +++static void free_opts(bam2fq_opts_t *opts) +++{ +++ free(opts->barcode_tag); +++ free(opts->quality_tag); +++ free(opts->index_format); +++ free(opts->extra_tags); +++ free(opts); +++} +++ +++// return true if valid +++static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) +++{ +++ // Parse args +++ bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t)); +++ opts->has12 = true; +++ opts->has12always = false; +++ opts->filetype = FASTQ; +++ opts->def_qual = 1; +++ opts->barcode_tag = NULL; +++ opts->quality_tag = NULL; +++ opts->index_format = NULL; +++ opts->index_file[0] = NULL; +++ opts->index_file[1] = NULL; +++ opts->extra_tags = NULL; +++ opts->compression_level = 1; +++ opts->flag_off = BAM_FSECONDARY|BAM_FSUPPLEMENTARY; +++ int flag_off_set = 0; +++ +++ int c; +++ sam_global_args_init(&opts->ga); +++ static const struct option lopts[] = { +++ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), +++ {"i1", required_argument, NULL, 1}, +++ {"I1", required_argument, NULL, 1}, +++ {"i2", required_argument, NULL, 2}, +++ {"I2", required_argument, NULL, 2}, +++ {"if", required_argument, NULL, 3}, +++ {"IF", required_argument, NULL, 3}, +++ {"index-format", required_argument, NULL, 3}, +++ {"barcode-tag", required_argument, NULL, 'b'}, +++ {"quality-tag", required_argument, NULL, 'q'}, +++ { NULL, 0, NULL, 0 } +++ }; +++ while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) { +++ switch (c) { +++ case 'b': opts->barcode_tag = strdup(optarg); break; +++ case 'q': opts->quality_tag = strdup(optarg); break; +++ case 1 : opts->index_file[0] = optarg; break; +++ case 2 : opts->index_file[1] = optarg; break; +++ case 3 : opts->index_format = strdup(optarg); break; +++ case '0': opts->fnr[0] = optarg; break; +++ case '1': opts->fnr[1] = optarg; break; +++ case '2': opts->fnr[2] = optarg; break; +++ case 'o': opts->fnr[1] = optarg; opts->fnr[2] = optarg; break; +++ case 'f': opts->flag_on |= strtol(optarg, 0, 0); break; +++ case 'F': +++ if (!flag_off_set) { +++ flag_off_set = 1; +++ opts->flag_off = 0; +++ } +++ opts->flag_off |= strtol(optarg, 0, 0); break; +++ case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; +++ case 'n': opts->has12 = false; break; +++ case 'N': opts->has12always = true; break; +++ case 'O': opts->use_oq = true; break; +++ case 's': opts->fnse = optarg; break; +++ case 't': opts->copy_tags = true; break; +++ case 'i': opts->illumina_tag = true; break; +++ case 'c': opts->compression_level = atoi(optarg); break; +++ case 'T': opts->extra_tags = strdup(optarg); break; +++ case 'v': opts->def_qual = atoi(optarg); break; +++ case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; +++ default: +++ if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { +++ bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; +++ } +++ break; +++ } +++ } +++ +++ if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; +++ if (opts->has12always) opts->has12 = true; +++ +++ if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); +++ if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); +++ +++ int nIndex = 0; +++ if (opts->index_format) { +++ char *s; +++ for (s = opts->index_format; *s; s++) { +++ if (*s == 'i') nIndex++; +++ } +++ } +++ if (nIndex>2) { +++ fprintf(stderr,"Invalid index format: more than 2 indexes\n"); +++ bam2fq_usage(stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ +++ if (opts->index_file[1] && !opts->index_file[0]) { +++ fprintf(stderr, "Index one specified, but index two not given\n"); +++ bam2fq_usage(stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ +++ if (opts->illumina_tag && !nIndex) { +++ fprintf(stderr, "You must specify an index format (--index-format) with the Illumina Casava (-i) option\n"); +++ bam2fq_usage(stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ +++ if (nIndex==0 && opts->index_file[0]) { +++ fprintf(stderr, "index_format not specified, but index file given\n"); +++ bam2fq_usage(stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ +++ if (opts->def_qual < 0 || 93 < opts->def_qual) { +++ fprintf(stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); +++ bam2fq_usage(stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ +++ const char* type_str = argv[0]; +++ if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) { +++ opts->filetype = FASTQ; +++ } else if (strcasecmp("fasta", type_str) == 0) { +++ opts->filetype = FASTA; +++ } else { +++ print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str); +++ bam2fq_usage(stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ +++ if (argc == optind && isatty(STDIN_FILENO)) { +++ bam2fq_usage(stdout, argv[0]); +++ free_opts(opts); +++ return true; +++ } +++ +++ if (argc - optind > 1) { +++ fprintf(stderr, "Too many arguments.\n"); +++ bam2fq_usage(stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ opts->fn_input = argc > optind ? argv[optind] : "-"; +++ *opts_out = opts; +++ return true; +++} +++ +++static BGZF *open_fqfile(char *filename, int c, htsThreadPool *tp) +++{ +++ char mode[4] = "w"; +++ size_t len = strlen(filename); +++ +++ mode[2] = 0; mode[3] = 0; +++ if (len > 3 && strstr(filename + (len - 3),".gz")) { +++ mode[1] = 'g'; mode[2] = c+'0'; +++ } else if ((len > 4 && strstr(filename + (len - 4),".bgz")) +++ || (len > 5 && strstr(filename + (len - 5),".bgzf"))) { +++ mode[1] = c+'0'; +++ } else { +++ mode[1] = 'u'; +++ } +++ +++ BGZF *fp = bgzf_open(filename,mode); +++ if (!fp) +++ return fp; +++ if (tp->pool && bgzf_thread_pool(fp, tp->pool, tp->qsize) < 0) { +++ bgzf_close(fp); +++ return NULL; +++ } +++ return fp; +++} +++ +++static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) +++{ +++ bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); +++ state->flag_on = opts->flag_on; +++ state->flag_off = opts->flag_off; +++ state->flag_alloff = opts->flag_alloff; +++ state->has12 = opts->has12; +++ state->use_oq = opts->use_oq; +++ state->illumina_tag = opts->illumina_tag; +++ state->copy_tags = opts->copy_tags; +++ state->filetype = opts->filetype; +++ state->def_qual = opts->def_qual; +++ state->index_sequence = NULL; +++ state->hstdout = NULL; +++ state->compression_level = opts->compression_level; +++ +++ state->taglist = kl_init(ktaglist); +++ if (opts->extra_tags) { +++ char *save_p; +++ char *s = strtok_r(opts->extra_tags, ",", &save_p); +++ while (s) { +++ if (strlen(s) != 2) { +++ fprintf(stderr, "Parsing extra tags - '%s' is not two characters\n", s); +++ free(state); +++ return false; +++ } +++ char **et = kl_pushp(ktaglist, state->taglist); +++ *et = s; +++ s = strtok_r(NULL, ",", &save_p); +++ } +++ } +++ +++ state->fp = sam_open(opts->fn_input, "r"); +++ if (state->fp == NULL) { +++ print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); +++ free(state); +++ return false; +++ } +++ +++ state->p.pool = NULL; +++ if (opts->ga.nthreads > 0) { +++ if (!(state->p.pool = hts_tpool_init(opts->ga.nthreads))) { +++ fprintf(stderr, "Failed to create thread pool\n"); +++ free(state); +++ return false; +++ } +++ state->p.qsize = opts->ga.nthreads*2; +++ hts_set_thread_pool(state->fp, &state->p); +++ } +++ +++ uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; +++ if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX; +++ if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { +++ fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); +++ free(state); +++ return false; +++ } +++ if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) { +++ fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); +++ free(state); +++ return false; +++ } +++ if (opts->fnse) { +++ state->fpse = open_fqfile(opts->fnse, state->compression_level, &state->p); +++ if (state->fpse == NULL) { +++ print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse); +++ free(state); +++ return false; +++ } +++ } +++ +++ if (opts->ga.reference) { +++ if (hts_set_fai_filename(state->fp, opts->ga.reference) != 0) { +++ print_error_errno("bam2fq", "cannot load reference \"%s\"", opts->ga.reference); +++ free(state); +++ return false; +++ } +++ } +++ +++ int i, j; +++ for (i = 0; i < 3; ++i) { +++ if (opts->fnr[i]) { +++ for (j = 0; j < i; j++) +++ if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0) +++ break; +++ if (j == i) { +++ state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level, &state->p); +++ if (state->fpr[i] == NULL) { +++ print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", +++ i, opts->fnr[i]); +++ free(state); +++ return false; +++ } +++ } else { +++ state->fpr[i] = state->fpr[j]; +++ } +++ } else { +++ if (!state->hstdout) { +++ state->hstdout = bgzf_dopen(fileno(stdout), "wu"); +++ if (!state->hstdout) { +++ print_error_errno("bam2fq", "Cannot open STDOUT"); +++ free(state); +++ return false; +++ } +++ } +++ state->fpr[i] = state->hstdout; +++ } +++ } +++ for (i = 0; i < 2; i++) { +++ state->fpi[i] = NULL; +++ if (opts->index_file[i]) { +++ for (j = 0; j < 3; j++) +++ if (opts->fnr[j] && strcmp(opts->fnr[j], opts->index_file[i]) == 0) +++ break; +++ for (j -= 3; j >= 0 && j < i; j++) +++ if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0) +++ break; +++ if (i == j) { +++ state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level, &state->p); +++ if (state->fpi[i] == NULL) { +++ print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", +++ i+1, opts->index_file[i]); +++ free(state); +++ return false; +++ } +++ } else if (j < 0) { +++ state->fpi[i] = state->fpr[j+3]; +++ } else { +++ state->fpi[i] = state->fpi[j]; +++ } +++ } +++ } +++ +++ state->h = sam_hdr_read(state->fp); +++ if (state->h == NULL) { +++ fprintf(stderr, "Failed to read header for \"%s\"\n", opts->fn_input); +++ free(state); +++ return false; +++ } +++ +++ *state_out = state; +++ return true; +++} +++ +++static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status) +++{ +++ bool valid = true; +++ sam_hdr_destroy(state->h); +++ check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status); +++ if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } +++ int i, j; +++ for (i = 0; i < 3; ++i) { +++ if (state->fpr[i] != state->hstdout) { +++ for (j = 0; j < i; j++) +++ if (state->fpr[i] == state->fpr[j]) +++ break; +++ if (j == i && bgzf_close(state->fpr[i])) { +++ print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); +++ valid = false; +++ } +++ } +++ } +++ if (state->hstdout) { +++ if (bgzf_close(state->hstdout)) { +++ print_error_errno("bam2fq", "Error closing STDOUT"); +++ valid = false; +++ } +++ } +++ for (i = 0; i < 2; i++) { +++ for (j = 0; j < 3; j++) +++ if (state->fpi[i] == state->fpr[j]) +++ break; +++ for (j -= 3; j >= 0 && j < i; j++) +++ if (state->fpi[i] == state->fpi[j]) +++ break; +++ if (j == i && state->fpi[i] && bgzf_close(state->fpi[i])) { +++ print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); +++ valid = false; +++ } +++ } +++ kl_destroy(ktaglist,state->taglist); +++ free(state->index_sequence); +++ if (state->p.pool) +++ hts_tpool_destroy(state->p.pool); +++ free(state); +++ return valid; +++} +++ +++static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) +++{ +++ return ((b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags +++ || (b->core.flag&(state->flag_off)) != 0 +++ || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff)); +++ +++} +++ +++static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) +++{ +++ int n; +++ bam1_t *records[3] = {NULL, NULL, NULL}; +++ char *current_qname = NULL; +++ int64_t n_reads = 0, n_singletons = 0; // Statistics +++ kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}}; +++ int score[3]; +++ int at_eof; +++ bool valid = true; +++ bam1_t* b = NULL; +++ +++ while (true) { +++ if (!b) +++ b = bam_init1(); +++ if (b == NULL) { +++ perror("[bam2fq_mainloop] Malloc error for bam record buffer."); +++ valid = false; +++ break; +++ } +++ int res = sam_read1(state->fp, state->h, b); +++ if (res < -1) { +++ fprintf(stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); +++ valid = false; +++ break; +++ } +++ at_eof = res < 0; +++ +++ if (!at_eof && filter_it_out(b, state)) +++ continue; +++ if (!at_eof) ++n_reads; +++ +++ if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) { +++ if (current_qname) { +++ if (state->illumina_tag) { +++ for (n=0; valid && n<3; n++) { +++ if (!records[n]) continue; +++ if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false; +++ } +++ if (!valid) break; +++ } +++ free(state->index_sequence); state->index_sequence = NULL; +++ if (score[1] > 0 && score[2] > 0) { +++ // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] +++ if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } +++ if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } +++ } else if (score[1] > 0 || score[2] > 0) { +++ if (state->fpse) { +++ // print whichever one exists to fpse +++ if (score[1] > 0) { +++ if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } +++ } else { +++ if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } +++ } +++ ++n_singletons; +++ } else { +++ if (score[1] > 0) { +++ if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } +++ } else { +++ if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } +++ } +++ } +++ } +++ if (score[0]) { // TODO: check this +++ // print linebuf[0] to fpr[0] +++ if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; } +++ } +++ } +++ +++ +++ free(current_qname); current_qname = NULL; +++ score[0] = score[1] = score[2] = 0; +++ for (n=0; n < 3; n++) { +++ bam_destroy1(records[n]); records[n]=NULL; +++ } +++ +++ if (at_eof) { break; } +++ +++ current_qname = strdup(bam_get_qname(b)); +++ if (!current_qname) { valid = false; break; } +++ } +++ +++ // Prefer a copy of the read that has base qualities +++ int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; +++ readpart rp = which_readpart(b); +++ if (b_score > score[rp]) { +++ if (!tags2fq(b, state, opts)) { valid = false; break; } +++ if (records[rp]) bam_destroy1(records[rp]); +++ records[rp] = b; +++ score[rp] = b_score; +++ b = NULL; +++ if(!bam1_to_fq(records[rp], &linebuf[rp], state)) { +++ fprintf(stderr, "[%s] Error converting read to FASTA/Q\n", __func__); +++ valid = false; break; +++ } +++ } +++ } +++ if (!valid) +++ { +++ perror("[bam2fq_mainloop] Error writing to FASTx files."); +++ } +++ bam_destroy1(b); +++ for (n=0; n < 3; n++) { +++ bam_destroy1(records[n]); +++ } +++ free(current_qname); +++ free(linebuf[0].s); +++ free(linebuf[1].s); +++ free(linebuf[2].s); +++ fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); +++ fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); +++ +++ return valid; +++} +++ +++int main_bam2fq(int argc, char *argv[]) +++{ +++ int status = EXIT_SUCCESS; +++ bam2fq_opts_t* opts = NULL; +++ bam2fq_state_t* state = NULL; +++ +++ bool valid = parse_opts(argc, argv, &opts); +++ if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE; +++ +++ if (!init_state(opts, &state)) return EXIT_FAILURE; +++ +++ if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; +++ +++ if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; +++ sam_global_args_free(&opts->ga); +++ free_opts(opts); +++ +++ return status; +++} ++--- /dev/null +++++ python-pysam/samtools/bam_fastq.c.pysam.c ++@@ -0,0 +1,1039 @@ +++#include "samtools.pysam.h" +++ +++/* bam_fastq.c -- FASTA and FASTQ file generation +++ +++ Copyright (C) 2009-2017, 2019 Genome Research Ltd. +++ Portions copyright (C) 2009, 2011, 2012 Broad Institute. +++ +++ Author: Heng Li +++ +++Permission is hereby granted, free of charge, to any person obtaining a copy +++of this software and associated documentation files (the "Software"), to deal +++in the Software without restriction, including without limitation the rights +++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++copies of the Software, and to permit persons to whom the Software is +++furnished to do so, subject to the following conditions: +++ +++The above copyright notices and this permission notice shall be included in +++all copies or substantial portions of the Software. +++ +++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++DEALINGS IN THE SOFTWARE. */ +++ +++#include +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++ +++#include "htslib/sam.h" +++#include "htslib/klist.h" +++#include "htslib/kstring.h" +++#include "htslib/bgzf.h" +++#include "htslib/thread_pool.h" +++#include "samtools.h" +++#include "sam_opts.h" +++ +++#define taglist_free(p) +++KLIST_INIT(ktaglist, char*, taglist_free) +++ +++#define DEFAULT_BARCODE_TAG "BC" +++#define DEFAULT_QUALITY_TAG "QT" +++#define INDEX_SEPARATOR "+" +++ +++int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; +++static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; +++ +++static void bam2fq_usage(FILE *to, const char *command) +++{ +++ int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; +++ fprintf(to, +++"Usage: samtools %s [options...] \n", command); +++ fprintf(to, +++"\n" +++"Description:\n" +++"Converts a SAM, BAM or CRAM into either FASTQ or FASTA format depending on the command invoked.\n" +++"\n" +++"Options:\n" +++" -0 FILE write reads designated READ_OTHER to FILE\n" +++" -1 FILE write reads designated READ1 to FILE\n" +++" -2 FILE write reads designated READ2 to FILE\n" +++" -o FILE write reads designated READ1 or READ2 to FILE\n" +++" note: if a singleton file is specified with -s, only\n" +++" paired reads will be written to the -1 and -2 files.\n" +++" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x +++" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0 +++" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) +++" -n don't append /1 and /2 to the read name\n" +++" -N always append /1 and /2 to the read name\n"); +++ if (fq) fprintf(to, +++" -O output quality in the OQ tag if present\n"); +++ fprintf(to, +++" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" +++" -t copy RG, BC and QT tags to the %s header line\n", +++ fq ? "FASTQ" : "FASTA"); +++ fprintf(to, +++" -T TAGLIST copy arbitrary tags to the %s header line\n", +++ fq ? "FASTQ" : "FASTA"); +++ if (fq) fprintf(to, +++" -v INT default quality score if not given in file [1]\n" +++" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" +++" -c compression level [0..9] to use when creating gz or bgzf fastq files [1]\n" +++" --i1 FILE write first index reads to FILE\n" +++" --i2 FILE write second index reads to FILE\n" +++" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" +++" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" +++" --index-format STR How to parse barcode and quality tags\n\n"); +++ sam_global_opt_help(to, "-.--.@-."); +++ fprintf(to, +++"\n" +++"The files will be automatically compressed if the file names have a .gz or .bgzf extension.\n" +++"The input to this program must be collated by name. Run 'samtools collate' or 'samtools sort -n'.\n" +++"\n" +++"Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n" +++"Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n" +++"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n" +++"or both unset.\n" +++"Run 'samtools flags' for more information on flag codes and meanings.\n"); +++ fprintf(to, +++"\n" +++"The index-format string describes how to parse the barcode and quality tags, for example:\n" +++" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" +++" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" +++"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" +++"'read until the separator or end of tag', for example:\n" +++" n*i* ignore the left part of the tag until the separator, then use the second part\n" +++" of the tag as index 1\n"); +++ fprintf(to, +++"\n" +++"Examples:\n" +++" To get just the paired reads in separate files, use:\n" +++" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n in.bam\n" +++"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n" +++" samtools %s in.bam > all_reads.%s\n", +++ command, fq ? "fq" : "fa", fq ? "fq" : "fa", +++ command, fq ? "fq" : "fa"); +++} +++ +++typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart; +++typedef enum { FASTA, FASTQ } fastfile; +++typedef struct bam2fq_opts { +++ char *fnse; +++ char *fnr[3]; +++ char *fn_input; // pointer to input filename in argv do not free +++ bool has12, has12always, use_oq, copy_tags, illumina_tag; +++ int flag_on, flag_off, flag_alloff; +++ sam_global_args ga; +++ fastfile filetype; +++ int def_qual; +++ char *barcode_tag; +++ char *quality_tag; +++ char *index_file[2]; +++ char *index_format; +++ char *extra_tags; +++ char compression_level; +++} bam2fq_opts_t; +++ +++typedef struct bam2fq_state { +++ samFile *fp; +++ BGZF *fpse; +++ BGZF *fpr[3]; +++ BGZF *fpi[2]; +++ BGZF *hsamtools_stdout; +++ sam_hdr_t *h; +++ bool has12, use_oq, copy_tags, illumina_tag; +++ int flag_on, flag_off, flag_alloff; +++ fastfile filetype; +++ int def_qual; +++ klist_t(ktaglist) *taglist; +++ char *index_sequence; +++ char compression_level; +++ htsThreadPool p; +++} bam2fq_state_t; +++ +++/* +++ * Get and decode the read from a BAM record. +++ * +++ * TODO: htslib really needs an interface for this. Consider this or perhaps +++ * bam_get_seq_str (current vs original orientation) and bam_get_qual_str +++ * functions as string formatted equivalents to bam_get_{seq,qual}? +++ */ +++ +++/* +++ * Reverse a string in place. +++ * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. +++ * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik +++ */ +++static char *reverse(char *str) +++{ +++ int i = strlen(str)-1,j=0; +++ char ch; +++ while (i>j) { +++ ch = str[i]; +++ str[i]= str[j]; +++ str[j] = ch; +++ i--; +++ j++; +++ } +++ return str; +++} +++ +++/* return the read, reverse complemented if necessary */ +++static char *get_read(const bam1_t *rec) +++{ +++ int len = rec->core.l_qseq + 1; +++ char *read = calloc(1, len); +++ char *seq = (char *)bam_get_seq(rec); +++ int n; +++ +++ if (!read) return NULL; +++ +++ for (n=0; n < rec->core.l_qseq; n++) { +++ if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; +++ else read[n] = seq_nt16_str[bam_seqi(seq,n)]; +++ } +++ if (rec->core.flag & BAM_FREVERSE) reverse(read); +++ return read; +++} +++ +++/* +++ * get and decode the quality from a BAM record +++ */ +++static int get_quality(const bam1_t *rec, char **qual_out) +++{ +++ char *quality = calloc(1, rec->core.l_qseq + 1); +++ char *q = (char *)bam_get_qual(rec); +++ int n; +++ +++ if (!quality) return -1; +++ +++ if (*q == '\xff') { +++ free(quality); +++ *qual_out = NULL; +++ return 0; +++ } +++ +++ for (n=0; n < rec->core.l_qseq; n++) { +++ quality[n] = q[n]+33; +++ } +++ if (rec->core.flag & BAM_FREVERSE) reverse(quality); +++ *qual_out = quality; +++ return 0; +++} +++ +++// +++// End of htslib complaints +++// +++ +++ +++static readpart which_readpart(const bam1_t *b) +++{ +++ if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { +++ return READ_1; +++ } else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) { +++ return READ_2; +++ } else { +++ return READ_UNKNOWN; +++ } +++} +++ +++/* +++ * parse the length part from the index-format string +++ */ +++static int getLength(char **s) +++{ +++ int n = 0; +++ while (**s) { +++ if (**s == '*') { n=-1; (*s)++; break; } +++ if ( !isdigit(**s)) break; +++ n = n*10 + ((**s)-'0'); +++ (*s)++; +++ } +++ return n; +++} +++ +++static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf) +++{ +++ uint8_t *s = bam_aux_get(rec, tag); +++ if (s) { +++ char aux_type = *s; +++ switch (aux_type) { +++ case 'C': +++ case 'S': aux_type = 'I'; break; +++ case 'c': +++ case 's': aux_type = 'i'; break; +++ case 'd': aux_type = 'f'; break; +++ } +++ +++ // Ensure space. Need 6 chars + length of tag. Max length of +++ // i is 16, A is 21, B currently 26, Z is unknown, so +++ // have to check that one later. +++ if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false; +++ +++ kputc('\t', linebuf); +++ kputsn(tag, 2, linebuf); +++ kputc(':', linebuf); +++ kputc(aux_type=='I'? 'i': aux_type, linebuf); +++ kputc(':', linebuf); +++ switch (aux_type) { +++ case 'H': +++ case 'Z': +++ if (kputs(bam_aux2Z(s), linebuf) < 0) return false; +++ break; +++ case 'i': kputw(bam_aux2i(s), linebuf); break; +++ case 'I': kputuw(bam_aux2i(s), linebuf); break; +++ case 'A': kputc(bam_aux2A(s), linebuf); break; +++ case 'f': kputd(bam_aux2f(s), linebuf); break; +++ case 'B': kputs("*** Unhandled aux type ***", linebuf); return false; +++ default: kputs("*** Unknown aux type ***", linebuf); return false; +++ } +++ } +++ return true; +++} +++ +++static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec) +++{ +++ if (!index_sequence) return 0; +++ +++ kstring_t new = {0,0,NULL}; +++ if (linebuf->s) { +++ char *s = strchr(linebuf->s, '\n'); +++ if (s) { +++ if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0) +++ return -1; +++ *s = 0; +++ kputs(linebuf->s, &new); +++ kputc(' ', &new); +++ readpart readpart = which_readpart(rec); +++ if (readpart == READ_1) kputc('1', &new); +++ else if (readpart == READ_2) kputc('2', &new); +++ else kputc('0', &new); +++ +++ kputc(':', &new); +++ if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new); +++ else kputc('N', &new); +++ +++ kputs(":0:", &new); +++ kputs(index_sequence, &new); +++ kputc('\n', &new); +++ kputs(s+1, &new); +++ free(ks_release(linebuf)); +++ linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m; +++ } +++ } +++ return 0; +++} +++ +++static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) +++{ +++ int i; +++ +++ linebuf->l = 0; +++ // Write read name +++ if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false; +++ if (kputs(bam_get_qname(rec), linebuf) < 0) return false; +++ // Add the /1 /2 if requested +++ if (state->has12) { +++ readpart readpart = which_readpart(rec); +++ if (readpart == READ_1) { +++ if (kputs("/1", linebuf) < 0) return false; +++ } else if (readpart == READ_2) { +++ if (kputs("/2", linebuf) < 0) return false; +++ } +++ } +++ if (state->copy_tags) { +++ for (i = 0; copied_tags[i]; ++i) { +++ if (!copy_tag(copied_tags[i], rec, linebuf)) { +++ fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); +++ return false; +++ } +++ } +++ } +++ +++ if (state->taglist->size) { +++ kliter_t(ktaglist) *p; +++ for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) { +++ if (!copy_tag(kl_val(p), rec, linebuf)) { +++ fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); +++ return false; +++ } +++ } +++ } +++ +++ if (kputc('\n', linebuf) < 0) return false; +++ if (kputs(seq, linebuf) < 0) return false; +++ if (kputc('\n', linebuf) < 0) return false; +++ +++ if (state->filetype == FASTQ) { +++ // Write quality +++ if (kputs("+\n", linebuf) < 0) return false; +++ if (qual && *qual) { +++ if (kputs(qual, linebuf) < 0) return false; +++ } else { +++ int len = strlen(seq); +++ if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false; +++ for (i = 0; i < len; ++i) { +++ kputc(33 + state->def_qual, linebuf); +++ } +++ } +++ if (kputc('\n', linebuf) < 0) return false; +++ } +++ return true; +++} +++ +++/* +++ * Create FASTQ lines from the barcode tag using the index-format +++ */ +++static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) +++{ +++ uint8_t *p; +++ char *ifmt = opts->index_format; +++ char *tag = NULL; +++ char *qual = NULL; +++ char *sub_tag = NULL; +++ char *sub_qual = NULL; +++ size_t tag_len; +++ int file_number = 0; +++ kstring_t linebuf = { 0, 0, NULL }; // Buffer +++ +++ if (!ifmt) return true; +++ +++ // read barcode tag +++ p = bam_aux_get(rec,opts->barcode_tag); +++ if (p) tag = bam_aux2Z(p); +++ +++ if (!tag) return true; // there is no tag +++ +++ tag_len = strlen(tag); +++ sub_tag = calloc(1, tag_len + 1); +++ if (!sub_tag) goto fail; +++ sub_qual = calloc(1, tag_len + 1); +++ if (!sub_qual) goto fail; +++ +++ // read quality tag +++ p = bam_aux_get(rec, opts->quality_tag); +++ if (p) qual = bam_aux2Z(p); +++ +++ // Parse the index-format string +++ while (*ifmt) { +++ if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly +++ char action = *ifmt; // should be 'i' or 'n' +++ ifmt++; // skip over action +++ int index_len = getLength(&ifmt); +++ int n = 0; +++ +++ if (index_len < 0) { +++ // read until separator +++ while (isalpha(*tag)) { +++ sub_tag[n] = *tag++; +++ if (qual) sub_qual[n] = *qual++; +++ n++; +++ } +++ if (*tag) { // skip separator +++ tag++; +++ if (qual) qual++; +++ } +++ } else { +++ // read index_len characters +++ while (index_len-- && *tag) { +++ sub_tag[n] = *tag++; +++ if (qual) sub_qual[n] = *qual++; +++ n++; +++ } +++ } +++ sub_tag[n] = '\0'; +++ sub_qual[n] = '\0'; +++ +++ if (action=='i' && *sub_tag) { +++ if (state->index_sequence) { +++ char *new_index_sequence = realloc(state->index_sequence, strlen(state->index_sequence) + strlen(sub_tag) + 2); +++ if (!new_index_sequence) goto fail; +++ state->index_sequence = new_index_sequence; +++ strcat(state->index_sequence, INDEX_SEPARATOR); +++ strcat(state->index_sequence, sub_tag); +++ } else { +++ state->index_sequence = strdup(sub_tag); // we're going to need this later... +++ } +++ if (!state->index_sequence) goto fail; +++ if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail; +++ if (state->illumina_tag) { +++ if (insert_index_sequence_into_linebuf(sub_tag, &linebuf, rec) < 0) { +++ goto fail; +++ } +++ } +++ if (state->fpi[file_number]) { +++ if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0) +++ goto fail; +++ } +++ } +++ +++ } +++ +++ free(sub_qual); free(sub_tag); +++ free(linebuf.s); +++ return true; +++ +++ fail: +++ perror(__func__); +++ free(sub_qual); free(sub_tag); +++ free(linebuf.s); +++ return false; +++} +++ +++// Transform a bam1_t record into a string with the FASTQ representation of it +++// @returns false for error, true for success +++static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) +++{ +++ int32_t qlen = b->core.l_qseq; +++ assert(qlen >= 0); +++ const uint8_t *oq = NULL; +++ char *qual = NULL; +++ +++ char *seq = get_read(b); +++ if (!seq) return false; +++ +++ if (state->use_oq) oq = bam_aux_get(b, "OQ"); +++ if (oq && *oq=='Z') { +++ qual = strdup(bam_aux2Z(oq)); +++ if (!qual) goto fail; +++ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented +++ reverse(qual); +++ } +++ } else { +++ if (get_quality(b, &qual) < 0) goto fail; +++ } +++ +++ if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail; +++ +++ free(qual); +++ free(seq); +++ return true; +++ +++ fail: +++ free(seq); +++ free(qual); +++ return false; +++} +++ +++static void free_opts(bam2fq_opts_t *opts) +++{ +++ free(opts->barcode_tag); +++ free(opts->quality_tag); +++ free(opts->index_format); +++ free(opts->extra_tags); +++ free(opts); +++} +++ +++// return true if valid +++static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) +++{ +++ // Parse args +++ bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t)); +++ opts->has12 = true; +++ opts->has12always = false; +++ opts->filetype = FASTQ; +++ opts->def_qual = 1; +++ opts->barcode_tag = NULL; +++ opts->quality_tag = NULL; +++ opts->index_format = NULL; +++ opts->index_file[0] = NULL; +++ opts->index_file[1] = NULL; +++ opts->extra_tags = NULL; +++ opts->compression_level = 1; +++ opts->flag_off = BAM_FSECONDARY|BAM_FSUPPLEMENTARY; +++ int flag_off_set = 0; +++ +++ int c; +++ sam_global_args_init(&opts->ga); +++ static const struct option lopts[] = { +++ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), +++ {"i1", required_argument, NULL, 1}, +++ {"I1", required_argument, NULL, 1}, +++ {"i2", required_argument, NULL, 2}, +++ {"I2", required_argument, NULL, 2}, +++ {"if", required_argument, NULL, 3}, +++ {"IF", required_argument, NULL, 3}, +++ {"index-format", required_argument, NULL, 3}, +++ {"barcode-tag", required_argument, NULL, 'b'}, +++ {"quality-tag", required_argument, NULL, 'q'}, +++ { NULL, 0, NULL, 0 } +++ }; +++ while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) { +++ switch (c) { +++ case 'b': opts->barcode_tag = strdup(optarg); break; +++ case 'q': opts->quality_tag = strdup(optarg); break; +++ case 1 : opts->index_file[0] = optarg; break; +++ case 2 : opts->index_file[1] = optarg; break; +++ case 3 : opts->index_format = strdup(optarg); break; +++ case '0': opts->fnr[0] = optarg; break; +++ case '1': opts->fnr[1] = optarg; break; +++ case '2': opts->fnr[2] = optarg; break; +++ case 'o': opts->fnr[1] = optarg; opts->fnr[2] = optarg; break; +++ case 'f': opts->flag_on |= strtol(optarg, 0, 0); break; +++ case 'F': +++ if (!flag_off_set) { +++ flag_off_set = 1; +++ opts->flag_off = 0; +++ } +++ opts->flag_off |= strtol(optarg, 0, 0); break; +++ case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; +++ case 'n': opts->has12 = false; break; +++ case 'N': opts->has12always = true; break; +++ case 'O': opts->use_oq = true; break; +++ case 's': opts->fnse = optarg; break; +++ case 't': opts->copy_tags = true; break; +++ case 'i': opts->illumina_tag = true; break; +++ case 'c': opts->compression_level = atoi(optarg); break; +++ case 'T': opts->extra_tags = strdup(optarg); break; +++ case 'v': opts->def_qual = atoi(optarg); break; +++ case '?': bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; +++ default: +++ if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { +++ bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; +++ } +++ break; +++ } +++ } +++ +++ if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; +++ if (opts->has12always) opts->has12 = true; +++ +++ if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); +++ if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); +++ +++ int nIndex = 0; +++ if (opts->index_format) { +++ char *s; +++ for (s = opts->index_format; *s; s++) { +++ if (*s == 'i') nIndex++; +++ } +++ } +++ if (nIndex>2) { +++ fprintf(samtools_stderr,"Invalid index format: more than 2 indexes\n"); +++ bam2fq_usage(samtools_stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ +++ if (opts->index_file[1] && !opts->index_file[0]) { +++ fprintf(samtools_stderr, "Index one specified, but index two not given\n"); +++ bam2fq_usage(samtools_stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ +++ if (opts->illumina_tag && !nIndex) { +++ fprintf(samtools_stderr, "You must specify an index format (--index-format) with the Illumina Casava (-i) option\n"); +++ bam2fq_usage(samtools_stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ +++ if (nIndex==0 && opts->index_file[0]) { +++ fprintf(samtools_stderr, "index_format not specified, but index file given\n"); +++ bam2fq_usage(samtools_stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ +++ if (opts->def_qual < 0 || 93 < opts->def_qual) { +++ fprintf(samtools_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); +++ bam2fq_usage(samtools_stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ +++ const char* type_str = argv[0]; +++ if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) { +++ opts->filetype = FASTQ; +++ } else if (strcasecmp("fasta", type_str) == 0) { +++ opts->filetype = FASTA; +++ } else { +++ print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str); +++ bam2fq_usage(samtools_stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ +++ if (argc == optind && isatty(STDIN_FILENO)) { +++ bam2fq_usage(samtools_stdout, argv[0]); +++ free_opts(opts); +++ return true; +++ } +++ +++ if (argc - optind > 1) { +++ fprintf(samtools_stderr, "Too many arguments.\n"); +++ bam2fq_usage(samtools_stderr, argv[0]); +++ free_opts(opts); +++ return false; +++ } +++ opts->fn_input = argc > optind ? argv[optind] : "-"; +++ *opts_out = opts; +++ return true; +++} +++ +++static BGZF *open_fqfile(char *filename, int c, htsThreadPool *tp) +++{ +++ char mode[4] = "w"; +++ size_t len = strlen(filename); +++ +++ mode[2] = 0; mode[3] = 0; +++ if (len > 3 && strstr(filename + (len - 3),".gz")) { +++ mode[1] = 'g'; mode[2] = c+'0'; +++ } else if ((len > 4 && strstr(filename + (len - 4),".bgz")) +++ || (len > 5 && strstr(filename + (len - 5),".bgzf"))) { +++ mode[1] = c+'0'; +++ } else { +++ mode[1] = 'u'; +++ } +++ +++ BGZF *fp = bgzf_open(filename,mode); +++ if (!fp) +++ return fp; +++ if (tp->pool && bgzf_thread_pool(fp, tp->pool, tp->qsize) < 0) { +++ bgzf_close(fp); +++ return NULL; +++ } +++ return fp; +++} +++ +++static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) +++{ +++ bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); +++ state->flag_on = opts->flag_on; +++ state->flag_off = opts->flag_off; +++ state->flag_alloff = opts->flag_alloff; +++ state->has12 = opts->has12; +++ state->use_oq = opts->use_oq; +++ state->illumina_tag = opts->illumina_tag; +++ state->copy_tags = opts->copy_tags; +++ state->filetype = opts->filetype; +++ state->def_qual = opts->def_qual; +++ state->index_sequence = NULL; +++ state->hsamtools_stdout = NULL; +++ state->compression_level = opts->compression_level; +++ +++ state->taglist = kl_init(ktaglist); +++ if (opts->extra_tags) { +++ char *save_p; +++ char *s = strtok_r(opts->extra_tags, ",", &save_p); +++ while (s) { +++ if (strlen(s) != 2) { +++ fprintf(samtools_stderr, "Parsing extra tags - '%s' is not two characters\n", s); +++ free(state); +++ return false; +++ } +++ char **et = kl_pushp(ktaglist, state->taglist); +++ *et = s; +++ s = strtok_r(NULL, ",", &save_p); +++ } +++ } +++ +++ state->fp = sam_open(opts->fn_input, "r"); +++ if (state->fp == NULL) { +++ print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); +++ free(state); +++ return false; +++ } +++ +++ state->p.pool = NULL; +++ if (opts->ga.nthreads > 0) { +++ if (!(state->p.pool = hts_tpool_init(opts->ga.nthreads))) { +++ fprintf(samtools_stderr, "Failed to create thread pool\n"); +++ free(state); +++ return false; +++ } +++ state->p.qsize = opts->ga.nthreads*2; +++ hts_set_thread_pool(state->fp, &state->p); +++ } +++ +++ uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; +++ if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX; +++ if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { +++ fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); +++ free(state); +++ return false; +++ } +++ if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) { +++ fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); +++ free(state); +++ return false; +++ } +++ if (opts->fnse) { +++ state->fpse = open_fqfile(opts->fnse, state->compression_level, &state->p); +++ if (state->fpse == NULL) { +++ print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse); +++ free(state); +++ return false; +++ } +++ } +++ +++ if (opts->ga.reference) { +++ if (hts_set_fai_filename(state->fp, opts->ga.reference) != 0) { +++ print_error_errno("bam2fq", "cannot load reference \"%s\"", opts->ga.reference); +++ free(state); +++ return false; +++ } +++ } +++ +++ int i, j; +++ for (i = 0; i < 3; ++i) { +++ if (opts->fnr[i]) { +++ for (j = 0; j < i; j++) +++ if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0) +++ break; +++ if (j == i) { +++ state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level, &state->p); +++ if (state->fpr[i] == NULL) { +++ print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", +++ i, opts->fnr[i]); +++ free(state); +++ return false; +++ } +++ } else { +++ state->fpr[i] = state->fpr[j]; +++ } +++ } else { +++ if (!state->hsamtools_stdout) { +++ state->hsamtools_stdout = bgzf_dopen(fileno(samtools_stdout), "wu"); +++ if (!state->hsamtools_stdout) { +++ print_error_errno("bam2fq", "Cannot open STDOUT"); +++ free(state); +++ return false; +++ } +++ } +++ state->fpr[i] = state->hsamtools_stdout; +++ } +++ } +++ for (i = 0; i < 2; i++) { +++ state->fpi[i] = NULL; +++ if (opts->index_file[i]) { +++ for (j = 0; j < 3; j++) +++ if (opts->fnr[j] && strcmp(opts->fnr[j], opts->index_file[i]) == 0) +++ break; +++ for (j -= 3; j >= 0 && j < i; j++) +++ if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0) +++ break; +++ if (i == j) { +++ state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level, &state->p); +++ if (state->fpi[i] == NULL) { +++ print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", +++ i+1, opts->index_file[i]); +++ free(state); +++ return false; +++ } +++ } else if (j < 0) { +++ state->fpi[i] = state->fpr[j+3]; +++ } else { +++ state->fpi[i] = state->fpi[j]; +++ } +++ } +++ } +++ +++ state->h = sam_hdr_read(state->fp); +++ if (state->h == NULL) { +++ fprintf(samtools_stderr, "Failed to read header for \"%s\"\n", opts->fn_input); +++ free(state); +++ return false; +++ } +++ +++ *state_out = state; +++ return true; +++} +++ +++static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status) +++{ +++ bool valid = true; +++ sam_hdr_destroy(state->h); +++ check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status); +++ if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } +++ int i, j; +++ for (i = 0; i < 3; ++i) { +++ if (state->fpr[i] != state->hsamtools_stdout) { +++ for (j = 0; j < i; j++) +++ if (state->fpr[i] == state->fpr[j]) +++ break; +++ if (j == i && bgzf_close(state->fpr[i])) { +++ print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); +++ valid = false; +++ } +++ } +++ } +++ if (state->hsamtools_stdout) { +++ if (bgzf_close(state->hsamtools_stdout)) { +++ print_error_errno("bam2fq", "Error closing STDOUT"); +++ valid = false; +++ } +++ } +++ for (i = 0; i < 2; i++) { +++ for (j = 0; j < 3; j++) +++ if (state->fpi[i] == state->fpr[j]) +++ break; +++ for (j -= 3; j >= 0 && j < i; j++) +++ if (state->fpi[i] == state->fpi[j]) +++ break; +++ if (j == i && state->fpi[i] && bgzf_close(state->fpi[i])) { +++ print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); +++ valid = false; +++ } +++ } +++ kl_destroy(ktaglist,state->taglist); +++ free(state->index_sequence); +++ if (state->p.pool) +++ hts_tpool_destroy(state->p.pool); +++ free(state); +++ return valid; +++} +++ +++static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) +++{ +++ return ((b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags +++ || (b->core.flag&(state->flag_off)) != 0 +++ || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff)); +++ +++} +++ +++static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) +++{ +++ int n; +++ bam1_t *records[3] = {NULL, NULL, NULL}; +++ char *current_qname = NULL; +++ int64_t n_reads = 0, n_singletons = 0; // Statistics +++ kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}}; +++ int score[3]; +++ int at_eof; +++ bool valid = true; +++ bam1_t* b = NULL; +++ +++ while (true) { +++ if (!b) +++ b = bam_init1(); +++ if (b == NULL) { +++ perror("[bam2fq_mainloop] Malloc error for bam record buffer."); +++ valid = false; +++ break; +++ } +++ int res = sam_read1(state->fp, state->h, b); +++ if (res < -1) { +++ fprintf(samtools_stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); +++ valid = false; +++ break; +++ } +++ at_eof = res < 0; +++ +++ if (!at_eof && filter_it_out(b, state)) +++ continue; +++ if (!at_eof) ++n_reads; +++ +++ if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) { +++ if (current_qname) { +++ if (state->illumina_tag) { +++ for (n=0; valid && n<3; n++) { +++ if (!records[n]) continue; +++ if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false; +++ } +++ if (!valid) break; +++ } +++ free(state->index_sequence); state->index_sequence = NULL; +++ if (score[1] > 0 && score[2] > 0) { +++ // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] +++ if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } +++ if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } +++ } else if (score[1] > 0 || score[2] > 0) { +++ if (state->fpse) { +++ // print whichever one exists to fpse +++ if (score[1] > 0) { +++ if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } +++ } else { +++ if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } +++ } +++ ++n_singletons; +++ } else { +++ if (score[1] > 0) { +++ if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } +++ } else { +++ if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } +++ } +++ } +++ } +++ if (score[0]) { // TODO: check this +++ // print linebuf[0] to fpr[0] +++ if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; } +++ } +++ } +++ +++ +++ free(current_qname); current_qname = NULL; +++ score[0] = score[1] = score[2] = 0; +++ for (n=0; n < 3; n++) { +++ bam_destroy1(records[n]); records[n]=NULL; +++ } +++ +++ if (at_eof) { break; } +++ +++ current_qname = strdup(bam_get_qname(b)); +++ if (!current_qname) { valid = false; break; } +++ } +++ +++ // Prefer a copy of the read that has base qualities +++ int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; +++ readpart rp = which_readpart(b); +++ if (b_score > score[rp]) { +++ if (!tags2fq(b, state, opts)) { valid = false; break; } +++ if (records[rp]) bam_destroy1(records[rp]); +++ records[rp] = b; +++ score[rp] = b_score; +++ b = NULL; +++ if(!bam1_to_fq(records[rp], &linebuf[rp], state)) { +++ fprintf(samtools_stderr, "[%s] Error converting read to FASTA/Q\n", __func__); +++ valid = false; break; +++ } +++ } +++ } +++ if (!valid) +++ { +++ perror("[bam2fq_mainloop] Error writing to FASTx files."); +++ } +++ bam_destroy1(b); +++ for (n=0; n < 3; n++) { +++ bam_destroy1(records[n]); +++ } +++ free(current_qname); +++ free(linebuf[0].s); +++ free(linebuf[1].s); +++ free(linebuf[2].s); +++ fprintf(samtools_stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); +++ fprintf(samtools_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); +++ +++ return valid; +++} +++ +++int main_bam2fq(int argc, char *argv[]) +++{ +++ int status = EXIT_SUCCESS; +++ bam2fq_opts_t* opts = NULL; +++ bam2fq_state_t* state = NULL; +++ +++ bool valid = parse_opts(argc, argv, &opts); +++ if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE; +++ +++ if (!init_state(opts, &state)) return EXIT_FAILURE; +++ +++ if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; +++ +++ if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; +++ sam_global_args_free(&opts->ga); +++ free_opts(opts); +++ +++ return status; +++} ++--- python-pysam.orig/samtools/bam_import.c +++++ /dev/null ++@@ -1,65 +0,0 @@ ++-/* bam_import.c -- SAM format parsing. ++- ++- Copyright (C) 2008-2013 Genome Research Ltd. ++- ++- Author: Heng Li ++- ++-Permission is hereby granted, free of charge, to any person obtaining a copy ++-of this software and associated documentation files (the "Software"), to deal ++-in the Software without restriction, including without limitation the rights ++-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++-copies of the Software, and to permit persons to whom the Software is ++-furnished to do so, subject to the following conditions: ++- ++-The above copyright notice and this permission notice shall be included in ++-all copies or substantial portions of the Software. ++- ++-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++-DEALINGS IN THE SOFTWARE. */ ++- ++-#include ++- ++-#include ++-#include ++-#include ++-#include ++-#include "htslib/kstring.h" ++-#include "bam.h" ++-#include "htslib/kseq.h" ++- ++-KSTREAM_INIT(gzFile, gzread, 16384) ++- ++-bam_header_t *sam_header_read2(const char *fn) ++-{ ++- bam_header_t *header; ++- int c, dret, n_targets = 0; ++- gzFile fp; ++- kstream_t *ks; ++- kstring_t *str; ++- kstring_t samstr = { 0, 0, NULL }; ++- if (fn == 0) return 0; ++- fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); ++- if (fp == 0) return 0; ++- ks = ks_init(fp); ++- str = (kstring_t*)calloc(1, sizeof(kstring_t)); ++- while (ks_getuntil(ks, 0, str, &dret) > 0) { ++- ksprintf(&samstr, "@SQ\tSN:%s", str->s); ++- ks_getuntil(ks, 0, str, &dret); ++- ksprintf(&samstr, "\tLN:%d\n", atoi(str->s)); ++- n_targets++; ++- if (dret != '\n') ++- while ((c = ks_getc(ks)) != '\n' && c != -1); ++- } ++- ks_destroy(ks); ++- gzclose(fp); ++- free(str->s); free(str); ++- header = sam_hdr_parse(samstr.l, samstr.s? samstr.s : ""); ++- free(samstr.s); ++- fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", n_targets); ++- return header; ++-} ++--- python-pysam.orig/samtools/bam_import.c.pysam.c +++++ /dev/null ++@@ -1,67 +0,0 @@ ++-#include "samtools.pysam.h" ++- ++-/* bam_import.c -- SAM format parsing. ++- ++- Copyright (C) 2008-2013 Genome Research Ltd. ++- ++- Author: Heng Li ++- ++-Permission is hereby granted, free of charge, to any person obtaining a copy ++-of this software and associated documentation files (the "Software"), to deal ++-in the Software without restriction, including without limitation the rights ++-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++-copies of the Software, and to permit persons to whom the Software is ++-furnished to do so, subject to the following conditions: ++- ++-The above copyright notice and this permission notice shall be included in ++-all copies or substantial portions of the Software. ++- ++-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++-DEALINGS IN THE SOFTWARE. */ ++- ++-#include ++- ++-#include ++-#include ++-#include ++-#include ++-#include "htslib/kstring.h" ++-#include "bam.h" ++-#include "htslib/kseq.h" ++- ++-KSTREAM_INIT(gzFile, gzread, 16384) ++- ++-bam_header_t *sam_header_read2(const char *fn) ++-{ ++- bam_header_t *header; ++- int c, dret, n_targets = 0; ++- gzFile fp; ++- kstream_t *ks; ++- kstring_t *str; ++- kstring_t samstr = { 0, 0, NULL }; ++- if (fn == 0) return 0; ++- fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); ++- if (fp == 0) return 0; ++- ks = ks_init(fp); ++- str = (kstring_t*)calloc(1, sizeof(kstring_t)); ++- while (ks_getuntil(ks, 0, str, &dret) > 0) { ++- ksprintf(&samstr, "@SQ\tSN:%s", str->s); ++- ks_getuntil(ks, 0, str, &dret); ++- ksprintf(&samstr, "\tLN:%d\n", atoi(str->s)); ++- n_targets++; ++- if (dret != '\n') ++- while ((c = ks_getc(ks)) != '\n' && c != -1); ++- } ++- ks_destroy(ks); ++- gzclose(fp); ++- free(str->s); free(str); ++- header = sam_hdr_parse(samstr.l, samstr.s? samstr.s : ""); ++- free(samstr.s); ++- fprintf(samtools_stderr, "[sam_header_read2] %d sequences loaded.\n", n_targets); ++- return header; ++-} ++--- python-pysam.orig/samtools/bam_index.c +++++ python-pysam/samtools/bam_index.c ++@@ -1,6 +1,6 @@ ++ /* bam_index.c -- index and idxstats subcommands. ++ ++- Copyright (C) 2008-2011, 2013, 2014 Genome Research Ltd. +++ Copyright (C) 2008-2011, 2013-2016, 2018, 2019 Genome Research Ltd. ++ Portions copyright (C) 2010 Broad Institute. ++ Portions copyright (C) 2013 Peter Cock, The James Hutton Institute. ++ ++@@ -114,20 +114,20 @@ ++ * Returns 0 on success, ++ * -1 on failure. ++ */ ++-int slow_idxstats(samFile *fp, bam_hdr_t *header) { +++int slow_idxstats(samFile *fp, sam_hdr_t *header) { ++ int ret, last_tid = -2; ++ bam1_t *b = bam_init1(); ++ ++ if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_FLAG)) ++ return -1; ++ ++- uint64_t (*count0)[2] = calloc(header->n_targets+1, sizeof(*count0)); +++ uint64_t (*count0)[2] = calloc(sam_hdr_nref(header)+1, sizeof(*count0)); ++ uint64_t (*counts)[2] = count0+1; ++ if (!count0) ++ return -1; ++ ++ while ((ret = sam_read1(fp, header, b)) >= 0) { ++- if (b->core.tid >= header->n_targets || b->core.tid < -1) { +++ if (b->core.tid >= sam_hdr_nref(header) || b->core.tid < -1) { ++ free(count0); ++ return -1; ++ } ++@@ -148,10 +148,10 @@ ++ ++ if (ret == -1) { ++ int i; ++- for (i = 0; i < header->n_targets; i++) { ++- printf("%s\t%d\t%"PRIu64"\t%"PRIu64"\n", ++- header->target_name[i], ++- header->target_len[i], +++ for (i = 0; i < sam_hdr_nref(header); i++) { +++ printf("%s\t%"PRId64"\t%"PRIu64"\t%"PRIu64"\n", +++ sam_hdr_tid2name(header, i), +++ (int64_t) sam_hdr_tid2len(header, i), ++ counts[i][0], counts[i][1]); ++ } ++ printf("*\t0\t%"PRIu64"\t%"PRIu64"\n", counts[-1][0], counts[-1][1]); ++@@ -167,14 +167,14 @@ ++ static void usage_exit(FILE *fp, int exit_status) ++ { ++ fprintf(fp, "Usage: samtools idxstats [options] \n"); ++- sam_global_opt_help(fp, "-.---@"); +++ sam_global_opt_help(fp, "-.---@-."); ++ exit(exit_status); ++ } ++ ++ int bam_idxstats(int argc, char *argv[]) ++ { ++ hts_idx_t* idx; ++- bam_hdr_t* header; +++ sam_hdr_t* header; ++ samFile* fp; ++ int c; ++ ++@@ -227,9 +227,9 @@ ++ } ++ ++ int i; ++- for (i = 0; i < header->n_targets; ++i) { +++ for (i = 0; i < sam_hdr_nref(header); ++i) { ++ // Print out contig name and length ++- printf("%s\t%d", header->target_name[i], header->target_len[i]); +++ printf("%s\t%"PRId64, sam_hdr_tid2name(header, i), (int64_t) sam_hdr_tid2len(header, i)); ++ // Now fetch info about it from the meta bin ++ uint64_t u, v; ++ hts_idx_get_stat(idx, i, &u, &v); ++@@ -240,7 +240,7 @@ ++ hts_idx_destroy(idx); ++ } ++ ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ sam_close(fp); ++ return 0; ++ } ++--- python-pysam.orig/samtools/bam_index.c.pysam.c +++++ python-pysam/samtools/bam_index.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam_index.c -- index and idxstats subcommands. ++ ++- Copyright (C) 2008-2011, 2013, 2014 Genome Research Ltd. +++ Copyright (C) 2008-2011, 2013-2016, 2018, 2019 Genome Research Ltd. ++ Portions copyright (C) 2010 Broad Institute. ++ Portions copyright (C) 2013 Peter Cock, The James Hutton Institute. ++ ++@@ -116,20 +116,20 @@ ++ * Returns 0 on success, ++ * -1 on failure. ++ */ ++-int slow_idxstats(samFile *fp, bam_hdr_t *header) { +++int slow_idxstats(samFile *fp, sam_hdr_t *header) { ++ int ret, last_tid = -2; ++ bam1_t *b = bam_init1(); ++ ++ if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_FLAG)) ++ return -1; ++ ++- uint64_t (*count0)[2] = calloc(header->n_targets+1, sizeof(*count0)); +++ uint64_t (*count0)[2] = calloc(sam_hdr_nref(header)+1, sizeof(*count0)); ++ uint64_t (*counts)[2] = count0+1; ++ if (!count0) ++ return -1; ++ ++ while ((ret = sam_read1(fp, header, b)) >= 0) { ++- if (b->core.tid >= header->n_targets || b->core.tid < -1) { +++ if (b->core.tid >= sam_hdr_nref(header) || b->core.tid < -1) { ++ free(count0); ++ return -1; ++ } ++@@ -150,10 +150,10 @@ ++ ++ if (ret == -1) { ++ int i; ++- for (i = 0; i < header->n_targets; i++) { ++- fprintf(samtools_stdout, "%s\t%d\t%"PRIu64"\t%"PRIu64"\n", ++- header->target_name[i], ++- header->target_len[i], +++ for (i = 0; i < sam_hdr_nref(header); i++) { +++ fprintf(samtools_stdout, "%s\t%"PRId64"\t%"PRIu64"\t%"PRIu64"\n", +++ sam_hdr_tid2name(header, i), +++ (int64_t) sam_hdr_tid2len(header, i), ++ counts[i][0], counts[i][1]); ++ } ++ fprintf(samtools_stdout, "*\t0\t%"PRIu64"\t%"PRIu64"\n", counts[-1][0], counts[-1][1]); ++@@ -169,14 +169,14 @@ ++ static void usage_exit(FILE *fp, int exit_status) ++ { ++ fprintf(fp, "Usage: samtools idxstats [options] \n"); ++- sam_global_opt_help(fp, "-.---@"); +++ sam_global_opt_help(fp, "-.---@-."); ++ exit(exit_status); ++ } ++ ++ int bam_idxstats(int argc, char *argv[]) ++ { ++ hts_idx_t* idx; ++- bam_hdr_t* header; +++ sam_hdr_t* header; ++ samFile* fp; ++ int c; ++ ++@@ -229,9 +229,9 @@ ++ } ++ ++ int i; ++- for (i = 0; i < header->n_targets; ++i) { +++ for (i = 0; i < sam_hdr_nref(header); ++i) { ++ // Print out contig name and length ++- fprintf(samtools_stdout, "%s\t%d", header->target_name[i], header->target_len[i]); +++ fprintf(samtools_stdout, "%s\t%"PRId64, sam_hdr_tid2name(header, i), (int64_t) sam_hdr_tid2len(header, i)); ++ // Now fetch info about it from the meta bin ++ uint64_t u, v; ++ hts_idx_get_stat(idx, i, &u, &v); ++@@ -242,7 +242,7 @@ ++ hts_idx_destroy(idx); ++ } ++ ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ sam_close(fp); ++ return 0; ++ } ++--- python-pysam.orig/samtools/bam_lpileup.c +++++ python-pysam/samtools/bam_lpileup.c ++@@ -100,7 +100,7 @@ ++ buf->n_nodes = 0; ++ } ++ ++-static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +++static int tview_func(uint32_t tid, hts_pos_t pos, int n, const bam_pileup1_t *pl, void *data) ++ { ++ bam_lplbuf_t *tv = (bam_lplbuf_t*)data; ++ freenode_t *p; ++--- python-pysam.orig/samtools/bam_lpileup.c.pysam.c +++++ python-pysam/samtools/bam_lpileup.c.pysam.c ++@@ -102,7 +102,7 @@ ++ buf->n_nodes = 0; ++ } ++ ++-static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +++static int tview_func(uint32_t tid, hts_pos_t pos, int n, const bam_pileup1_t *pl, void *data) ++ { ++ bam_lplbuf_t *tv = (bam_lplbuf_t*)data; ++ freenode_t *p; ++--- python-pysam.orig/samtools/bam_lpileup.h +++++ python-pysam/samtools/bam_lpileup.h ++@@ -33,7 +33,7 @@ ++ ++ #ifndef BAM_PILEUP_F_DEFINED ++ #define BAM_PILEUP_F_DEFINED ++-typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); +++typedef int (*bam_pileup_f)(uint32_t tid, hts_pos_t pos, int n, const bam_pileup1_t *pl, void *data); ++ #endif //BAM_PILEUP_F_DEFINED ++ ++ ++--- python-pysam.orig/samtools/bam_markdup.c +++++ python-pysam/samtools/bam_markdup.c ++@@ -1,7 +1,7 @@ ++ /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone ++ through fixmates with the mate scoring option on. ++ ++- Copyright (C) 2017-18 Genome Research Ltd. +++ Copyright (C) 2017-2019 Genome Research Ltd. ++ ++ Author: Andrew Whitwham ++ ++@@ -22,6 +22,9 @@ ++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ DEALINGS IN THE SOFTWARE +++ +++Estimate library size derived from Picard DuplicationMetrics.java +++Copyright (c) 2009,2018 The Broad Institute. MIT license. ++ */ ++ ++ #include ++@@ -33,6 +36,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include "htslib/thread_pool.h" ++ #include "htslib/sam.h" ++ #include "sam_opts.h" ++@@ -42,26 +46,53 @@ ++ #include "htslib/kstring.h" ++ #include "tmp_file.h" ++ +++ +++typedef struct { +++ samFile *in; +++ samFile *out; +++ char *prefix; +++ int remove_dups; +++ int32_t max_length; +++ int do_stats; +++ int supp; +++ int tag; +++ int opt_dist; +++ int no_pg; +++ int clear; +++ int mode; +++ int write_index; +++ int include_fails; +++ char *stats_file; +++ char *arg_list; +++ char *out_fn; +++} md_param_t; +++ ++ typedef struct { ++- int32_t single; +++ hts_pos_t this_coord; +++ hts_pos_t other_coord; ++ int32_t this_ref; ++- int32_t this_coord; ++ int32_t other_ref; ++- int32_t other_coord; ++- int32_t leftmost; ++- int32_t orientation; +++ int8_t single; +++ int8_t leftmost; +++ int8_t orientation; ++ } key_data_t; ++ +++typedef struct read_queue_s { +++ key_data_t pair_key; +++ key_data_t single_key; +++ bam1_t *b; +++ struct read_queue_s *duplicate; +++ hts_pos_t pos; +++} read_queue_t; +++ ++ typedef struct { ++- bam1_t *p; +++ read_queue_t *p; ++ } in_hash_t; ++ ++ typedef struct { ++- bam1_t *b; ++- int32_t pos; ++- key_data_t pair_key; ++- key_data_t single_key; ++-} read_queue_t; +++ char *name; +++ char type; +++} dup_map_t; ++ ++ ++ ++@@ -72,22 +103,22 @@ ++ khint_t hash; ++ ++ if (key.single) { ++- unsigned char sig[12]; +++ unsigned char sig[13]; ++ ++ memcpy(sig + i, &key.this_ref, 4); i += 4; ++- memcpy(sig + i, &key.this_coord, 4); i += 4; ++- memcpy(sig + i, &key.orientation, 4); i += 4; +++ memcpy(sig + i, &key.this_coord, 8); i += 8; +++ memcpy(sig + i, &key.orientation, 1); i += 1; ++ ++ hash = do_hash(sig, i); ++ } else { ++- unsigned char sig[24]; +++ unsigned char sig[26]; ++ ++ memcpy(sig + i, &key.this_ref, 4); i += 4; ++- memcpy(sig + i, &key.this_coord, 4); i += 4; +++ memcpy(sig + i, &key.this_coord, 8); i += 8; ++ memcpy(sig + i, &key.other_ref, 4); i += 4; ++- memcpy(sig + i, &key.other_coord, 4); i += 4; ++- memcpy(sig + i, &key.leftmost, 4); i += 4; ++- memcpy(sig + i, &key.orientation, 4); i += 4; +++ memcpy(sig + i, &key.other_coord, 8); i += 8; +++ memcpy(sig + i, &key.leftmost, 1); i += 1; +++ memcpy(sig + i, &key.orientation, 1); i += 1; ++ ++ hash = do_hash(sig, i); ++ } ++@@ -122,21 +153,35 @@ ++ ++ ++ #define __free_queue_element(p) +++ +++// Orientations (prime numbers to feed to hashing algorithm) ++ #define O_FF 2 ++ #define O_RR 3 ++ #define O_FR 5 ++ #define O_RF 7 ++ +++// Left or rightmost +++#define R_LE 11 +++#define R_RI 13 +++ +++#define BMD_WARNING_MAX 10 +++ +++#define MD_MIN_QUALITY 15 +++ +++// Duplicate finding mode +++#define MD_MODE_TEMPLATE 0 +++#define MD_MODE_SEQUENCE 1 +++ ++ KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash ++ KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer ++-KHASH_MAP_INIT_STR(duplicates, int) // map of duplicates for supplementary dup id +++KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id ++ ++ ++ /* Calculate the mate's unclipped start based on position and cigar string from MC tag. */ ++ ++-static int32_t unclipped_other_start(int32_t op, char *cigar) { +++static hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) { ++ char *c = cigar; ++- int32_t clipped = 0; +++ int64_t clipped = 0; ++ ++ while (*c && *c != '*') { ++ long num = 0; ++@@ -162,9 +207,9 @@ ++ ++ /* Calculate the current read's start based on the stored cigar string. */ ++ ++-static int32_t unclipped_start(bam1_t *b) { +++static hts_pos_t unclipped_start(bam1_t *b) { ++ uint32_t *cigar = bam_get_cigar(b); ++- int32_t clipped = 0; +++ int64_t clipped = 0; ++ uint32_t i; ++ ++ for (i = 0; i < b->core.n_cigar; i++) { ++@@ -183,9 +228,9 @@ ++ ++ /* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/ ++ ++-static int32_t unclipped_other_end(int32_t op, char *cigar) { +++static hts_pos_t unclipped_other_end(int64_t op, char *cigar) { ++ char *c = cigar; ++- int32_t refpos = 0; +++ int64_t refpos = 0; ++ int skip = 1; ++ ++ while (*c && *c != '*') { ++@@ -224,9 +269,9 @@ ++ ++ /* Calculate the current read's end based on the stored cigar string. */ ++ ++-static int32_t unclipped_end(bam1_t *b) { +++static hts_pos_t unclipped_end(bam1_t *b) { ++ uint32_t *cigar = bam_get_cigar(b); ++- int32_t end_pos, clipped = 0; +++ hts_pos_t end_pos, clipped = 0; ++ int32_t i; ++ ++ end_pos = bam_endpos(b); ++@@ -293,7 +338,7 @@ ++ int i; ++ ++ for (i = 0; i < b->core.l_qseq; i++) { ++- if (qual[i] >= 15) score += qual[i]; +++ if (qual[i] >= MD_MIN_QUALITY) score += qual[i]; ++ } ++ ++ return score; ++@@ -305,10 +350,10 @@ ++ the reference id, orientation and whether the current ++ read is leftmost of the pair. */ ++ ++-static int make_pair_key(key_data_t *key, bam1_t *bam) { ++- int32_t this_ref, this_coord, this_end; ++- int32_t other_ref, other_coord, other_end; ++- int32_t orientation, leftmost; +++static int make_pair_key_template(key_data_t *key, bam1_t *bam) { +++ hts_pos_t this_coord, other_coord, this_end, other_end; +++ int32_t this_ref, other_ref; +++ int8_t orientation, leftmost; ++ uint8_t *data; ++ char *cig; ++ ++@@ -319,7 +364,11 @@ ++ this_end = unclipped_end(bam); ++ ++ if ((data = bam_aux_get(bam, "MC"))) { ++- cig = bam_aux2Z(data); +++ if (!(cig = bam_aux2Z(data))) { +++ fprintf(stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); +++ return 1; +++ } +++ ++ other_end = unclipped_other_end(bam->core.mpos, cig); ++ other_coord = unclipped_other_start(bam->core.mpos, cig); ++ } else { ++@@ -402,9 +451,9 @@ ++ } ++ ++ if (!leftmost) ++- leftmost = 13; +++ leftmost = R_RI; ++ else ++- leftmost = 11; +++ leftmost = R_LE; ++ ++ key->single = 0; ++ key->this_ref = this_ref; ++@@ -418,13 +467,140 @@ ++ } ++ ++ +++static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) { +++ hts_pos_t this_coord, this_end, other_coord, other_end, leftmost; +++ int32_t this_ref, other_ref; +++ int8_t orientation, left_read; +++ uint8_t *data; +++ char *cig; +++ +++ this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash +++ other_ref = bam->core.mtid + 1; +++ +++ this_coord = unclipped_start(bam); +++ this_end = unclipped_end(bam); +++ +++ if ((data = bam_aux_get(bam, "MC"))) { +++ if (!(cig = bam_aux2Z(data))) { +++ fprintf(stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); +++ return 1; +++ } +++ +++ other_end = unclipped_other_end(bam->core.mpos, cig); +++ other_coord = unclipped_other_start(bam->core.mpos, cig); +++ } else { +++ fprintf(stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n"); +++ return 1; +++ } +++ +++ // work out orientations +++ if (this_ref != other_ref) { +++ leftmost = this_ref - other_ref; +++ } else { +++ if (bam_is_rev(bam) == bam_is_mrev(bam)) { +++ if (!bam_is_rev(bam)) { +++ leftmost = this_coord - other_coord; +++ } else { +++ leftmost = this_end - other_end; +++ } +++ } else { +++ if (bam_is_rev(bam)) { +++ leftmost = this_end - other_coord; +++ } else { +++ leftmost = this_coord - other_end; +++ } +++ } +++ } +++ +++ if (leftmost < 0) { +++ leftmost = 1; +++ } else if (leftmost > 0) { +++ leftmost = 0; +++ } else { +++ // tie breaks +++ +++ if (bam->core.pos == bam->core.mpos) { +++ if (bam->core.flag & BAM_FREAD1) { +++ leftmost = 1; +++ } else { +++ leftmost = 0; +++ } +++ } else if (bam->core.pos < bam->core.mpos) { +++ leftmost = 1; +++ } else { +++ leftmost = 0; +++ } +++ } +++ +++ // pair orientation +++ if (leftmost) { +++ if (bam_is_rev(bam) == bam_is_mrev(bam)) { +++ +++ if (!bam_is_rev(bam)) { +++ orientation = O_FF; +++ } else { +++ orientation = O_RR; +++ } +++ } else { +++ if (!bam_is_rev(bam)) { +++ orientation = O_FR; +++ } else { +++ orientation = O_RF; +++ } +++ } +++ } else { +++ if (bam_is_rev(bam) == bam_is_mrev(bam)) { +++ +++ if (!bam_is_rev(bam)) { +++ orientation = O_RR; +++ } else { +++ orientation = O_FF; +++ } +++ } else { +++ if (!bam_is_rev(bam)) { +++ orientation = O_RF; +++ } else { +++ orientation = O_FR; +++ } +++ } +++ } +++ +++ if (!leftmost) +++ left_read = R_RI; +++ else +++ left_read = R_LE; +++ +++ if (!bam_is_rev(bam)) { +++ this_coord = unclipped_start(bam); +++ } else { +++ this_coord = unclipped_end(bam); +++ } +++ +++ if (!bam_is_mrev(bam)) { +++ other_coord = unclipped_other_start(bam->core.mpos, cig); +++ } else { +++ other_coord = unclipped_other_end(bam->core.mpos, cig); +++ } +++ +++ key->single = 0; +++ key->this_ref = this_ref; +++ key->this_coord = this_coord; +++ key->other_ref = other_ref; +++ key->other_coord = other_coord; +++ key->leftmost = left_read; +++ key->orientation = orientation; +++ +++ return 0; +++} +++ ++ /* Create a signature hash of single read (or read with an unmatched pair). ++ Uses unclipped start (or end depending on orientation), reference id, ++ and orientation. */ ++ ++ static void make_single_key(key_data_t *key, bam1_t *bam) { ++- int32_t this_ref, this_coord; ++- int32_t orientation; +++ hts_pos_t this_coord; +++ int32_t this_ref; +++ int8_t orientation; ++ ++ this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash ++ ++@@ -442,23 +618,45 @@ ++ key->orientation = orientation; ++ } ++ +++ ++ /* Add the duplicate name to a hash if it does not exist. */ ++ ++-static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe) { +++static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type) { ++ khiter_t d; ++ int ret; ++ ++ d = kh_get(duplicates, d_hash, bam_get_qname(dupe)); ++ ++ if (d == kh_end(d_hash)) { ++- d = kh_put(duplicates, d_hash, strdup(bam_get_qname(dupe)), &ret); +++ char *name = strdup(bam_get_qname(dupe)); +++ if (name) { +++ d = kh_put(duplicates, d_hash, name, &ret); +++ } else { +++ ret = -1; +++ } +++ +++ if (ret >= 0) { +++ if (orig_name) { +++ if (ret == 0) { +++ // replace old name +++ free(kh_value(d_hash, d).name); +++ free(name); +++ } ++ ++- if (ret > 0) { ++- kh_value(d_hash, d) = 1; ++- } else if (ret == 0) { ++- kh_value(d_hash, d)++; +++ kh_value(d_hash, d).name = strdup(orig_name); +++ +++ if (kh_value(d_hash, d).name == NULL) { +++ fprintf(stderr, "[markdup] error: unable to allocate memory for duplicate original name.\n"); +++ return 1; +++ } +++ } else { +++ kh_value(d_hash, d).name = NULL; +++ } +++ +++ kh_value(d_hash, d).type = type; ++ } else { ++ fprintf(stderr, "[markdup] error: unable to store supplementary duplicates.\n"); +++ free(name); ++ return 1; ++ } ++ } ++@@ -467,6 +665,467 @@ ++ } ++ ++ +++static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) { +++ int sep = 0; +++ int pos = 0; +++ +++ while (qname[pos]) { +++ if (qname[pos] == ':') { +++ sep++; +++ +++ if (sep == 2) { +++ *xpos = pos + 1; +++ } else if (sep == 3) { +++ *ypos = pos + 1; +++ } else if (sep == 4) { // HiSeq style names +++ *xpos = *ypos; +++ *ypos = pos + 1; +++ } else if (sep == 5) { // Newer Illumina format +++ *xpos = pos + 1; +++ } else if (sep == 6) { +++ *ypos = pos + 1; +++ } +++ } +++ +++ pos++; +++ } +++ +++ return sep; +++} +++ +++/* Using the coordinates from the Illumina read name, see whether the duplicated read is +++ close enough (set by max_dist) to the original to be counted as optical.*/ +++ +++static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) { +++ int ret = 0, seps; +++ char *original, *duplicate; +++ int oxpos = 0, oypos = 0, dxpos = 0, dypos = 0; +++ +++ +++ original = bam_get_qname(ori); +++ duplicate = bam_get_qname(dup); +++ +++ seps = get_coordinate_positions(original, &oxpos, &oypos); +++ +++ if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { +++ (*warnings)++; +++ +++ if (*warnings <= BMD_WARNING_MAX) { +++ fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", original); +++ } +++ +++ return ret; +++ } +++ +++ seps = get_coordinate_positions(duplicate, &dxpos, &dypos); +++ +++ if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { +++ +++ (*warnings)++; +++ +++ if (*warnings <= BMD_WARNING_MAX) { +++ fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", duplicate); +++ } +++ +++ return ret; +++ } +++ +++ if (strncmp(original, duplicate, oxpos - 1) == 0) { +++ // the initial parts match, look at the numbers +++ long ox, oy, dx, dy, xdiff, ydiff; +++ char *end; +++ +++ ox = strtol(original + oxpos, &end, 10); +++ +++ if ((original + oxpos) == end) { +++ (*warnings)++; +++ +++ if (*warnings <= BMD_WARNING_MAX) { +++ fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", original); +++ } +++ +++ return ret; +++ } +++ +++ dx = strtol(duplicate + dxpos, &end, 10); +++ +++ if ((duplicate + dxpos) == end) { +++ (*warnings)++; +++ +++ if (*warnings <= BMD_WARNING_MAX) { +++ fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s.\n", duplicate); +++ } +++ +++ return ret; +++ } +++ +++ if (ox > dx) { +++ xdiff = ox - dx; +++ } else { +++ xdiff = dx - ox; +++ } +++ +++ if (xdiff <= max_dist) { +++ // still might be optical +++ +++ oy = strtol(original + oypos, &end, 10); +++ +++ if ((original + oypos) == end) { +++ (*warnings)++; +++ +++ if (*warnings <= BMD_WARNING_MAX) { +++ fprintf(stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", original); +++ } +++ +++ return ret; +++ } +++ +++ dy = strtol(duplicate + dypos, &end, 10); +++ +++ if ((duplicate + dypos) == end) { +++ (*warnings)++; +++ +++ if (*warnings <= BMD_WARNING_MAX) { +++ fprintf(stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", duplicate); +++ } +++ +++ return ret; +++ } +++ +++ if (oy > dy) { +++ ydiff = oy - dy; +++ } else { +++ ydiff = dy - oy; +++ } +++ +++ if (ydiff <= max_dist) ret = 1; +++ } +++ } +++ +++ return ret; +++} +++ +++ +++static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup, +++ long *optical, long *warn) { +++ char dup_type = 0; +++ long incoming_warnings = *warn; +++ +++ dup->core.flag |= BAM_FDUP; +++ +++ if (param->tag) { +++ if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(ori)) + 1, (uint8_t*)bam_get_qname(ori))) { +++ fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); +++ return -1; +++ } +++ } +++ +++ if (param->opt_dist) { // mark optical duplicates +++ if (optical_duplicate(ori, dup, param->opt_dist, warn)) { +++ bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"SQ"); +++ dup_type = 'O'; +++ (*optical)++; +++ } else { +++ // not an optical duplicate +++ bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"LB"); +++ } +++ } +++ +++ if ((*warn == BMD_WARNING_MAX) && (incoming_warnings != *warn)) { +++ fprintf(stderr, "[markdup] warning: %ld decipher read name warnings. New warnings will not be reported.\n", +++ *warn); +++ } +++ +++ if (param->supp) { +++ if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP) || bam_aux_get(dup, "XA")) { +++ char *original = NULL; +++ +++ if (param->tag) { +++ original = bam_get_qname(ori); +++ } +++ +++ if (add_duplicate(dup_hash, dup, original, dup_type)) +++ return -1; +++ } +++ } +++ +++ return 0; +++} +++ +++ +++static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) { +++ int ret = 0; +++ uint8_t *data; +++ +++ // remove any existing dt tag +++ if ((data = bam_aux_get(b, "dt")) != NULL) { +++ bam_aux_del(b, data); +++ } +++ +++ if (bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ")) { +++ fprintf(stderr, "[markdup] error: unable to append 'dt' tag.\n"); +++ ret = -1; +++ } +++ +++ if (paired) { +++ (*optical_pair)++; +++ } else { +++ (*optical_single)++; +++ } +++ +++ if (param->supp) { +++ // Change the duplicate type +++ +++ if (bam_aux_get(b, "SA") || (b->core.flag & BAM_FMUNMAP) +++ || bam_aux_get(b, "XA")) { +++ khiter_t d; +++ +++ d = kh_get(duplicates, dup_hash, bam_get_qname(b)); +++ +++ if (d == kh_end(dup_hash)) { +++ // error, name should already be in dup hash +++ fprintf(stderr, "[markdup] error: duplicate name %s not found in hash.\n", +++ bam_get_qname(b)); +++ ret = -1; +++ } else { +++ kh_value(dup_hash, d).type = 'O'; +++ } +++ } +++ } +++ +++ return ret; +++} +++ +++ +++ +++/* +++ Where there is more than one duplicate go down the list and check for optical duplicates and change +++ do tags (where used) to point to original (non-duplicate) read. +++*/ +++static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, +++ long *warn, long *optical_single, long *optical_pair) { +++ int ret = 0; +++ read_queue_t *current = ori->duplicate; +++ char *ori_name = bam_get_qname(ori->b); +++ int have_original = !(ori->b->core.flag & BAM_FDUP); +++ int ori_paired = (ori->b->core.flag & BAM_FPAIRED) && !(ori->b->core.flag & BAM_FMUNMAP); +++ +++ while (current) { +++ int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); +++ +++ if (param->tag && have_original) { +++ uint8_t *data; +++ +++ // at this stage all duplicates should have a do tag +++ if ((data = bam_aux_get(current->b, "do")) != NULL) { +++ // see if we need to change the tag +++ char *old_name = bam_aux2Z(data); +++ +++ if (old_name) { +++ if (strcmp(old_name, ori_name) != 0) { +++ bam_aux_del(current->b, data); +++ +++ if (bam_aux_append(current->b, "do", 'Z', strlen(ori_name) + 1, (uint8_t*)ori_name)) { +++ fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); +++ ret = -1; +++ break; +++ } +++ } +++ } else { +++ fprintf(stderr, "[markdup] error: 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b)); +++ ret = -1; +++ break; +++ } +++ } +++ } +++ +++ if (param->opt_dist) { +++ int is_cur_opt = 0, is_ori_opt = 0; +++ uint8_t *data; +++ char *dup_type; +++ +++ if ((data = bam_aux_get(ori->b, "dt"))) { +++ if ((dup_type = bam_aux2Z(data))) { +++ if (strcmp(dup_type, "SQ") == 0) { +++ is_ori_opt = 1; +++ } +++ } +++ } +++ +++ if ((data = bam_aux_get(current->b, "dt"))) { +++ if ((dup_type = bam_aux2Z(data))) { +++ if (strcmp(dup_type, "SQ") == 0) { +++ is_cur_opt = 1; +++ } +++ } +++ } +++ +++ if (!(is_ori_opt && is_cur_opt)) { +++ // if both are already optical duplicates there is no need to check again, otherwise... +++ +++ if (optical_duplicate(ori->b, current->b, param->opt_dist, warn)) { +++ // find out which one is the duplicate +++ int is_cur_dup = 0; +++ +++ if (have_original) { +++ // compared against an original, this is a dup. +++ is_cur_dup = 1; +++ } else if (ori_paired != current_paired) { +++ if (!current_paired) { +++ // current is single vs pair, this is a dup. +++ is_cur_dup = 1; +++ } +++ } else { +++ // do it by scores +++ int64_t ori_score, curr_score; +++ +++ if ((ori->b->core.flag & BAM_FQCFAIL) != (current->b->core.flag & BAM_FQCFAIL)) { +++ if (ori->b->core.flag & BAM_FQCFAIL) { +++ ori_score = 0; +++ curr_score = 1; +++ } else { +++ ori_score = 1; +++ curr_score = 0; +++ } +++ } else { +++ ori_score = calc_score(ori->b); +++ curr_score = calc_score(current->b); +++ +++ if (current_paired) { +++ // they are pairs so add mate scores. +++ int64_t mate_tmp; +++ +++ if ((mate_tmp = get_mate_score(ori->b)) == -1) { +++ fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); +++ ret = -1; +++ break; +++ } else { +++ ori_score += mate_tmp; +++ } +++ +++ if ((mate_tmp = get_mate_score(current->b)) == -1) { +++ fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); +++ ret = -1; +++ break; +++ } else { +++ curr_score += mate_tmp; +++ } +++ } +++ } +++ +++ if (ori_score == curr_score) { +++ if (strcmp(bam_get_qname(current->b), ori_name) < 0) { +++ curr_score++; +++ } else { +++ curr_score--; +++ } +++ } +++ +++ if (ori_score > curr_score) { +++ is_cur_dup = 1; +++ } +++ } +++ +++ if (is_cur_dup) { +++ // the current is the optical duplicate +++ if (!is_cur_opt) { // only change if not already an optical duplicate +++ if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { +++ ret = -1; +++ break; +++ } +++ } +++ } else { +++ if (!is_ori_opt) { +++ if (optical_retag(param, dup_hash, ori->b, ori_paired, optical_single, optical_pair)) { +++ ret = -1; +++ break; +++ } +++ } +++ } +++ } +++ } +++ } +++ +++ current = current->duplicate; +++ } +++ +++ return ret; +++} +++ +++/* +++ Function to use when estimating library size. +++ +++ This is based on an approximate formula for the coverage of a set +++ obtained after sampling it a given number of times with replacement. +++ +++ x = number of items in the set (the number of unique fragments in the library) +++ +++ c = number of unique items (unique read pairs observed) +++ +++ n = number of items samples (total number of read pairs) +++ +++ c and n are known; x is unknown. +++ +++ As n -> infinity, the coverage (c/x) can be given as: +++ +++ c / x = 1 - exp(-n / x) (see https://math.stackexchange.com/questions/32800) +++ +++ This needs to be solved for x, so it is rearranged to put both terms on the +++ left side and estimate_library_size() finds a value of x which gives a +++ result of zero (or as close as it can get). +++ */ +++static inline double coverage_equation(double x, double c, double n) { +++ return c / x - 1 + exp(-n / x); +++} +++ +++ +++/* estimate the library size, based on the Picard code in DuplicationMetrics.java*/ +++static unsigned long estimate_library_size(unsigned long read_pairs, unsigned long duplicate_pairs) { +++ unsigned long estimated_size = 0; +++ +++ read_pairs /= 2; +++ duplicate_pairs /= 2; +++ +++ if ((read_pairs && duplicate_pairs) && (read_pairs > duplicate_pairs)) { +++ unsigned long unique_pairs = read_pairs - duplicate_pairs; +++ double m = 1; +++ double M = 100; +++ int i; +++ +++ if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) < 0) { +++ fprintf(stderr, "[markdup] warning: unable to calculate estimated library size.\n"); +++ return estimated_size; +++ } +++ +++ while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) > 0) { +++ M *= 10; +++ } +++ +++ for (i = 0; i < 40; i++) { +++ double r = (m + M) / 2; +++ double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)read_pairs); +++ +++ if (u > 0) { +++ m = r; +++ } else if (u < 0) { +++ M = r; +++ } else { +++ break; +++ } +++ } +++ +++ estimated_size = (unsigned long)(unique_pairs * (m + M) / 2); +++ } else { +++ fprintf(stderr, "[markdup] warning: unable to calculate estimated library size." +++ " Read pairs %ld should be greater than duplicate pairs %ld," +++ " which should both be non zero.\n", +++ read_pairs, duplicate_pairs); +++ } +++ +++ return estimated_size; +++} +++ +++ ++ /* Compare the reads near each other (coordinate sorted) and try to spot the duplicates. ++ Generally the highest quality scoring is chosen as the original and all others the duplicates. ++ The score is based on the sum of the quality values (<= 15) of the read and its mate (if any). ++@@ -476,44 +1135,59 @@ ++ Marking the supplementary reads of a duplicate as also duplicates takes an extra file read/write ++ step. This is because the duplicate can occur before the primary read.*/ ++ ++-static int bam_mark_duplicates(samFile *in, samFile *out, char *prefix, int remove_dups, int32_t max_length, int do_stats, int supp, int tag) { ++- bam_hdr_t *header; +++static int bam_mark_duplicates(md_param_t *param) { +++ bam_hdr_t *header = NULL; ++ khiter_t k; ++ khash_t(reads) *pair_hash = kh_init(reads); ++ khash_t(reads) *single_hash = kh_init(reads); ++ klist_t(read_queue) *read_buffer = kl_init(read_queue); ++ kliter_t(read_queue) *rq; ++ khash_t(duplicates) *dup_hash = kh_init(duplicates); ++- int32_t prev_tid, prev_coord; +++ int32_t prev_tid; +++ hts_pos_t prev_coord; ++ read_queue_t *in_read; ++ int ret; ++- int reading, writing, excluded, duplicate, single, pair, single_dup, examined; +++ long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical; +++ long np_duplicate, np_opt_duplicate; +++ long opt_warnings = 0; ++ tmp_file_t temp; +++ char *idx_fn = NULL; +++ int exclude = 0; ++ ++- if ((header = sam_hdr_read(in)) == NULL) { +++ if (!pair_hash || !single_hash || !read_buffer || !dup_hash) { +++ fprintf(stderr, "[markdup] out of memory\n"); +++ goto fail; +++ } +++ +++ if ((header = sam_hdr_read(param->in)) == NULL) { ++ fprintf(stderr, "[markdup] error reading header\n"); ++- return 1; +++ goto fail; ++ } ++ ++ // accept unknown, unsorted or coordinate sort order, but error on queryname sorted. ++ // only really works on coordinate sorted files. ++- if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { ++- char *p, *q; ++- ++- p = strstr(header->text, "\tSO:queryname"); ++- q = strchr(header->text, '\n'); ++- ++- // looking for SO:queryname within @HD only ++- // (e.g. must ignore in a @CO comment line later in header) ++- if ((p != 0) && (p < q)) { ++- fprintf(stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); ++- return 1; ++- } +++ kstring_t str = KS_INITIALIZE; +++ if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "queryname")) { +++ fprintf(stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); +++ ks_free(&str); +++ goto fail; +++ } +++ ks_free(&str); +++ +++ if (!param->no_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(), +++ param->arg_list ? "CL" : NULL, +++ param->arg_list ? param->arg_list : NULL, +++ NULL) != 0) { +++ fprintf(stderr, "[markdup] warning: unable to add @PG line to header.\n"); ++ } ++ ++- if (sam_hdr_write(out, header) < 0) { +++ if (sam_hdr_write(param->out, header) < 0) { ++ fprintf(stderr, "[markdup] error writing header.\n"); ++- return 1; +++ goto fail; +++ } +++ if (param->write_index) { +++ if (!(idx_fn = auto_index(param->out, param->out_fn, header))) +++ goto fail; ++ } ++ ++ // used for coordinate order checks ++@@ -521,30 +1195,35 @@ ++ ++ // get the buffer going ++ in_read = kl_pushp(read_queue, read_buffer); +++ if (!in_read) { +++ fprintf(stderr, "[markdup] out of memory\n"); +++ goto fail; +++ } ++ ++ // handling supplementary reads needs a temporary file ++- if (supp) { ++- if (tmp_file_open_write(&temp, prefix, 1)) { ++- fprintf(stderr, "[markdup] error: unable to open tmp file %s.\n", prefix); ++- return 1; +++ if (param->supp) { +++ if (tmp_file_open_write(&temp, param->prefix, 1)) { +++ fprintf(stderr, "[markdup] error: unable to open tmp file %s.\n", param->prefix); +++ goto fail; ++ } ++ } ++ ++ if ((in_read->b = bam_init1()) == NULL) { ++ fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n"); ++- return 1; +++ goto fail; ++ } ++ ++- reading = writing = excluded = single_dup = duplicate = examined = pair = single = 0; +++ reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0; +++ np_duplicate = np_opt_duplicate = 0; ++ ++- while ((ret = sam_read1(in, header, in_read->b)) >= 0) { +++ while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) { ++ ++ // do some basic coordinate order checks ++ if (in_read->b->core.tid >= 0) { // -1 for unmapped reads ++ if (in_read->b->core.tid < prev_tid || ++ ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) { ++- fprintf(stderr, "[markdup] error: bad coordinate order.\n"); ++- return 1; +++ fprintf(stderr, "[markdup] error: not in coordinate sorted order.\n"); +++ goto fail; ++ } ++ } ++ ++@@ -555,10 +1234,30 @@ ++ ++ reading++; ++ ++- // read must not be secondary, supplementary, unmapped or failed QC ++- if (!(in_read->b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL))) { ++- examined++; +++ if (param->clear && (in_read->b->core.flag & BAM_FDUP)) { +++ uint8_t *data; +++ +++ in_read->b->core.flag ^= BAM_FDUP; ++ +++ if ((data = bam_aux_get(in_read->b, "dt")) != NULL) { +++ bam_aux_del(in_read->b, data); +++ } +++ +++ if ((data = bam_aux_get(in_read->b, "do")) != NULL) { +++ bam_aux_del(in_read->b, data); +++ } +++ } +++ +++ if (param->include_fails) { +++ exclude |= (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP); +++ } else { +++ exclude |= (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL); +++ } +++ +++ // read must not be secondary, supplementary, unmapped or (possibly) failed QC +++ if (!(in_read->b->core.flag & exclude)) { +++ examined++; +++ in_read->duplicate = NULL; ++ ++ // look at the pairs first ++ if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { ++@@ -567,9 +1266,16 @@ ++ key_data_t single_key; ++ in_hash_t *bp; ++ ++- if (make_pair_key(&pair_key, in_read->b)) { ++- fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n"); ++- return 1; +++ if (param->mode) { +++ if (make_pair_key_sequence(&pair_key, in_read->b)) { +++ fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n"); +++ goto fail; +++ } +++ } else { +++ if (make_pair_key_template(&pair_key, in_read->b)) { +++ fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n"); +++ goto fail; +++ } ++ } ++ ++ make_single_key(&single_key, in_read->b); ++@@ -583,40 +1289,32 @@ ++ if (ret > 0) { // new ++ // add to single duplicate hash ++ bp = &kh_val(single_hash, k); ++- bp->p = in_read->b; +++ bp->p = in_read; ++ in_read->single_key = single_key; ++ } else if (ret == 0) { // exists ++ // look at singles only for duplication marking ++ bp = &kh_val(single_hash, k); ++ ++- if (!(bp->p->core.flag & BAM_FPAIRED) || (bp->p->core.flag & BAM_FMUNMAP)) { ++- bam1_t *dup = bp->p; +++ if (!(bp->p->b->core.flag & BAM_FPAIRED) || (bp->p->b->core.flag & BAM_FMUNMAP)) { +++ // singleton will always be marked duplicate even if +++ // scores more than one read of the pair +++ bam1_t *dup = bp->p->b; +++ +++ in_read->duplicate = bp->p; +++ bp->p = in_read; ++ ++- // singleton will always be marked duplicate even if ++- // scores more than one read of the pair +++ if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) +++ goto fail; ++ ++- bp->p = in_read->b; ++- dup->core.flag |= BAM_FDUP; ++ single_dup++; ++ ++- if (tag) { ++- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { ++- fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); ++- return 1; ++- } ++- } +++ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) +++ goto fail; ++ ++- if (supp) { ++- if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { ++- if (add_duplicate(dup_hash, dup)) { ++- return 1; ++- } ++- } ++- } ++ } ++ } else { ++ fprintf(stderr, "[markdup] error: single hashing failure.\n"); ++- return 1; +++ goto fail; ++ } ++ ++ // now do the pair ++@@ -625,33 +1323,44 @@ ++ if (ret > 0) { // new ++ // add to the pair hash ++ bp = &kh_val(pair_hash, k); ++- bp->p = in_read->b; +++ bp->p = in_read; ++ in_read->pair_key = pair_key; ++ } else if (ret == 0) { ++ int64_t old_score, new_score, tie_add = 0; ++ bam1_t *dup; +++ int check_chain = 0; ++ ++ bp = &kh_val(pair_hash, k); ++ ++- if ((mate_tmp = get_mate_score(bp->p)) == -1) { ++- fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); ++- return 1; +++ if ((bp->p->b->core.flag & BAM_FQCFAIL) != (in_read->b->core.flag & BAM_FQCFAIL)) { +++ if (bp->p->b->core.flag & BAM_FQCFAIL) { +++ old_score = 0; +++ new_score = 1; +++ } else { +++ old_score = 1; +++ new_score = 0; +++ } ++ } else { ++- old_score = calc_score(bp->p) + mate_tmp; ++- } +++ if ((mate_tmp = get_mate_score(bp->p->b)) == -1) { +++ fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); +++ goto fail; +++ } else { +++ old_score = calc_score(bp->p->b) + mate_tmp; +++ } ++ ++- if ((mate_tmp = get_mate_score(in_read->b)) == -1) { ++- fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); ++- return 1; ++- } else { ++- new_score = calc_score(in_read->b) + mate_tmp; +++ if ((mate_tmp = get_mate_score(in_read->b)) == -1) { +++ fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); +++ goto fail; +++ } else { +++ new_score = calc_score(in_read->b) + mate_tmp; +++ } ++ } ++ ++ // choose the highest score as the original ++ // and add it to the pair hash, mark the other as duplicate ++ ++ if (new_score == old_score) { ++- if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p)) < 0) { +++ if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p->b)) < 0) { ++ tie_add = 1; ++ } else { ++ tie_add = -1; ++@@ -659,39 +1368,40 @@ ++ } ++ ++ if (new_score + tie_add > old_score) { // swap reads ++- dup = bp->p; ++- bp->p = in_read->b; +++ dup = bp->p->b; +++ in_read->duplicate = bp->p; +++ bp->p = in_read; ++ } else { +++ if (bp->p->duplicate) { +++ in_read->duplicate = bp->p->duplicate; +++ check_chain = 1; +++ } +++ +++ bp->p->duplicate = in_read; ++ dup = in_read->b; ++ } ++ ++- dup->core.flag |= BAM_FDUP; ++- ++- if (tag) { ++- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { ++- fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); ++- return 1; ++- } +++ if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings)) +++ goto fail; ++ +++ if (check_chain) { +++ if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) +++ goto fail; ++ } ++ ++- if (supp) { ++- if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { ++- if (add_duplicate(dup_hash, dup)) { ++- return 1; ++- } ++- } ++- } +++ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) +++ goto fail; ++ ++ duplicate++; ++ } else { ++ fprintf(stderr, "[markdup] error: pair hashing failure.\n"); ++- return 1; +++ goto fail; ++ } ++ } else { // do the single (or effectively single) reads ++ int ret; ++ key_data_t single_key; ++ in_hash_t *bp; +++ int check_chain = 0; ++ ++ make_single_key(&single_key, in_read->b); ++ ++@@ -702,68 +1412,76 @@ ++ ++ if (ret > 0) { // new ++ bp = &kh_val(single_hash, k); ++- bp->p = in_read->b; +++ bp->p = in_read; ++ in_read->single_key = single_key; ++ } else if (ret == 0) { // exists ++ bp = &kh_val(single_hash, k); ++ ++- if ((bp->p->core.flag & BAM_FPAIRED) && !(bp->p->core.flag & BAM_FMUNMAP)) { +++ if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) { ++ // if matched against one of a pair just mark as duplicate ++ ++- if (tag) { ++- if (bam_aux_append(in_read->b, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { ++- fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); ++- return 1; ++- } +++ if (bp->p->duplicate) { +++ in_read->duplicate = bp->p->duplicate; +++ check_chain = 1; ++ } ++ ++- if (supp) { ++- if (bam_aux_get(in_read->b, "SA") || (in_read->b->core.flag & BAM_FMUNMAP)) { ++- if (add_duplicate(dup_hash, in_read->b)) { ++- return 1; ++- } ++- } +++ bp->p->duplicate = in_read; +++ +++ if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings)) +++ goto fail; +++ +++ if (check_chain) { +++ // check the new duplicate entry in the chain +++ if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) +++ goto fail; ++ } ++ ++- in_read->b->core.flag |= BAM_FDUP; +++ // check against the new original +++ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) +++ goto fail; +++ ++ } else { ++ int64_t old_score, new_score; ++ bam1_t *dup; ++ ++- old_score = calc_score(bp->p); +++ old_score = calc_score(bp->p->b); ++ new_score = calc_score(in_read->b); ++ ++ // choose the highest score as the original, add it ++ // to the single hash and mark the other as duplicate ++ if (new_score > old_score) { // swap reads ++- dup = bp->p; ++- bp->p = in_read->b; +++ dup = bp->p->b; +++ in_read->duplicate = bp->p; +++ bp->p = in_read; ++ } else { +++ if (bp->p->duplicate) { +++ in_read->duplicate = bp->p->duplicate; +++ check_chain = 1; +++ } +++ +++ bp->p->duplicate = in_read; ++ dup = in_read->b; ++ } ++ ++- dup->core.flag |= BAM_FDUP; +++ if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) +++ goto fail; ++ ++- if (tag) { ++- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { ++- fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); ++- return 1; ++- } +++ +++ if (check_chain) { +++ if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) +++ goto fail; ++ } ++ ++- if (supp) { ++- if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { ++- if (add_duplicate(dup_hash, dup)) { ++- return 1; ++- } ++- } +++ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) +++ goto fail; +++ +++ ++ } ++- } ++ ++ single_dup++; ++ } else { ++ fprintf(stderr, "[markdup] error: single hashing failure.\n"); ++- return 1; +++ goto fail; ++ } ++ } ++ } else { ++@@ -778,20 +1496,20 @@ ++ ++ /* keep a moving window of reads based on coordinates and max read length. Any unaligned reads ++ should just be written as they cannot be matched as duplicates. */ ++- if (in_read->pos + max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { +++ if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { ++ break; ++ } ++ ++- if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { ++- if (supp) { +++ if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { +++ if (param->supp) { ++ if (tmp_file_write(&temp, in_read->b)) { ++ fprintf(stderr, "[markdup] error: writing temp output failed.\n"); ++- return 1; +++ goto fail; ++ } ++ } else { ++- if (sam_write1(out, header, in_read->b) < 0) { +++ if (sam_write1(param->out, header, in_read->b) < 0) { ++ fprintf(stderr, "[markdup] error: writing output failed.\n"); ++- return 1; +++ goto fail; ++ } ++ } ++ ++@@ -816,16 +1534,20 @@ ++ ++ // set the next one up for reading ++ in_read = kl_pushp(read_queue, read_buffer); +++ if (!in_read) { +++ fprintf(stderr, "[markdup] out of memory\n"); +++ goto fail; +++ } ++ ++ if ((in_read->b = bam_init1()) == NULL) { ++ fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n"); ++- return 1; +++ goto fail; ++ } ++ } ++ ++ if (ret < -1) { ++ fprintf(stderr, "[markdup] error: truncated input file.\n"); ++- return 1; +++ goto fail; ++ } ++ ++ // write out the end of the list ++@@ -834,16 +1556,16 @@ ++ in_read = &kl_val(rq); ++ ++ if (bam_get_qname(in_read->b)) { // last entry will be blank ++- if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { ++- if (supp) { +++ if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { +++ if (param->supp) { ++ if (tmp_file_write(&temp, in_read->b)) { ++ fprintf(stderr, "[markdup] error: writing temp output failed.\n"); ++- return 1; +++ goto fail; ++ } ++ } else { ++- if (sam_write1(out, header, in_read->b) < 0) { +++ if (sam_write1(param->out, header, in_read->b) < 0) { ++ fprintf(stderr, "[markdup] error: writing output failed.\n"); ++- return 1; +++ goto fail; ++ } ++ } ++ ++@@ -856,71 +1578,155 @@ ++ rq = kl_begin(read_buffer); ++ } ++ ++- if (supp) { +++ if (param->supp) { ++ bam1_t *b; ++ ++ if (tmp_file_end_write(&temp)) { ++ fprintf(stderr, "[markdup] error: unable to end tmp writing.\n"); ++- return 1; +++ goto fail; ++ } ++ ++ // read data from temp file and mark duplicate supplementary alignments ++ ++- if (tmp_file_begin_read(&temp, NULL)) { ++- return 1; +++ if (tmp_file_begin_read(&temp)) { +++ goto fail; ++ } ++ ++ b = bam_init1(); ++ ++ while ((ret = tmp_file_read(&temp, b)) > 0) { ++ ++- if ((b->core.flag & BAM_FSUPPLEMENTARY) || (b->core.flag & BAM_FUNMAP)) { +++ if ((b->core.flag & BAM_FSUPPLEMENTARY) || (b->core.flag & BAM_FUNMAP) || (b->core.flag & BAM_FSECONDARY)) { +++ ++ k = kh_get(duplicates, dup_hash, bam_get_qname(b)); ++ ++ if (k != kh_end(dup_hash)) { +++ ++ b->core.flag |= BAM_FDUP; +++ np_duplicate++; +++ +++ if (param->tag && kh_val(dup_hash, k).name) { +++ if (bam_aux_append(b, "do", 'Z', strlen(kh_val(dup_hash, k).name) + 1, (uint8_t*)kh_val(dup_hash, k).name)) { +++ fprintf(stderr, "[markdup] error: unable to append supplementary 'do' tag.\n"); +++ goto fail; +++ } +++ } +++ +++ if (param->opt_dist) { +++ if (kh_val(dup_hash, k).type) { +++ bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ"); +++ np_opt_duplicate++; +++ } else { +++ bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"LB"); +++ } +++ } ++ } ++ } ++ ++- if (!remove_dups || !(b->core.flag & BAM_FDUP)) { ++- if (sam_write1(out, header, b) < 0) { +++ if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) { +++ if (sam_write1(param->out, header, b) < 0) { ++ fprintf(stderr, "[markdup] error: writing final output failed.\n"); ++- return 1; +++ goto fail; ++ } ++ } ++ } ++ ++ if (ret == -1) { ++ fprintf(stderr, "[markdup] error: failed to read tmp file.\n"); ++- return 1; +++ goto fail; ++ } ++ ++ for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) { ++ if (kh_exist(dup_hash, k)) { +++ free(kh_val(dup_hash, k).name); ++ free((char *)kh_key(dup_hash, k)); +++ kh_key(dup_hash, k) = NULL; ++ } ++ } ++ ++- tmp_file_destroy(&temp, b, 0); ++- kh_destroy(duplicates, dup_hash); +++ tmp_file_destroy(&temp); ++ bam_destroy1(b); ++ } ++ ++- if (do_stats) { ++- fprintf(stderr, "READ %d WRITTEN %d \n" ++- "EXCLUDED %d EXAMINED %d\n" ++- "PAIRED %d SINGLE %d\n" ++- "DULPICATE PAIR %d DUPLICATE SINGLE %d\n" ++- "DUPLICATE TOTAL %d\n", reading, writing, excluded, examined, pair, single, ++- duplicate, single_dup, single_dup + duplicate); +++ if (opt_warnings) { +++ fprintf(stderr, "[markdup] warning: number of failed attempts to get coordinates from read names = %ld\n", +++ opt_warnings); +++ } +++ +++ if (param->do_stats) { +++ FILE *fp; +++ int file_open = 0; +++ unsigned long els; +++ +++ if (param->stats_file) { +++ if (NULL == (fp = fopen(param->stats_file, "w"))) { +++ fprintf(stderr, "[markdup] warning: cannot write stats to %s.\n", param->stats_file); +++ fp = stderr; +++ } else { +++ file_open = 1; +++ } +++ } else { +++ fp = stderr; +++ } +++ +++ els = estimate_library_size(pair, duplicate - optical); +++ +++ fprintf(fp, +++ "COMMAND: %s\n" +++ "READ: %ld\n" +++ "WRITTEN: %ld\n" +++ "EXCLUDED: %ld\n" +++ "EXAMINED: %ld\n" +++ "PAIRED: %ld\n" +++ "SINGLE: %ld\n" +++ "DUPLICATE PAIR: %ld\n" +++ "DUPLICATE SINGLE: %ld\n" +++ "DUPLICATE PAIR OPTICAL: %ld\n" +++ "DUPLICATE SINGLE OPTICAL: %ld\n" +++ "DUPLICATE NON PRIMARY: %ld\n" +++ "DUPLICATE NON PRIMARY OPTICAL: %ld\n" +++ "DUPLICATE PRIMARY TOTAL: %ld\n" +++ "DUPLICATE TOTAL: %ld\n" +++ "ESTIMATED_LIBRARY_SIZE: %ld\n", param->arg_list, reading, writing, excluded, examined, pair, single, +++ duplicate, single_dup, optical, single_optical, np_duplicate, np_opt_duplicate, +++ single_dup + duplicate, single_dup + duplicate + np_duplicate, els); +++ +++ if (file_open) { +++ fclose(fp); +++ } +++ } +++ +++ if (param->write_index) { +++ if (sam_idx_save(param->out) < 0) { +++ print_error_errno("markdup", "writing index failed"); +++ goto fail; +++ } ++ } ++ ++ kh_destroy(reads, pair_hash); ++ kh_destroy(reads, single_hash); ++ kl_destroy(read_queue, read_buffer); ++- bam_hdr_destroy(header); +++ kh_destroy(duplicates, dup_hash); +++ sam_hdr_destroy(header); ++ ++ return 0; +++ +++ fail: +++ for (rq = kl_begin(read_buffer); rq != kl_end(read_buffer); rq = kl_next(rq)) +++ bam_destroy1(kl_val(rq).b); +++ kl_destroy(read_queue, read_buffer); +++ +++ for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) { +++ if (kh_exist(dup_hash, k)) { +++ free((char *)kh_key(dup_hash, k)); +++ } +++ } +++ kh_destroy(duplicates, dup_hash); +++ +++ kh_destroy(reads, pair_hash); +++ kh_destroy(reads, single_hash); +++ sam_hdr_destroy(header); +++ return 1; ++ } ++ ++ ++@@ -928,15 +1734,23 @@ ++ fprintf(stderr, "\n"); ++ fprintf(stderr, "Usage: samtools markdup \n\n"); ++ fprintf(stderr, "Option: \n"); ++- fprintf(stderr, " -r Remove duplicate reads\n"); ++- fprintf(stderr, " -l INT Max read length (default 300 bases)\n"); ++- fprintf(stderr, " -S Mark supplemenary alignments of duplicates as duplicates (slower).\n"); ++- fprintf(stderr, " -s Report stats.\n"); ++- fprintf(stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); ++- fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." +++ fprintf(stderr, " -r Remove duplicate reads\n"); +++ fprintf(stderr, " -l INT Max read length (default 300 bases)\n"); +++ fprintf(stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n"); +++ fprintf(stderr, " -s Report stats.\n"); +++ fprintf(stderr, " -f NAME Write stats to named file. Implies -s.\n"); +++ fprintf(stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); +++ fprintf(stderr, " -d INT Optical distance (if set, marks with dt tag)\n"); +++ fprintf(stderr, " -c Clear previous duplicate settings and tags.\n"); +++ fprintf(stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n" +++ " TYPE = t measure positions based on template start/end (default).\n" +++ " s measure positions based on sequence start.\n"); +++ fprintf(stderr, " --include-fails Include quality check failed reads.\n"); +++ fprintf(stderr, " --no-PG Do not add a PG line\n"); +++ fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." ++ " Mainly for information and debugging.\n"); ++ ++- sam_global_opt_help(stderr, "-.O..@"); +++ sam_global_opt_help(stderr, "-.O..@.."); ++ ++ fprintf(stderr, "\nThe input file must be coordinate sorted and must have gone" ++ " through fixmates with the mate scoring option on.\n"); ++@@ -946,29 +1760,47 @@ ++ ++ ++ int bam_markdup(int argc, char **argv) { ++- int c, ret, remove_dups = 0, report_stats = 0, include_supplementary = 0, tag_dup = 0; ++- int32_t max_length = 300; ++- samFile *in = NULL, *out = NULL; +++ int c, ret; ++ char wmode[3] = {'w', 'b', 0}; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ htsThreadPool p = {NULL, 0}; ++ kstring_t tmpprefix = {0, 0, NULL}; ++ struct stat st; ++ unsigned int t; +++ md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL}; ++ ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), +++ {"include-fails", no_argument, NULL, 1001}, +++ {"no-PG", no_argument, NULL, 1002}, +++ {"mode", required_argument, NULL, 'm'}, ++ {NULL, 0, NULL, 0} ++ }; ++ ++- while ((c = getopt_long(argc, argv, "rsl:StT:O:@:", lopts, NULL)) >= 0) { +++ while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:ncm:", lopts, NULL)) >= 0) { ++ switch (c) { ++- case 'r': remove_dups = 1; break; ++- case 'l': max_length = atoi(optarg); break; ++- case 's': report_stats = 1; break; +++ case 'r': param.remove_dups = 1; break; +++ case 'l': param.max_length = atoi(optarg); break; +++ case 's': param.do_stats = 1; break; ++ case 'T': kputs(optarg, &tmpprefix); break; ++- case 'S': include_supplementary = 1; break; ++- case 't': tag_dup = 1; break; +++ case 'S': param.supp = 1; break; +++ case 't': param.tag = 1; break; +++ case 'f': param.stats_file = optarg; param.do_stats = 1; break; +++ case 'd': param.opt_dist = atoi(optarg); break; +++ case 'c': param.clear = 1; break; +++ case 'm': +++ if (strcmp(optarg, "t") == 0) { +++ param.mode = MD_MODE_TEMPLATE; +++ } else if (strcmp(optarg, "s") == 0) { +++ param.mode = MD_MODE_SEQUENCE; +++ } else { +++ fprintf(stderr, "[markdup] error: unknown mode '%s'.\n", optarg); +++ return markdup_usage(); +++ } +++ +++ break; +++ case 1001: param.include_fails = 1; break; +++ case 1002: param.no_pg = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': return markdup_usage(); ++@@ -978,17 +1810,20 @@ ++ if (optind + 2 > argc) ++ return markdup_usage(); ++ ++- in = sam_open_format(argv[optind], "r", &ga.in); +++ if (param.opt_dist < 0) param.opt_dist = 0; +++ if (param.max_length < 0) param.max_length = 300; +++ +++ param.in = sam_open_format(argv[optind], "r", &ga.in); ++ ++- if (!in) { +++ if (!param.in) { ++ print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]); ++ return 1; ++ } ++ ++ sam_open_mode(wmode + 1, argv[optind + 1], NULL); ++- out = sam_open_format(argv[optind + 1], wmode, &ga.out); +++ param.out = sam_open_format(argv[optind + 1], wmode, &ga.out); ++ ++- if (!out) { +++ if (!param.out) { ++ print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]); ++ return 1; ++ } ++@@ -999,8 +1834,8 @@ ++ return 1; ++ } ++ ++- hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); ++- hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); +++ hts_set_opt(param.in, HTS_OPT_THREAD_POOL, &p); +++ hts_set_opt(param.out, HTS_OPT_THREAD_POOL, &p); ++ } ++ ++ // actual stuff happens here ++@@ -1020,18 +1855,24 @@ ++ ++ t = ((unsigned) time(NULL)) ^ ((unsigned) clock()); ++ ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000); +++ param.prefix = tmpprefix.s; +++ +++ param.arg_list = stringify_argv(argc + 1, argv - 1); +++ param.write_index = ga.write_index; +++ param.out_fn = argv[optind + 1]; ++ ++- ret = bam_mark_duplicates(in, out, tmpprefix.s, remove_dups, max_length, report_stats, include_supplementary, tag_dup); +++ ret = bam_mark_duplicates(¶m); ++ ++- sam_close(in); +++ sam_close(param.in); ++ ++- if (sam_close(out) < 0) { +++ if (sam_close(param.out) < 0) { ++ fprintf(stderr, "[markdup] error closing output file\n"); ++ ret = 1; ++ } ++ ++ if (p.pool) hts_tpool_destroy(p.pool); ++ +++ free(param.arg_list); ++ free(tmpprefix.s); ++ sam_global_args_free(&ga); ++ ++--- python-pysam.orig/samtools/bam_markdup.c.pysam.c +++++ python-pysam/samtools/bam_markdup.c.pysam.c ++@@ -3,7 +3,7 @@ ++ /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone ++ through fixmates with the mate scoring option on. ++ ++- Copyright (C) 2017-18 Genome Research Ltd. +++ Copyright (C) 2017-2019 Genome Research Ltd. ++ ++ Author: Andrew Whitwham ++ ++@@ -24,6 +24,9 @@ ++ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ DEALINGS IN THE SOFTWARE +++ +++Estimate library size derived from Picard DuplicationMetrics.java +++Copyright (c) 2009,2018 The Broad Institute. MIT license. ++ */ ++ ++ #include ++@@ -35,6 +38,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include "htslib/thread_pool.h" ++ #include "htslib/sam.h" ++ #include "sam_opts.h" ++@@ -44,26 +48,53 @@ ++ #include "htslib/kstring.h" ++ #include "tmp_file.h" ++ +++ +++typedef struct { +++ samFile *in; +++ samFile *out; +++ char *prefix; +++ int remove_dups; +++ int32_t max_length; +++ int do_stats; +++ int supp; +++ int tag; +++ int opt_dist; +++ int no_pg; +++ int clear; +++ int mode; +++ int write_index; +++ int include_fails; +++ char *stats_file; +++ char *arg_list; +++ char *out_fn; +++} md_param_t; +++ ++ typedef struct { ++- int32_t single; +++ hts_pos_t this_coord; +++ hts_pos_t other_coord; ++ int32_t this_ref; ++- int32_t this_coord; ++ int32_t other_ref; ++- int32_t other_coord; ++- int32_t leftmost; ++- int32_t orientation; +++ int8_t single; +++ int8_t leftmost; +++ int8_t orientation; ++ } key_data_t; ++ +++typedef struct read_queue_s { +++ key_data_t pair_key; +++ key_data_t single_key; +++ bam1_t *b; +++ struct read_queue_s *duplicate; +++ hts_pos_t pos; +++} read_queue_t; +++ ++ typedef struct { ++- bam1_t *p; +++ read_queue_t *p; ++ } in_hash_t; ++ ++ typedef struct { ++- bam1_t *b; ++- int32_t pos; ++- key_data_t pair_key; ++- key_data_t single_key; ++-} read_queue_t; +++ char *name; +++ char type; +++} dup_map_t; ++ ++ ++ ++@@ -74,22 +105,22 @@ ++ khint_t hash; ++ ++ if (key.single) { ++- unsigned char sig[12]; +++ unsigned char sig[13]; ++ ++ memcpy(sig + i, &key.this_ref, 4); i += 4; ++- memcpy(sig + i, &key.this_coord, 4); i += 4; ++- memcpy(sig + i, &key.orientation, 4); i += 4; +++ memcpy(sig + i, &key.this_coord, 8); i += 8; +++ memcpy(sig + i, &key.orientation, 1); i += 1; ++ ++ hash = do_hash(sig, i); ++ } else { ++- unsigned char sig[24]; +++ unsigned char sig[26]; ++ ++ memcpy(sig + i, &key.this_ref, 4); i += 4; ++- memcpy(sig + i, &key.this_coord, 4); i += 4; +++ memcpy(sig + i, &key.this_coord, 8); i += 8; ++ memcpy(sig + i, &key.other_ref, 4); i += 4; ++- memcpy(sig + i, &key.other_coord, 4); i += 4; ++- memcpy(sig + i, &key.leftmost, 4); i += 4; ++- memcpy(sig + i, &key.orientation, 4); i += 4; +++ memcpy(sig + i, &key.other_coord, 8); i += 8; +++ memcpy(sig + i, &key.leftmost, 1); i += 1; +++ memcpy(sig + i, &key.orientation, 1); i += 1; ++ ++ hash = do_hash(sig, i); ++ } ++@@ -124,21 +155,35 @@ ++ ++ ++ #define __free_queue_element(p) +++ +++// Orientations (prime numbers to feed to hashing algorithm) ++ #define O_FF 2 ++ #define O_RR 3 ++ #define O_FR 5 ++ #define O_RF 7 ++ +++// Left or rightmost +++#define R_LE 11 +++#define R_RI 13 +++ +++#define BMD_WARNING_MAX 10 +++ +++#define MD_MIN_QUALITY 15 +++ +++// Duplicate finding mode +++#define MD_MODE_TEMPLATE 0 +++#define MD_MODE_SEQUENCE 1 +++ ++ KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash ++ KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer ++-KHASH_MAP_INIT_STR(duplicates, int) // map of duplicates for supplementary dup id +++KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id ++ ++ ++ /* Calculate the mate's unclipped start based on position and cigar string from MC tag. */ ++ ++-static int32_t unclipped_other_start(int32_t op, char *cigar) { +++static hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) { ++ char *c = cigar; ++- int32_t clipped = 0; +++ int64_t clipped = 0; ++ ++ while (*c && *c != '*') { ++ long num = 0; ++@@ -164,9 +209,9 @@ ++ ++ /* Calculate the current read's start based on the stored cigar string. */ ++ ++-static int32_t unclipped_start(bam1_t *b) { +++static hts_pos_t unclipped_start(bam1_t *b) { ++ uint32_t *cigar = bam_get_cigar(b); ++- int32_t clipped = 0; +++ int64_t clipped = 0; ++ uint32_t i; ++ ++ for (i = 0; i < b->core.n_cigar; i++) { ++@@ -185,9 +230,9 @@ ++ ++ /* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/ ++ ++-static int32_t unclipped_other_end(int32_t op, char *cigar) { +++static hts_pos_t unclipped_other_end(int64_t op, char *cigar) { ++ char *c = cigar; ++- int32_t refpos = 0; +++ int64_t refpos = 0; ++ int skip = 1; ++ ++ while (*c && *c != '*') { ++@@ -226,9 +271,9 @@ ++ ++ /* Calculate the current read's end based on the stored cigar string. */ ++ ++-static int32_t unclipped_end(bam1_t *b) { +++static hts_pos_t unclipped_end(bam1_t *b) { ++ uint32_t *cigar = bam_get_cigar(b); ++- int32_t end_pos, clipped = 0; +++ hts_pos_t end_pos, clipped = 0; ++ int32_t i; ++ ++ end_pos = bam_endpos(b); ++@@ -295,7 +340,7 @@ ++ int i; ++ ++ for (i = 0; i < b->core.l_qseq; i++) { ++- if (qual[i] >= 15) score += qual[i]; +++ if (qual[i] >= MD_MIN_QUALITY) score += qual[i]; ++ } ++ ++ return score; ++@@ -307,10 +352,10 @@ ++ the reference id, orientation and whether the current ++ read is leftmost of the pair. */ ++ ++-static int make_pair_key(key_data_t *key, bam1_t *bam) { ++- int32_t this_ref, this_coord, this_end; ++- int32_t other_ref, other_coord, other_end; ++- int32_t orientation, leftmost; +++static int make_pair_key_template(key_data_t *key, bam1_t *bam) { +++ hts_pos_t this_coord, other_coord, this_end, other_end; +++ int32_t this_ref, other_ref; +++ int8_t orientation, leftmost; ++ uint8_t *data; ++ char *cig; ++ ++@@ -321,7 +366,11 @@ ++ this_end = unclipped_end(bam); ++ ++ if ((data = bam_aux_get(bam, "MC"))) { ++- cig = bam_aux2Z(data); +++ if (!(cig = bam_aux2Z(data))) { +++ fprintf(samtools_stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); +++ return 1; +++ } +++ ++ other_end = unclipped_other_end(bam->core.mpos, cig); ++ other_coord = unclipped_other_start(bam->core.mpos, cig); ++ } else { ++@@ -404,9 +453,9 @@ ++ } ++ ++ if (!leftmost) ++- leftmost = 13; +++ leftmost = R_RI; ++ else ++- leftmost = 11; +++ leftmost = R_LE; ++ ++ key->single = 0; ++ key->this_ref = this_ref; ++@@ -420,13 +469,140 @@ ++ } ++ ++ +++static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) { +++ hts_pos_t this_coord, this_end, other_coord, other_end, leftmost; +++ int32_t this_ref, other_ref; +++ int8_t orientation, left_read; +++ uint8_t *data; +++ char *cig; +++ +++ this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash +++ other_ref = bam->core.mtid + 1; +++ +++ this_coord = unclipped_start(bam); +++ this_end = unclipped_end(bam); +++ +++ if ((data = bam_aux_get(bam, "MC"))) { +++ if (!(cig = bam_aux2Z(data))) { +++ fprintf(samtools_stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); +++ return 1; +++ } +++ +++ other_end = unclipped_other_end(bam->core.mpos, cig); +++ other_coord = unclipped_other_start(bam->core.mpos, cig); +++ } else { +++ fprintf(samtools_stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n"); +++ return 1; +++ } +++ +++ // work out orientations +++ if (this_ref != other_ref) { +++ leftmost = this_ref - other_ref; +++ } else { +++ if (bam_is_rev(bam) == bam_is_mrev(bam)) { +++ if (!bam_is_rev(bam)) { +++ leftmost = this_coord - other_coord; +++ } else { +++ leftmost = this_end - other_end; +++ } +++ } else { +++ if (bam_is_rev(bam)) { +++ leftmost = this_end - other_coord; +++ } else { +++ leftmost = this_coord - other_end; +++ } +++ } +++ } +++ +++ if (leftmost < 0) { +++ leftmost = 1; +++ } else if (leftmost > 0) { +++ leftmost = 0; +++ } else { +++ // tie breaks +++ +++ if (bam->core.pos == bam->core.mpos) { +++ if (bam->core.flag & BAM_FREAD1) { +++ leftmost = 1; +++ } else { +++ leftmost = 0; +++ } +++ } else if (bam->core.pos < bam->core.mpos) { +++ leftmost = 1; +++ } else { +++ leftmost = 0; +++ } +++ } +++ +++ // pair orientation +++ if (leftmost) { +++ if (bam_is_rev(bam) == bam_is_mrev(bam)) { +++ +++ if (!bam_is_rev(bam)) { +++ orientation = O_FF; +++ } else { +++ orientation = O_RR; +++ } +++ } else { +++ if (!bam_is_rev(bam)) { +++ orientation = O_FR; +++ } else { +++ orientation = O_RF; +++ } +++ } +++ } else { +++ if (bam_is_rev(bam) == bam_is_mrev(bam)) { +++ +++ if (!bam_is_rev(bam)) { +++ orientation = O_RR; +++ } else { +++ orientation = O_FF; +++ } +++ } else { +++ if (!bam_is_rev(bam)) { +++ orientation = O_RF; +++ } else { +++ orientation = O_FR; +++ } +++ } +++ } +++ +++ if (!leftmost) +++ left_read = R_RI; +++ else +++ left_read = R_LE; +++ +++ if (!bam_is_rev(bam)) { +++ this_coord = unclipped_start(bam); +++ } else { +++ this_coord = unclipped_end(bam); +++ } +++ +++ if (!bam_is_mrev(bam)) { +++ other_coord = unclipped_other_start(bam->core.mpos, cig); +++ } else { +++ other_coord = unclipped_other_end(bam->core.mpos, cig); +++ } +++ +++ key->single = 0; +++ key->this_ref = this_ref; +++ key->this_coord = this_coord; +++ key->other_ref = other_ref; +++ key->other_coord = other_coord; +++ key->leftmost = left_read; +++ key->orientation = orientation; +++ +++ return 0; +++} +++ ++ /* Create a signature hash of single read (or read with an unmatched pair). ++ Uses unclipped start (or end depending on orientation), reference id, ++ and orientation. */ ++ ++ static void make_single_key(key_data_t *key, bam1_t *bam) { ++- int32_t this_ref, this_coord; ++- int32_t orientation; +++ hts_pos_t this_coord; +++ int32_t this_ref; +++ int8_t orientation; ++ ++ this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash ++ ++@@ -444,23 +620,45 @@ ++ key->orientation = orientation; ++ } ++ +++ ++ /* Add the duplicate name to a hash if it does not exist. */ ++ ++-static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe) { +++static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type) { ++ khiter_t d; ++ int ret; ++ ++ d = kh_get(duplicates, d_hash, bam_get_qname(dupe)); ++ ++ if (d == kh_end(d_hash)) { ++- d = kh_put(duplicates, d_hash, strdup(bam_get_qname(dupe)), &ret); +++ char *name = strdup(bam_get_qname(dupe)); +++ if (name) { +++ d = kh_put(duplicates, d_hash, name, &ret); +++ } else { +++ ret = -1; +++ } +++ +++ if (ret >= 0) { +++ if (orig_name) { +++ if (ret == 0) { +++ // replace old name +++ free(kh_value(d_hash, d).name); +++ free(name); +++ } ++ ++- if (ret > 0) { ++- kh_value(d_hash, d) = 1; ++- } else if (ret == 0) { ++- kh_value(d_hash, d)++; +++ kh_value(d_hash, d).name = strdup(orig_name); +++ +++ if (kh_value(d_hash, d).name == NULL) { +++ fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for duplicate original name.\n"); +++ return 1; +++ } +++ } else { +++ kh_value(d_hash, d).name = NULL; +++ } +++ +++ kh_value(d_hash, d).type = type; ++ } else { ++ fprintf(samtools_stderr, "[markdup] error: unable to store supplementary duplicates.\n"); +++ free(name); ++ return 1; ++ } ++ } ++@@ -469,6 +667,467 @@ ++ } ++ ++ +++static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) { +++ int sep = 0; +++ int pos = 0; +++ +++ while (qname[pos]) { +++ if (qname[pos] == ':') { +++ sep++; +++ +++ if (sep == 2) { +++ *xpos = pos + 1; +++ } else if (sep == 3) { +++ *ypos = pos + 1; +++ } else if (sep == 4) { // HiSeq style names +++ *xpos = *ypos; +++ *ypos = pos + 1; +++ } else if (sep == 5) { // Newer Illumina format +++ *xpos = pos + 1; +++ } else if (sep == 6) { +++ *ypos = pos + 1; +++ } +++ } +++ +++ pos++; +++ } +++ +++ return sep; +++} +++ +++/* Using the coordinates from the Illumina read name, see whether the duplicated read is +++ close enough (set by max_dist) to the original to be counted as optical.*/ +++ +++static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) { +++ int ret = 0, seps; +++ char *original, *duplicate; +++ int oxpos = 0, oypos = 0, dxpos = 0, dypos = 0; +++ +++ +++ original = bam_get_qname(ori); +++ duplicate = bam_get_qname(dup); +++ +++ seps = get_coordinate_positions(original, &oxpos, &oypos); +++ +++ if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { +++ (*warnings)++; +++ +++ if (*warnings <= BMD_WARNING_MAX) { +++ fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", original); +++ } +++ +++ return ret; +++ } +++ +++ seps = get_coordinate_positions(duplicate, &dxpos, &dypos); +++ +++ if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { +++ +++ (*warnings)++; +++ +++ if (*warnings <= BMD_WARNING_MAX) { +++ fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", duplicate); +++ } +++ +++ return ret; +++ } +++ +++ if (strncmp(original, duplicate, oxpos - 1) == 0) { +++ // the initial parts match, look at the numbers +++ long ox, oy, dx, dy, xdiff, ydiff; +++ char *end; +++ +++ ox = strtol(original + oxpos, &end, 10); +++ +++ if ((original + oxpos) == end) { +++ (*warnings)++; +++ +++ if (*warnings <= BMD_WARNING_MAX) { +++ fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", original); +++ } +++ +++ return ret; +++ } +++ +++ dx = strtol(duplicate + dxpos, &end, 10); +++ +++ if ((duplicate + dxpos) == end) { +++ (*warnings)++; +++ +++ if (*warnings <= BMD_WARNING_MAX) { +++ fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s.\n", duplicate); +++ } +++ +++ return ret; +++ } +++ +++ if (ox > dx) { +++ xdiff = ox - dx; +++ } else { +++ xdiff = dx - ox; +++ } +++ +++ if (xdiff <= max_dist) { +++ // still might be optical +++ +++ oy = strtol(original + oypos, &end, 10); +++ +++ if ((original + oypos) == end) { +++ (*warnings)++; +++ +++ if (*warnings <= BMD_WARNING_MAX) { +++ fprintf(samtools_stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", original); +++ } +++ +++ return ret; +++ } +++ +++ dy = strtol(duplicate + dypos, &end, 10); +++ +++ if ((duplicate + dypos) == end) { +++ (*warnings)++; +++ +++ if (*warnings <= BMD_WARNING_MAX) { +++ fprintf(samtools_stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", duplicate); +++ } +++ +++ return ret; +++ } +++ +++ if (oy > dy) { +++ ydiff = oy - dy; +++ } else { +++ ydiff = dy - oy; +++ } +++ +++ if (ydiff <= max_dist) ret = 1; +++ } +++ } +++ +++ return ret; +++} +++ +++ +++static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup, +++ long *optical, long *warn) { +++ char dup_type = 0; +++ long incoming_warnings = *warn; +++ +++ dup->core.flag |= BAM_FDUP; +++ +++ if (param->tag) { +++ if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(ori)) + 1, (uint8_t*)bam_get_qname(ori))) { +++ fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); +++ return -1; +++ } +++ } +++ +++ if (param->opt_dist) { // mark optical duplicates +++ if (optical_duplicate(ori, dup, param->opt_dist, warn)) { +++ bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"SQ"); +++ dup_type = 'O'; +++ (*optical)++; +++ } else { +++ // not an optical duplicate +++ bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"LB"); +++ } +++ } +++ +++ if ((*warn == BMD_WARNING_MAX) && (incoming_warnings != *warn)) { +++ fprintf(samtools_stderr, "[markdup] warning: %ld decipher read name warnings. New warnings will not be reported.\n", +++ *warn); +++ } +++ +++ if (param->supp) { +++ if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP) || bam_aux_get(dup, "XA")) { +++ char *original = NULL; +++ +++ if (param->tag) { +++ original = bam_get_qname(ori); +++ } +++ +++ if (add_duplicate(dup_hash, dup, original, dup_type)) +++ return -1; +++ } +++ } +++ +++ return 0; +++} +++ +++ +++static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) { +++ int ret = 0; +++ uint8_t *data; +++ +++ // remove any existing dt tag +++ if ((data = bam_aux_get(b, "dt")) != NULL) { +++ bam_aux_del(b, data); +++ } +++ +++ if (bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ")) { +++ fprintf(samtools_stderr, "[markdup] error: unable to append 'dt' tag.\n"); +++ ret = -1; +++ } +++ +++ if (paired) { +++ (*optical_pair)++; +++ } else { +++ (*optical_single)++; +++ } +++ +++ if (param->supp) { +++ // Change the duplicate type +++ +++ if (bam_aux_get(b, "SA") || (b->core.flag & BAM_FMUNMAP) +++ || bam_aux_get(b, "XA")) { +++ khiter_t d; +++ +++ d = kh_get(duplicates, dup_hash, bam_get_qname(b)); +++ +++ if (d == kh_end(dup_hash)) { +++ // error, name should already be in dup hash +++ fprintf(samtools_stderr, "[markdup] error: duplicate name %s not found in hash.\n", +++ bam_get_qname(b)); +++ ret = -1; +++ } else { +++ kh_value(dup_hash, d).type = 'O'; +++ } +++ } +++ } +++ +++ return ret; +++} +++ +++ +++ +++/* +++ Where there is more than one duplicate go down the list and check for optical duplicates and change +++ do tags (where used) to point to original (non-duplicate) read. +++*/ +++static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, +++ long *warn, long *optical_single, long *optical_pair) { +++ int ret = 0; +++ read_queue_t *current = ori->duplicate; +++ char *ori_name = bam_get_qname(ori->b); +++ int have_original = !(ori->b->core.flag & BAM_FDUP); +++ int ori_paired = (ori->b->core.flag & BAM_FPAIRED) && !(ori->b->core.flag & BAM_FMUNMAP); +++ +++ while (current) { +++ int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); +++ +++ if (param->tag && have_original) { +++ uint8_t *data; +++ +++ // at this stage all duplicates should have a do tag +++ if ((data = bam_aux_get(current->b, "do")) != NULL) { +++ // see if we need to change the tag +++ char *old_name = bam_aux2Z(data); +++ +++ if (old_name) { +++ if (strcmp(old_name, ori_name) != 0) { +++ bam_aux_del(current->b, data); +++ +++ if (bam_aux_append(current->b, "do", 'Z', strlen(ori_name) + 1, (uint8_t*)ori_name)) { +++ fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); +++ ret = -1; +++ break; +++ } +++ } +++ } else { +++ fprintf(samtools_stderr, "[markdup] error: 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b)); +++ ret = -1; +++ break; +++ } +++ } +++ } +++ +++ if (param->opt_dist) { +++ int is_cur_opt = 0, is_ori_opt = 0; +++ uint8_t *data; +++ char *dup_type; +++ +++ if ((data = bam_aux_get(ori->b, "dt"))) { +++ if ((dup_type = bam_aux2Z(data))) { +++ if (strcmp(dup_type, "SQ") == 0) { +++ is_ori_opt = 1; +++ } +++ } +++ } +++ +++ if ((data = bam_aux_get(current->b, "dt"))) { +++ if ((dup_type = bam_aux2Z(data))) { +++ if (strcmp(dup_type, "SQ") == 0) { +++ is_cur_opt = 1; +++ } +++ } +++ } +++ +++ if (!(is_ori_opt && is_cur_opt)) { +++ // if both are already optical duplicates there is no need to check again, otherwise... +++ +++ if (optical_duplicate(ori->b, current->b, param->opt_dist, warn)) { +++ // find out which one is the duplicate +++ int is_cur_dup = 0; +++ +++ if (have_original) { +++ // compared against an original, this is a dup. +++ is_cur_dup = 1; +++ } else if (ori_paired != current_paired) { +++ if (!current_paired) { +++ // current is single vs pair, this is a dup. +++ is_cur_dup = 1; +++ } +++ } else { +++ // do it by scores +++ int64_t ori_score, curr_score; +++ +++ if ((ori->b->core.flag & BAM_FQCFAIL) != (current->b->core.flag & BAM_FQCFAIL)) { +++ if (ori->b->core.flag & BAM_FQCFAIL) { +++ ori_score = 0; +++ curr_score = 1; +++ } else { +++ ori_score = 1; +++ curr_score = 0; +++ } +++ } else { +++ ori_score = calc_score(ori->b); +++ curr_score = calc_score(current->b); +++ +++ if (current_paired) { +++ // they are pairs so add mate scores. +++ int64_t mate_tmp; +++ +++ if ((mate_tmp = get_mate_score(ori->b)) == -1) { +++ fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); +++ ret = -1; +++ break; +++ } else { +++ ori_score += mate_tmp; +++ } +++ +++ if ((mate_tmp = get_mate_score(current->b)) == -1) { +++ fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); +++ ret = -1; +++ break; +++ } else { +++ curr_score += mate_tmp; +++ } +++ } +++ } +++ +++ if (ori_score == curr_score) { +++ if (strcmp(bam_get_qname(current->b), ori_name) < 0) { +++ curr_score++; +++ } else { +++ curr_score--; +++ } +++ } +++ +++ if (ori_score > curr_score) { +++ is_cur_dup = 1; +++ } +++ } +++ +++ if (is_cur_dup) { +++ // the current is the optical duplicate +++ if (!is_cur_opt) { // only change if not already an optical duplicate +++ if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { +++ ret = -1; +++ break; +++ } +++ } +++ } else { +++ if (!is_ori_opt) { +++ if (optical_retag(param, dup_hash, ori->b, ori_paired, optical_single, optical_pair)) { +++ ret = -1; +++ break; +++ } +++ } +++ } +++ } +++ } +++ } +++ +++ current = current->duplicate; +++ } +++ +++ return ret; +++} +++ +++/* +++ Function to use when estimating library size. +++ +++ This is based on an approximate formula for the coverage of a set +++ obtained after sampling it a given number of times with replacement. +++ +++ x = number of items in the set (the number of unique fragments in the library) +++ +++ c = number of unique items (unique read pairs observed) +++ +++ n = number of items samples (total number of read pairs) +++ +++ c and n are known; x is unknown. +++ +++ As n -> infinity, the coverage (c/x) can be given as: +++ +++ c / x = 1 - exp(-n / x) (see https://math.stackexchange.com/questions/32800) +++ +++ This needs to be solved for x, so it is rearranged to put both terms on the +++ left side and estimate_library_size() finds a value of x which gives a +++ result of zero (or as close as it can get). +++ */ +++static inline double coverage_equation(double x, double c, double n) { +++ return c / x - 1 + exp(-n / x); +++} +++ +++ +++/* estimate the library size, based on the Picard code in DuplicationMetrics.java*/ +++static unsigned long estimate_library_size(unsigned long read_pairs, unsigned long duplicate_pairs) { +++ unsigned long estimated_size = 0; +++ +++ read_pairs /= 2; +++ duplicate_pairs /= 2; +++ +++ if ((read_pairs && duplicate_pairs) && (read_pairs > duplicate_pairs)) { +++ unsigned long unique_pairs = read_pairs - duplicate_pairs; +++ double m = 1; +++ double M = 100; +++ int i; +++ +++ if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) < 0) { +++ fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size.\n"); +++ return estimated_size; +++ } +++ +++ while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) > 0) { +++ M *= 10; +++ } +++ +++ for (i = 0; i < 40; i++) { +++ double r = (m + M) / 2; +++ double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)read_pairs); +++ +++ if (u > 0) { +++ m = r; +++ } else if (u < 0) { +++ M = r; +++ } else { +++ break; +++ } +++ } +++ +++ estimated_size = (unsigned long)(unique_pairs * (m + M) / 2); +++ } else { +++ fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size." +++ " Read pairs %ld should be greater than duplicate pairs %ld," +++ " which should both be non zero.\n", +++ read_pairs, duplicate_pairs); +++ } +++ +++ return estimated_size; +++} +++ +++ ++ /* Compare the reads near each other (coordinate sorted) and try to spot the duplicates. ++ Generally the highest quality scoring is chosen as the original and all others the duplicates. ++ The score is based on the sum of the quality values (<= 15) of the read and its mate (if any). ++@@ -478,44 +1137,59 @@ ++ Marking the supplementary reads of a duplicate as also duplicates takes an extra file read/write ++ step. This is because the duplicate can occur before the primary read.*/ ++ ++-static int bam_mark_duplicates(samFile *in, samFile *out, char *prefix, int remove_dups, int32_t max_length, int do_stats, int supp, int tag) { ++- bam_hdr_t *header; +++static int bam_mark_duplicates(md_param_t *param) { +++ bam_hdr_t *header = NULL; ++ khiter_t k; ++ khash_t(reads) *pair_hash = kh_init(reads); ++ khash_t(reads) *single_hash = kh_init(reads); ++ klist_t(read_queue) *read_buffer = kl_init(read_queue); ++ kliter_t(read_queue) *rq; ++ khash_t(duplicates) *dup_hash = kh_init(duplicates); ++- int32_t prev_tid, prev_coord; +++ int32_t prev_tid; +++ hts_pos_t prev_coord; ++ read_queue_t *in_read; ++ int ret; ++- int reading, writing, excluded, duplicate, single, pair, single_dup, examined; +++ long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical; +++ long np_duplicate, np_opt_duplicate; +++ long opt_warnings = 0; ++ tmp_file_t temp; +++ char *idx_fn = NULL; +++ int exclude = 0; ++ ++- if ((header = sam_hdr_read(in)) == NULL) { +++ if (!pair_hash || !single_hash || !read_buffer || !dup_hash) { +++ fprintf(samtools_stderr, "[markdup] out of memory\n"); +++ goto fail; +++ } +++ +++ if ((header = sam_hdr_read(param->in)) == NULL) { ++ fprintf(samtools_stderr, "[markdup] error reading header\n"); ++- return 1; +++ goto fail; ++ } ++ ++ // accept unknown, unsorted or coordinate sort order, but error on queryname sorted. ++ // only really works on coordinate sorted files. ++- if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { ++- char *p, *q; ++- ++- p = strstr(header->text, "\tSO:queryname"); ++- q = strchr(header->text, '\n'); ++- ++- // looking for SO:queryname within @HD only ++- // (e.g. must ignore in a @CO comment line later in header) ++- if ((p != 0) && (p < q)) { ++- fprintf(samtools_stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); ++- return 1; ++- } +++ kstring_t str = KS_INITIALIZE; +++ if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "queryname")) { +++ fprintf(samtools_stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); +++ ks_free(&str); +++ goto fail; +++ } +++ ks_free(&str); +++ +++ if (!param->no_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(), +++ param->arg_list ? "CL" : NULL, +++ param->arg_list ? param->arg_list : NULL, +++ NULL) != 0) { +++ fprintf(samtools_stderr, "[markdup] warning: unable to add @PG line to header.\n"); ++ } ++ ++- if (sam_hdr_write(out, header) < 0) { +++ if (sam_hdr_write(param->out, header) < 0) { ++ fprintf(samtools_stderr, "[markdup] error writing header.\n"); ++- return 1; +++ goto fail; +++ } +++ if (param->write_index) { +++ if (!(idx_fn = auto_index(param->out, param->out_fn, header))) +++ goto fail; ++ } ++ ++ // used for coordinate order checks ++@@ -523,30 +1197,35 @@ ++ ++ // get the buffer going ++ in_read = kl_pushp(read_queue, read_buffer); +++ if (!in_read) { +++ fprintf(samtools_stderr, "[markdup] out of memory\n"); +++ goto fail; +++ } ++ ++ // handling supplementary reads needs a temporary file ++- if (supp) { ++- if (tmp_file_open_write(&temp, prefix, 1)) { ++- fprintf(samtools_stderr, "[markdup] error: unable to open tmp file %s.\n", prefix); ++- return 1; +++ if (param->supp) { +++ if (tmp_file_open_write(&temp, param->prefix, 1)) { +++ fprintf(samtools_stderr, "[markdup] error: unable to open tmp file %s.\n", param->prefix); +++ goto fail; ++ } ++ } ++ ++ if ((in_read->b = bam_init1()) == NULL) { ++ fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for alignment.\n"); ++- return 1; +++ goto fail; ++ } ++ ++- reading = writing = excluded = single_dup = duplicate = examined = pair = single = 0; +++ reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0; +++ np_duplicate = np_opt_duplicate = 0; ++ ++- while ((ret = sam_read1(in, header, in_read->b)) >= 0) { +++ while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) { ++ ++ // do some basic coordinate order checks ++ if (in_read->b->core.tid >= 0) { // -1 for unmapped reads ++ if (in_read->b->core.tid < prev_tid || ++ ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) { ++- fprintf(samtools_stderr, "[markdup] error: bad coordinate order.\n"); ++- return 1; +++ fprintf(samtools_stderr, "[markdup] error: not in coordinate sorted order.\n"); +++ goto fail; ++ } ++ } ++ ++@@ -557,10 +1236,30 @@ ++ ++ reading++; ++ ++- // read must not be secondary, supplementary, unmapped or failed QC ++- if (!(in_read->b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL))) { ++- examined++; +++ if (param->clear && (in_read->b->core.flag & BAM_FDUP)) { +++ uint8_t *data; +++ +++ in_read->b->core.flag ^= BAM_FDUP; ++ +++ if ((data = bam_aux_get(in_read->b, "dt")) != NULL) { +++ bam_aux_del(in_read->b, data); +++ } +++ +++ if ((data = bam_aux_get(in_read->b, "do")) != NULL) { +++ bam_aux_del(in_read->b, data); +++ } +++ } +++ +++ if (param->include_fails) { +++ exclude |= (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP); +++ } else { +++ exclude |= (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL); +++ } +++ +++ // read must not be secondary, supplementary, unmapped or (possibly) failed QC +++ if (!(in_read->b->core.flag & exclude)) { +++ examined++; +++ in_read->duplicate = NULL; ++ ++ // look at the pairs first ++ if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { ++@@ -569,9 +1268,16 @@ ++ key_data_t single_key; ++ in_hash_t *bp; ++ ++- if (make_pair_key(&pair_key, in_read->b)) { ++- fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n"); ++- return 1; +++ if (param->mode) { +++ if (make_pair_key_sequence(&pair_key, in_read->b)) { +++ fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n"); +++ goto fail; +++ } +++ } else { +++ if (make_pair_key_template(&pair_key, in_read->b)) { +++ fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n"); +++ goto fail; +++ } ++ } ++ ++ make_single_key(&single_key, in_read->b); ++@@ -585,40 +1291,32 @@ ++ if (ret > 0) { // new ++ // add to single duplicate hash ++ bp = &kh_val(single_hash, k); ++- bp->p = in_read->b; +++ bp->p = in_read; ++ in_read->single_key = single_key; ++ } else if (ret == 0) { // exists ++ // look at singles only for duplication marking ++ bp = &kh_val(single_hash, k); ++ ++- if (!(bp->p->core.flag & BAM_FPAIRED) || (bp->p->core.flag & BAM_FMUNMAP)) { ++- bam1_t *dup = bp->p; +++ if (!(bp->p->b->core.flag & BAM_FPAIRED) || (bp->p->b->core.flag & BAM_FMUNMAP)) { +++ // singleton will always be marked duplicate even if +++ // scores more than one read of the pair +++ bam1_t *dup = bp->p->b; +++ +++ in_read->duplicate = bp->p; +++ bp->p = in_read; ++ ++- // singleton will always be marked duplicate even if ++- // scores more than one read of the pair +++ if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) +++ goto fail; ++ ++- bp->p = in_read->b; ++- dup->core.flag |= BAM_FDUP; ++ single_dup++; ++ ++- if (tag) { ++- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { ++- fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); ++- return 1; ++- } ++- } +++ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) +++ goto fail; ++ ++- if (supp) { ++- if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { ++- if (add_duplicate(dup_hash, dup)) { ++- return 1; ++- } ++- } ++- } ++ } ++ } else { ++ fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n"); ++- return 1; +++ goto fail; ++ } ++ ++ // now do the pair ++@@ -627,33 +1325,44 @@ ++ if (ret > 0) { // new ++ // add to the pair hash ++ bp = &kh_val(pair_hash, k); ++- bp->p = in_read->b; +++ bp->p = in_read; ++ in_read->pair_key = pair_key; ++ } else if (ret == 0) { ++ int64_t old_score, new_score, tie_add = 0; ++ bam1_t *dup; +++ int check_chain = 0; ++ ++ bp = &kh_val(pair_hash, k); ++ ++- if ((mate_tmp = get_mate_score(bp->p)) == -1) { ++- fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); ++- return 1; +++ if ((bp->p->b->core.flag & BAM_FQCFAIL) != (in_read->b->core.flag & BAM_FQCFAIL)) { +++ if (bp->p->b->core.flag & BAM_FQCFAIL) { +++ old_score = 0; +++ new_score = 1; +++ } else { +++ old_score = 1; +++ new_score = 0; +++ } ++ } else { ++- old_score = calc_score(bp->p) + mate_tmp; ++- } +++ if ((mate_tmp = get_mate_score(bp->p->b)) == -1) { +++ fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); +++ goto fail; +++ } else { +++ old_score = calc_score(bp->p->b) + mate_tmp; +++ } ++ ++- if ((mate_tmp = get_mate_score(in_read->b)) == -1) { ++- fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); ++- return 1; ++- } else { ++- new_score = calc_score(in_read->b) + mate_tmp; +++ if ((mate_tmp = get_mate_score(in_read->b)) == -1) { +++ fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); +++ goto fail; +++ } else { +++ new_score = calc_score(in_read->b) + mate_tmp; +++ } ++ } ++ ++ // choose the highest score as the original ++ // and add it to the pair hash, mark the other as duplicate ++ ++ if (new_score == old_score) { ++- if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p)) < 0) { +++ if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p->b)) < 0) { ++ tie_add = 1; ++ } else { ++ tie_add = -1; ++@@ -661,39 +1370,40 @@ ++ } ++ ++ if (new_score + tie_add > old_score) { // swap reads ++- dup = bp->p; ++- bp->p = in_read->b; +++ dup = bp->p->b; +++ in_read->duplicate = bp->p; +++ bp->p = in_read; ++ } else { +++ if (bp->p->duplicate) { +++ in_read->duplicate = bp->p->duplicate; +++ check_chain = 1; +++ } +++ +++ bp->p->duplicate = in_read; ++ dup = in_read->b; ++ } ++ ++- dup->core.flag |= BAM_FDUP; ++- ++- if (tag) { ++- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { ++- fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); ++- return 1; ++- } +++ if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings)) +++ goto fail; ++ +++ if (check_chain) { +++ if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) +++ goto fail; ++ } ++ ++- if (supp) { ++- if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { ++- if (add_duplicate(dup_hash, dup)) { ++- return 1; ++- } ++- } ++- } +++ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) +++ goto fail; ++ ++ duplicate++; ++ } else { ++ fprintf(samtools_stderr, "[markdup] error: pair hashing failure.\n"); ++- return 1; +++ goto fail; ++ } ++ } else { // do the single (or effectively single) reads ++ int ret; ++ key_data_t single_key; ++ in_hash_t *bp; +++ int check_chain = 0; ++ ++ make_single_key(&single_key, in_read->b); ++ ++@@ -704,68 +1414,76 @@ ++ ++ if (ret > 0) { // new ++ bp = &kh_val(single_hash, k); ++- bp->p = in_read->b; +++ bp->p = in_read; ++ in_read->single_key = single_key; ++ } else if (ret == 0) { // exists ++ bp = &kh_val(single_hash, k); ++ ++- if ((bp->p->core.flag & BAM_FPAIRED) && !(bp->p->core.flag & BAM_FMUNMAP)) { +++ if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) { ++ // if matched against one of a pair just mark as duplicate ++ ++- if (tag) { ++- if (bam_aux_append(in_read->b, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { ++- fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); ++- return 1; ++- } +++ if (bp->p->duplicate) { +++ in_read->duplicate = bp->p->duplicate; +++ check_chain = 1; ++ } ++ ++- if (supp) { ++- if (bam_aux_get(in_read->b, "SA") || (in_read->b->core.flag & BAM_FMUNMAP)) { ++- if (add_duplicate(dup_hash, in_read->b)) { ++- return 1; ++- } ++- } +++ bp->p->duplicate = in_read; +++ +++ if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings)) +++ goto fail; +++ +++ if (check_chain) { +++ // check the new duplicate entry in the chain +++ if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) +++ goto fail; ++ } ++ ++- in_read->b->core.flag |= BAM_FDUP; +++ // check against the new original +++ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) +++ goto fail; +++ ++ } else { ++ int64_t old_score, new_score; ++ bam1_t *dup; ++ ++- old_score = calc_score(bp->p); +++ old_score = calc_score(bp->p->b); ++ new_score = calc_score(in_read->b); ++ ++ // choose the highest score as the original, add it ++ // to the single hash and mark the other as duplicate ++ if (new_score > old_score) { // swap reads ++- dup = bp->p; ++- bp->p = in_read->b; +++ dup = bp->p->b; +++ in_read->duplicate = bp->p; +++ bp->p = in_read; ++ } else { +++ if (bp->p->duplicate) { +++ in_read->duplicate = bp->p->duplicate; +++ check_chain = 1; +++ } +++ +++ bp->p->duplicate = in_read; ++ dup = in_read->b; ++ } ++ ++- dup->core.flag |= BAM_FDUP; +++ if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) +++ goto fail; ++ ++- if (tag) { ++- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { ++- fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); ++- return 1; ++- } +++ +++ if (check_chain) { +++ if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) +++ goto fail; ++ } ++ ++- if (supp) { ++- if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { ++- if (add_duplicate(dup_hash, dup)) { ++- return 1; ++- } ++- } +++ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) +++ goto fail; +++ +++ ++ } ++- } ++ ++ single_dup++; ++ } else { ++ fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n"); ++- return 1; +++ goto fail; ++ } ++ } ++ } else { ++@@ -780,20 +1498,20 @@ ++ ++ /* keep a moving window of reads based on coordinates and max read length. Any unaligned reads ++ should just be written as they cannot be matched as duplicates. */ ++- if (in_read->pos + max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { +++ if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { ++ break; ++ } ++ ++- if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { ++- if (supp) { +++ if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { +++ if (param->supp) { ++ if (tmp_file_write(&temp, in_read->b)) { ++ fprintf(samtools_stderr, "[markdup] error: writing temp output failed.\n"); ++- return 1; +++ goto fail; ++ } ++ } else { ++- if (sam_write1(out, header, in_read->b) < 0) { +++ if (sam_write1(param->out, header, in_read->b) < 0) { ++ fprintf(samtools_stderr, "[markdup] error: writing output failed.\n"); ++- return 1; +++ goto fail; ++ } ++ } ++ ++@@ -818,16 +1536,20 @@ ++ ++ // set the next one up for reading ++ in_read = kl_pushp(read_queue, read_buffer); +++ if (!in_read) { +++ fprintf(samtools_stderr, "[markdup] out of memory\n"); +++ goto fail; +++ } ++ ++ if ((in_read->b = bam_init1()) == NULL) { ++ fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for alignment.\n"); ++- return 1; +++ goto fail; ++ } ++ } ++ ++ if (ret < -1) { ++ fprintf(samtools_stderr, "[markdup] error: truncated input file.\n"); ++- return 1; +++ goto fail; ++ } ++ ++ // write out the end of the list ++@@ -836,16 +1558,16 @@ ++ in_read = &kl_val(rq); ++ ++ if (bam_get_qname(in_read->b)) { // last entry will be blank ++- if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { ++- if (supp) { +++ if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { +++ if (param->supp) { ++ if (tmp_file_write(&temp, in_read->b)) { ++ fprintf(samtools_stderr, "[markdup] error: writing temp output failed.\n"); ++- return 1; +++ goto fail; ++ } ++ } else { ++- if (sam_write1(out, header, in_read->b) < 0) { +++ if (sam_write1(param->out, header, in_read->b) < 0) { ++ fprintf(samtools_stderr, "[markdup] error: writing output failed.\n"); ++- return 1; +++ goto fail; ++ } ++ } ++ ++@@ -858,71 +1580,155 @@ ++ rq = kl_begin(read_buffer); ++ } ++ ++- if (supp) { +++ if (param->supp) { ++ bam1_t *b; ++ ++ if (tmp_file_end_write(&temp)) { ++ fprintf(samtools_stderr, "[markdup] error: unable to end tmp writing.\n"); ++- return 1; +++ goto fail; ++ } ++ ++ // read data from temp file and mark duplicate supplementary alignments ++ ++- if (tmp_file_begin_read(&temp, NULL)) { ++- return 1; +++ if (tmp_file_begin_read(&temp)) { +++ goto fail; ++ } ++ ++ b = bam_init1(); ++ ++ while ((ret = tmp_file_read(&temp, b)) > 0) { ++ ++- if ((b->core.flag & BAM_FSUPPLEMENTARY) || (b->core.flag & BAM_FUNMAP)) { +++ if ((b->core.flag & BAM_FSUPPLEMENTARY) || (b->core.flag & BAM_FUNMAP) || (b->core.flag & BAM_FSECONDARY)) { +++ ++ k = kh_get(duplicates, dup_hash, bam_get_qname(b)); ++ ++ if (k != kh_end(dup_hash)) { +++ ++ b->core.flag |= BAM_FDUP; +++ np_duplicate++; +++ +++ if (param->tag && kh_val(dup_hash, k).name) { +++ if (bam_aux_append(b, "do", 'Z', strlen(kh_val(dup_hash, k).name) + 1, (uint8_t*)kh_val(dup_hash, k).name)) { +++ fprintf(samtools_stderr, "[markdup] error: unable to append supplementary 'do' tag.\n"); +++ goto fail; +++ } +++ } +++ +++ if (param->opt_dist) { +++ if (kh_val(dup_hash, k).type) { +++ bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ"); +++ np_opt_duplicate++; +++ } else { +++ bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"LB"); +++ } +++ } ++ } ++ } ++ ++- if (!remove_dups || !(b->core.flag & BAM_FDUP)) { ++- if (sam_write1(out, header, b) < 0) { +++ if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) { +++ if (sam_write1(param->out, header, b) < 0) { ++ fprintf(samtools_stderr, "[markdup] error: writing final output failed.\n"); ++- return 1; +++ goto fail; ++ } ++ } ++ } ++ ++ if (ret == -1) { ++ fprintf(samtools_stderr, "[markdup] error: failed to read tmp file.\n"); ++- return 1; +++ goto fail; ++ } ++ ++ for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) { ++ if (kh_exist(dup_hash, k)) { +++ free(kh_val(dup_hash, k).name); ++ free((char *)kh_key(dup_hash, k)); +++ kh_key(dup_hash, k) = NULL; ++ } ++ } ++ ++- tmp_file_destroy(&temp, b, 0); ++- kh_destroy(duplicates, dup_hash); +++ tmp_file_destroy(&temp); ++ bam_destroy1(b); ++ } ++ ++- if (do_stats) { ++- fprintf(samtools_stderr, "READ %d WRITTEN %d \n" ++- "EXCLUDED %d EXAMINED %d\n" ++- "PAIRED %d SINGLE %d\n" ++- "DULPICATE PAIR %d DUPLICATE SINGLE %d\n" ++- "DUPLICATE TOTAL %d\n", reading, writing, excluded, examined, pair, single, ++- duplicate, single_dup, single_dup + duplicate); +++ if (opt_warnings) { +++ fprintf(samtools_stderr, "[markdup] warning: number of failed attempts to get coordinates from read names = %ld\n", +++ opt_warnings); +++ } +++ +++ if (param->do_stats) { +++ FILE *fp; +++ int file_open = 0; +++ unsigned long els; +++ +++ if (param->stats_file) { +++ if (NULL == (fp = fopen(param->stats_file, "w"))) { +++ fprintf(samtools_stderr, "[markdup] warning: cannot write stats to %s.\n", param->stats_file); +++ fp = samtools_stderr; +++ } else { +++ file_open = 1; +++ } +++ } else { +++ fp = samtools_stderr; +++ } +++ +++ els = estimate_library_size(pair, duplicate - optical); +++ +++ fprintf(fp, +++ "COMMAND: %s\n" +++ "READ: %ld\n" +++ "WRITTEN: %ld\n" +++ "EXCLUDED: %ld\n" +++ "EXAMINED: %ld\n" +++ "PAIRED: %ld\n" +++ "SINGLE: %ld\n" +++ "DUPLICATE PAIR: %ld\n" +++ "DUPLICATE SINGLE: %ld\n" +++ "DUPLICATE PAIR OPTICAL: %ld\n" +++ "DUPLICATE SINGLE OPTICAL: %ld\n" +++ "DUPLICATE NON PRIMARY: %ld\n" +++ "DUPLICATE NON PRIMARY OPTICAL: %ld\n" +++ "DUPLICATE PRIMARY TOTAL: %ld\n" +++ "DUPLICATE TOTAL: %ld\n" +++ "ESTIMATED_LIBRARY_SIZE: %ld\n", param->arg_list, reading, writing, excluded, examined, pair, single, +++ duplicate, single_dup, optical, single_optical, np_duplicate, np_opt_duplicate, +++ single_dup + duplicate, single_dup + duplicate + np_duplicate, els); +++ +++ if (file_open) { +++ fclose(fp); +++ } +++ } +++ +++ if (param->write_index) { +++ if (sam_idx_save(param->out) < 0) { +++ print_error_errno("markdup", "writing index failed"); +++ goto fail; +++ } ++ } ++ ++ kh_destroy(reads, pair_hash); ++ kh_destroy(reads, single_hash); ++ kl_destroy(read_queue, read_buffer); ++- bam_hdr_destroy(header); +++ kh_destroy(duplicates, dup_hash); +++ sam_hdr_destroy(header); ++ ++ return 0; +++ +++ fail: +++ for (rq = kl_begin(read_buffer); rq != kl_end(read_buffer); rq = kl_next(rq)) +++ bam_destroy1(kl_val(rq).b); +++ kl_destroy(read_queue, read_buffer); +++ +++ for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) { +++ if (kh_exist(dup_hash, k)) { +++ free((char *)kh_key(dup_hash, k)); +++ } +++ } +++ kh_destroy(duplicates, dup_hash); +++ +++ kh_destroy(reads, pair_hash); +++ kh_destroy(reads, single_hash); +++ sam_hdr_destroy(header); +++ return 1; ++ } ++ ++ ++@@ -930,15 +1736,23 @@ ++ fprintf(samtools_stderr, "\n"); ++ fprintf(samtools_stderr, "Usage: samtools markdup \n\n"); ++ fprintf(samtools_stderr, "Option: \n"); ++- fprintf(samtools_stderr, " -r Remove duplicate reads\n"); ++- fprintf(samtools_stderr, " -l INT Max read length (default 300 bases)\n"); ++- fprintf(samtools_stderr, " -S Mark supplemenary alignments of duplicates as duplicates (slower).\n"); ++- fprintf(samtools_stderr, " -s Report stats.\n"); ++- fprintf(samtools_stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); ++- fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." +++ fprintf(samtools_stderr, " -r Remove duplicate reads\n"); +++ fprintf(samtools_stderr, " -l INT Max read length (default 300 bases)\n"); +++ fprintf(samtools_stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n"); +++ fprintf(samtools_stderr, " -s Report stats.\n"); +++ fprintf(samtools_stderr, " -f NAME Write stats to named file. Implies -s.\n"); +++ fprintf(samtools_stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); +++ fprintf(samtools_stderr, " -d INT Optical distance (if set, marks with dt tag)\n"); +++ fprintf(samtools_stderr, " -c Clear previous duplicate settings and tags.\n"); +++ fprintf(samtools_stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n" +++ " TYPE = t measure positions based on template start/end (default).\n" +++ " s measure positions based on sequence start.\n"); +++ fprintf(samtools_stderr, " --include-fails Include quality check failed reads.\n"); +++ fprintf(samtools_stderr, " --no-PG Do not add a PG line\n"); +++ fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." ++ " Mainly for information and debugging.\n"); ++ ++- sam_global_opt_help(samtools_stderr, "-.O..@"); +++ sam_global_opt_help(samtools_stderr, "-.O..@.."); ++ ++ fprintf(samtools_stderr, "\nThe input file must be coordinate sorted and must have gone" ++ " through fixmates with the mate scoring option on.\n"); ++@@ -948,29 +1762,47 @@ ++ ++ ++ int bam_markdup(int argc, char **argv) { ++- int c, ret, remove_dups = 0, report_stats = 0, include_supplementary = 0, tag_dup = 0; ++- int32_t max_length = 300; ++- samFile *in = NULL, *out = NULL; +++ int c, ret; ++ char wmode[3] = {'w', 'b', 0}; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ htsThreadPool p = {NULL, 0}; ++ kstring_t tmpprefix = {0, 0, NULL}; ++ struct stat st; ++ unsigned int t; +++ md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL}; ++ ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), +++ {"include-fails", no_argument, NULL, 1001}, +++ {"no-PG", no_argument, NULL, 1002}, +++ {"mode", required_argument, NULL, 'm'}, ++ {NULL, 0, NULL, 0} ++ }; ++ ++- while ((c = getopt_long(argc, argv, "rsl:StT:O:@:", lopts, NULL)) >= 0) { +++ while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:ncm:", lopts, NULL)) >= 0) { ++ switch (c) { ++- case 'r': remove_dups = 1; break; ++- case 'l': max_length = atoi(optarg); break; ++- case 's': report_stats = 1; break; +++ case 'r': param.remove_dups = 1; break; +++ case 'l': param.max_length = atoi(optarg); break; +++ case 's': param.do_stats = 1; break; ++ case 'T': kputs(optarg, &tmpprefix); break; ++- case 'S': include_supplementary = 1; break; ++- case 't': tag_dup = 1; break; +++ case 'S': param.supp = 1; break; +++ case 't': param.tag = 1; break; +++ case 'f': param.stats_file = optarg; param.do_stats = 1; break; +++ case 'd': param.opt_dist = atoi(optarg); break; +++ case 'c': param.clear = 1; break; +++ case 'm': +++ if (strcmp(optarg, "t") == 0) { +++ param.mode = MD_MODE_TEMPLATE; +++ } else if (strcmp(optarg, "s") == 0) { +++ param.mode = MD_MODE_SEQUENCE; +++ } else { +++ fprintf(samtools_stderr, "[markdup] error: unknown mode '%s'.\n", optarg); +++ return markdup_usage(); +++ } +++ +++ break; +++ case 1001: param.include_fails = 1; break; +++ case 1002: param.no_pg = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': return markdup_usage(); ++@@ -980,17 +1812,20 @@ ++ if (optind + 2 > argc) ++ return markdup_usage(); ++ ++- in = sam_open_format(argv[optind], "r", &ga.in); +++ if (param.opt_dist < 0) param.opt_dist = 0; +++ if (param.max_length < 0) param.max_length = 300; +++ +++ param.in = sam_open_format(argv[optind], "r", &ga.in); ++ ++- if (!in) { +++ if (!param.in) { ++ print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]); ++ return 1; ++ } ++ ++ sam_open_mode(wmode + 1, argv[optind + 1], NULL); ++- out = sam_open_format(argv[optind + 1], wmode, &ga.out); +++ param.out = sam_open_format(argv[optind + 1], wmode, &ga.out); ++ ++- if (!out) { +++ if (!param.out) { ++ print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]); ++ return 1; ++ } ++@@ -1001,8 +1836,8 @@ ++ return 1; ++ } ++ ++- hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); ++- hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); +++ hts_set_opt(param.in, HTS_OPT_THREAD_POOL, &p); +++ hts_set_opt(param.out, HTS_OPT_THREAD_POOL, &p); ++ } ++ ++ // actual stuff happens here ++@@ -1022,18 +1857,24 @@ ++ ++ t = ((unsigned) time(NULL)) ^ ((unsigned) clock()); ++ ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000); +++ param.prefix = tmpprefix.s; +++ +++ param.arg_list = stringify_argv(argc + 1, argv - 1); +++ param.write_index = ga.write_index; +++ param.out_fn = argv[optind + 1]; ++ ++- ret = bam_mark_duplicates(in, out, tmpprefix.s, remove_dups, max_length, report_stats, include_supplementary, tag_dup); +++ ret = bam_mark_duplicates(¶m); ++ ++- sam_close(in); +++ sam_close(param.in); ++ ++- if (sam_close(out) < 0) { +++ if (sam_close(param.out) < 0) { ++ fprintf(samtools_stderr, "[markdup] error closing output file\n"); ++ ret = 1; ++ } ++ ++ if (p.pool) hts_tpool_destroy(p.pool); ++ +++ free(param.arg_list); ++ free(tmpprefix.s); ++ sam_global_args_free(&ga); ++ ++--- python-pysam.orig/samtools/bam_mate.c +++++ python-pysam/samtools/bam_mate.c ++@@ -1,6 +1,6 @@ ++ /* bam_mate.c -- fix mate pairing information and clean up flags. ++ ++- Copyright (C) 2009, 2011-2017 Genome Research Ltd. +++ Copyright (C) 2009, 2011-2017, 2019 Genome Research Ltd. ++ Portions copyright (C) 2011 Broad Institute. ++ Portions copyright (C) 2012 Peter Cock, The James Hutton Institute. ++ ++@@ -37,6 +37,9 @@ ++ #include "htslib/sam.h" ++ #include "samtools.h" ++ +++ +++#define MD_MIN_QUALITY 15 +++ ++ /* ++ * This function calculates ct tag for two bams, it assumes they are from the same template and ++ * writes the tag to the first read in position terms. ++@@ -44,7 +47,8 @@ ++ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) ++ { ++ bam1_t *swap; ++- int i, end; +++ int i; +++ hts_pos_t end; ++ uint32_t *cigar; ++ str->l = 0; ++ if (b1->core.tid != b2->core.tid || b1->core.tid < 0 || b1->core.pos < 0 || b2->core.pos < 0 || b1->core.flag&BAM_FUNMAP || b2->core.flag&BAM_FUNMAP) return; // coordinateless or not on the same chr; skip ++@@ -140,8 +144,8 @@ ++ ++ bam1_t* first = a; ++ bam1_t* second = b; ++- int32_t a_pos = a->core.flag&BAM_FREVERSE ? bam_endpos(a) : a->core.pos; ++- int32_t b_pos = b->core.flag&BAM_FREVERSE ? bam_endpos(b) : b->core.pos; +++ hts_pos_t a_pos = a->core.flag&BAM_FREVERSE ? bam_endpos(a) : a->core.pos; +++ hts_pos_t b_pos = b->core.flag&BAM_FREVERSE ? bam_endpos(b) : b->core.pos; ++ if (a_pos > b_pos) { ++ first = b; ++ second = a; ++@@ -226,7 +230,7 @@ ++ int i; ++ ++ for (i = 0; i < b->core.l_qseq; i++) { ++- if (qual[i] >= 15) score += qual[i]; +++ if (qual[i] >= MD_MIN_QUALITY) score += qual[i]; ++ } ++ ++ return score; ++@@ -250,31 +254,34 @@ ++ } ++ ++ // currently, this function ONLY works if each read has one hit ++-static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring) +++static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring, char *arg_list, int no_pg) ++ { ++- bam_hdr_t *header; +++ sam_hdr_t *header; ++ bam1_t *b[2] = { NULL, NULL }; ++- int curr, has_prev, pre_end = 0, cur_end = 0, result; ++- kstring_t str; +++ int curr, has_prev, result; +++ hts_pos_t pre_end = 0, cur_end = 0; +++ kstring_t str = KS_INITIALIZE; ++ ++- str.l = str.m = 0; str.s = 0; ++ header = sam_hdr_read(in); ++ if (header == NULL) { ++ fprintf(stderr, "[bam_mating_core] ERROR: Couldn't read header\n"); ++ return 1; ++ } +++ ++ // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted. ++- if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { ++- char *p, *q; ++- p = strstr(header->text, "\tSO:coordinate"); ++- q = strchr(header->text, '\n'); ++- // Looking for SO:coordinate within the @HD line only ++- // (e.g. must ignore in a @CO comment line later in header) ++- if ((p != 0) && (p < q)) { ++- fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); ++- goto fail; ++- } +++ if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "coordinate")) { +++ fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); +++ goto fail; ++ } +++ ks_free(&str); +++ +++ if (!no_pg && sam_hdr_add_pg(header, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) +++ goto fail; +++ ++ if (sam_hdr_write(out, header) < 0) goto write_fail; ++ ++ b[0] = bam_init1(); ++@@ -303,7 +310,7 @@ ++ cur_end = bam_endpos(cur); ++ ++ // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag ++- if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; +++ if (cur_end > sam_hdr_tid2len(header, cur->core.tid)) cur->core.flag |= BAM_FUNMAP; ++ } ++ if (has_prev) { // do we have a pair of reads to examine? ++ if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name ++@@ -314,7 +321,7 @@ ++ if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) ++ && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE ++ { ++- uint32_t cur5, pre5; +++ hts_pos_t cur5, pre5; ++ cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; ++ pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; ++ cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; ++@@ -378,18 +385,19 @@ ++ ++ if (sam_write1(out, header, pre) < 0) goto write_fail; ++ } ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ bam_destroy1(b[0]); ++ bam_destroy1(b[1]); ++- free(str.s); +++ ks_free(&str); ++ return 0; ++ ++ write_fail: ++ print_error_errno("fixmate", "Couldn't write to output file"); ++ fail: ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ bam_destroy1(b[0]); ++ bam_destroy1(b[1]); +++ ks_free(&str); ++ return 1; ++ } ++ ++@@ -401,9 +409,10 @@ ++ " -r Remove unmapped reads and secondary alignments\n" ++ " -p Disable FR proper pair check\n" ++ " -c Add template cigar ct tag\n" ++-" -m Add mate score tag\n"); +++" -m Add mate score tag\n" +++" --no-PG do not add a PG line\n"); ++ ++- sam_global_opt_help(where, "-.O..@"); +++ sam_global_opt_help(where, "-.O..@-."); ++ ++ fprintf(where, ++ "\n" ++@@ -416,13 +425,15 @@ ++ { ++ htsThreadPool p = {NULL, 0}; ++ samFile *in = NULL, *out = NULL; ++- int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0; +++ int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ char wmode[3] = {'w', 'b', 0}; ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; +++ char *arg_list = NULL; ++ ++ // parse args ++ if (argc == 1) { usage(stdout); return 0; } ++@@ -432,6 +443,7 @@ ++ case 'p': proper_pair_check = 0; break; ++ case 'c': add_ct = 1; break; ++ case 'm': mate_score = 1; break; +++ case 1: no_pg = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': usage(stderr); goto fail; ++@@ -439,6 +451,9 @@ ++ } ++ if (optind+1 >= argc) { usage(stderr); goto fail; } ++ +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) +++ goto fail; +++ ++ // init ++ if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) { ++ print_error_errno("fixmate", "cannot open input file"); ++@@ -460,7 +475,7 @@ ++ } ++ ++ // run ++- res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score); +++ res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score, arg_list, no_pg); ++ ++ // cleanup ++ sam_close(in); ++@@ -470,6 +485,7 @@ ++ } ++ ++ if (p.pool) hts_tpool_destroy(p.pool); +++ free(arg_list); ++ sam_global_args_free(&ga); ++ return res; ++ ++@@ -477,6 +493,7 @@ ++ if (in) sam_close(in); ++ if (out) sam_close(out); ++ if (p.pool) hts_tpool_destroy(p.pool); +++ free(arg_list); ++ sam_global_args_free(&ga); ++ return 1; ++ } ++--- python-pysam.orig/samtools/bam_mate.c.pysam.c +++++ python-pysam/samtools/bam_mate.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam_mate.c -- fix mate pairing information and clean up flags. ++ ++- Copyright (C) 2009, 2011-2017 Genome Research Ltd. +++ Copyright (C) 2009, 2011-2017, 2019 Genome Research Ltd. ++ Portions copyright (C) 2011 Broad Institute. ++ Portions copyright (C) 2012 Peter Cock, The James Hutton Institute. ++ ++@@ -39,6 +39,9 @@ ++ #include "htslib/sam.h" ++ #include "samtools.h" ++ +++ +++#define MD_MIN_QUALITY 15 +++ ++ /* ++ * This function calculates ct tag for two bams, it assumes they are from the same template and ++ * writes the tag to the first read in position terms. ++@@ -46,7 +49,8 @@ ++ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) ++ { ++ bam1_t *swap; ++- int i, end; +++ int i; +++ hts_pos_t end; ++ uint32_t *cigar; ++ str->l = 0; ++ if (b1->core.tid != b2->core.tid || b1->core.tid < 0 || b1->core.pos < 0 || b2->core.pos < 0 || b1->core.flag&BAM_FUNMAP || b2->core.flag&BAM_FUNMAP) return; // coordinateless or not on the same chr; skip ++@@ -142,8 +146,8 @@ ++ ++ bam1_t* first = a; ++ bam1_t* second = b; ++- int32_t a_pos = a->core.flag&BAM_FREVERSE ? bam_endpos(a) : a->core.pos; ++- int32_t b_pos = b->core.flag&BAM_FREVERSE ? bam_endpos(b) : b->core.pos; +++ hts_pos_t a_pos = a->core.flag&BAM_FREVERSE ? bam_endpos(a) : a->core.pos; +++ hts_pos_t b_pos = b->core.flag&BAM_FREVERSE ? bam_endpos(b) : b->core.pos; ++ if (a_pos > b_pos) { ++ first = b; ++ second = a; ++@@ -228,7 +232,7 @@ ++ int i; ++ ++ for (i = 0; i < b->core.l_qseq; i++) { ++- if (qual[i] >= 15) score += qual[i]; +++ if (qual[i] >= MD_MIN_QUALITY) score += qual[i]; ++ } ++ ++ return score; ++@@ -252,31 +256,34 @@ ++ } ++ ++ // currently, this function ONLY works if each read has one hit ++-static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring) +++static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring, char *arg_list, int no_pg) ++ { ++- bam_hdr_t *header; +++ sam_hdr_t *header; ++ bam1_t *b[2] = { NULL, NULL }; ++- int curr, has_prev, pre_end = 0, cur_end = 0, result; ++- kstring_t str; +++ int curr, has_prev, result; +++ hts_pos_t pre_end = 0, cur_end = 0; +++ kstring_t str = KS_INITIALIZE; ++ ++- str.l = str.m = 0; str.s = 0; ++ header = sam_hdr_read(in); ++ if (header == NULL) { ++ fprintf(samtools_stderr, "[bam_mating_core] ERROR: Couldn't read header\n"); ++ return 1; ++ } +++ ++ // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted. ++- if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { ++- char *p, *q; ++- p = strstr(header->text, "\tSO:coordinate"); ++- q = strchr(header->text, '\n'); ++- // Looking for SO:coordinate within the @HD line only ++- // (e.g. must ignore in a @CO comment line later in header) ++- if ((p != 0) && (p < q)) { ++- fprintf(samtools_stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); ++- goto fail; ++- } +++ if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "coordinate")) { +++ fprintf(samtools_stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); +++ goto fail; ++ } +++ ks_free(&str); +++ +++ if (!no_pg && sam_hdr_add_pg(header, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) +++ goto fail; +++ ++ if (sam_hdr_write(out, header) < 0) goto write_fail; ++ ++ b[0] = bam_init1(); ++@@ -305,7 +312,7 @@ ++ cur_end = bam_endpos(cur); ++ ++ // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag ++- if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; +++ if (cur_end > sam_hdr_tid2len(header, cur->core.tid)) cur->core.flag |= BAM_FUNMAP; ++ } ++ if (has_prev) { // do we have a pair of reads to examine? ++ if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name ++@@ -316,7 +323,7 @@ ++ if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) ++ && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE ++ { ++- uint32_t cur5, pre5; +++ hts_pos_t cur5, pre5; ++ cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; ++ pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; ++ cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; ++@@ -380,18 +387,19 @@ ++ ++ if (sam_write1(out, header, pre) < 0) goto write_fail; ++ } ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ bam_destroy1(b[0]); ++ bam_destroy1(b[1]); ++- free(str.s); +++ ks_free(&str); ++ return 0; ++ ++ write_fail: ++ print_error_errno("fixmate", "Couldn't write to output file"); ++ fail: ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ bam_destroy1(b[0]); ++ bam_destroy1(b[1]); +++ ks_free(&str); ++ return 1; ++ } ++ ++@@ -403,9 +411,10 @@ ++ " -r Remove unmapped reads and secondary alignments\n" ++ " -p Disable FR proper pair check\n" ++ " -c Add template cigar ct tag\n" ++-" -m Add mate score tag\n"); +++" -m Add mate score tag\n" +++" --no-PG do not add a PG line\n"); ++ ++- sam_global_opt_help(where, "-.O..@"); +++ sam_global_opt_help(where, "-.O..@-."); ++ ++ fprintf(where, ++ "\n" ++@@ -418,13 +427,15 @@ ++ { ++ htsThreadPool p = {NULL, 0}; ++ samFile *in = NULL, *out = NULL; ++- int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0; +++ int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ char wmode[3] = {'w', 'b', 0}; ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; +++ char *arg_list = NULL; ++ ++ // parse args ++ if (argc == 1) { usage(samtools_stdout); return 0; } ++@@ -434,6 +445,7 @@ ++ case 'p': proper_pair_check = 0; break; ++ case 'c': add_ct = 1; break; ++ case 'm': mate_score = 1; break; +++ case 1: no_pg = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': usage(samtools_stderr); goto fail; ++@@ -441,6 +453,9 @@ ++ } ++ if (optind+1 >= argc) { usage(samtools_stderr); goto fail; } ++ +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) +++ goto fail; +++ ++ // init ++ if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) { ++ print_error_errno("fixmate", "cannot open input file"); ++@@ -462,7 +477,7 @@ ++ } ++ ++ // run ++- res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score); +++ res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score, arg_list, no_pg); ++ ++ // cleanup ++ sam_close(in); ++@@ -472,6 +487,7 @@ ++ } ++ ++ if (p.pool) hts_tpool_destroy(p.pool); +++ free(arg_list); ++ sam_global_args_free(&ga); ++ return res; ++ ++@@ -479,6 +495,7 @@ ++ if (in) sam_close(in); ++ if (out) sam_close(out); ++ if (p.pool) hts_tpool_destroy(p.pool); +++ free(arg_list); ++ sam_global_args_free(&ga); ++ return 1; ++ } ++--- python-pysam.orig/samtools/bam_md.c +++++ python-pysam/samtools/bam_md.c ++@@ -1,6 +1,6 @@ ++ /* bam_md.c -- calmd subcommand. ++ ++- Copyright (C) 2009-2011, 2014-2015 Genome Research Ltd. +++ Copyright (C) 2009-2011, 2014-2015, 2019 Genome Research Ltd. ++ Portions copyright (C) 2009-2011 Broad Institute. ++ ++ Author: Heng Li ++@@ -46,12 +46,13 @@ ++ ++ int bam_aux_drop_other(bam1_t *b, uint8_t *s); ++ ++-void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm, int quiet_mode) +++void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max_nm, int quiet_mode) ++ { ++ uint8_t *seq = bam_get_seq(b); ++ uint32_t *cigar = bam_get_cigar(b); ++ bam1_core_t *c = &b->core; ++- int i, x, y, u = 0; +++ int i, y, u = 0; +++ hts_pos_t x; ++ kstring_t *str; ++ int32_t old_nm_i = -1, nm = 0; ++ ++@@ -67,7 +68,7 @@ ++ if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; ++ ++u; ++ } else { ++- kputw(u, str); kputc(ref[x+j], str); +++ kputw(u, str); kputc(toupper(ref[x+j]), str); ++ u = 0; ++nm; ++ } ++ } ++@@ -77,7 +78,7 @@ ++ kputw(u, str); kputc('^', str); ++ for (j = 0; j < l; ++j) { ++ if (x+j >= ref_len || ref[x+j] == '\0') break; ++- kputc(ref[x+j], str); +++ kputc(toupper(ref[x+j]), str); ++ } ++ u = 0; ++ x += j; nm += j; ++@@ -176,25 +177,28 @@ ++ " -A modify the quality string\n" ++ " -Q use quiet mode to output less debug info to stdout\n" ++ " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n" ++-" -E extended BAQ for better sensitivity but lower specificity\n"); +++" -E extended BAQ for better sensitivity but lower specificity\n" +++" --no-PG do not add a PG line\n"); ++ ++- sam_global_opt_help(stderr, "-....@"); +++ sam_global_opt_help(stderr, "-....@-."); ++ return 1; ++ } ++ ++ int bam_fillmd(int argc, char *argv[]) ++ { ++- int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode; +++ int c, flt_flag, tid = -2, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0; +++ hts_pos_t len; ++ htsThreadPool p = {NULL, 0}; ++ samFile *fp = NULL, *fpout = NULL; ++- bam_hdr_t *header = NULL; +++ sam_hdr_t *header = NULL; ++ faidx_t *fai = NULL; ++- char *ref = NULL, mode_w[8], *ref_file; +++ char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL; ++ bam1_t *b = NULL; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -217,6 +221,7 @@ ++ case 'A': baq_flag |= 1; break; ++ case 'E': baq_flag |= 2; break; ++ case 'Q': quiet_mode = 1; break; +++ case 1: no_pg = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); ++ /* else fall-through */ ++@@ -234,8 +239,13 @@ ++ return 1; ++ } ++ +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("calmd", "failed to create arg_list"); +++ return 1; +++ } +++ ++ header = sam_hdr_read(fp); ++- if (header == NULL || header->n_targets == 0) { +++ if (header == NULL || sam_hdr_nref(header) == 0) { ++ fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); ++ goto fail; ++ } ++@@ -245,6 +255,14 @@ ++ print_error_errno("calmd", "Failed to open output"); ++ goto fail; ++ } +++ if (!no_pg && sam_hdr_add_pg(header, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ print_error("calmd", "failed to add PG line to header"); +++ goto fail; +++ } ++ if (sam_hdr_write(fpout, header) < 0) { ++ print_error_errno("calmd", "Failed to write sam header"); ++ goto fail; ++@@ -276,11 +294,11 @@ ++ if (b->core.tid >= 0) { ++ if (tid != b->core.tid) { ++ free(ref); ++- ref = fai_fetch(fai, header->target_name[b->core.tid], &len); +++ ref = fai_fetch64(fai, sam_hdr_tid2name(header, b->core.tid), &len); ++ tid = b->core.tid; ++ if (ref == 0) { // FIXME: Should this always be fatal? ++ fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", ++- header->target_name[tid]); +++ sam_hdr_tid2name(header, tid)); ++ if (is_realn || capQ > 10) goto fail; // Would otherwise crash ++ } ++ } ++@@ -301,8 +319,9 @@ ++ goto fail; ++ } ++ bam_destroy1(b); ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ +++ free(arg_list); ++ free(ref); ++ fai_destroy(fai); ++ sam_close(fp); ++@@ -315,9 +334,10 @@ ++ return 0; ++ ++ fail: +++ free(arg_list); ++ free(ref); ++ if (b) bam_destroy1(b); ++- if (header) bam_hdr_destroy(header); +++ if (header) sam_hdr_destroy(header); ++ if (fai) fai_destroy(fai); ++ if (fp) sam_close(fp); ++ if (fpout) sam_close(fpout); ++--- python-pysam.orig/samtools/bam_md.c.pysam.c +++++ python-pysam/samtools/bam_md.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam_md.c -- calmd subcommand. ++ ++- Copyright (C) 2009-2011, 2014-2015 Genome Research Ltd. +++ Copyright (C) 2009-2011, 2014-2015, 2019 Genome Research Ltd. ++ Portions copyright (C) 2009-2011 Broad Institute. ++ ++ Author: Heng Li ++@@ -48,12 +48,13 @@ ++ ++ int bam_aux_drop_other(bam1_t *b, uint8_t *s); ++ ++-void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm, int quiet_mode) +++void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max_nm, int quiet_mode) ++ { ++ uint8_t *seq = bam_get_seq(b); ++ uint32_t *cigar = bam_get_cigar(b); ++ bam1_core_t *c = &b->core; ++- int i, x, y, u = 0; +++ int i, y, u = 0; +++ hts_pos_t x; ++ kstring_t *str; ++ int32_t old_nm_i = -1, nm = 0; ++ ++@@ -69,7 +70,7 @@ ++ if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; ++ ++u; ++ } else { ++- kputw(u, str); kputc(ref[x+j], str); +++ kputw(u, str); kputc(toupper(ref[x+j]), str); ++ u = 0; ++nm; ++ } ++ } ++@@ -79,7 +80,7 @@ ++ kputw(u, str); kputc('^', str); ++ for (j = 0; j < l; ++j) { ++ if (x+j >= ref_len || ref[x+j] == '\0') break; ++- kputc(ref[x+j], str); +++ kputc(toupper(ref[x+j]), str); ++ } ++ u = 0; ++ x += j; nm += j; ++@@ -178,25 +179,28 @@ ++ " -A modify the quality string\n" ++ " -Q use quiet mode to output less debug info to samtools_stdout\n" ++ " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n" ++-" -E extended BAQ for better sensitivity but lower specificity\n"); +++" -E extended BAQ for better sensitivity but lower specificity\n" +++" --no-PG do not add a PG line\n"); ++ ++- sam_global_opt_help(samtools_stderr, "-....@"); +++ sam_global_opt_help(samtools_stderr, "-....@-."); ++ return 1; ++ } ++ ++ int bam_fillmd(int argc, char *argv[]) ++ { ++- int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode; +++ int c, flt_flag, tid = -2, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0; +++ hts_pos_t len; ++ htsThreadPool p = {NULL, 0}; ++ samFile *fp = NULL, *fpout = NULL; ++- bam_hdr_t *header = NULL; +++ sam_hdr_t *header = NULL; ++ faidx_t *fai = NULL; ++- char *ref = NULL, mode_w[8], *ref_file; +++ char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL; ++ bam1_t *b = NULL; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -219,6 +223,7 @@ ++ case 'A': baq_flag |= 1; break; ++ case 'E': baq_flag |= 2; break; ++ case 'Q': quiet_mode = 1; break; +++ case 1: no_pg = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ fprintf(samtools_stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); ++ /* else fall-through */ ++@@ -236,8 +241,13 @@ ++ return 1; ++ } ++ +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("calmd", "failed to create arg_list"); +++ return 1; +++ } +++ ++ header = sam_hdr_read(fp); ++- if (header == NULL || header->n_targets == 0) { +++ if (header == NULL || sam_hdr_nref(header) == 0) { ++ fprintf(samtools_stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); ++ goto fail; ++ } ++@@ -247,6 +257,14 @@ ++ print_error_errno("calmd", "Failed to open output"); ++ goto fail; ++ } +++ if (!no_pg && sam_hdr_add_pg(header, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ print_error("calmd", "failed to add PG line to header"); +++ goto fail; +++ } ++ if (sam_hdr_write(fpout, header) < 0) { ++ print_error_errno("calmd", "Failed to write sam header"); ++ goto fail; ++@@ -278,11 +296,11 @@ ++ if (b->core.tid >= 0) { ++ if (tid != b->core.tid) { ++ free(ref); ++- ref = fai_fetch(fai, header->target_name[b->core.tid], &len); +++ ref = fai_fetch64(fai, sam_hdr_tid2name(header, b->core.tid), &len); ++ tid = b->core.tid; ++ if (ref == 0) { // FIXME: Should this always be fatal? ++ fprintf(samtools_stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", ++- header->target_name[tid]); +++ sam_hdr_tid2name(header, tid)); ++ if (is_realn || capQ > 10) goto fail; // Would otherwise crash ++ } ++ } ++@@ -303,8 +321,9 @@ ++ goto fail; ++ } ++ bam_destroy1(b); ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ +++ free(arg_list); ++ free(ref); ++ fai_destroy(fai); ++ sam_close(fp); ++@@ -317,9 +336,10 @@ ++ return 0; ++ ++ fail: +++ free(arg_list); ++ free(ref); ++ if (b) bam_destroy1(b); ++- if (header) bam_hdr_destroy(header); +++ if (header) sam_hdr_destroy(header); ++ if (fai) fai_destroy(fai); ++ if (fp) sam_close(fp); ++ if (fpout) sam_close(fpout); ++--- python-pysam.orig/samtools/bam_plbuf.c +++++ python-pysam/samtools/bam_plbuf.c ++@@ -58,11 +58,12 @@ ++ ++ int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) ++ { ++- int ret, n_plp, tid, pos; +++ int ret, n_plp, tid; +++ hts_pos_t pos; ++ const bam_pileup1_t *plp; ++ ret = bam_plp_push(buf->iter, b); ++ if (ret < 0) return ret; ++- while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0) +++ while ((plp = bam_plp64_next(buf->iter, &tid, &pos, &n_plp)) != 0) ++ buf->func(tid, pos, n_plp, plp, buf->data); ++ return 0; ++ } ++--- python-pysam.orig/samtools/bam_plbuf.c.pysam.c +++++ python-pysam/samtools/bam_plbuf.c.pysam.c ++@@ -60,11 +60,12 @@ ++ ++ int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) ++ { ++- int ret, n_plp, tid, pos; +++ int ret, n_plp, tid; +++ hts_pos_t pos; ++ const bam_pileup1_t *plp; ++ ret = bam_plp_push(buf->iter, b); ++ if (ret < 0) return ret; ++- while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0) +++ while ((plp = bam_plp64_next(buf->iter, &tid, &pos, &n_plp)) != 0) ++ buf->func(tid, pos, n_plp, plp, buf->data); ++ return 0; ++ } ++--- python-pysam.orig/samtools/bam_plbuf.h +++++ python-pysam/samtools/bam_plbuf.h ++@@ -29,7 +29,7 @@ ++ ++ #ifndef BAM_PILEUP_F_DEFINED ++ #define BAM_PILEUP_F_DEFINED ++-typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); +++typedef int (*bam_pileup_f)(uint32_t tid, hts_pos_t pos, int n, const bam_pileup1_t *pl, void *data); ++ #endif //BAM_PILEUP_F_DEFINED ++ ++ typedef struct { ++--- python-pysam.orig/samtools/bam_plcmd.c +++++ python-pysam/samtools/bam_plcmd.c ++@@ -1,6 +1,6 @@ ++ /* bam_plcmd.c -- mpileup subcommand. ++ ++- Copyright (C) 2008-2015 Genome Research Ltd. +++ Copyright (C) 2008-2015, 2019 Genome Research Ltd. ++ Portions copyright (C) 2009-2012 Broad Institute. ++ ++ Author: Heng Li ++@@ -36,14 +36,19 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include +++#include ++ #include ++-#include "sam_header.h" ++ #include "samtools.h" +++#include "bedidx.h" ++ #include "sam_opts.h" ++ +++#define dummy_free(p) +++KLIST_INIT(auxlist, char *, dummy_free) +++ ++ static inline int printw(int c, FILE *fp) ++ { ++ char buf[16]; ++@@ -59,7 +64,9 @@ ++ return 0; ++ } ++ ++-static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref_len, const char *ref) +++static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, +++ hts_pos_t ref_len, const char *ref, kstring_t *ks, +++ int rev_del) ++ { ++ int j; ++ if (p->is_head) { ++@@ -79,21 +86,31 @@ ++ else c = bam_is_rev(p->b)? tolower(c) : toupper(c); ++ } ++ putc(c, fp); ++- } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*', fp); +++ } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : ((bam_is_rev(p->b) && rev_del) ? '#' : '*'), fp); +++ int del_len = -p->indel; ++ if (p->indel > 0) { ++- putc('+', fp); printw(p->indel, fp); ++- for (j = 1; j <= p->indel; ++j) { ++- int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)]; ++- putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); +++ int len = bam_plp_insertion(p, ks, &del_len); +++ if (len < 0) +++ return -1; +++ putc('+', fp); printw(len, fp); +++ if (bam_is_rev(p->b)) { +++ char pad = rev_del ? '#' : '*'; +++ for (j = 0; j < len; j++) +++ putc(ks->s[j] != '*' ? tolower(ks->s[j]) : pad, fp); +++ } else { +++ for (j = 0; j < len; j++) +++ putc(toupper(ks->s[j]), fp); ++ } ++- } else if (p->indel < 0) { ++- printw(p->indel, fp); ++- for (j = 1; j <= -p->indel; ++j) { +++ } +++ if (del_len > 0) { +++ printw(-del_len, fp); +++ for (j = 1; j <= del_len; ++j) { ++ int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; ++ putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); ++ } ++ } ++ if (p->is_tail) putc('$', fp); +++ return 0; ++ } ++ ++ #include ++@@ -109,36 +126,43 @@ ++ #define MPLP_REDO_BAQ (1<<6) ++ #define MPLP_ILLUMINA13 (1<<7) ++ #define MPLP_IGNORE_RG (1<<8) ++-#define MPLP_PRINT_POS (1<<9) ++-#define MPLP_PRINT_MAPQ (1<<10) +++#define MPLP_PRINT_QPOS (1<<9) ++ #define MPLP_PER_SAMPLE (1<<11) ++ #define MPLP_SMART_OVERLAPS (1<<12) +++ ++ #define MPLP_PRINT_QNAME (1<<13) +++#define MPLP_PRINT_FLAG (1<<14) +++#define MPLP_PRINT_RNAME (1<<15) +++#define MPLP_PRINT_POS (1<<16) +++#define MPLP_PRINT_MAPQ (1<<17) +++#define MPLP_PRINT_CIGAR (1<<18) +++#define MPLP_PRINT_RNEXT (1<<19) +++#define MPLP_PRINT_PNEXT (1<<20) +++#define MPLP_PRINT_TLEN (1<<21) +++#define MPLP_PRINT_SEQ (1<<22) +++#define MPLP_PRINT_QUAL (1<<23) ++ ++ #define MPLP_MAX_DEPTH 8000 ++ #define MPLP_MAX_INDEL_DEPTH 250 ++ ++-void *bed_read(const char *fn); ++-void bed_destroy(void *_h); ++-int bed_overlap(const void *_h, const char *chr, int beg, int end); ++- ++ typedef struct { ++- int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all; +++ int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all, rev_del; ++ int rflag_require, rflag_filter; ++ int openQ, extQ, tandemQ, min_support; // for indels ++ double min_frac; // for indels ++ char *reg, *pl_list, *fai_fname, *output_fname; ++ faidx_t *fai; ++- void *bed, *rghash; +++ void *bed, *rghash, *auxlist; ++ int argc; ++ char **argv; +++ char sep, empty; ++ sam_global_args ga; ++ } mplp_conf_t; ++ ++ typedef struct { ++ char *ref[2]; ++ int ref_id[2]; ++- int ref_len[2]; +++ hts_pos_t ref_len[2]; ++ } mplp_ref_t; ++ ++ #define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}} ++@@ -146,7 +170,7 @@ ++ typedef struct { ++ samFile *fp; ++ hts_itr_t *iter; ++- bam_hdr_t *h; +++ sam_hdr_t *h; ++ mplp_ref_t *ref; ++ const mplp_conf_t *conf; ++ } mplp_aux_t; ++@@ -157,7 +181,54 @@ ++ bam_pileup1_t **plp; ++ } mplp_pileup_t; ++ ++-static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) { +++static int build_auxlist(mplp_conf_t *conf, char *optstring) { +++ if (!optstring) +++ return 0; +++ +++ void *colhash = khash_str2int_init(); +++ if (!colhash) +++ return 1; +++ +++ struct active_cols { +++ char *name; +++ int supported; +++ }; +++ +++ const struct active_cols colnames[11] = { +++ {"QNAME", 1}, {"FLAG", 1}, {"RNAME", 1}, {"POS", 1}, {"MAPQ", 1}, {"CIGAR", 0}, {"RNEXT", 1}, {"PNEXT", 1}, {"TLEN", 0}, {"SEQ", 0}, {"QUAL", 0} +++ }; +++ +++ int i, f = MPLP_PRINT_QNAME, colno = 11; +++ for (i = 0; i < colno; i++, f <<= 1) +++ if (colnames[i].supported) +++ khash_str2int_set(colhash, colnames[i].name, f); +++ +++ conf->auxlist = kl_init(auxlist); +++ if (!conf->auxlist) +++ return 1; +++ +++ char *save_p; +++ char *tag = strtok_r(optstring, ",", &save_p); +++ while (tag) { +++ if (khash_str2int_get(colhash, tag, &f) == 0) { +++ conf->flag |= f; +++ } else { +++ if (strlen(tag) != 2) { +++ fprintf(stderr, "[%s] tag '%s' has more than two characters or not supported\n", __func__, tag); +++ } else { +++ char **tag_p = kl_pushp(auxlist, conf->auxlist); +++ *tag_p = tag; +++ } +++ } +++ tag = strtok_r(NULL, ",", &save_p); +++ } +++ +++ khash_str2int_destroy(colhash); +++ +++ return 0; +++} +++ +++static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, hts_pos_t *ref_len) { ++ mplp_ref_t *r = ma->ref; ++ ++ //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]); ++@@ -177,9 +248,10 @@ ++ } ++ if (tid == r->ref_id[1]) { ++ // Last, swap over ++- int tmp; ++- tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp; ++- tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp; +++ int tmp_id; +++ hts_pos_t tmp_len; +++ tmp_id = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp_id; +++ tmp_len = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp_len; ++ ++ char *tc; ++ tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc; ++@@ -195,10 +267,10 @@ ++ r->ref_len[1] = r->ref_len[0]; ++ ++ r->ref_id[0] = tid; ++- r->ref[0] = faidx_fetch_seq(ma->conf->fai, ++- ma->h->target_name[r->ref_id[0]], +++ r->ref[0] = faidx_fetch_seq64(ma->conf->fai, +++ sam_hdr_tid2name(ma->h, r->ref_id[0]), ++ 0, ++- INT_MAX, +++ HTS_POS_MAX, ++ &r->ref_len[0]); ++ ++ if (!r->ref[0]) { ++@@ -216,15 +288,25 @@ ++ ++ static void ++ print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, ++- int pos, int n, const char *ref, int ref_len) +++ hts_pos_t pos, int n, const char *ref, hts_pos_t ref_len) ++ { ++ int i; ++- fprintf(fp, "%s\t%d\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N'); +++ fprintf(fp, "%s\t%"PRIhts_pos"\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N'); ++ for (i = 0; i < n; ++i) { ++ fputs("\t0\t*\t*", fp); ++- if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp); ++- if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp); ++- if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", fp); +++ if (conf->flag & MPLP_PRINT_QPOS) +++ fputs("\t*", fp); +++ int flag_value = MPLP_PRINT_QNAME; +++ while(flag_value < MPLP_PRINT_QUAL + 1) { +++ if (conf->flag & flag_value) +++ fputs("\t*", fp); +++ flag_value <<= 1; +++ } +++ if (conf->auxlist) { +++ int t = 0; +++ while(t++ < ((klist_t(auxlist) *)conf->auxlist)->size) +++ fputs("\t*", fp); +++ } ++ } ++ putc('\n', fp); ++ } ++@@ -233,7 +315,9 @@ ++ { ++ char *ref; ++ mplp_aux_t *ma = (mplp_aux_t*)data; ++- int ret, skip = 0, ref_len; +++ int ret, skip = 0; +++ hts_pos_t ref_len; +++ ++ do { ++ int has_ref; ++ ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b); ++@@ -247,7 +331,7 @@ ++ if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; } ++ if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; } ++ if (ma->conf->bed && ma->conf->all == 0) { // test overlap ++- skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_endpos(b)); +++ skip = !bed_overlap(ma->conf->bed, sam_hdr_tid2name(ma->h, b->core.tid), b->core.pos, bam_endpos(b)); ++ if (skip) continue; ++ } ++ if (ma->conf->rghash) { // exclude read groups ++@@ -265,8 +349,8 @@ ++ if (ma->conf->fai && b->core.tid >= 0) { ++ has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); ++ if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence ++- fprintf(stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", ++- __func__, b->core.pos, ref_len, b->core.tid); +++ fprintf(stderr,"[%s] Skipping because %"PRIhts_pos" is outside of %"PRIhts_pos" [ref:%d]\n", +++ __func__, (int64_t) b->core.pos, ref_len, b->core.tid); ++ skip = 1; ++ continue; ++ } ++@@ -319,17 +403,19 @@ ++ * @param conf configuration for this pileup ++ * @param n number of files specified in fn ++ * @param fn filenames +++ * @param fn_idx index filenames ++ */ ++-static int mpileup(mplp_conf_t *conf, int n, char **fn) +++static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) ++ { ++ extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); ++ extern void bcf_call_del_rghash(void *rghash); ++ mplp_aux_t **data; ++- int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, tid0 = 0, ref_len, max_depth, max_indel_depth; +++ int i, tid, *n_plp, tid0 = 0, max_depth, max_indel_depth; +++ hts_pos_t pos, beg0 = 0, end0 = HTS_POS_MAX, ref_len; ++ const bam_pileup1_t **plp; ++ mplp_ref_t mp_ref = MPLP_REF_INIT; ++ bam_mplp_t iter; ++- bam_hdr_t *h = NULL; /* header of first file in input list */ +++ sam_hdr_t *h = NULL; /* header of first file in input list */ ++ char *ref; ++ void *rghash = NULL; ++ FILE *pileup_fp = NULL; ++@@ -359,7 +445,7 @@ ++ ++ // read the header of each file in the list and initialize data ++ for (i = 0; i < n; ++i) { ++- bam_hdr_t *h_tmp; +++ sam_hdr_t *h_tmp; ++ data[i] = calloc(1, sizeof(mplp_aux_t)); ++ data[i]->fp = sam_open_format(fn[i], "rb", &conf->ga.in); ++ if ( !data[i]->fp ) ++@@ -383,13 +469,20 @@ ++ fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); ++ exit(EXIT_FAILURE); ++ } ++- bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); +++ bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : sam_hdr_str(h_tmp)); ++ if (conf->flag & MPLP_BCF) { ++ // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) ++- rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); +++ rghash = bcf_call_add_rg(rghash, sam_hdr_str(h_tmp), conf->pl_list); ++ } ++ if (conf->reg) { ++- hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]); +++ hts_idx_t *idx = NULL; +++ // If index filename has not been specfied, look in BAM folder +++ if (fn_idx != NULL) { +++ idx = sam_index_load2(data[i]->fp, fn[i], fn_idx[i]); +++ } else { +++ idx = sam_index_load(data[i]->fp, fn[i]); +++ } +++ ++ if (idx == NULL) { ++ fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); ++ exit(EXIT_FAILURE); ++@@ -407,7 +500,7 @@ ++ if (i == 0) h = data[i]->h = h_tmp; // save the header of the first file ++ else { ++ // FIXME: check consistency between h and h_tmp ++- bam_hdr_destroy(h_tmp); +++ sam_hdr_destroy(h_tmp); ++ ++ // we store only the first file's header; it's (alleged to be) ++ // compatible with the i-th file's target_name lookup needs ++@@ -459,10 +552,10 @@ ++ ++ // Translate BAM @SQ tags to BCF ##contig tags ++ // todo: use/write new BAM header manipulation routines, fill also UR, M5 ++- for (i=0; in_targets; i++) +++ for (i=0; i < sam_hdr_nref(h); i++) ++ { ++ str.l = 0; ++- ksprintf(&str, "##contig=", h->target_name[i], h->target_len[i]); +++ ksprintf(&str, "##contig=", sam_hdr_tid2name(h, i), (int64_t) sam_hdr_tid2len(h, i)); ++ bcf_hdr_append(bcf_hdr, str.s); ++ } ++ free(str.s); ++@@ -515,7 +608,11 @@ ++ for (i=0; in; i++) ++ bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); ++ bcf_hdr_add_sample(bcf_hdr, NULL); ++- bcf_hdr_write(bcf_fp, bcf_hdr); +++ if (bcf_hdr_write(bcf_fp, bcf_hdr) != 0) { +++ print_error_errno("mpileup", "Failed to write VCF/BCF header to \"%s\"", +++ conf->output_fname? conf->output_fname : "standard output"); +++ exit(EXIT_FAILURE); +++ } ++ // End of BCF header creation ++ ++ // Initialise the calling algorithm ++@@ -574,16 +671,17 @@ ++ bam_mplp_set_maxcnt(iter, max_depth); ++ bcf1_t *bcf_rec = bcf_init1(); ++ int ret; ++- int last_tid = -1, last_pos = -1; +++ int last_tid = -1; +++ hts_pos_t last_pos = -1; ++ ++ // begin pileup ++- while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { +++ while ( (ret=bam_mplp64_auto(iter, &tid, &pos, n_plp, plp)) > 0) { ++ if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested ++ mplp_get_ref(data[0], tid, &ref, &ref_len); ++ //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref); ++ if (conf->flag & MPLP_BCF) { ++ int total_depth, _ref0, ref16; ++- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; +++ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; ++ for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; ++ group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); ++ _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ++@@ -595,7 +693,11 @@ ++ bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); ++ bcf_clear1(bcf_rec); ++ bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0); ++- bcf_write1(bcf_fp, bcf_hdr, bcf_rec); +++ if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { +++ print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", +++ conf->output_fname?conf->output_fname:"standard output"); +++ exit(EXIT_FAILURE); +++ } ++ // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? ++ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) ++ { ++@@ -605,7 +707,11 @@ ++ if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { ++ bcf_clear1(bcf_rec); ++ bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref); ++- bcf_write1(bcf_fp, bcf_hdr, bcf_rec); +++ if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { +++ print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", +++ conf->output_fname?conf->output_fname:"standard output"); +++ exit(EXIT_FAILURE); +++ } ++ } ++ } ++ } else { ++@@ -613,10 +719,10 @@ ++ // Deal with missing portions of previous tids ++ while (tid > last_tid) { ++ if (last_tid >= 0 && !conf->reg) { ++- while (++last_pos < h->target_len[last_tid]) { ++- if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) +++ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { +++ if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) ++ continue; ++- print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len); +++ print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len); ++ } ++ } ++ last_tid++; ++@@ -629,16 +735,16 @@ ++ // Deal with missing portion of current tid ++ while (++last_pos < pos) { ++ if (conf->reg && last_pos < beg0) continue; // out of range; skip ++- if (conf->bed && bed_overlap(conf->bed, h->target_name[tid], last_pos, last_pos + 1) == 0) +++ if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) ++ continue; ++- print_empty_pileup(pileup_fp, conf, h->target_name[tid], last_pos, n, ref, ref_len); +++ print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, tid), last_pos, n, ref, ref_len); ++ } ++ last_tid = tid; ++ last_pos = pos; ++ } ++- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; +++ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; ++ ++- fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); +++ fprintf(pileup_fp, "%s\t%"PRIhts_pos"\t%c", sam_hdr_tid2name(h, tid), pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); ++ for (i = 0; i < n; ++i) { ++ int j, cnt; ++ for (j = cnt = 0; j < n_plp[i]; ++j) { ++@@ -651,22 +757,40 @@ ++ fprintf(pileup_fp, "\t%d\t", cnt); ++ if (n_plp[i] == 0) { ++ fputs("*\t*", pileup_fp); ++- if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); ++- if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); ++- if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", pileup_fp); +++ if (conf->flag & MPLP_PRINT_QPOS) +++ fputs("\t*", pileup_fp); +++ int flag_value = MPLP_PRINT_QNAME; +++ while(flag_value < MPLP_PRINT_QUAL + 1) { +++ if (conf->flag & flag_value) +++ fputs("\t*", pileup_fp); +++ flag_value <<= 1; +++ } +++ if (conf->auxlist) { +++ int t = 0; +++ while(t++ < ((klist_t(auxlist) *)conf->auxlist)->size) +++ fputs("\t*", pileup_fp); +++ } ++ } else { ++ int n = 0; +++ kstring_t ks = KS_INITIALIZE; ++ for (j = 0; j < n_plp[i]; ++j) { ++ const bam_pileup1_t *p = plp[i] + j; ++ int c = p->qpos < p->b->core.l_qseq ++ ? bam_get_qual(p->b)[p->qpos] ++ : 0; ++- if (c >= conf->min_baseQ) ++- n++, pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); +++ if (c >= conf->min_baseQ) { +++ n++; +++ if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref, &ks, conf->rev_del) < 0) { +++ ret = 1; +++ goto fail; +++ } +++ } ++ } ++ if (!n) putc('*', pileup_fp); ++ +++ /* Print base qualities */ ++ n = 0; +++ ks_free(&ks); ++ putc('\t', pileup_fp); ++ for (j = 0; j < n_plp[i]; ++j) { ++ const bam_pileup1_t *p = plp[i] + j; ++@@ -681,55 +805,124 @@ ++ } ++ if (!n) putc('*', pileup_fp); ++ ++- if (conf->flag & MPLP_PRINT_MAPQ) { +++ /* Print mpileup positions */ +++ if (conf->flag & MPLP_PRINT_QPOS) { ++ n = 0; ++ putc('\t', pileup_fp); ++ for (j = 0; j < n_plp[i]; ++j) { ++ const bam_pileup1_t *p = plp[i] + j; ++ int c = p->qpos < p->b->core.l_qseq ++- ? bam_get_qual(p->b)[p->qpos] ++- : 0; +++ ? bam_get_qual(p->b)[p->qpos] +++ : 0; ++ if ( c < conf->min_baseQ ) continue; ++- c = plp[i][j].b->core.qual + 33; ++- if (c > 126) c = 126; ++- putc(c, pileup_fp); +++ if (n > 0) putc(',', pileup_fp); ++ n++; +++ fprintf(pileup_fp, "%d", p->qpos + 1); ++ } ++ if (!n) putc('*', pileup_fp); ++ } ++ ++- if (conf->flag & MPLP_PRINT_POS) { ++- n = 0; ++- putc('\t', pileup_fp); ++- for (j = 0; j < n_plp[i]; ++j) { ++- const bam_pileup1_t *p = plp[i] + j; ++- int c = p->qpos < p->b->core.l_qseq ++- ? bam_get_qual(p->b)[p->qpos] ++- : 0; ++- if ( c < conf->min_baseQ ) continue; ++- ++- if (n > 0) putc(',', pileup_fp); ++- fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... ++- n++; +++ /* Print selected columns */ +++ int flag_value = MPLP_PRINT_QNAME; +++ while(flag_value < MPLP_PRINT_QUAL + 1) { +++ if (conf->flag & flag_value) { +++ n = 0; +++ putc('\t', pileup_fp); +++ for (j = 0; j < n_plp[i]; ++j) { +++ const bam_pileup1_t *p = &plp[i][j]; +++ int c = p->qpos < p->b->core.l_qseq +++ ? bam_get_qual(p->b)[p->qpos] +++ : 0; +++ if ( c < conf->min_baseQ ) continue; +++ if (n > 0 && flag_value != MPLP_PRINT_MAPQ) putc(',', pileup_fp); +++ n++; +++ +++ switch (flag_value) { +++ case MPLP_PRINT_QNAME: +++ fputs(bam_get_qname(p->b), pileup_fp); +++ break; +++ case MPLP_PRINT_FLAG: +++ fprintf(pileup_fp, "%d", p->b->core.flag); +++ break; +++ case MPLP_PRINT_RNAME: +++ if (p->b->core.tid >= 0) +++ fputs(sam_hdr_tid2name(h, p->b->core.tid), pileup_fp); +++ else +++ putc('*', pileup_fp); +++ break; +++ case MPLP_PRINT_POS: +++ fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1); +++ break; +++ case MPLP_PRINT_MAPQ: +++ c = p->b->core.qual + 33; +++ if (c > 126) c = 126; +++ putc(c, pileup_fp); +++ break; +++ case MPLP_PRINT_RNEXT: +++ if (p->b->core.mtid >= 0) +++ fputs(sam_hdr_tid2name(h, p->b->core.mtid), pileup_fp); +++ else +++ putc('*', pileup_fp); +++ break; +++ case MPLP_PRINT_PNEXT: +++ fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.mpos + 1); +++ break; +++ } +++ } +++ if (!n) putc('*', pileup_fp); ++ } ++- if (!n) putc('*', pileup_fp); +++ flag_value <<= 1; ++ } ++ ++- if (conf->flag & MPLP_PRINT_QNAME) { ++- n = 0; ++- putc('\t', pileup_fp); ++- for (j = 0; j < n_plp[i]; ++j) { ++- const bam_pileup1_t *p = &plp[i][j]; ++- int c = p->qpos < p->b->core.l_qseq ++- ? bam_get_qual(p->b)[p->qpos] ++- : 0; ++- if ( c < conf->min_baseQ ) continue; ++- ++- if (n > 0) putc(',', pileup_fp); ++- fputs(bam_get_qname(p->b), pileup_fp); ++- n++; +++ /* Print selected tags */ +++ klist_t(auxlist) *auxlist_p = ((klist_t(auxlist) *)conf->auxlist); +++ if (auxlist_p && auxlist_p->size) { +++ kliter_t(auxlist) *aux; +++ for (aux = kl_begin(auxlist_p); aux != kl_end(auxlist_p); aux = kl_next(aux)) { +++ n = 0; +++ putc('\t', pileup_fp); +++ for (j = 0; j < n_plp[i]; ++j) { +++ const bam_pileup1_t *p = &plp[i][j]; +++ int c = p->qpos < p->b->core.l_qseq +++ ? bam_get_qual(p->b)[p->qpos] +++ : 0; +++ if ( c < conf->min_baseQ ) continue; +++ +++ if (n > 0) putc(conf->sep, pileup_fp); +++ n++; +++ uint8_t* tag_u = bam_aux_get(p->b, kl_val(aux)); +++ if (!tag_u) { +++ putc(conf->empty , pileup_fp); +++ continue; +++ } +++ +++ /* Tag value is string */ +++ if (*tag_u == 'Z' || *tag_u == 'H') { +++ char *tag_s = bam_aux2Z(tag_u); +++ if (!tag_s) continue; +++ fputs(tag_s, pileup_fp); +++ } +++ +++ /* Tag value is integer */ +++ if (*tag_u == 'I' || *tag_u == 'i' || *tag_u == 'C' || *tag_u == 'c' || *tag_u == 'S' || *tag_u == 's') { +++ int64_t tag_i = bam_aux2i(tag_u); +++ fprintf(pileup_fp, "%" PRId64 "", tag_i); +++ } +++ +++ /* Tag value is float */ +++ if (*tag_u == 'd' || *tag_u == 'f') { +++ double tag_f = bam_aux2f(tag_u); +++ fprintf(pileup_fp, "%lf", tag_f); +++ } +++ +++ /* Tag value is character */ +++ if (*tag_u == 'A') { +++ char tag_c = bam_aux2A(tag_u); +++ putc(tag_c, pileup_fp); +++ } +++ } +++ if (!n) putc('*', pileup_fp); ++ } ++- if (!n) putc('*', pileup_fp); ++ } ++ } ++ } ++@@ -744,12 +937,12 @@ ++ last_pos = beg0-1; ++ mplp_get_ref(data[0], tid0, &ref, &ref_len); ++ } ++- while (last_tid >= 0 && last_tid < h->n_targets) { ++- while (++last_pos < h->target_len[last_tid]) { +++ while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { +++ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { ++ if (last_pos >= end0) break; ++- if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) +++ if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) ++ continue; ++- print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len); +++ print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len); ++ } ++ last_tid++; ++ last_pos = -1; ++@@ -758,6 +951,7 @@ ++ } ++ } ++ +++fail: ++ // clean up ++ free(bc.tmp.s); ++ bcf_destroy1(bcf_rec); ++@@ -779,7 +973,7 @@ ++ free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); ++ bcf_call_del_rghash(rghash); ++ bam_mplp_destroy(iter); ++- bam_hdr_destroy(h); +++ sam_hdr_destroy(h); ++ for (i = 0; i < n; ++i) { ++ sam_close(data[i]->fp); ++ if (data[i]->iter) hts_itr_destroy(data[i]->iter); ++@@ -922,17 +1116,22 @@ ++ " [%s]\n", tmp_filter); ++ fprintf(fp, ++ " -x, --ignore-overlaps disable read-pair overlap detection\n" +++" -X, --customized-index use customized index files\n" // -X flag for index filename ++ "\n" ++ "Output options:\n" ++ " -o, --output FILE write output to FILE [standard output]\n" ++ " -O, --output-BP output base positions on reads\n" ++ " -s, --output-MQ output mapping quality\n" ++ " --output-QNAME output read names\n" +++" --output-extra STR output extra read fields and read tag values\n" +++" --output-sep CHAR set the separator character for tag lists [,]\n" +++" --output-empty CHAR set the no value character for tag lists [*]\n" +++" --reverse-del use '#' character for deletions on the reverse strand\n" ++ " -a output all positions (including zero depth)\n" ++ " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n" ++ "\n" ++ "Generic options:\n"); ++- sam_global_opt_help(fp, "-.--.-"); +++ sam_global_opt_help(fp, "-.--.--."); ++ ++ fprintf(fp, "\n" ++ "Note that using \"samtools mpileup\" to generate BCF or VCF files is now\n" ++@@ -952,7 +1151,7 @@ ++ int c; ++ const char *file_list = NULL; ++ char **fn = NULL; ++- int nfiles = 0, use_orphan = 0; +++ int nfiles = 0, use_orphan = 0, has_index_file = 0; ++ mplp_conf_t mplp; ++ memset(&mplp, 0, sizeof(mplp_conf_t)); ++ mplp.min_baseQ = 13; ++@@ -966,6 +1165,9 @@ ++ mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; ++ mplp.output_fname = NULL; ++ mplp.all = 0; +++ mplp.rev_del = 0; +++ mplp.sep = ','; +++ mplp.empty = '*'; ++ sam_global_args_init(&mplp.ga); ++ ++ static const struct option lopts[] = ++@@ -1020,9 +1222,15 @@ ++ {"per-sample-mF", no_argument, NULL, 'p'}, ++ {"per-sample-mf", no_argument, NULL, 'p'}, ++ {"platforms", required_argument, NULL, 'P'}, +++ {"customized-index", no_argument, NULL, 'X'}, +++ {"reverse-del", no_argument, NULL, 6}, +++ {"output-extra", required_argument, NULL, 7}, +++ {"output-sep", required_argument, NULL, 8}, +++ {"output-empty", required_argument, NULL, 9}, ++ {NULL, 0, NULL, 0} ++ }; ++- while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:a",lopts,NULL)) >= 0) { +++ +++ while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:a",lopts,NULL)) >= 0) { ++ switch (c) { ++ case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; ++ case 1 : ++@@ -1036,6 +1244,15 @@ ++ case 3 : mplp.output_fname = optarg; break; ++ case 4 : mplp.openQ = atoi(optarg); break; ++ case 5 : mplp.flag |= MPLP_PRINT_QNAME; break; +++ case 6 : mplp.rev_del = 1; break; +++ case 7 : +++ if (build_auxlist(&mplp, optarg) != 0) { +++ fprintf(stderr,"Could not build aux list using '%s'\n", optarg); +++ return 1; +++ } +++ break; +++ case 8: mplp.sep = optarg[0]; break; +++ case 9: mplp.empty = optarg[0]; break; ++ case 'f': ++ mplp.fai = fai_load(optarg); ++ if (mplp.fai == NULL) return 1; ++@@ -1056,6 +1273,7 @@ ++ case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; deprecated(c); break; ++ case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; deprecated(c); break; ++ case 'B': mplp.flag &= ~MPLP_REALN; break; +++ case 'X': has_index_file = 1; break; ++ case 'D': mplp.fmt_flag |= B2B_FMT_DP; deprecated(c); break; ++ case 'S': mplp.fmt_flag |= B2B_FMT_SP; deprecated(c); break; ++ case 'V': mplp.fmt_flag |= B2B_FMT_DV; deprecated(c); break; ++@@ -1064,7 +1282,7 @@ ++ case '6': mplp.flag |= MPLP_ILLUMINA13; break; ++ case 'R': mplp.flag |= MPLP_IGNORE_RG; break; ++ case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; ++- case 'O': mplp.flag |= MPLP_PRINT_POS; break; +++ case 'O': mplp.flag |= MPLP_PRINT_QPOS; break; ++ case 'C': mplp.capQ_thres = atoi(optarg); break; ++ case 'q': mplp.min_mq = atoi(optarg); break; ++ case 'Q': mplp.min_baseQ = atoi(optarg); break; ++@@ -1129,16 +1347,32 @@ ++ } ++ int ret; ++ if (file_list) { +++ if (has_index_file) { +++ fprintf(stderr,"Error: The -b option cannot be combined with -X\n"); // No customize index loc in file list mode +++ return 1; +++ } ++ if ( read_file_list(file_list,&nfiles,&fn) ) return 1; ++- ret = mpileup(&mplp,nfiles,fn); +++ ret = mpileup(&mplp,nfiles,fn,NULL); ++ for (c=0; c ++@@ -38,14 +38,19 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include +++#include ++ #include ++-#include "sam_header.h" ++ #include "samtools.h" +++#include "bedidx.h" ++ #include "sam_opts.h" ++ +++#define dummy_free(p) +++KLIST_INIT(auxlist, char *, dummy_free) +++ ++ static inline int printw(int c, FILE *fp) ++ { ++ char buf[16]; ++@@ -61,7 +66,9 @@ ++ return 0; ++ } ++ ++-static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref_len, const char *ref) +++static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, +++ hts_pos_t ref_len, const char *ref, kstring_t *ks, +++ int rev_del) ++ { ++ int j; ++ if (p->is_head) { ++@@ -81,21 +88,31 @@ ++ else c = bam_is_rev(p->b)? tolower(c) : toupper(c); ++ } ++ putc(c, fp); ++- } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*', fp); +++ } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : ((bam_is_rev(p->b) && rev_del) ? '#' : '*'), fp); +++ int del_len = -p->indel; ++ if (p->indel > 0) { ++- putc('+', fp); printw(p->indel, fp); ++- for (j = 1; j <= p->indel; ++j) { ++- int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)]; ++- putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); +++ int len = bam_plp_insertion(p, ks, &del_len); +++ if (len < 0) +++ return -1; +++ putc('+', fp); printw(len, fp); +++ if (bam_is_rev(p->b)) { +++ char pad = rev_del ? '#' : '*'; +++ for (j = 0; j < len; j++) +++ putc(ks->s[j] != '*' ? tolower(ks->s[j]) : pad, fp); +++ } else { +++ for (j = 0; j < len; j++) +++ putc(toupper(ks->s[j]), fp); ++ } ++- } else if (p->indel < 0) { ++- printw(p->indel, fp); ++- for (j = 1; j <= -p->indel; ++j) { +++ } +++ if (del_len > 0) { +++ printw(-del_len, fp); +++ for (j = 1; j <= del_len; ++j) { ++ int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; ++ putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); ++ } ++ } ++ if (p->is_tail) putc('$', fp); +++ return 0; ++ } ++ ++ #include ++@@ -111,36 +128,43 @@ ++ #define MPLP_REDO_BAQ (1<<6) ++ #define MPLP_ILLUMINA13 (1<<7) ++ #define MPLP_IGNORE_RG (1<<8) ++-#define MPLP_PRINT_POS (1<<9) ++-#define MPLP_PRINT_MAPQ (1<<10) +++#define MPLP_PRINT_QPOS (1<<9) ++ #define MPLP_PER_SAMPLE (1<<11) ++ #define MPLP_SMART_OVERLAPS (1<<12) +++ ++ #define MPLP_PRINT_QNAME (1<<13) +++#define MPLP_PRINT_FLAG (1<<14) +++#define MPLP_PRINT_RNAME (1<<15) +++#define MPLP_PRINT_POS (1<<16) +++#define MPLP_PRINT_MAPQ (1<<17) +++#define MPLP_PRINT_CIGAR (1<<18) +++#define MPLP_PRINT_RNEXT (1<<19) +++#define MPLP_PRINT_PNEXT (1<<20) +++#define MPLP_PRINT_TLEN (1<<21) +++#define MPLP_PRINT_SEQ (1<<22) +++#define MPLP_PRINT_QUAL (1<<23) ++ ++ #define MPLP_MAX_DEPTH 8000 ++ #define MPLP_MAX_INDEL_DEPTH 250 ++ ++-void *bed_read(const char *fn); ++-void bed_destroy(void *_h); ++-int bed_overlap(const void *_h, const char *chr, int beg, int end); ++- ++ typedef struct { ++- int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all; +++ int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all, rev_del; ++ int rflag_require, rflag_filter; ++ int openQ, extQ, tandemQ, min_support; // for indels ++ double min_frac; // for indels ++ char *reg, *pl_list, *fai_fname, *output_fname; ++ faidx_t *fai; ++- void *bed, *rghash; +++ void *bed, *rghash, *auxlist; ++ int argc; ++ char **argv; +++ char sep, empty; ++ sam_global_args ga; ++ } mplp_conf_t; ++ ++ typedef struct { ++ char *ref[2]; ++ int ref_id[2]; ++- int ref_len[2]; +++ hts_pos_t ref_len[2]; ++ } mplp_ref_t; ++ ++ #define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}} ++@@ -148,7 +172,7 @@ ++ typedef struct { ++ samFile *fp; ++ hts_itr_t *iter; ++- bam_hdr_t *h; +++ sam_hdr_t *h; ++ mplp_ref_t *ref; ++ const mplp_conf_t *conf; ++ } mplp_aux_t; ++@@ -159,7 +183,54 @@ ++ bam_pileup1_t **plp; ++ } mplp_pileup_t; ++ ++-static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) { +++static int build_auxlist(mplp_conf_t *conf, char *optstring) { +++ if (!optstring) +++ return 0; +++ +++ void *colhash = khash_str2int_init(); +++ if (!colhash) +++ return 1; +++ +++ struct active_cols { +++ char *name; +++ int supported; +++ }; +++ +++ const struct active_cols colnames[11] = { +++ {"QNAME", 1}, {"FLAG", 1}, {"RNAME", 1}, {"POS", 1}, {"MAPQ", 1}, {"CIGAR", 0}, {"RNEXT", 1}, {"PNEXT", 1}, {"TLEN", 0}, {"SEQ", 0}, {"QUAL", 0} +++ }; +++ +++ int i, f = MPLP_PRINT_QNAME, colno = 11; +++ for (i = 0; i < colno; i++, f <<= 1) +++ if (colnames[i].supported) +++ khash_str2int_set(colhash, colnames[i].name, f); +++ +++ conf->auxlist = kl_init(auxlist); +++ if (!conf->auxlist) +++ return 1; +++ +++ char *save_p; +++ char *tag = strtok_r(optstring, ",", &save_p); +++ while (tag) { +++ if (khash_str2int_get(colhash, tag, &f) == 0) { +++ conf->flag |= f; +++ } else { +++ if (strlen(tag) != 2) { +++ fprintf(samtools_stderr, "[%s] tag '%s' has more than two characters or not supported\n", __func__, tag); +++ } else { +++ char **tag_p = kl_pushp(auxlist, conf->auxlist); +++ *tag_p = tag; +++ } +++ } +++ tag = strtok_r(NULL, ",", &save_p); +++ } +++ +++ khash_str2int_destroy(colhash); +++ +++ return 0; +++} +++ +++static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, hts_pos_t *ref_len) { ++ mplp_ref_t *r = ma->ref; ++ ++ //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]); ++@@ -179,9 +250,10 @@ ++ } ++ if (tid == r->ref_id[1]) { ++ // Last, swap over ++- int tmp; ++- tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp; ++- tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp; +++ int tmp_id; +++ hts_pos_t tmp_len; +++ tmp_id = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp_id; +++ tmp_len = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp_len; ++ ++ char *tc; ++ tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc; ++@@ -197,10 +269,10 @@ ++ r->ref_len[1] = r->ref_len[0]; ++ ++ r->ref_id[0] = tid; ++- r->ref[0] = faidx_fetch_seq(ma->conf->fai, ++- ma->h->target_name[r->ref_id[0]], +++ r->ref[0] = faidx_fetch_seq64(ma->conf->fai, +++ sam_hdr_tid2name(ma->h, r->ref_id[0]), ++ 0, ++- INT_MAX, +++ HTS_POS_MAX, ++ &r->ref_len[0]); ++ ++ if (!r->ref[0]) { ++@@ -218,15 +290,25 @@ ++ ++ static void ++ print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, ++- int pos, int n, const char *ref, int ref_len) +++ hts_pos_t pos, int n, const char *ref, hts_pos_t ref_len) ++ { ++ int i; ++- fprintf(fp, "%s\t%d\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N'); +++ fprintf(fp, "%s\t%"PRIhts_pos"\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N'); ++ for (i = 0; i < n; ++i) { ++ fputs("\t0\t*\t*", fp); ++- if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp); ++- if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp); ++- if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", fp); +++ if (conf->flag & MPLP_PRINT_QPOS) +++ fputs("\t*", fp); +++ int flag_value = MPLP_PRINT_QNAME; +++ while(flag_value < MPLP_PRINT_QUAL + 1) { +++ if (conf->flag & flag_value) +++ fputs("\t*", fp); +++ flag_value <<= 1; +++ } +++ if (conf->auxlist) { +++ int t = 0; +++ while(t++ < ((klist_t(auxlist) *)conf->auxlist)->size) +++ fputs("\t*", fp); +++ } ++ } ++ putc('\n', fp); ++ } ++@@ -235,7 +317,9 @@ ++ { ++ char *ref; ++ mplp_aux_t *ma = (mplp_aux_t*)data; ++- int ret, skip = 0, ref_len; +++ int ret, skip = 0; +++ hts_pos_t ref_len; +++ ++ do { ++ int has_ref; ++ ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b); ++@@ -249,7 +333,7 @@ ++ if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; } ++ if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; } ++ if (ma->conf->bed && ma->conf->all == 0) { // test overlap ++- skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_endpos(b)); +++ skip = !bed_overlap(ma->conf->bed, sam_hdr_tid2name(ma->h, b->core.tid), b->core.pos, bam_endpos(b)); ++ if (skip) continue; ++ } ++ if (ma->conf->rghash) { // exclude read groups ++@@ -267,8 +351,8 @@ ++ if (ma->conf->fai && b->core.tid >= 0) { ++ has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); ++ if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence ++- fprintf(samtools_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", ++- __func__, b->core.pos, ref_len, b->core.tid); +++ fprintf(samtools_stderr,"[%s] Skipping because %"PRIhts_pos" is outside of %"PRIhts_pos" [ref:%d]\n", +++ __func__, (int64_t) b->core.pos, ref_len, b->core.tid); ++ skip = 1; ++ continue; ++ } ++@@ -321,17 +405,19 @@ ++ * @param conf configuration for this pileup ++ * @param n number of files specified in fn ++ * @param fn filenames +++ * @param fn_idx index filenames ++ */ ++-static int mpileup(mplp_conf_t *conf, int n, char **fn) +++static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) ++ { ++ extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); ++ extern void bcf_call_del_rghash(void *rghash); ++ mplp_aux_t **data; ++- int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, tid0 = 0, ref_len, max_depth, max_indel_depth; +++ int i, tid, *n_plp, tid0 = 0, max_depth, max_indel_depth; +++ hts_pos_t pos, beg0 = 0, end0 = HTS_POS_MAX, ref_len; ++ const bam_pileup1_t **plp; ++ mplp_ref_t mp_ref = MPLP_REF_INIT; ++ bam_mplp_t iter; ++- bam_hdr_t *h = NULL; /* header of first file in input list */ +++ sam_hdr_t *h = NULL; /* header of first file in input list */ ++ char *ref; ++ void *rghash = NULL; ++ FILE *pileup_fp = NULL; ++@@ -361,7 +447,7 @@ ++ ++ // read the header of each file in the list and initialize data ++ for (i = 0; i < n; ++i) { ++- bam_hdr_t *h_tmp; +++ sam_hdr_t *h_tmp; ++ data[i] = calloc(1, sizeof(mplp_aux_t)); ++ data[i]->fp = sam_open_format(fn[i], "rb", &conf->ga.in); ++ if ( !data[i]->fp ) ++@@ -385,13 +471,20 @@ ++ fprintf(samtools_stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); ++ exit(EXIT_FAILURE); ++ } ++- bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); +++ bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : sam_hdr_str(h_tmp)); ++ if (conf->flag & MPLP_BCF) { ++ // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) ++- rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); +++ rghash = bcf_call_add_rg(rghash, sam_hdr_str(h_tmp), conf->pl_list); ++ } ++ if (conf->reg) { ++- hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]); +++ hts_idx_t *idx = NULL; +++ // If index filename has not been specfied, look in BAM folder +++ if (fn_idx != NULL) { +++ idx = sam_index_load2(data[i]->fp, fn[i], fn_idx[i]); +++ } else { +++ idx = sam_index_load(data[i]->fp, fn[i]); +++ } +++ ++ if (idx == NULL) { ++ fprintf(samtools_stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); ++ exit(EXIT_FAILURE); ++@@ -409,7 +502,7 @@ ++ if (i == 0) h = data[i]->h = h_tmp; // save the header of the first file ++ else { ++ // FIXME: check consistency between h and h_tmp ++- bam_hdr_destroy(h_tmp); +++ sam_hdr_destroy(h_tmp); ++ ++ // we store only the first file's header; it's (alleged to be) ++ // compatible with the i-th file's target_name lookup needs ++@@ -461,10 +554,10 @@ ++ ++ // Translate BAM @SQ tags to BCF ##contig tags ++ // todo: use/write new BAM header manipulation routines, fill also UR, M5 ++- for (i=0; in_targets; i++) +++ for (i=0; i < sam_hdr_nref(h); i++) ++ { ++ str.l = 0; ++- ksprintf(&str, "##contig=", h->target_name[i], h->target_len[i]); +++ ksprintf(&str, "##contig=", sam_hdr_tid2name(h, i), (int64_t) sam_hdr_tid2len(h, i)); ++ bcf_hdr_append(bcf_hdr, str.s); ++ } ++ free(str.s); ++@@ -517,7 +610,11 @@ ++ for (i=0; in; i++) ++ bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); ++ bcf_hdr_add_sample(bcf_hdr, NULL); ++- bcf_hdr_write(bcf_fp, bcf_hdr); +++ if (bcf_hdr_write(bcf_fp, bcf_hdr) != 0) { +++ print_error_errno("mpileup", "Failed to write VCF/BCF header to \"%s\"", +++ conf->output_fname? conf->output_fname : "standard output"); +++ exit(EXIT_FAILURE); +++ } ++ // End of BCF header creation ++ ++ // Initialise the calling algorithm ++@@ -576,16 +673,17 @@ ++ bam_mplp_set_maxcnt(iter, max_depth); ++ bcf1_t *bcf_rec = bcf_init1(); ++ int ret; ++- int last_tid = -1, last_pos = -1; +++ int last_tid = -1; +++ hts_pos_t last_pos = -1; ++ ++ // begin pileup ++- while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { +++ while ( (ret=bam_mplp64_auto(iter, &tid, &pos, n_plp, plp)) > 0) { ++ if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested ++ mplp_get_ref(data[0], tid, &ref, &ref_len); ++ //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref); ++ if (conf->flag & MPLP_BCF) { ++ int total_depth, _ref0, ref16; ++- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; +++ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; ++ for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; ++ group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); ++ _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ++@@ -597,7 +695,11 @@ ++ bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); ++ bcf_clear1(bcf_rec); ++ bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0); ++- bcf_write1(bcf_fp, bcf_hdr, bcf_rec); +++ if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { +++ print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", +++ conf->output_fname?conf->output_fname:"standard output"); +++ exit(EXIT_FAILURE); +++ } ++ // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? ++ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) ++ { ++@@ -607,7 +709,11 @@ ++ if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { ++ bcf_clear1(bcf_rec); ++ bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref); ++- bcf_write1(bcf_fp, bcf_hdr, bcf_rec); +++ if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { +++ print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", +++ conf->output_fname?conf->output_fname:"standard output"); +++ exit(EXIT_FAILURE); +++ } ++ } ++ } ++ } else { ++@@ -615,10 +721,10 @@ ++ // Deal with missing portions of previous tids ++ while (tid > last_tid) { ++ if (last_tid >= 0 && !conf->reg) { ++- while (++last_pos < h->target_len[last_tid]) { ++- if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) +++ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { +++ if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) ++ continue; ++- print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len); +++ print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len); ++ } ++ } ++ last_tid++; ++@@ -631,16 +737,16 @@ ++ // Deal with missing portion of current tid ++ while (++last_pos < pos) { ++ if (conf->reg && last_pos < beg0) continue; // out of range; skip ++- if (conf->bed && bed_overlap(conf->bed, h->target_name[tid], last_pos, last_pos + 1) == 0) +++ if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) ++ continue; ++- print_empty_pileup(pileup_fp, conf, h->target_name[tid], last_pos, n, ref, ref_len); +++ print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, tid), last_pos, n, ref, ref_len); ++ } ++ last_tid = tid; ++ last_pos = pos; ++ } ++- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; +++ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; ++ ++- fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); +++ fprintf(pileup_fp, "%s\t%"PRIhts_pos"\t%c", sam_hdr_tid2name(h, tid), pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); ++ for (i = 0; i < n; ++i) { ++ int j, cnt; ++ for (j = cnt = 0; j < n_plp[i]; ++j) { ++@@ -653,22 +759,40 @@ ++ fprintf(pileup_fp, "\t%d\t", cnt); ++ if (n_plp[i] == 0) { ++ fputs("*\t*", pileup_fp); ++- if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); ++- if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); ++- if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", pileup_fp); +++ if (conf->flag & MPLP_PRINT_QPOS) +++ fputs("\t*", pileup_fp); +++ int flag_value = MPLP_PRINT_QNAME; +++ while(flag_value < MPLP_PRINT_QUAL + 1) { +++ if (conf->flag & flag_value) +++ fputs("\t*", pileup_fp); +++ flag_value <<= 1; +++ } +++ if (conf->auxlist) { +++ int t = 0; +++ while(t++ < ((klist_t(auxlist) *)conf->auxlist)->size) +++ fputs("\t*", pileup_fp); +++ } ++ } else { ++ int n = 0; +++ kstring_t ks = KS_INITIALIZE; ++ for (j = 0; j < n_plp[i]; ++j) { ++ const bam_pileup1_t *p = plp[i] + j; ++ int c = p->qpos < p->b->core.l_qseq ++ ? bam_get_qual(p->b)[p->qpos] ++ : 0; ++- if (c >= conf->min_baseQ) ++- n++, pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); +++ if (c >= conf->min_baseQ) { +++ n++; +++ if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref, &ks, conf->rev_del) < 0) { +++ ret = 1; +++ goto fail; +++ } +++ } ++ } ++ if (!n) putc('*', pileup_fp); ++ +++ /* Print base qualities */ ++ n = 0; +++ ks_free(&ks); ++ putc('\t', pileup_fp); ++ for (j = 0; j < n_plp[i]; ++j) { ++ const bam_pileup1_t *p = plp[i] + j; ++@@ -683,55 +807,124 @@ ++ } ++ if (!n) putc('*', pileup_fp); ++ ++- if (conf->flag & MPLP_PRINT_MAPQ) { +++ /* Print mpileup positions */ +++ if (conf->flag & MPLP_PRINT_QPOS) { ++ n = 0; ++ putc('\t', pileup_fp); ++ for (j = 0; j < n_plp[i]; ++j) { ++ const bam_pileup1_t *p = plp[i] + j; ++ int c = p->qpos < p->b->core.l_qseq ++- ? bam_get_qual(p->b)[p->qpos] ++- : 0; +++ ? bam_get_qual(p->b)[p->qpos] +++ : 0; ++ if ( c < conf->min_baseQ ) continue; ++- c = plp[i][j].b->core.qual + 33; ++- if (c > 126) c = 126; ++- putc(c, pileup_fp); +++ if (n > 0) putc(',', pileup_fp); ++ n++; +++ fprintf(pileup_fp, "%d", p->qpos + 1); ++ } ++ if (!n) putc('*', pileup_fp); ++ } ++ ++- if (conf->flag & MPLP_PRINT_POS) { ++- n = 0; ++- putc('\t', pileup_fp); ++- for (j = 0; j < n_plp[i]; ++j) { ++- const bam_pileup1_t *p = plp[i] + j; ++- int c = p->qpos < p->b->core.l_qseq ++- ? bam_get_qual(p->b)[p->qpos] ++- : 0; ++- if ( c < conf->min_baseQ ) continue; ++- ++- if (n > 0) putc(',', pileup_fp); ++- fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: fprintf(samtools_stdout, ) is very slow... ++- n++; +++ /* Print selected columns */ +++ int flag_value = MPLP_PRINT_QNAME; +++ while(flag_value < MPLP_PRINT_QUAL + 1) { +++ if (conf->flag & flag_value) { +++ n = 0; +++ putc('\t', pileup_fp); +++ for (j = 0; j < n_plp[i]; ++j) { +++ const bam_pileup1_t *p = &plp[i][j]; +++ int c = p->qpos < p->b->core.l_qseq +++ ? bam_get_qual(p->b)[p->qpos] +++ : 0; +++ if ( c < conf->min_baseQ ) continue; +++ if (n > 0 && flag_value != MPLP_PRINT_MAPQ) putc(',', pileup_fp); +++ n++; +++ +++ switch (flag_value) { +++ case MPLP_PRINT_QNAME: +++ fputs(bam_get_qname(p->b), pileup_fp); +++ break; +++ case MPLP_PRINT_FLAG: +++ fprintf(pileup_fp, "%d", p->b->core.flag); +++ break; +++ case MPLP_PRINT_RNAME: +++ if (p->b->core.tid >= 0) +++ fputs(sam_hdr_tid2name(h, p->b->core.tid), pileup_fp); +++ else +++ putc('*', pileup_fp); +++ break; +++ case MPLP_PRINT_POS: +++ fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1); +++ break; +++ case MPLP_PRINT_MAPQ: +++ c = p->b->core.qual + 33; +++ if (c > 126) c = 126; +++ putc(c, pileup_fp); +++ break; +++ case MPLP_PRINT_RNEXT: +++ if (p->b->core.mtid >= 0) +++ fputs(sam_hdr_tid2name(h, p->b->core.mtid), pileup_fp); +++ else +++ putc('*', pileup_fp); +++ break; +++ case MPLP_PRINT_PNEXT: +++ fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.mpos + 1); +++ break; +++ } +++ } +++ if (!n) putc('*', pileup_fp); ++ } ++- if (!n) putc('*', pileup_fp); +++ flag_value <<= 1; ++ } ++ ++- if (conf->flag & MPLP_PRINT_QNAME) { ++- n = 0; ++- putc('\t', pileup_fp); ++- for (j = 0; j < n_plp[i]; ++j) { ++- const bam_pileup1_t *p = &plp[i][j]; ++- int c = p->qpos < p->b->core.l_qseq ++- ? bam_get_qual(p->b)[p->qpos] ++- : 0; ++- if ( c < conf->min_baseQ ) continue; ++- ++- if (n > 0) putc(',', pileup_fp); ++- fputs(bam_get_qname(p->b), pileup_fp); ++- n++; +++ /* Print selected tags */ +++ klist_t(auxlist) *auxlist_p = ((klist_t(auxlist) *)conf->auxlist); +++ if (auxlist_p && auxlist_p->size) { +++ kliter_t(auxlist) *aux; +++ for (aux = kl_begin(auxlist_p); aux != kl_end(auxlist_p); aux = kl_next(aux)) { +++ n = 0; +++ putc('\t', pileup_fp); +++ for (j = 0; j < n_plp[i]; ++j) { +++ const bam_pileup1_t *p = &plp[i][j]; +++ int c = p->qpos < p->b->core.l_qseq +++ ? bam_get_qual(p->b)[p->qpos] +++ : 0; +++ if ( c < conf->min_baseQ ) continue; +++ +++ if (n > 0) putc(conf->sep, pileup_fp); +++ n++; +++ uint8_t* tag_u = bam_aux_get(p->b, kl_val(aux)); +++ if (!tag_u) { +++ putc(conf->empty , pileup_fp); +++ continue; +++ } +++ +++ /* Tag value is string */ +++ if (*tag_u == 'Z' || *tag_u == 'H') { +++ char *tag_s = bam_aux2Z(tag_u); +++ if (!tag_s) continue; +++ fputs(tag_s, pileup_fp); +++ } +++ +++ /* Tag value is integer */ +++ if (*tag_u == 'I' || *tag_u == 'i' || *tag_u == 'C' || *tag_u == 'c' || *tag_u == 'S' || *tag_u == 's') { +++ int64_t tag_i = bam_aux2i(tag_u); +++ fprintf(pileup_fp, "%" PRId64 "", tag_i); +++ } +++ +++ /* Tag value is float */ +++ if (*tag_u == 'd' || *tag_u == 'f') { +++ double tag_f = bam_aux2f(tag_u); +++ fprintf(pileup_fp, "%lf", tag_f); +++ } +++ +++ /* Tag value is character */ +++ if (*tag_u == 'A') { +++ char tag_c = bam_aux2A(tag_u); +++ putc(tag_c, pileup_fp); +++ } +++ } +++ if (!n) putc('*', pileup_fp); ++ } ++- if (!n) putc('*', pileup_fp); ++ } ++ } ++ } ++@@ -746,12 +939,12 @@ ++ last_pos = beg0-1; ++ mplp_get_ref(data[0], tid0, &ref, &ref_len); ++ } ++- while (last_tid >= 0 && last_tid < h->n_targets) { ++- while (++last_pos < h->target_len[last_tid]) { +++ while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { +++ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { ++ if (last_pos >= end0) break; ++- if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) +++ if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) ++ continue; ++- print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len); +++ print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len); ++ } ++ last_tid++; ++ last_pos = -1; ++@@ -760,6 +953,7 @@ ++ } ++ } ++ +++fail: ++ // clean up ++ free(bc.tmp.s); ++ bcf_destroy1(bcf_rec); ++@@ -781,7 +975,7 @@ ++ free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); ++ bcf_call_del_rghash(rghash); ++ bam_mplp_destroy(iter); ++- bam_hdr_destroy(h); +++ sam_hdr_destroy(h); ++ for (i = 0; i < n; ++i) { ++ sam_close(data[i]->fp); ++ if (data[i]->iter) hts_itr_destroy(data[i]->iter); ++@@ -924,17 +1118,22 @@ ++ " [%s]\n", tmp_filter); ++ fprintf(fp, ++ " -x, --ignore-overlaps disable read-pair overlap detection\n" +++" -X, --customized-index use customized index files\n" // -X flag for index filename ++ "\n" ++ "Output options:\n" ++ " -o, --output FILE write output to FILE [standard output]\n" ++ " -O, --output-BP output base positions on reads\n" ++ " -s, --output-MQ output mapping quality\n" ++ " --output-QNAME output read names\n" +++" --output-extra STR output extra read fields and read tag values\n" +++" --output-sep CHAR set the separator character for tag lists [,]\n" +++" --output-empty CHAR set the no value character for tag lists [*]\n" +++" --reverse-del use '#' character for deletions on the reverse strand\n" ++ " -a output all positions (including zero depth)\n" ++ " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n" ++ "\n" ++ "Generic options:\n"); ++- sam_global_opt_help(fp, "-.--.-"); +++ sam_global_opt_help(fp, "-.--.--."); ++ ++ fprintf(fp, "\n" ++ "Note that using \"samtools mpileup\" to generate BCF or VCF files is now\n" ++@@ -954,7 +1153,7 @@ ++ int c; ++ const char *file_list = NULL; ++ char **fn = NULL; ++- int nfiles = 0, use_orphan = 0; +++ int nfiles = 0, use_orphan = 0, has_index_file = 0; ++ mplp_conf_t mplp; ++ memset(&mplp, 0, sizeof(mplp_conf_t)); ++ mplp.min_baseQ = 13; ++@@ -968,6 +1167,9 @@ ++ mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; ++ mplp.output_fname = NULL; ++ mplp.all = 0; +++ mplp.rev_del = 0; +++ mplp.sep = ','; +++ mplp.empty = '*'; ++ sam_global_args_init(&mplp.ga); ++ ++ static const struct option lopts[] = ++@@ -1022,9 +1224,15 @@ ++ {"per-sample-mF", no_argument, NULL, 'p'}, ++ {"per-sample-mf", no_argument, NULL, 'p'}, ++ {"platforms", required_argument, NULL, 'P'}, +++ {"customized-index", no_argument, NULL, 'X'}, +++ {"reverse-del", no_argument, NULL, 6}, +++ {"output-extra", required_argument, NULL, 7}, +++ {"output-sep", required_argument, NULL, 8}, +++ {"output-empty", required_argument, NULL, 9}, ++ {NULL, 0, NULL, 0} ++ }; ++- while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:a",lopts,NULL)) >= 0) { +++ +++ while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:a",lopts,NULL)) >= 0) { ++ switch (c) { ++ case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; ++ case 1 : ++@@ -1038,6 +1246,15 @@ ++ case 3 : mplp.output_fname = optarg; break; ++ case 4 : mplp.openQ = atoi(optarg); break; ++ case 5 : mplp.flag |= MPLP_PRINT_QNAME; break; +++ case 6 : mplp.rev_del = 1; break; +++ case 7 : +++ if (build_auxlist(&mplp, optarg) != 0) { +++ fprintf(samtools_stderr,"Could not build aux list using '%s'\n", optarg); +++ return 1; +++ } +++ break; +++ case 8: mplp.sep = optarg[0]; break; +++ case 9: mplp.empty = optarg[0]; break; ++ case 'f': ++ mplp.fai = fai_load(optarg); ++ if (mplp.fai == NULL) return 1; ++@@ -1058,6 +1275,7 @@ ++ case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; deprecated(c); break; ++ case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; deprecated(c); break; ++ case 'B': mplp.flag &= ~MPLP_REALN; break; +++ case 'X': has_index_file = 1; break; ++ case 'D': mplp.fmt_flag |= B2B_FMT_DP; deprecated(c); break; ++ case 'S': mplp.fmt_flag |= B2B_FMT_SP; deprecated(c); break; ++ case 'V': mplp.fmt_flag |= B2B_FMT_DV; deprecated(c); break; ++@@ -1066,7 +1284,7 @@ ++ case '6': mplp.flag |= MPLP_ILLUMINA13; break; ++ case 'R': mplp.flag |= MPLP_IGNORE_RG; break; ++ case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; ++- case 'O': mplp.flag |= MPLP_PRINT_POS; break; +++ case 'O': mplp.flag |= MPLP_PRINT_QPOS; break; ++ case 'C': mplp.capQ_thres = atoi(optarg); break; ++ case 'q': mplp.min_mq = atoi(optarg); break; ++ case 'Q': mplp.min_baseQ = atoi(optarg); break; ++@@ -1131,16 +1349,32 @@ ++ } ++ int ret; ++ if (file_list) { +++ if (has_index_file) { +++ fprintf(samtools_stderr,"Error: The -b option cannot be combined with -X\n"); // No customize index loc in file list mode +++ return 1; +++ } ++ if ( read_file_list(file_list,&nfiles,&fn) ) return 1; ++- ret = mpileup(&mplp,nfiles,fn); +++ ret = mpileup(&mplp,nfiles,fn,NULL); ++ for (c=0; c ++ ++@@ -46,6 +46,7 @@ ++ "Options:\n" ++ " -v verbose output (repeat for more verbosity)\n" ++ " -q suppress warning messages\n" +++" -u unmapped input (do not require targets in header)\n" ++ "\n" ++ "Notes:\n" ++ "\n" ++@@ -77,13 +78,16 @@ ++ ++ int main_quickcheck(int argc, char** argv) ++ { ++- int verbose = 0, quiet = 0; +++ int verbose = 0, quiet = 0, unmapped = 0; ++ hts_verbose = 0; ++ ++- const char* optstring = "vq"; +++ const char* optstring = "vqu"; ++ int opt; ++ while ((opt = getopt(argc, argv, optstring)) != -1) { ++ switch (opt) { +++ case 'u': +++ unmapped = 1; +++ break; ++ case 'v': ++ verbose++; ++ break; ++@@ -136,17 +140,17 @@ ++ else { ++ if (verbose >= 3) fprintf(stderr, "%s is sequence data\n", fn); ++ // check header ++- bam_hdr_t *header = sam_hdr_read(hts_fp); +++ sam_hdr_t *header = sam_hdr_read(hts_fp); ++ if (header == NULL) { ++ QC_ERR(QC_BAD_HEADER, 2, "%s caused an error whilst reading its header.\n", fn); ++ } else { ++- if (header->n_targets <= 0) { +++ if (!unmapped && sam_hdr_nref(header) <= 0) { ++ QC_ERR(QC_BAD_HEADER, 2, "%s had no targets in header.\n", fn); ++ } ++ else { ++- if (verbose >= 3) fprintf(stderr, "%s has %d targets in header.\n", fn, header->n_targets); +++ if (verbose >= 3) fprintf(stderr, "%s has %d targets in header.\n", fn, sam_hdr_nref(header)); ++ } ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ } ++ } ++ // check EOF on formats that support this ++--- python-pysam.orig/samtools/bam_quickcheck.c.pysam.c +++++ python-pysam/samtools/bam_quickcheck.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam_quickcheck.c -- quickcheck subcommand. ++ ++- Copyright (C) 2015 Genome Research Ltd. +++ Copyright (C) 2015-2017 Genome Research Ltd. ++ ++ Author: Joshua C. Randall ++ ++@@ -48,6 +48,7 @@ ++ "Options:\n" ++ " -v verbose output (repeat for more verbosity)\n" ++ " -q suppress warning messages\n" +++" -u unmapped input (do not require targets in header)\n" ++ "\n" ++ "Notes:\n" ++ "\n" ++@@ -79,13 +80,16 @@ ++ ++ int main_quickcheck(int argc, char** argv) ++ { ++- int verbose = 0, quiet = 0; +++ int verbose = 0, quiet = 0, unmapped = 0; ++ hts_verbose = 0; ++ ++- const char* optstring = "vq"; +++ const char* optstring = "vqu"; ++ int opt; ++ while ((opt = getopt(argc, argv, optstring)) != -1) { ++ switch (opt) { +++ case 'u': +++ unmapped = 1; +++ break; ++ case 'v': ++ verbose++; ++ break; ++@@ -138,17 +142,17 @@ ++ else { ++ if (verbose >= 3) fprintf(samtools_stderr, "%s is sequence data\n", fn); ++ // check header ++- bam_hdr_t *header = sam_hdr_read(hts_fp); +++ sam_hdr_t *header = sam_hdr_read(hts_fp); ++ if (header == NULL) { ++ QC_ERR(QC_BAD_HEADER, 2, "%s caused an error whilst reading its header.\n", fn); ++ } else { ++- if (header->n_targets <= 0) { +++ if (!unmapped && sam_hdr_nref(header) <= 0) { ++ QC_ERR(QC_BAD_HEADER, 2, "%s had no targets in header.\n", fn); ++ } ++ else { ++- if (verbose >= 3) fprintf(samtools_stderr, "%s has %d targets in header.\n", fn, header->n_targets); +++ if (verbose >= 3) fprintf(samtools_stderr, "%s has %d targets in header.\n", fn, sam_hdr_nref(header)); ++ } ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ } ++ } ++ // check EOF on formats that support this ++--- python-pysam.orig/samtools/bam_reheader.c +++++ python-pysam/samtools/bam_reheader.c ++@@ -1,7 +1,7 @@ ++ /* bam_reheader.c -- reheader subcommand. ++ ++ Copyright (C) 2010 Broad Institute. ++- Copyright (C) 2012-2015 Genome Research Ltd. +++ Copyright (C) 2012-2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -29,6 +29,7 @@ ++ #include ++ #include ++ #include +++#include ++ ++ #include "htslib/bgzf.h" ++ #include "htslib/sam.h" ++@@ -42,50 +43,44 @@ ++ * Reads a file and outputs a new BAM file to fd with 'h' replaced as ++ * the header. No checks are made to the validity. ++ */ ++-int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, ++- const char *arg_list, int add_PG) +++int bam_reheader(BGZF *in, sam_hdr_t *h, int fd, +++ const char *arg_list, int no_pg, int skip_header) ++ { ++ BGZF *fp = NULL; ++ ssize_t len; ++ uint8_t *buf = NULL; ++- SAM_hdr *sh = NULL; +++ sam_hdr_t *tmp; +++ if (!h) +++ return -1; +++ ++ if (in->is_write) return -1; ++ buf = malloc(BUF_SIZE); ++ if (!buf) { ++ fprintf(stderr, "Out of memory\n"); ++ return -1; ++ } ++- if (bam_hdr_read(in) == NULL) { ++- fprintf(stderr, "Couldn't read header\n"); ++- goto fail; +++ +++ if (!skip_header) { +++ if ((tmp = bam_hdr_read(in)) == NULL) { +++ fprintf(stderr, "Couldn't read header\n"); +++ goto fail; +++ } +++ sam_hdr_destroy(tmp); ++ } +++ ++ fp = bgzf_fdopen(fd, "w"); ++ if (!fp) { ++ print_error_errno("reheader", "Couldn't open output file"); ++ goto fail; ++ } ++ ++- if (add_PG) { ++- // Around the houses, but it'll do until we can manipulate bam_hdr_t natively. ++- sh = sam_hdr_parse_(h->text, h->l_text); ++- if (!sh) ++- goto fail; ++- if (sam_hdr_add_PG(sh, "samtools", +++ if (!no_pg && sam_hdr_add_pg(h, "samtools", ++ "VN", samtools_version(), ++ arg_list ? "CL": NULL, ++ arg_list ? arg_list : NULL, ++ NULL) != 0) ++ goto fail; ++ ++- free(h->text); ++- h->text = strdup(sam_hdr_str(sh)); ++- h->l_text = sam_hdr_length(sh); ++- if (!h->text) ++- goto fail; ++- sam_hdr_free(sh); ++- sh = NULL; ++- } ++- ++ if (bam_hdr_write(fp, h) < 0) { ++ print_error_errno("reheader", "Couldn't write header"); ++ goto fail; ++@@ -114,7 +109,6 @@ ++ fail: ++ bgzf_close(fp); ++ free(buf); ++- sam_hdr_free(sh); ++ return -1; ++ } ++ ++@@ -124,32 +118,28 @@ ++ * ++ * FIXME: error checking ++ */ ++-int cram_reheader(cram_fd *in, bam_hdr_t *h, const char *arg_list, int add_PG) +++int cram_reheader(cram_fd *in, sam_hdr_t *h, const char *arg_list, int no_pg) ++ { ++ htsFile *h_out = hts_open("-", "wc"); ++ cram_fd *out = h_out->fp.cram; ++ cram_container *c = NULL; ++ int ret = -1; +++ if (!h) +++ return ret; ++ ++ // Attempt to fill out a cram->refs[] array from @SQ headers ++- cram_fd_set_header(out, sam_hdr_parse_(h->text, h->l_text)); ++- if (add_PG) { ++- if (sam_hdr_add_PG(cram_fd_get_header(out), "samtools", +++ sam_hdr_t *cram_h = sam_hdr_dup(h); +++ if (!cram_h) +++ return -1; +++ cram_fd_set_header(out, cram_h); +++ if (!no_pg && sam_hdr_add_pg(cram_fd_get_header(out), "samtools", ++ "VN", samtools_version(), ++ arg_list ? "CL": NULL, ++ arg_list ? arg_list : NULL, ++- NULL) != 0) +++ NULL)) ++ goto err; ++ ++- // Covert back to bam_hdr_t struct ++- free(h->text); ++- h->text = strdup(sam_hdr_str(cram_fd_get_header(out))); ++- h->l_text = sam_hdr_length(cram_fd_get_header(out)); ++- if (!h->text) ++- goto err; ++- } ++- ++- if (sam_hdr_write(h_out, h) != 0) +++ if (sam_hdr_write(h_out, cram_h) != 0) ++ goto err; ++ cram_set_option(out, CRAM_OPT_REFERENCE, NULL); ++ ++@@ -192,14 +182,16 @@ ++ * -1 on general failure; ++ * -2 on failure due to insufficient size ++ */ ++-int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, ++- int add_PG) +++int cram_reheader_inplace2(cram_fd *fd, sam_hdr_t *h, const char *arg_list, +++ int no_pg) ++ { ++ cram_container *c = NULL; ++ cram_block *b = NULL; ++- SAM_hdr *hdr = NULL; +++ sam_hdr_t *cram_h = NULL; ++ off_t start; ++ int ret = -1; +++ if (!h) +++ goto err; ++ ++ if (cram_major_vers(fd) < 2 || ++ cram_major_vers(fd) > 3) { ++@@ -208,16 +200,17 @@ ++ goto err; ++ } ++ ++- if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) +++ cram_h = sam_hdr_dup(h); +++ if (!cram_h) ++ goto err; ++ ++- if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), +++ if (!no_pg && sam_hdr_add_pg(cram_h, "samtools", "VN", samtools_version(), ++ arg_list ? "CL": NULL, ++ arg_list ? arg_list : NULL, ++ NULL)) ++ goto err; ++ ++- int header_len = sam_hdr_length(hdr); +++ int header_len = sam_hdr_length(cram_h); ++ /* Fix M5 strings? Maybe out of scope for this tool */ ++ ++ // Load the existing header ++@@ -244,7 +237,7 @@ ++ ++ cram_block_set_offset(b, 0); // rewind block ++ int32_put_blk(b, header_len); ++- cram_block_append(b, sam_hdr_str(hdr), header_len); +++ cram_block_append(b, (void *)sam_hdr_str(cram_h), header_len); ++ // Zero the remaining block ++ memset((char *)cram_block_get_data(b)+cram_block_get_offset(b), 0, ++ cram_block_get_uncomp_size(b) - cram_block_get_offset(b)); ++@@ -265,7 +258,7 @@ ++ err: ++ if (c) cram_free_container(c); ++ if (b) cram_free_block(b); ++- if (hdr) sam_hdr_free(hdr); +++ if (cram_h) sam_hdr_destroy(cram_h); ++ ++ return ret; ++ } ++@@ -286,16 +279,18 @@ ++ * -1 on general failure; ++ * -2 on failure due to insufficient size ++ */ ++-int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, ++- int add_PG) +++int cram_reheader_inplace3(cram_fd *fd, sam_hdr_t *h, const char *arg_list, +++ int no_pg) ++ { ++ cram_container *c = NULL; ++ cram_block *b = NULL; ++- SAM_hdr *hdr = NULL; +++ sam_hdr_t *cram_h = NULL; ++ off_t start, sz, end; ++ int container_sz, max_container_sz; ++ char *buf = NULL; ++ int ret = -1; +++ if (!h) +++ goto err; ++ ++ if (cram_major_vers(fd) < 2 || ++ cram_major_vers(fd) > 3) { ++@@ -304,16 +299,17 @@ ++ goto err; ++ } ++ ++- if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) +++ cram_h = sam_hdr_dup(h); +++ if (!cram_h) ++ goto err; ++ ++- if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), +++ if (!no_pg && sam_hdr_add_pg(cram_h, "samtools", "VN", samtools_version(), ++ arg_list ? "CL": NULL, ++ arg_list ? arg_list : NULL, ++ NULL)) ++ goto err; ++ ++- int header_len = sam_hdr_length(hdr); +++ int header_len = sam_hdr_length(cram_h); ++ /* Fix M5 strings? Maybe out of scope for this tool */ ++ ++ // Find current size of SAM header block ++@@ -381,7 +377,7 @@ ++ // Version 3.0 supports compressed header ++ b = cram_new_block(FILE_HEADER, 0); ++ int32_put_blk(b, header_len); ++- cram_block_append(b, sam_hdr_str(hdr), header_len); +++ cram_block_append(b, (void *)sam_hdr_str(cram_h), header_len); ++ cram_block_update_size(b); ++ ++ cram_compress_block(fd, b, NULL, -1, -1); ++@@ -416,17 +412,17 @@ ++ if (c) cram_free_container(c); ++ if (buf) free(buf); ++ if (b) cram_free_block(b); ++- if (hdr) sam_hdr_free(hdr); +++ if (cram_h) sam_hdr_destroy(cram_h); ++ ++ return ret; ++ } ++ ++-int cram_reheader_inplace(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, ++- int add_PG) +++int cram_reheader_inplace(cram_fd *fd, sam_hdr_t *h, const char *arg_list, +++ int no_pg) ++ { ++ switch (cram_major_vers(fd)) { ++- case 2: return cram_reheader_inplace2(fd, h, arg_list, add_PG); ++- case 3: return cram_reheader_inplace3(fd, h, arg_list, add_PG); +++ case 2: return cram_reheader_inplace2(fd, h, arg_list, no_pg); +++ case 3: return cram_reheader_inplace3(fd, h, arg_list, no_pg); ++ default: ++ fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__, ++ cram_major_vers(fd)); ++@@ -437,33 +433,124 @@ ++ static void usage(FILE *fp, int ret) { ++ fprintf(fp, ++ "Usage: samtools reheader [-P] in.header.sam in.bam > out.bam\n" ++- " or samtools reheader [-P] -i in.header.sam file.bam\n" +++ " or samtools reheader [-P] -i in.header.sam file.cram\n" +++ " or samtools reheader -c CMD in.bam\n" +++ " or samtools reheader -c CMD in.cram\n" ++ "\n" ++ "Options:\n" ++- " -P, --no-PG Do not generate an @PG header line.\n" ++- " -i, --in-place Modify the bam/cram file directly.\n" ++- " (Defaults to outputting to stdout.)\n"); +++ " -P, --no-PG Do not generate a @PG header line.\n" +++ " -i, --in-place Modify the CRAM file directly, if possible.\n" +++ " (Defaults to outputting to stdout.)\n" +++ " -c, --command CMD Pass the header in SAM format to external program CMD.\n"); ++ exit(ret); ++ } ++ +++static sam_hdr_t* external_reheader(samFile* in, const char* external) { +++ char *command = NULL; +++ sam_hdr_t* h = NULL; +++ sam_hdr_t* ih = sam_hdr_read(in); +++ if (ih == NULL) { +++ fprintf(stderr, "[%s] failed to read the header for '%s'.\n", __func__, in->fn); +++ return NULL; +++ } +++ char tmp_fn[] = "reheaderXXXXXX"; +++ int tmp_fd = mkstemp(tmp_fn); +++ if (tmp_fd < 0) { +++ print_error_errno("reheader", "fail to open temp file '%s'", tmp_fn); +++ return NULL; +++ } +++ hFILE* tmp_hf = hdopen(tmp_fd, "w"); +++ if (!tmp_hf) { +++ fprintf(stderr, "[%s] failed to convert to hFILE.\n", __func__); +++ goto cleanup; +++ } +++ samFile* tmp_sf = hts_hopen(tmp_hf, tmp_fn, "w"); +++ if (!tmp_sf) { +++ fprintf(stderr, "[%s] failed to convert to samFile.\n", __func__); +++ goto cleanup; +++ } +++ if (-1 == sam_hdr_write(tmp_sf, ih)) { +++ fprintf(stderr, "[%s] failed to write the header to the temp file.\n", __func__); +++ goto cleanup; +++ } +++ sam_close(tmp_sf); +++ sam_hdr_destroy(ih); +++ int comm_len = strlen(external) + strlen(tmp_fn) + 8; +++ command = calloc(comm_len, 1); +++ if (!command || snprintf(command, comm_len, "( %s ) < %s", external, tmp_fn) != comm_len - 1) { +++ fprintf(stderr, "[%s] failed to create command string.\n", __func__); +++ goto cleanup; +++ } +++ FILE* nh = popen(command, "r"); +++ if (!nh) { +++ print_error_errno("reheader", "[%s] failed to run external command '%s'.\n", __func__, command); +++ goto cleanup; +++ } +++ +++ int nh_fd = dup(fileno(nh)); +++ if (nh_fd < 0) { +++ fprintf(stderr, "[%s] failed to get the file descriptor.\n", __func__); +++ goto cleanup; +++ } +++ hFILE* nh_hf = hdopen(nh_fd, "r"); +++ if (!nh_hf) { +++ fprintf(stderr, "[%s] failed to convert to hFILE.\n", __func__); +++ goto cleanup; +++ } +++ samFile* nh_sf = hts_hopen(nh_hf, tmp_fn, "r"); +++ if (!nh_sf) { +++ fprintf(stderr, "[%s] failed to convert to samFile.\n", __func__); +++ goto cleanup; +++ } +++ +++ h = sam_hdr_read(nh_sf); +++ sam_close(nh_sf); +++ if (h == NULL) { +++ fprintf(stderr, "[%s] failed to read the header from the temp file.\n", __func__); +++ } +++ int res = pclose(nh); +++ if (res != 0) { +++ if (res < 0) { +++ print_error_errno("reheader", +++ "Error on closing pipe from command '%s'.\n", +++ command); +++ } else { +++ print_error("reheader", +++ "Non-zero exit code returned by command '%s'\n", +++ command); +++ } +++ if (h) sam_hdr_destroy(h); +++ h = NULL; +++ } +++cleanup: +++ free(command); +++ if (unlink(tmp_fn) != 0) { +++ print_error_errno("reheader", "failed to remove the temp file '%s'", tmp_fn); +++ } +++ +++ return h; +++} +++ ++ int main_reheader(int argc, char *argv[]) ++ { ++- int inplace = 0, r, add_PG = 1, c; ++- bam_hdr_t *h; +++ int inplace = 0, r, no_pg = 0, c, skip_header = 0; +++ sam_hdr_t *h; ++ samFile *in; ++- char *arg_list = stringify_argv(argc+1, argv-1); +++ char *arg_list = NULL, *external = NULL; ++ ++ static const struct option lopts[] = { ++ {"help", no_argument, NULL, 'h'}, ++ {"in-place", no_argument, NULL, 'i'}, ++ {"no-PG", no_argument, NULL, 'P'}, +++ {"command", required_argument, NULL, 'c'}, ++ {NULL, 0, NULL, 0} ++ }; ++ ++- while ((c = getopt_long(argc, argv, "hiP", lopts, NULL)) >= 0) { +++ while ((c = getopt_long(argc, argv, "hiPc:", lopts, NULL)) >= 0) { ++ switch (c) { ++- case 'P': add_PG = 0; break; +++ case 'P': no_pg = 1; break; ++ case 'i': inplace = 1; break; +++ case 'c': external = optarg; break; ++ case 'h': usage(stdout, 0); break; ++ default: ++ fprintf(stderr, "Invalid option '%c'\n", c); ++@@ -471,10 +558,29 @@ ++ } ++ } ++ ++- if (argc - optind != 2) +++ if ((argc - optind != 2 || external) && (argc - optind != 1 || !external)) ++ usage(stderr, 1); ++ ++- { // read the header +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("reheader", "failed to create arg_list"); +++ return 1; +++ } +++ +++ if (external) { +++ skip_header = 1; +++ in = sam_open(argv[optind], inplace?"r+":"r"); +++ if (in == 0) { +++ print_error_errno("reheader", "fail to open file '%s'", argv[optind]); +++ return 1; +++ } +++ +++ h = external_reheader(in, external); +++ if (h == NULL) { +++ fprintf(stderr, "[%s] failed to read the header from '%s'.\n", __func__, external); +++ sam_close(in); +++ return 1; +++ } +++ } else { // read the header from a separate file ++ samFile *fph = sam_open(argv[optind], "r"); ++ if (fph == 0) { ++ print_error_errno("reheader", "fail to read the header from '%s'", argv[optind]); ++@@ -487,25 +593,34 @@ ++ __func__, argv[1]); ++ return 1; ++ } +++ in = sam_open(argv[optind+1], inplace?"r+":"r"); +++ if (in == 0) { +++ print_error_errno("reheader", "fail to open file '%s'", argv[optind+1]); +++ return 1; +++ } ++ } ++- in = sam_open(argv[optind+1], inplace?"r+":"r"); ++- if (in == 0) { ++- print_error_errno("reheader", "fail to open file '%s'", argv[optind+1]); ++- return 1; ++- } +++ ++ if (hts_get_format(in)->format == bam) { ++- r = bam_reheader(in->fp.bgzf, h, fileno(stdout), arg_list, add_PG); ++- } else { +++ if (inplace) { +++ print_error("reheader", "cannot reheader BAM '%s' in-place", argv[optind+1]); +++ r = -1; +++ } else { +++ r = bam_reheader(in->fp.bgzf, h, fileno(stdout), arg_list, no_pg, skip_header); +++ } +++ } else if (hts_get_format(in)->format == cram) { ++ if (inplace) ++- r = cram_reheader_inplace(in->fp.cram, h, arg_list, add_PG); +++ r = cram_reheader_inplace(in->fp.cram, h, arg_list, no_pg); ++ else ++- r = cram_reheader(in->fp.cram, h, arg_list, add_PG); +++ r = cram_reheader(in->fp.cram, h, arg_list, no_pg); +++ } else { +++ print_error("reheader", "input file '%s' must be BAM or CRAM", argv[optind+1]); +++ r = -1; ++ } ++ ++ if (sam_close(in) != 0) ++ r = -1; ++ ++- bam_hdr_destroy(h); +++ sam_hdr_destroy(h); ++ ++ if (arg_list) ++ free(arg_list); ++--- python-pysam.orig/samtools/bam_reheader.c.pysam.c +++++ python-pysam/samtools/bam_reheader.c.pysam.c ++@@ -3,7 +3,7 @@ ++ /* bam_reheader.c -- reheader subcommand. ++ ++ Copyright (C) 2010 Broad Institute. ++- Copyright (C) 2012-2015 Genome Research Ltd. +++ Copyright (C) 2012-2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -31,6 +31,7 @@ ++ #include ++ #include ++ #include +++#include ++ ++ #include "htslib/bgzf.h" ++ #include "htslib/sam.h" ++@@ -44,50 +45,44 @@ ++ * Reads a file and outputs a new BAM file to fd with 'h' replaced as ++ * the header. No checks are made to the validity. ++ */ ++-int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, ++- const char *arg_list, int add_PG) +++int bam_reheader(BGZF *in, sam_hdr_t *h, int fd, +++ const char *arg_list, int no_pg, int skip_header) ++ { ++ BGZF *fp = NULL; ++ ssize_t len; ++ uint8_t *buf = NULL; ++- SAM_hdr *sh = NULL; +++ sam_hdr_t *tmp; +++ if (!h) +++ return -1; +++ ++ if (in->is_write) return -1; ++ buf = malloc(BUF_SIZE); ++ if (!buf) { ++ fprintf(samtools_stderr, "Out of memory\n"); ++ return -1; ++ } ++- if (bam_hdr_read(in) == NULL) { ++- fprintf(samtools_stderr, "Couldn't read header\n"); ++- goto fail; +++ +++ if (!skip_header) { +++ if ((tmp = bam_hdr_read(in)) == NULL) { +++ fprintf(samtools_stderr, "Couldn't read header\n"); +++ goto fail; +++ } +++ sam_hdr_destroy(tmp); ++ } +++ ++ fp = bgzf_fdopen(fd, "w"); ++ if (!fp) { ++ print_error_errno("reheader", "Couldn't open output file"); ++ goto fail; ++ } ++ ++- if (add_PG) { ++- // Around the houses, but it'll do until we can manipulate bam_hdr_t natively. ++- sh = sam_hdr_parse_(h->text, h->l_text); ++- if (!sh) ++- goto fail; ++- if (sam_hdr_add_PG(sh, "samtools", +++ if (!no_pg && sam_hdr_add_pg(h, "samtools", ++ "VN", samtools_version(), ++ arg_list ? "CL": NULL, ++ arg_list ? arg_list : NULL, ++ NULL) != 0) ++ goto fail; ++ ++- free(h->text); ++- h->text = strdup(sam_hdr_str(sh)); ++- h->l_text = sam_hdr_length(sh); ++- if (!h->text) ++- goto fail; ++- sam_hdr_free(sh); ++- sh = NULL; ++- } ++- ++ if (bam_hdr_write(fp, h) < 0) { ++ print_error_errno("reheader", "Couldn't write header"); ++ goto fail; ++@@ -116,7 +111,6 @@ ++ fail: ++ bgzf_close(fp); ++ free(buf); ++- sam_hdr_free(sh); ++ return -1; ++ } ++ ++@@ -126,32 +120,28 @@ ++ * ++ * FIXME: error checking ++ */ ++-int cram_reheader(cram_fd *in, bam_hdr_t *h, const char *arg_list, int add_PG) +++int cram_reheader(cram_fd *in, sam_hdr_t *h, const char *arg_list, int no_pg) ++ { ++ htsFile *h_out = hts_open("-", "wc"); ++ cram_fd *out = h_out->fp.cram; ++ cram_container *c = NULL; ++ int ret = -1; +++ if (!h) +++ return ret; ++ ++ // Attempt to fill out a cram->refs[] array from @SQ headers ++- cram_fd_set_header(out, sam_hdr_parse_(h->text, h->l_text)); ++- if (add_PG) { ++- if (sam_hdr_add_PG(cram_fd_get_header(out), "samtools", +++ sam_hdr_t *cram_h = sam_hdr_dup(h); +++ if (!cram_h) +++ return -1; +++ cram_fd_set_header(out, cram_h); +++ if (!no_pg && sam_hdr_add_pg(cram_fd_get_header(out), "samtools", ++ "VN", samtools_version(), ++ arg_list ? "CL": NULL, ++ arg_list ? arg_list : NULL, ++- NULL) != 0) +++ NULL)) ++ goto err; ++ ++- // Covert back to bam_hdr_t struct ++- free(h->text); ++- h->text = strdup(sam_hdr_str(cram_fd_get_header(out))); ++- h->l_text = sam_hdr_length(cram_fd_get_header(out)); ++- if (!h->text) ++- goto err; ++- } ++- ++- if (sam_hdr_write(h_out, h) != 0) +++ if (sam_hdr_write(h_out, cram_h) != 0) ++ goto err; ++ cram_set_option(out, CRAM_OPT_REFERENCE, NULL); ++ ++@@ -194,14 +184,16 @@ ++ * -1 on general failure; ++ * -2 on failure due to insufficient size ++ */ ++-int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, ++- int add_PG) +++int cram_reheader_inplace2(cram_fd *fd, sam_hdr_t *h, const char *arg_list, +++ int no_pg) ++ { ++ cram_container *c = NULL; ++ cram_block *b = NULL; ++- SAM_hdr *hdr = NULL; +++ sam_hdr_t *cram_h = NULL; ++ off_t start; ++ int ret = -1; +++ if (!h) +++ goto err; ++ ++ if (cram_major_vers(fd) < 2 || ++ cram_major_vers(fd) > 3) { ++@@ -210,16 +202,17 @@ ++ goto err; ++ } ++ ++- if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) +++ cram_h = sam_hdr_dup(h); +++ if (!cram_h) ++ goto err; ++ ++- if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), +++ if (!no_pg && sam_hdr_add_pg(cram_h, "samtools", "VN", samtools_version(), ++ arg_list ? "CL": NULL, ++ arg_list ? arg_list : NULL, ++ NULL)) ++ goto err; ++ ++- int header_len = sam_hdr_length(hdr); +++ int header_len = sam_hdr_length(cram_h); ++ /* Fix M5 strings? Maybe out of scope for this tool */ ++ ++ // Load the existing header ++@@ -246,7 +239,7 @@ ++ ++ cram_block_set_offset(b, 0); // rewind block ++ int32_put_blk(b, header_len); ++- cram_block_append(b, sam_hdr_str(hdr), header_len); +++ cram_block_append(b, (void *)sam_hdr_str(cram_h), header_len); ++ // Zero the remaining block ++ memset((char *)cram_block_get_data(b)+cram_block_get_offset(b), 0, ++ cram_block_get_uncomp_size(b) - cram_block_get_offset(b)); ++@@ -267,7 +260,7 @@ ++ err: ++ if (c) cram_free_container(c); ++ if (b) cram_free_block(b); ++- if (hdr) sam_hdr_free(hdr); +++ if (cram_h) sam_hdr_destroy(cram_h); ++ ++ return ret; ++ } ++@@ -288,16 +281,18 @@ ++ * -1 on general failure; ++ * -2 on failure due to insufficient size ++ */ ++-int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, ++- int add_PG) +++int cram_reheader_inplace3(cram_fd *fd, sam_hdr_t *h, const char *arg_list, +++ int no_pg) ++ { ++ cram_container *c = NULL; ++ cram_block *b = NULL; ++- SAM_hdr *hdr = NULL; +++ sam_hdr_t *cram_h = NULL; ++ off_t start, sz, end; ++ int container_sz, max_container_sz; ++ char *buf = NULL; ++ int ret = -1; +++ if (!h) +++ goto err; ++ ++ if (cram_major_vers(fd) < 2 || ++ cram_major_vers(fd) > 3) { ++@@ -306,16 +301,17 @@ ++ goto err; ++ } ++ ++- if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) +++ cram_h = sam_hdr_dup(h); +++ if (!cram_h) ++ goto err; ++ ++- if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), +++ if (!no_pg && sam_hdr_add_pg(cram_h, "samtools", "VN", samtools_version(), ++ arg_list ? "CL": NULL, ++ arg_list ? arg_list : NULL, ++ NULL)) ++ goto err; ++ ++- int header_len = sam_hdr_length(hdr); +++ int header_len = sam_hdr_length(cram_h); ++ /* Fix M5 strings? Maybe out of scope for this tool */ ++ ++ // Find current size of SAM header block ++@@ -383,7 +379,7 @@ ++ // Version 3.0 supports compressed header ++ b = cram_new_block(FILE_HEADER, 0); ++ int32_put_blk(b, header_len); ++- cram_block_append(b, sam_hdr_str(hdr), header_len); +++ cram_block_append(b, (void *)sam_hdr_str(cram_h), header_len); ++ cram_block_update_size(b); ++ ++ cram_compress_block(fd, b, NULL, -1, -1); ++@@ -418,17 +414,17 @@ ++ if (c) cram_free_container(c); ++ if (buf) free(buf); ++ if (b) cram_free_block(b); ++- if (hdr) sam_hdr_free(hdr); +++ if (cram_h) sam_hdr_destroy(cram_h); ++ ++ return ret; ++ } ++ ++-int cram_reheader_inplace(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, ++- int add_PG) +++int cram_reheader_inplace(cram_fd *fd, sam_hdr_t *h, const char *arg_list, +++ int no_pg) ++ { ++ switch (cram_major_vers(fd)) { ++- case 2: return cram_reheader_inplace2(fd, h, arg_list, add_PG); ++- case 3: return cram_reheader_inplace3(fd, h, arg_list, add_PG); +++ case 2: return cram_reheader_inplace2(fd, h, arg_list, no_pg); +++ case 3: return cram_reheader_inplace3(fd, h, arg_list, no_pg); ++ default: ++ fprintf(samtools_stderr, "[%s] unsupported CRAM version %d\n", __func__, ++ cram_major_vers(fd)); ++@@ -439,33 +435,124 @@ ++ static void usage(FILE *fp, int ret) { ++ fprintf(fp, ++ "Usage: samtools reheader [-P] in.header.sam in.bam > out.bam\n" ++- " or samtools reheader [-P] -i in.header.sam file.bam\n" +++ " or samtools reheader [-P] -i in.header.sam file.cram\n" +++ " or samtools reheader -c CMD in.bam\n" +++ " or samtools reheader -c CMD in.cram\n" ++ "\n" ++ "Options:\n" ++- " -P, --no-PG Do not generate an @PG header line.\n" ++- " -i, --in-place Modify the bam/cram file directly.\n" ++- " (Defaults to outputting to samtools_stdout.)\n"); +++ " -P, --no-PG Do not generate a @PG header line.\n" +++ " -i, --in-place Modify the CRAM file directly, if possible.\n" +++ " (Defaults to outputting to samtools_stdout.)\n" +++ " -c, --command CMD Pass the header in SAM format to external program CMD.\n"); ++ exit(ret); ++ } ++ +++static sam_hdr_t* external_reheader(samFile* in, const char* external) { +++ char *command = NULL; +++ sam_hdr_t* h = NULL; +++ sam_hdr_t* ih = sam_hdr_read(in); +++ if (ih == NULL) { +++ fprintf(samtools_stderr, "[%s] failed to read the header for '%s'.\n", __func__, in->fn); +++ return NULL; +++ } +++ char tmp_fn[] = "reheaderXXXXXX"; +++ int tmp_fd = mkstemp(tmp_fn); +++ if (tmp_fd < 0) { +++ print_error_errno("reheader", "fail to open temp file '%s'", tmp_fn); +++ return NULL; +++ } +++ hFILE* tmp_hf = hdopen(tmp_fd, "w"); +++ if (!tmp_hf) { +++ fprintf(samtools_stderr, "[%s] failed to convert to hFILE.\n", __func__); +++ goto cleanup; +++ } +++ samFile* tmp_sf = hts_hopen(tmp_hf, tmp_fn, "w"); +++ if (!tmp_sf) { +++ fprintf(samtools_stderr, "[%s] failed to convert to samFile.\n", __func__); +++ goto cleanup; +++ } +++ if (-1 == sam_hdr_write(tmp_sf, ih)) { +++ fprintf(samtools_stderr, "[%s] failed to write the header to the temp file.\n", __func__); +++ goto cleanup; +++ } +++ sam_close(tmp_sf); +++ sam_hdr_destroy(ih); +++ int comm_len = strlen(external) + strlen(tmp_fn) + 8; +++ command = calloc(comm_len, 1); +++ if (!command || snprintf(command, comm_len, "( %s ) < %s", external, tmp_fn) != comm_len - 1) { +++ fprintf(samtools_stderr, "[%s] failed to create command string.\n", __func__); +++ goto cleanup; +++ } +++ FILE* nh = popen(command, "r"); +++ if (!nh) { +++ print_error_errno("reheader", "[%s] failed to run external command '%s'.\n", __func__, command); +++ goto cleanup; +++ } +++ +++ int nh_fd = dup(fileno(nh)); +++ if (nh_fd < 0) { +++ fprintf(samtools_stderr, "[%s] failed to get the file descriptor.\n", __func__); +++ goto cleanup; +++ } +++ hFILE* nh_hf = hdopen(nh_fd, "r"); +++ if (!nh_hf) { +++ fprintf(samtools_stderr, "[%s] failed to convert to hFILE.\n", __func__); +++ goto cleanup; +++ } +++ samFile* nh_sf = hts_hopen(nh_hf, tmp_fn, "r"); +++ if (!nh_sf) { +++ fprintf(samtools_stderr, "[%s] failed to convert to samFile.\n", __func__); +++ goto cleanup; +++ } +++ +++ h = sam_hdr_read(nh_sf); +++ sam_close(nh_sf); +++ if (h == NULL) { +++ fprintf(samtools_stderr, "[%s] failed to read the header from the temp file.\n", __func__); +++ } +++ int res = pclose(nh); +++ if (res != 0) { +++ if (res < 0) { +++ print_error_errno("reheader", +++ "Error on closing pipe from command '%s'.\n", +++ command); +++ } else { +++ print_error("reheader", +++ "Non-zero exit code returned by command '%s'\n", +++ command); +++ } +++ if (h) sam_hdr_destroy(h); +++ h = NULL; +++ } +++cleanup: +++ free(command); +++ if (unlink(tmp_fn) != 0) { +++ print_error_errno("reheader", "failed to remove the temp file '%s'", tmp_fn); +++ } +++ +++ return h; +++} +++ ++ int main_reheader(int argc, char *argv[]) ++ { ++- int inplace = 0, r, add_PG = 1, c; ++- bam_hdr_t *h; +++ int inplace = 0, r, no_pg = 0, c, skip_header = 0; +++ sam_hdr_t *h; ++ samFile *in; ++- char *arg_list = stringify_argv(argc+1, argv-1); +++ char *arg_list = NULL, *external = NULL; ++ ++ static const struct option lopts[] = { ++ {"help", no_argument, NULL, 'h'}, ++ {"in-place", no_argument, NULL, 'i'}, ++ {"no-PG", no_argument, NULL, 'P'}, +++ {"command", required_argument, NULL, 'c'}, ++ {NULL, 0, NULL, 0} ++ }; ++ ++- while ((c = getopt_long(argc, argv, "hiP", lopts, NULL)) >= 0) { +++ while ((c = getopt_long(argc, argv, "hiPc:", lopts, NULL)) >= 0) { ++ switch (c) { ++- case 'P': add_PG = 0; break; +++ case 'P': no_pg = 1; break; ++ case 'i': inplace = 1; break; +++ case 'c': external = optarg; break; ++ case 'h': usage(samtools_stdout, 0); break; ++ default: ++ fprintf(samtools_stderr, "Invalid option '%c'\n", c); ++@@ -473,10 +560,29 @@ ++ } ++ } ++ ++- if (argc - optind != 2) +++ if ((argc - optind != 2 || external) && (argc - optind != 1 || !external)) ++ usage(samtools_stderr, 1); ++ ++- { // read the header +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("reheader", "failed to create arg_list"); +++ return 1; +++ } +++ +++ if (external) { +++ skip_header = 1; +++ in = sam_open(argv[optind], inplace?"r+":"r"); +++ if (in == 0) { +++ print_error_errno("reheader", "fail to open file '%s'", argv[optind]); +++ return 1; +++ } +++ +++ h = external_reheader(in, external); +++ if (h == NULL) { +++ fprintf(samtools_stderr, "[%s] failed to read the header from '%s'.\n", __func__, external); +++ sam_close(in); +++ return 1; +++ } +++ } else { // read the header from a separate file ++ samFile *fph = sam_open(argv[optind], "r"); ++ if (fph == 0) { ++ print_error_errno("reheader", "fail to read the header from '%s'", argv[optind]); ++@@ -489,25 +595,34 @@ ++ __func__, argv[1]); ++ return 1; ++ } +++ in = sam_open(argv[optind+1], inplace?"r+":"r"); +++ if (in == 0) { +++ print_error_errno("reheader", "fail to open file '%s'", argv[optind+1]); +++ return 1; +++ } ++ } ++- in = sam_open(argv[optind+1], inplace?"r+":"r"); ++- if (in == 0) { ++- print_error_errno("reheader", "fail to open file '%s'", argv[optind+1]); ++- return 1; ++- } +++ ++ if (hts_get_format(in)->format == bam) { ++- r = bam_reheader(in->fp.bgzf, h, fileno(samtools_stdout), arg_list, add_PG); ++- } else { +++ if (inplace) { +++ print_error("reheader", "cannot reheader BAM '%s' in-place", argv[optind+1]); +++ r = -1; +++ } else { +++ r = bam_reheader(in->fp.bgzf, h, fileno(samtools_stdout), arg_list, no_pg, skip_header); +++ } +++ } else if (hts_get_format(in)->format == cram) { ++ if (inplace) ++- r = cram_reheader_inplace(in->fp.cram, h, arg_list, add_PG); +++ r = cram_reheader_inplace(in->fp.cram, h, arg_list, no_pg); ++ else ++- r = cram_reheader(in->fp.cram, h, arg_list, add_PG); +++ r = cram_reheader(in->fp.cram, h, arg_list, no_pg); +++ } else { +++ print_error("reheader", "input file '%s' must be BAM or CRAM", argv[optind+1]); +++ r = -1; ++ } ++ ++ if (sam_close(in) != 0) ++ r = -1; ++ ++- bam_hdr_destroy(h); +++ sam_hdr_destroy(h); ++ ++ if (arg_list) ++ free(arg_list); ++--- python-pysam.orig/samtools/bam_rmdup.c +++++ python-pysam/samtools/bam_rmdup.c ++@@ -1,6 +1,6 @@ ++ /* bam_rmdup.c -- duplicate read detection. ++ ++- Copyright (C) 2009, 2015 Genome Research Ltd. +++ Copyright (C) 2009, 2015, 2016, 2019 Genome Research Ltd. ++ Portions copyright (C) 2009 Broad Institute. ++ ++ Author: Heng Li ++@@ -63,7 +63,7 @@ ++ stack->a[stack->n++] = b; ++ } ++ ++-static inline int dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr) +++static inline int dump_best(tmp_stack_t *stack, samFile *out, sam_hdr_t *hdr) ++ { ++ int i; ++ for (i = 0; i != stack->n; ++i) { ++@@ -127,7 +127,7 @@ ++ return q; ++ } ++ ++-int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) +++int bam_rmdup_core(samFile *in, sam_hdr_t *hdr, samFile *out) ++ { ++ bam1_t *b = NULL; ++ int last_tid = -1, last_pos = -1, r; ++@@ -165,7 +165,7 @@ ++ break; ++ } ++ last_tid = c->tid; ++- fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", hdr->target_name[c->tid]); +++ fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", sam_hdr_tid2name(hdr, c->tid)); ++ } ++ } ++ if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { ++@@ -179,13 +179,16 @@ ++ q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); ++ ++q->n_checked; ++ k = kh_put(pos, q->best_hash, key, &ret); +++ if (ret < 0) goto fail; ++ if (ret == 0) { // found in best_hash ++ bam1_t *p = kh_val(q->best_hash, k); ++ ++q->n_removed; ++ if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle ++ kh_put(name, del_set, strdup(bam_get_qname(p)), &ret); // p will be removed ++- bam_copy1(p, b); // replaced as b +++ if (ret < 0) goto fail; +++ if (bam_copy1(p, b) == NULL) goto fail; // replaced as b ++ } else kh_put(name, del_set, strdup(bam_get_qname(b)), &ret); // b will be removed +++ if (ret < 0) goto fail; ++ if (ret == 0) ++ fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam_get_qname(b)); ++ } else { // not found in best_hash ++@@ -250,7 +253,7 @@ ++ return 1; ++ } ++ ++-int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se); +++int bam_rmdupse_core(samFile *in, sam_hdr_t *hdr, samFile *out, int force_se); ++ ++ static int rmdup_usage(void) { ++ fprintf(stderr, "\n"); ++@@ -258,7 +261,7 @@ ++ fprintf(stderr, "Option: -s rmdup for SE reads\n"); ++ fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n"); ++ ++- sam_global_opt_help(stderr, "-....-"); +++ sam_global_opt_help(stderr, "-....--."); ++ return 1; ++ } ++ ++@@ -266,7 +269,7 @@ ++ { ++ int c, ret, is_se = 0, force_se = 0; ++ samFile *in, *out; ++- bam_hdr_t *header; +++ sam_hdr_t *header; ++ char wmode[3] = {'w', 'b', 0}; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ ++@@ -293,7 +296,7 @@ ++ return 1; ++ } ++ header = sam_hdr_read(in); ++- if (header == NULL || header->n_targets == 0) { +++ if (header == NULL || sam_hdr_nref(header) == 0) { ++ fprintf(stderr, "[bam_rmdup] input SAM does not have header. Abort!\n"); ++ return 1; ++ } ++@@ -312,7 +315,7 @@ ++ if (is_se) ret = bam_rmdupse_core(in, header, out, force_se); ++ else ret = bam_rmdup_core(in, header, out); ++ ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ sam_close(in); ++ if (sam_close(out) < 0) { ++ fprintf(stderr, "[bam_rmdup] error closing output file\n"); ++--- python-pysam.orig/samtools/bam_rmdup.c.pysam.c +++++ python-pysam/samtools/bam_rmdup.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam_rmdup.c -- duplicate read detection. ++ ++- Copyright (C) 2009, 2015 Genome Research Ltd. +++ Copyright (C) 2009, 2015, 2016, 2019 Genome Research Ltd. ++ Portions copyright (C) 2009 Broad Institute. ++ ++ Author: Heng Li ++@@ -65,7 +65,7 @@ ++ stack->a[stack->n++] = b; ++ } ++ ++-static inline int dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr) +++static inline int dump_best(tmp_stack_t *stack, samFile *out, sam_hdr_t *hdr) ++ { ++ int i; ++ for (i = 0; i != stack->n; ++i) { ++@@ -129,7 +129,7 @@ ++ return q; ++ } ++ ++-int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) +++int bam_rmdup_core(samFile *in, sam_hdr_t *hdr, samFile *out) ++ { ++ bam1_t *b = NULL; ++ int last_tid = -1, last_pos = -1, r; ++@@ -167,7 +167,7 @@ ++ break; ++ } ++ last_tid = c->tid; ++- fprintf(samtools_stderr, "[bam_rmdup_core] processing reference %s...\n", hdr->target_name[c->tid]); +++ fprintf(samtools_stderr, "[bam_rmdup_core] processing reference %s...\n", sam_hdr_tid2name(hdr, c->tid)); ++ } ++ } ++ if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { ++@@ -181,13 +181,16 @@ ++ q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); ++ ++q->n_checked; ++ k = kh_put(pos, q->best_hash, key, &ret); +++ if (ret < 0) goto fail; ++ if (ret == 0) { // found in best_hash ++ bam1_t *p = kh_val(q->best_hash, k); ++ ++q->n_removed; ++ if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle ++ kh_put(name, del_set, strdup(bam_get_qname(p)), &ret); // p will be removed ++- bam_copy1(p, b); // replaced as b +++ if (ret < 0) goto fail; +++ if (bam_copy1(p, b) == NULL) goto fail; // replaced as b ++ } else kh_put(name, del_set, strdup(bam_get_qname(b)), &ret); // b will be removed +++ if (ret < 0) goto fail; ++ if (ret == 0) ++ fprintf(samtools_stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam_get_qname(b)); ++ } else { // not found in best_hash ++@@ -252,7 +255,7 @@ ++ return 1; ++ } ++ ++-int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se); +++int bam_rmdupse_core(samFile *in, sam_hdr_t *hdr, samFile *out, int force_se); ++ ++ static int rmdup_usage(void) { ++ fprintf(samtools_stderr, "\n"); ++@@ -260,7 +263,7 @@ ++ fprintf(samtools_stderr, "Option: -s rmdup for SE reads\n"); ++ fprintf(samtools_stderr, " -S treat PE reads as SE in rmdup (force -s)\n"); ++ ++- sam_global_opt_help(samtools_stderr, "-....-"); +++ sam_global_opt_help(samtools_stderr, "-....--."); ++ return 1; ++ } ++ ++@@ -268,7 +271,7 @@ ++ { ++ int c, ret, is_se = 0, force_se = 0; ++ samFile *in, *out; ++- bam_hdr_t *header; +++ sam_hdr_t *header; ++ char wmode[3] = {'w', 'b', 0}; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ ++@@ -295,7 +298,7 @@ ++ return 1; ++ } ++ header = sam_hdr_read(in); ++- if (header == NULL || header->n_targets == 0) { +++ if (header == NULL || sam_hdr_nref(header) == 0) { ++ fprintf(samtools_stderr, "[bam_rmdup] input SAM does not have header. Abort!\n"); ++ return 1; ++ } ++@@ -314,7 +317,7 @@ ++ if (is_se) ret = bam_rmdupse_core(in, header, out, force_se); ++ else ret = bam_rmdup_core(in, header, out); ++ ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ sam_close(in); ++ if (sam_close(out) < 0) { ++ fprintf(samtools_stderr, "[bam_rmdup] error closing output file\n"); ++--- python-pysam.orig/samtools/bam_rmdupse.c +++++ python-pysam/samtools/bam_rmdupse.c ++@@ -1,6 +1,6 @@ ++ /* bam_rmdupse.c -- duplicate read detection for unpaired reads. ++ ++- Copyright (C) 2009, 2015 Genome Research Ltd. +++ Copyright (C) 2009, 2015, 2016, 2019 Genome Research Ltd. ++ Portions copyright (C) 2009 Broad Institute. ++ ++ Author: Heng Li ++@@ -84,7 +84,8 @@ ++ p->discarded = 0; ++ p->endpos = endpos; p->score = score; ++ if (p->b == 0) p->b = bam_init1(); ++- bam_copy1(p->b, b); +++ if (!p->b) { perror(NULL); exit(EXIT_FAILURE); } +++ if (bam_copy1(p->b, b) == NULL) { perror(NULL); exit(EXIT_FAILURE); } ++ return p; ++ } ++ ++@@ -96,7 +97,7 @@ ++ kh_del(best, h, k); ++ } ++ ++-static int dump_alignment(samFile *out, bam_hdr_t *hdr, +++static int dump_alignment(samFile *out, sam_hdr_t *hdr, ++ queue_t *queue, int32_t pos, khash_t(lib) *h) ++ { ++ if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { ++@@ -125,7 +126,7 @@ ++ return 0; ++ } ++ ++-int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) +++int bam_rmdupse_core(samFile *in, sam_hdr_t *hdr, samFile *out, int force_se) ++ { ++ bam1_t *b = NULL; ++ queue_t *queue = NULL; ++@@ -179,7 +180,9 @@ ++ kh_val(h, k) = push_queue(queue, b, endpos, score); ++ } else { // replace ++ p->score = score; p->endpos = endpos; ++- bam_copy1(p->b, b); +++ if (bam_copy1(p->b, b) == NULL) { +++ perror(NULL); exit(EXIT_FAILURE); +++ } ++ } ++ } // otherwise, discard the alignment ++ } else kh_val(h, k) = push_queue(queue, b, endpos, score); ++--- python-pysam.orig/samtools/bam_rmdupse.c.pysam.c +++++ python-pysam/samtools/bam_rmdupse.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam_rmdupse.c -- duplicate read detection for unpaired reads. ++ ++- Copyright (C) 2009, 2015 Genome Research Ltd. +++ Copyright (C) 2009, 2015, 2016, 2019 Genome Research Ltd. ++ Portions copyright (C) 2009 Broad Institute. ++ ++ Author: Heng Li ++@@ -86,7 +86,8 @@ ++ p->discarded = 0; ++ p->endpos = endpos; p->score = score; ++ if (p->b == 0) p->b = bam_init1(); ++- bam_copy1(p->b, b); +++ if (!p->b) { perror(NULL); exit(EXIT_FAILURE); } +++ if (bam_copy1(p->b, b) == NULL) { perror(NULL); exit(EXIT_FAILURE); } ++ return p; ++ } ++ ++@@ -98,7 +99,7 @@ ++ kh_del(best, h, k); ++ } ++ ++-static int dump_alignment(samFile *out, bam_hdr_t *hdr, +++static int dump_alignment(samFile *out, sam_hdr_t *hdr, ++ queue_t *queue, int32_t pos, khash_t(lib) *h) ++ { ++ if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { ++@@ -127,7 +128,7 @@ ++ return 0; ++ } ++ ++-int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) +++int bam_rmdupse_core(samFile *in, sam_hdr_t *hdr, samFile *out, int force_se) ++ { ++ bam1_t *b = NULL; ++ queue_t *queue = NULL; ++@@ -181,7 +182,9 @@ ++ kh_val(h, k) = push_queue(queue, b, endpos, score); ++ } else { // replace ++ p->score = score; p->endpos = endpos; ++- bam_copy1(p->b, b); +++ if (bam_copy1(p->b, b) == NULL) { +++ perror(NULL); exit(EXIT_FAILURE); +++ } ++ } ++ } // otherwise, discard the alignment ++ } else kh_val(h, k) = push_queue(queue, b, endpos, score); ++--- python-pysam.orig/samtools/bam_sort.c +++++ python-pysam/samtools/bam_sort.c ++@@ -1,6 +1,6 @@ ++ /* bam_sort.c -- sorting and merging. ++ ++- Copyright (C) 2008-2016 Genome Research Ltd. +++ Copyright (C) 2008-2019 Genome Research Ltd. ++ Portions copyright (C) 2009-2012 Broad Institute. ++ ++ Author: Heng Li ++@@ -44,6 +44,7 @@ ++ #include "htslib/klist.h" ++ #include "htslib/kstring.h" ++ #include "htslib/sam.h" +++#include "htslib/hts_endian.h" ++ #include "sam_opts.h" ++ #include "samtools.h" ++ ++@@ -55,7 +56,7 @@ ++ bam1_t *bam_record; ++ union { ++ const uint8_t *tag; ++- uint64_t pos; +++ uint8_t pos_tid[12]; ++ } u; ++ } bam1_tag; ++ ++@@ -122,12 +123,12 @@ ++ return *pa? 1 : *pb? -1 : 0; ++ } ++ ++-#define HEAP_EMPTY UINT64_MAX +++#define HEAP_EMPTY (UINT64_MAX >> 1) ++ ++ typedef struct { ++ int i; ++- uint32_t rev; ++- uint64_t pos, idx; +++ uint32_t tid; +++ uint64_t pos:63, rev:1, idx; ++ bam1_tag entry; ++ } heap1_t; ++ ++@@ -153,6 +154,7 @@ ++ fb = b.entry.bam_record->core.flag & 0xc0; ++ if (fa != fb) return fa > fb; ++ } else { +++ if (a.tid != b.tid) return a.tid > b.tid; ++ if (a.pos != b.pos) return a.pos > b.pos; ++ if (a.rev != b.rev) return a.rev > b.rev; ++ } ++@@ -164,8 +166,7 @@ ++ KSORT_INIT(heap, heap1_t, heap_lt) ++ ++ typedef struct merged_header { ++- kstring_t out_hd; ++- kstring_t out_sq; +++ sam_hdr_t *hdr; ++ kstring_t out_rg; ++ kstring_t out_pg; ++ kstring_t out_co; ++@@ -187,80 +188,6 @@ ++ bool lost_coord_sort; ++ } trans_tbl_t; ++ ++-/* Something to look like a regmatch_t */ ++-typedef struct hdr_match { ++- ptrdiff_t rm_so; ++- ptrdiff_t rm_eo; ++-} hdr_match_t; ++- ++-/* ++- * Search for header lines of a particular record type. ++- * ++- * This replaces a regex search for something like /^@SQ.*\tSN:([^\t]+).*$/ ++- * but is much quicker. The locations found are returned in *matches, ++- * which has a signature the same as that of a regmatch_t. ++- * ++- * rec is the record type to match (i.e. @HD, @SQ, @PG or @RG) ++- * tag is a tag type in the record to match (SN for @SQ, ID for @PG or @RG) ++- * ++- * The location of the record (if found) is returned in matches[0] ++- * If tag is not NULL, the record is searched for the presence of the ++- * given tag. If found, the location of the value is returned in matches[1]. ++- * If the tag isn't found then the record is ignored and the search resumes ++- * on the next header line. ++- * ++- * For simplicity, some assumptions are made about rec and tag: ++- * rec should include the leading '@' sign and be three characters long. ++- * tag should be exactly two characters long. ++- * These are always string constants when this is called below, so we don't ++- * bother to check here. ++- * ++- * Returns 0 if a match was found, -1 if not. ++- */ ++- ++- ++-static int hdr_line_match(const char *text, const char *rec, ++- const char *tag, hdr_match_t *matches) { ++- const char *line_start, *line_end = text; ++- const char *tag_start, *tag_end; ++- ++- for (;;) { ++- // Find record, ensure either at start of text or follows '\n' ++- line_start = strstr(line_end, rec); ++- while (line_start && line_start > text && *(line_start - 1) != '\n') { ++- line_start = strstr(line_start + 3, rec); ++- } ++- if (!line_start) return -1; ++- ++- // Find end of header line ++- line_end = strchr(line_start, '\n'); ++- if (!line_end) line_end = line_start + strlen(line_start); ++- ++- matches[0].rm_so = line_start - text; ++- matches[0].rm_eo = line_end - text; ++- if (!tag) return 0; // Match found if not looking for tag. ++- ++- for (tag_start = line_start + 3; tag_start < line_end; tag_start++) { ++- // Find possible tag start. Hacky but quick. ++- while (*tag_start > '\n') tag_start++; ++- ++- // Check it ++- if (tag_start[0] == '\t' ++- && strncmp(tag_start + 1, tag, 2) == 0 ++- && tag_start[3] == ':') { ++- // Found tag, record location and return. ++- tag_end = tag_start + 4; ++- while (*tag_end && *tag_end != '\t' && *tag_end != '\n') ++- ++tag_end; ++- matches[1].rm_so = tag_start - text + 4; ++- matches[1].rm_eo = tag_end - text; ++- return 0; ++- } ++- } ++- // Couldn't find tag, try again from end of current record. ++- } ++-} ++- ++ static void trans_tbl_destroy(trans_tbl_t *tbl) { ++ khiter_t iter; ++ ++@@ -299,6 +226,9 @@ ++ merged_hdr = calloc(1, sizeof(*merged_hdr)); ++ if (merged_hdr == NULL) return NULL; ++ +++ merged_hdr->hdr = sam_hdr_init(); +++ if (!merged_hdr->hdr) goto fail; +++ ++ merged_hdr->targets_sz = 16; ++ merged_hdr->target_name = malloc(merged_hdr->targets_sz ++ * sizeof(*merged_hdr->target_name)); ++@@ -326,6 +256,7 @@ ++ kh_destroy(c2i, merged_hdr->sq_tids); ++ free(merged_hdr->target_name); ++ free(merged_hdr->target_len); +++ sam_hdr_destroy(merged_hdr->hdr); ++ free(merged_hdr); ++ return NULL; ++ } ++@@ -338,12 +269,6 @@ ++ return kputsn(src + from, to - from, dest) != to - from; ++ } ++ ++-// Append a header line match to kstring ++-static inline int match_to_ks(const char *src, const hdr_match_t *match, ++- kstring_t *dest) { ++- return range_to_ks(src, match->rm_so, match->rm_eo, dest); ++-} ++- ++ // Append a kstring to a kstring ++ static inline int ks_to_ks(kstring_t *src, kstring_t *dest) { ++ return kputsn(ks_str(src), ks_len(src), dest) != ks_len(src); ++@@ -385,48 +310,32 @@ ++ */ ++ ++ static int trans_tbl_add_hd(merged_header_t* merged_hdr, ++- bam_hdr_t *translate) { ++- hdr_match_t match = {0, 0}; +++ sam_hdr_t *translate) { +++ kstring_t hd_line = { 0, 0, NULL }; +++ int res; ++ ++ // TODO: handle case when @HD needs merging. ++ if (merged_hdr->have_hd) return 0; ++ ++- if (hdr_line_match(translate->text, "@HD", NULL, &match) != 0) { ++- return 0; +++ res = sam_hdr_find_hd(translate, &hd_line); +++ if (res < -1) { +++ print_error("merge", "failed to get @HD line from header"); +++ return -1; ++ } ++ ++- if (match_to_ks(translate->text, &match, &merged_hdr->out_hd)) goto memfail; ++- if (kputc('\n', &merged_hdr->out_hd) == EOF) goto memfail; ++- merged_hdr->have_hd = true; ++- ++- return 0; ++- ++- memfail: ++- perror(__func__); ++- return -1; ++-} +++ if (res < 0) // Not found +++ return 0; ++ ++-static inline int grow_target_list(merged_header_t* merged_hdr) { ++- size_t new_size; ++- char **new_names; ++- uint32_t *new_len; ++- ++- new_size = merged_hdr->targets_sz * 2; ++- new_names = realloc(merged_hdr->target_name, sizeof(*new_names) * new_size); ++- if (!new_names) goto fail; ++- merged_hdr->target_name = new_names; ++- ++- new_len = realloc(merged_hdr->target_len, sizeof(*new_len) * new_size); ++- if (!new_len) goto fail; ++- merged_hdr->target_len = new_len; +++ if (sam_hdr_add_lines(merged_hdr->hdr, hd_line.s, hd_line.l) < 0) { +++ print_error("merge", "failed to add @HD line to new header"); +++ free(hd_line.s); +++ return -1; +++ } ++ ++- merged_hdr->targets_sz = new_size; +++ free(hd_line.s); +++ merged_hdr->have_hd = true; ++ ++ return 0; ++- ++- fail: ++- perror(__func__); ++- return -1; ++ } ++ ++ /* ++@@ -444,54 +353,48 @@ ++ * Returns 0 on success, -1 on failure. ++ */ ++ ++-static int trans_tbl_add_sq(merged_header_t* merged_hdr, bam_hdr_t *translate, +++static int trans_tbl_add_sq(merged_header_t* merged_hdr, sam_hdr_t *translate, ++ trans_tbl_t* tbl) { ++- ++- kstring_t *out_text = &merged_hdr->out_sq; ++- khash_t(c2i)* sq_tids = merged_hdr->sq_tids; ++- hdr_match_t *new_sq_matches = NULL; ++- char *text; ++- hdr_match_t matches[2]; ++ int32_t i; ++- int32_t old_n_targets = merged_hdr->n_targets; ++- khiter_t iter; ++- int min_tid = -1; +++ int min_tid = -1, res; +++ kstring_t sq_line = { 0, 0, NULL }, sq_sn = { 0, 0, NULL }; ++ ++ // Fill in the tid part of the translation table, adding new targets ++ // to the merged header as we go. ++ ++- for (i = 0; i < translate->n_targets; ++i) { +++ for (i = 0; i < sam_hdr_nref(translate); ++i) { +++ int trans_tid; +++ sq_sn.l = 0; +++ res = sam_hdr_find_tag_pos(translate, "SQ", i, "SN", &sq_sn); +++ if (res < 0) { +++ print_error("merge", "failed to get @SQ SN #%d from header", i + 1); +++ goto fail; +++ } ++ ++- // Check if it's a new target. ++- iter = kh_get(c2i, sq_tids, translate->target_name[i]); +++ trans_tid = sam_hdr_name2tid(merged_hdr->hdr, sq_sn.s); +++ if (trans_tid < -1) { +++ print_error("merge", "failed to lookup ref"); +++ goto fail; +++ } ++ ++- if (iter == kh_end(sq_tids)) { ++- int ret; +++ if (trans_tid < 0) { ++ // Append missing entries to out_hdr ++- ++- if (merged_hdr->n_targets == merged_hdr->targets_sz) { ++- if (grow_target_list(merged_hdr)) goto fail; +++ sq_line.l = 0; +++ res = sam_hdr_find_line_id(translate, "SQ", "SN", sq_sn.s, &sq_line); +++ if (res < 0) { +++ print_error("merge", "failed to get @SQ SN:%s from header", sq_sn.s); +++ goto fail; ++ } ++ ++- merged_hdr->target_name[merged_hdr->n_targets] = strdup(translate->target_name[i]); ++- if (merged_hdr->target_name[merged_hdr->n_targets] == NULL) goto memfail; ++- merged_hdr->target_len[merged_hdr->n_targets] = translate->target_len[i]; ++- ++- // Record the new identifier for reference below, ++- // and when building the ttable for other inputs. ++- iter = kh_put(c2i, sq_tids, ++- merged_hdr->target_name[merged_hdr->n_targets], &ret); ++- if (ret < 0) { ++- free(merged_hdr->target_name[merged_hdr->n_targets]); ++- goto memfail; ++- } ++- assert(ret > 0); // Should not be in hash already. +++ trans_tid = sam_hdr_nref(merged_hdr->hdr); ++ ++- kh_value(sq_tids, iter) = merged_hdr->n_targets; ++- tbl->tid_trans[i] = merged_hdr->n_targets++; ++- } else { ++- tbl->tid_trans[i] = kh_value(sq_tids, iter); +++ res = sam_hdr_add_lines(merged_hdr->hdr, sq_line.s, sq_line.l); +++ if (res < 0) { +++ print_error("merge", "failed to add @SQ SN:%s to new header", sq_sn.s); +++ goto fail; +++ } ++ } +++ tbl->tid_trans[i] = trans_tid; ++ ++ if (tbl->tid_trans[i] > min_tid) { ++ min_tid = tbl->tid_trans[i]; ++@@ -500,78 +403,14 @@ ++ } ++ } ++ ++- if (merged_hdr->n_targets == old_n_targets) ++- return 0; // Everything done if no new targets. ++- ++- // Otherwise, find @SQ lines in translate->text for all newly added targets. ++- ++- new_sq_matches = malloc((merged_hdr->n_targets - old_n_targets) ++- * sizeof(*new_sq_matches)); ++- if (new_sq_matches == NULL) goto memfail; ++- ++- for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) { ++- new_sq_matches[i].rm_so = new_sq_matches[i].rm_eo = -1; ++- } ++- ++- text = translate->text; ++- while (hdr_line_match(text, "@SQ", "SN", matches) == 0) { ++- // matches[0] is whole line, matches[1] is SN value. ++- ++- // This is a bit disgusting, but avoids a copy... ++- char c = text[matches[1].rm_eo]; ++- int idx; ++- ++- text[matches[1].rm_eo] = '\0'; ++- ++- // Look up the SN value in the sq_tids hash. ++- iter = kh_get(c2i, sq_tids, text + matches[1].rm_so); ++- text[matches[1].rm_eo] = c; // restore text ++- ++- if (iter == kh_end(sq_tids)) { ++- // Warn about this, but it's not really fatal. ++- fprintf(stderr, "[W::%s] @SQ SN (%.*s) found in text header but not binary header.\n", ++- __func__, ++- (int) (matches[1].rm_eo - matches[1].rm_so), ++- text + matches[1].rm_so); ++- text += matches[0].rm_eo; ++- continue; // Skip to next ++- } ++- ++- idx = kh_value(sq_tids, iter); ++- if (idx >= old_n_targets) { ++- // is a new SQ, so record position so we can add it to out_text. ++- assert(idx < merged_hdr->n_targets); ++- ptrdiff_t off = text - translate->text; ++- new_sq_matches[idx - old_n_targets].rm_so = matches[0].rm_so + off; ++- new_sq_matches[idx - old_n_targets].rm_eo = matches[0].rm_eo + off; ++- } ++- ++- // Carry on searching from end of current match ++- text += matches[0].rm_eo; ++- } ++- ++- // Copy the @SQ headers found and recreate any missing from binary header. ++- for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) { ++- if (new_sq_matches[i].rm_so >= 0) { ++- if (match_to_ks(translate->text, &new_sq_matches[i], out_text)) ++- goto memfail; ++- if (kputc('\n', out_text) == EOF) goto memfail; ++- } else { ++- if (kputs("@SQ\tSN:", out_text) == EOF || ++- kputs(merged_hdr->target_name[i + old_n_targets], out_text) == EOF || ++- kputs("\tLN:", out_text) == EOF || ++- kputuw(merged_hdr->target_len[i + old_n_targets], out_text) == EOF || ++- kputc('\n', out_text) == EOF) goto memfail; ++- } ++- } +++ free(sq_line.s); +++ free(sq_sn.s); ++ ++- free(new_sq_matches); ++ return 0; ++ ++- memfail: ++- perror(__func__); ++ fail: ++- free(new_sq_matches); +++ free(sq_line.s); +++ free(sq_sn.s); ++ return -1; ++ } ++ ++@@ -592,29 +431,30 @@ ++ * ++ */ ++ ++-static klist_t(hdrln) * trans_rg_pg(bool is_rg, bam_hdr_t *translate, +++static klist_t(hdrln) * trans_rg_pg(bool is_rg, sam_hdr_t *translate, ++ bool merge, khash_t(cset)* known_ids, ++ khash_t(c2c)* id_map, char *override) { ++- hdr_match_t matches[2]; ++ khiter_t iter; ++- const char *text = translate->text; ++- const char *rec_type = is_rg ? "@RG" : "@PG"; +++ int num_ids, i; +++ const char *rec_type = is_rg ? "RG" : "PG"; ++ klist_t(hdrln) *hdr_lines; ++ ++ hdr_lines = kl_init(hdrln); ++ ++ // Search through translate's header ++- while (hdr_line_match(text, rec_type, "ID", matches) == 0) { ++- // matches[0] is the whole @RG/PG line; matches[1] is the ID field value +++ num_ids = sam_hdr_count_lines(translate, rec_type); +++ if (num_ids < 0) +++ goto fail; ++ +++ for (i = 0; i < num_ids; i++) { ++ kstring_t orig_id = { 0, 0, NULL }; // ID in original header ++ kstring_t transformed_id = { 0, 0, NULL }; // ID in output header ++ char *map_value; // Value to store in id_map ++ bool id_changed; // Have we changed the ID? ++ bool not_found_in_output; // ID isn't in the output header (yet) ++ ++- // Take a copy of the ID as we'll need it for a hash key. ++- if (match_to_ks(text, &matches[1], &orig_id)) goto memfail; +++ if (sam_hdr_find_tag_pos(translate, rec_type, i, "ID", &orig_id) < 0) +++ goto fail; ++ ++ // is our matched ID in our output ID set already? ++ iter = kh_get(cset, known_ids, ks_str(&orig_id)); ++@@ -651,18 +491,38 @@ ++ ++ // Does this line need to go into our output header? ++ if (not_found_in_output) { ++- ++ // Take matched line and replace ID with transformed_id ++ kstring_t new_hdr_line = { 0, 0, NULL }; +++ if (sam_hdr_find_line_id(translate, rec_type, +++ "ID", ks_str(&orig_id), &new_hdr_line) < 0){ +++ goto fail; +++ } +++ +++ if (id_changed) { +++ char *idp = strstr(ks_str(&new_hdr_line), "\tID:"), *id_end; +++ ptrdiff_t id_offset, id_len; +++ if (!idp) { +++ print_error("merge", "failed to find ID in \"%s\"\n", +++ ks_str(&new_hdr_line)); +++ goto fail; +++ } +++ idp += 4; +++ for (id_end = idp; *id_end >= '\n'; id_end++) {} +++ +++ id_offset = idp - new_hdr_line.s; +++ id_len = id_end - idp; ++ ++- if (!id_changed) { // Can just copy ++- if (match_to_ks(text, &matches[0], &new_hdr_line)) goto memfail; ++- } else { // Substitute new name for original ++- if (range_to_ks(text, matches[0].rm_so, matches[1].rm_so, ++- &new_hdr_line)) goto memfail; ++- if (ks_to_ks(&transformed_id, &new_hdr_line)) goto memfail; ++- if (range_to_ks(text, matches[1].rm_eo, matches[0].rm_eo, ++- &new_hdr_line)) goto memfail; +++ if (id_len < transformed_id.l) { +++ if (ks_resize(&new_hdr_line, new_hdr_line.l + transformed_id.l - id_len)) +++ goto fail; +++ } +++ if (id_len != transformed_id.l) { +++ memmove(new_hdr_line.s + id_offset + transformed_id.l, +++ new_hdr_line.s + id_offset + id_len, +++ new_hdr_line.l - id_offset - id_len + 1); +++ } +++ memcpy(new_hdr_line.s + id_offset, transformed_id.s, +++ transformed_id.l); ++ } ++ ++ // append line to output linked list ++@@ -686,8 +546,6 @@ ++ int in_there = 0; ++ iter = kh_put(c2c, id_map, ks_release(&orig_id), &in_there); ++ kh_value(id_map, iter) = map_value; ++- ++- text += matches[0].rm_eo; // next! ++ } ++ ++ // If there are no RG lines in the file and we are overriding add one ++@@ -724,6 +582,7 @@ ++ ++ memfail: ++ perror(__func__); +++ fail: ++ if (hdr_lines) kl_destroy(hdrln, hdr_lines); ++ return NULL; ++ } ++@@ -821,16 +680,18 @@ ++ * Returns 0 on success, -1 on failure. ++ */ ++ ++-static int trans_tbl_init(merged_header_t* merged_hdr, bam_hdr_t* translate, +++static int trans_tbl_init(merged_header_t* merged_hdr, sam_hdr_t* translate, ++ trans_tbl_t* tbl, bool merge_rg, bool merge_pg, ++ bool copy_co, char* rg_override) ++ { +++ kstring_t lines = { 0, 0, NULL }; ++ klist_t(hdrln) *rg_list = NULL; ++ klist_t(hdrln) *pg_list = NULL; ++ ++- tbl->n_targets = translate->n_targets; +++ tbl->n_targets = sam_hdr_nref(translate); ++ tbl->rg_trans = tbl->pg_trans = NULL; ++- tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int)); +++ tbl->tid_trans = (int*)calloc(tbl->n_targets ? tbl->n_targets : 1, +++ sizeof(int)); ++ if (tbl->tid_trans == NULL) goto memfail; ++ tbl->rg_trans = kh_init(c2c); ++ if (tbl->rg_trans == NULL) goto memfail; ++@@ -859,6 +720,7 @@ ++ goto fail; ++ ++ // Fix-up PP: tags in the new @PG records and add to output +++ lines.l = 0; ++ if (finish_rg_pg(false, pg_list, tbl->pg_trans, &merged_hdr->out_pg)) ++ goto fail; ++ ++@@ -867,22 +729,22 @@ ++ ++ if (copy_co) { ++ // Just append @CO headers without translation ++- const char *line, *end_pointer; ++- for (line = translate->text; *line; line = end_pointer + 1) { ++- end_pointer = strchr(line, '\n'); ++- if (strncmp(line, "@CO", 3) == 0) { ++- if (end_pointer) { ++- if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF) ++- goto memfail; ++- } else { // Last line with no trailing '\n' ++- if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail; ++- if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail; ++- } ++- } ++- if (end_pointer == NULL) break; +++ int num_co = sam_hdr_count_lines(translate, "CO"), i; +++ if (num_co < 0) +++ goto fail; +++ +++ for (i = 0; i < num_co; i++) { +++ if (sam_hdr_find_line_pos(translate, "CO", i, &lines) < 0) +++ goto fail; +++ if (ks_to_ks(&lines, &merged_hdr->out_co)) +++ goto fail; +++ if (kputc('\n', &merged_hdr->out_co) < 0) +++ goto fail; ++ } ++ } ++ +++ free(lines.s); +++ ++ return 0; ++ ++ memfail: ++@@ -891,80 +753,22 @@ ++ trans_tbl_destroy(tbl); ++ if (rg_list) kl_destroy(hdrln, rg_list); ++ if (pg_list) kl_destroy(hdrln, pg_list); +++ free(lines.s); ++ return -1; ++ } ++ ++-static inline void move_kstr_to_text(char **text, kstring_t *ks) { ++- memcpy(*text, ks_str(ks), ks_len(ks)); ++- *text += ks_len(ks); ++- **text = '\0'; ++- free(ks_release(ks)); ++-} ++- ++-/* ++- * Populate a bam_hdr_t struct from data in a merged_header_t. ++- */ ++- ++-static bam_hdr_t * finish_merged_header(merged_header_t *merged_hdr) { ++- size_t txt_sz; ++- char *text; ++- bam_hdr_t *hdr; ++- ++- // Check output text size ++- txt_sz = (ks_len(&merged_hdr->out_hd) ++- + ks_len(&merged_hdr->out_sq) ++- + ks_len(&merged_hdr->out_rg) ++- + ks_len(&merged_hdr->out_pg) ++- + ks_len(&merged_hdr->out_co)); ++- if (txt_sz >= INT32_MAX) { ++- fprintf(stderr, "[%s] Output header text too long\n", __func__); ++- return NULL; ++- } ++- ++- // Allocate new header ++- hdr = bam_hdr_init(); ++- if (hdr == NULL) goto memfail; ++- ++- // Transfer targets arrays to new header ++- hdr->n_targets = merged_hdr->n_targets; ++- if (hdr->n_targets > 0) { ++- // Try to shrink targets arrays to correct size ++- hdr->target_name = realloc(merged_hdr->target_name, ++- hdr->n_targets * sizeof(char*)); ++- if (!hdr->target_name) hdr->target_name = merged_hdr->target_name; ++- ++- hdr->target_len = realloc(merged_hdr->target_len, ++- hdr->n_targets * sizeof(uint32_t)); ++- if (!hdr->target_len) hdr->target_len = merged_hdr->target_len; ++- ++- // These have either been freed by realloc() or, in the unlikely ++- // event that failed, have had their ownership transferred to hdr ++- merged_hdr->target_name = NULL; ++- merged_hdr->target_len = NULL; ++- } ++- else { ++- hdr->target_name = NULL; ++- hdr->target_len = NULL; ++- } ++- ++- // Allocate text ++- text = hdr->text = malloc(txt_sz + 1); ++- if (!text) goto memfail; ++- ++- // Put header text in order @HD, @SQ, @RG, @PG, @CO ++- move_kstr_to_text(&text, &merged_hdr->out_hd); ++- move_kstr_to_text(&text, &merged_hdr->out_sq); ++- move_kstr_to_text(&text, &merged_hdr->out_rg); ++- move_kstr_to_text(&text, &merged_hdr->out_pg); ++- move_kstr_to_text(&text, &merged_hdr->out_co); ++- hdr->l_text = txt_sz; ++- ++- return hdr; +++static int finish_merged_header(merged_header_t *merged_hdr) { +++ if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_rg), +++ ks_len(&merged_hdr->out_rg)) < 0) +++ return -1; +++ if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_pg), +++ ks_len(&merged_hdr->out_pg)) < 0) +++ return -1; +++ if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_co), +++ ks_len(&merged_hdr->out_co)) < 0) +++ return -1; ++ ++- memfail: ++- perror(__func__); ++- bam_hdr_destroy(hdr); ++- return NULL; +++ return 0; ++ } ++ ++ /* ++@@ -979,8 +783,6 @@ ++ size_t i; ++ khiter_t iter; ++ if (!merged_hdr) return; ++- free(ks_release(&merged_hdr->out_hd)); ++- free(ks_release(&merged_hdr->out_sq)); ++ free(ks_release(&merged_hdr->out_rg)); ++ free(ks_release(&merged_hdr->out_pg)); ++ free(ks_release(&merged_hdr->out_co)); ++@@ -1147,25 +949,30 @@ ++ @param cmd command name (used in print_error() etc) ++ @param in_fmt format options for input files ++ @param out_fmt output file format and options +++ @param write_index create the index, together with the output file +++ @param arg_list command string for PG line +++ @param no_pg if 1, do not add a new PG line ++ @discussion Padding information may NOT correctly maintained. This ++ function is NOT thread safe. ++ */ ++ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode, ++- const char *headers, int n, char * const *fn, int flag, ++- const char *reg, int n_threads, const char *cmd, ++- const htsFormat *in_fmt, const htsFormat *out_fmt) +++ const char *headers, int n, char * const *fn, char * const *fn_idx, +++ int flag, const char *reg, int n_threads, const char *cmd, +++ const htsFormat *in_fmt, const htsFormat *out_fmt, int write_index, +++ char *arg_list, int no_pg) ++ { ++ samFile *fpout, **fp = NULL; ++ heap1_t *heap = NULL; ++- bam_hdr_t *hout = NULL; ++- bam_hdr_t *hin = NULL; +++ sam_hdr_t *hout = NULL; +++ sam_hdr_t *hin = NULL; ++ int i, j, *RG_len = NULL; ++ uint64_t idx = 0; ++ char **RG = NULL; ++ hts_itr_t **iter = NULL; ++- bam_hdr_t **hdr = NULL; +++ sam_hdr_t **hdr = NULL; ++ trans_tbl_t *translation_tbl = NULL; ++ int *rtrans = NULL; +++ char *out_idx_fn = NULL; ++ merged_header_t *merged_hdr = init_merged_header(); ++ if (!merged_hdr) return -1; ++ ++@@ -1188,7 +995,7 @@ ++ if (sort_tag) { ++ g_is_by_tag = 1; ++ g_sort_tag[0] = sort_tag[0]; ++- g_sort_tag[1] = sort_tag[1]; +++ g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; ++ } ++ ++ fp = (samFile**)calloc(n, sizeof(samFile*)); ++@@ -1197,7 +1004,7 @@ ++ if (!heap) goto mem_fail; ++ iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); ++ if (!iter) goto mem_fail; ++- hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); +++ hdr = (sam_hdr_t**)calloc(n, sizeof(sam_hdr_t*)); ++ if (!hdr) goto mem_fail; ++ translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); ++ if (!translation_tbl) goto mem_fail; ++@@ -1234,7 +1041,7 @@ ++ ++ // open and read the header from each file ++ for (i = 0; i < n; ++i) { ++- bam_hdr_t *hin; +++ sam_hdr_t *hin; ++ fp[i] = sam_open_format(fn[i], "r", in_fmt); ++ if (fp[i] == NULL) { ++ print_error_errno(cmd, "fail to open \"%s\"", fn[i]); ++@@ -1255,7 +1062,7 @@ ++ // TODO sam_itr_next() doesn't yet work for SAM files, ++ // so for those keep the headers around for use with sam_read1() ++ if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; ++- else { bam_hdr_destroy(hin); hdr[i] = NULL; } +++ else { sam_hdr_destroy(hin); hdr[i] = NULL; } ++ ++ if ((translation_tbl+i)->lost_coord_sort && !by_qname) { ++ fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); ++@@ -1284,41 +1091,34 @@ ++ } ++ ++ // Transform the header into standard form ++- hout = finish_merged_header(merged_hdr); +++ if (finish_merged_header(merged_hdr) < 0) +++ goto fail; +++ +++ hout = merged_hdr->hdr; ++ if (!hout) return -1; // FIXME: memory leak ++ ++ // If we're only merging a specified region move our iters to start at that point ++ if (reg) { ++- int tid, beg, end; ++- const char *name_lim; +++ int tid; +++ hts_pos_t beg, end; ++ ++- rtrans = rtrans_build(n, hout->n_targets, translation_tbl); +++ rtrans = rtrans_build(n, sam_hdr_nref(hout), translation_tbl); ++ if (!rtrans) goto mem_fail; ++ ++- name_lim = hts_parse_reg(reg, &beg, &end); ++- if (name_lim) { ++- char *name = malloc(name_lim - reg + 1); ++- if (!name) goto mem_fail; ++- memcpy(name, reg, name_lim - reg); ++- name[name_lim - reg] = '\0'; ++- tid = bam_name2id(hout, name); ++- free(name); ++- } ++- else { ++- // not parsable as a region, but possibly a sequence named "foo:a" ++- tid = bam_name2id(hout, reg); ++- beg = 0; ++- end = INT_MAX; ++- } ++- if (tid < 0) { ++- if (name_lim) fprintf(stderr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg); ++- else fprintf(stderr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg); +++ if (!sam_parse_region(hout, reg, &tid, &beg, &end, 0)) { +++ fprintf(stderr, "[%s] Badly formatted region or unknown reference name: \"%s\"\n", __func__, reg); ++ goto fail; ++ } ++ for (i = 0; i < n; ++i) { ++- hts_idx_t *idx = sam_index_load(fp[i], fn[i]); +++ hts_idx_t *idx = NULL; +++ // If index filename has not been specfied, look in BAM folder +++ if (fn_idx != NULL) { +++ idx = sam_index_load2(fp[i], fn[i], fn_idx[i]); +++ } else { +++ idx = sam_index_load(fp[i], fn[i]); +++ } ++ // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space ++- int mapped_tid = rtrans[i*hout->n_targets+tid]; +++ int mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid]; ++ if (idx == NULL) { ++ fprintf(stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", ++ __func__, fn[i]); ++@@ -1334,7 +1134,7 @@ ++ if (mapped_tid != INT32_MIN) { ++ fprintf(stderr, ++ "[%s] failed to get iterator over " ++- "{%s, %d, %d, %d}\n", +++ "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n", ++ __func__, fn[i], mapped_tid, beg, end); ++ } else { ++ fprintf(stderr, ++@@ -1371,7 +1171,8 @@ ++ res = iter[i] ? sam_itr_next(fp[i], iter[i], h->entry.bam_record) : sam_read1(fp[i], hdr[i], h->entry.bam_record); ++ if (res >= 0) { ++ bam_translate(h->entry.bam_record, translation_tbl + i); ++- h->pos = ((uint64_t)h->entry.bam_record->core.tid<<32) | (uint32_t)((int32_t)h->entry.bam_record->core.pos+1); +++ h->tid = h->entry.bam_record->core.tid; +++ h->pos = (uint64_t)(h->entry.bam_record->core.pos + 1); ++ h->rev = bam_is_rev(h->entry.bam_record); ++ h->idx = idx++; ++ if (g_is_by_tag) { ++@@ -1396,11 +1197,26 @@ ++ print_error_errno(cmd, "failed to create \"%s\"", out); ++ return -1; ++ } +++ if (!no_pg && sam_hdr_add_pg(hout, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ print_error(cmd, "failed to add PG line to the header of \"%s\"", out); +++ sam_close(fpout); +++ return -1; +++ } ++ if (sam_hdr_write(fpout, hout) != 0) { ++ print_error_errno(cmd, "failed to write header to \"%s\"", out); ++ sam_close(fpout); ++ return -1; ++ } +++ if (write_index) { +++ if (!(out_idx_fn = auto_index(fpout, out, hout))){ +++ sam_close(fpout); +++ return -1; +++ } +++ } ++ if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); ++ ++ // Begin the actual merge ++@@ -1415,11 +1231,13 @@ ++ if (sam_write1(fpout, hout, b) < 0) { ++ print_error_errno(cmd, "failed writing to \"%s\"", out); ++ sam_close(fpout); +++ free(out_idx_fn); ++ return -1; ++ } ++ if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { ++ bam_translate(b, translation_tbl + heap->i); ++- heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1); +++ heap->tid = b->core.tid; +++ heap->pos = (uint64_t)(b->core.pos + 1); ++ heap->rev = bam_is_rev(b); ++ heap->idx = idx++; ++ if (g_is_by_tag) { ++@@ -1439,6 +1257,14 @@ ++ ks_heapadjust(heap, 0, n, heap); ++ } ++ +++ if (write_index) { +++ if (sam_idx_save(fpout) < 0) { +++ print_error_errno("merge", "writing index failed"); +++ goto fail; +++ } +++ } +++ free(out_idx_fn); +++ ++ // Clean up and close ++ if (flag & MERGE_RG) { ++ for (i = 0; i != n; ++i) free(RG[i]); ++@@ -1447,11 +1273,11 @@ ++ for (i = 0; i < n; ++i) { ++ trans_tbl_destroy(translation_tbl + i); ++ hts_itr_destroy(iter[i]); ++- bam_hdr_destroy(hdr[i]); +++ sam_hdr_destroy(hdr[i]); ++ sam_close(fp[i]); ++ } ++- bam_hdr_destroy(hin); ++- bam_hdr_destroy(hout); +++ sam_hdr_destroy(hin); +++ sam_hdr_destroy(hout); ++ free_merged_header(merged_hdr); ++ free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); ++ if (sam_close(fpout) < 0) { ++@@ -1473,11 +1299,11 @@ ++ for (i = 0; i < n; ++i) { ++ if (translation_tbl && translation_tbl[i].tid_trans) trans_tbl_destroy(translation_tbl + i); ++ if (iter && iter[i]) hts_itr_destroy(iter[i]); ++- if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]); +++ if (hdr && hdr[i]) sam_hdr_destroy(hdr[i]); ++ if (fp && fp[i]) sam_close(fp[i]); ++ if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record); ++ } ++- if (hout) bam_hdr_destroy(hout); +++ if (hout) sam_hdr_destroy(hout); ++ free(RG); ++ free(translation_tbl); ++ free(hdr); ++@@ -1485,6 +1311,7 @@ ++ free(heap); ++ free(fp); ++ free(rtrans); +++ free(out_idx_fn); ++ return -1; ++ } ++ ++@@ -1495,7 +1322,7 @@ ++ strcpy(mode, "wb"); ++ if (flag & MERGE_UNCOMP) strcat(mode, "0"); ++ else if (flag & MERGE_LEVEL1) strcat(mode, "1"); ++- return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL); +++ return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1); ++ } ++ ++ static void merge_usage(FILE *to) ++@@ -1516,23 +1343,27 @@ ++ " -c Combine @RG headers with colliding IDs [alter IDs to be distinct]\n" ++ " -p Combine @PG headers with colliding IDs [alter IDs to be distinct]\n" ++ " -s VALUE Override random seed\n" ++-" -b FILE List of input BAM filenames, one per line [null]\n"); ++- sam_global_opt_help(to, "-.O..@"); +++" -b FILE List of input BAM filenames, one per line [null]\n" +++" -X Use customized index files\n" +++" --no-PG do not add a PG line\n"); +++ sam_global_opt_help(to, "-.O..@.."); ++ } ++ ++ int bam_merge(int argc, char *argv[]) ++ { ++- int c, is_by_qname = 0, flag = 0, ret = 0, level = -1; +++ int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0; ++ char *fn_headers = NULL, *reg = NULL, mode[12]; ++- char *sort_tag = NULL; +++ char *sort_tag = NULL, *arg_list = NULL; ++ long random_seed = (long)time(NULL); ++ char** fn = NULL; ++- int fn_size = 0; +++ char** fn_idx = NULL; +++ int fn_size = 0, no_pg = 0; ++ ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), ++ { "threads", required_argument, NULL, '@' }, +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -1541,13 +1372,13 @@ ++ return 0; ++ } ++ ++- while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:", lopts, NULL)) >= 0) { +++ while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:X", lopts, NULL)) >= 0) { ++ switch (c) { ++ case 'r': flag |= MERGE_RG; break; ++ case 'f': flag |= MERGE_FORCE; break; ++- case 'h': fn_headers = strdup(optarg); break; +++ case 'h': fn_headers = optarg; break; ++ case 'n': is_by_qname = 1; break; ++- case 't': sort_tag = strdup(optarg); break; +++ case 't': sort_tag = optarg; break; ++ case '1': flag |= MERGE_LEVEL1; level = 1; break; ++ case 'u': flag |= MERGE_UNCOMP; level = 0; break; ++ case 'R': reg = strdup(optarg); break; ++@@ -1555,8 +1386,13 @@ ++ case 'c': flag |= MERGE_COMBINE_RG; break; ++ case 'p': flag |= MERGE_COMBINE_PG; break; ++ case 's': random_seed = atol(optarg); break; +++ case 'X': has_index_file = 1; break; // -X flag for index filename ++ case 'b': { ++ // load the list of files to read +++ if (has_index_file) { +++ fprintf(stderr,"Error: The -b option cannot be combined with -X\n"); +++ ret = 1; goto end; +++ } ++ int nfiles; ++ char **fn_read = hts_readlines(optarg, &nfiles); ++ if (fn_read) { ++@@ -1573,7 +1409,7 @@ ++ } ++ break; ++ } ++- +++ case 1: no_pg = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': merge_usage(stderr); return 1; ++@@ -1585,6 +1421,11 @@ ++ return 1; ++ } ++ +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("merge", "failed to create arg_list"); +++ return 1; +++ } +++ ++ srand48(random_seed); ++ if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { ++ FILE *fp = fopen(argv[optind], "rb"); ++@@ -1595,24 +1436,41 @@ ++ } ++ } ++ ++- int nargcfiles = argc - (optind+1); +++ int nargcfiles = 0; +++ if (has_index_file) { // Calculate # of input BAM files +++ if ((argc - optind - 1) % 2 != 0) { +++ fprintf(stderr, "Odd number of filenames detected! Each BAM file should have an index file\n"); +++ return 1; +++ } +++ nargcfiles = (argc - optind - 1) / 2; +++ } else { +++ nargcfiles = argc - optind - 1; +++ } +++ ++ if (nargcfiles > 0) { ++ // Add argc files to end of array ++ fn = realloc(fn, (fn_size+nargcfiles) * sizeof(char*)); ++ if (fn == NULL) { ret = 1; goto end; } ++ memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*)); +++ +++ if(has_index_file) { +++ fn_idx = realloc(fn_idx, nargcfiles * sizeof(char*)); +++ if (fn_idx == NULL) { ret = 1; goto end; } +++ memcpy(fn_idx+fn_size, argv + nargcfiles + (optind+1), nargcfiles * sizeof(char*)); +++ } ++ } ++ if (fn_size+nargcfiles < 1) { ++ print_error("merge", "You must specify at least one (and usually two or more) input files"); ++ merge_usage(stderr); +++ free(fn_idx); ++ return 1; ++ } ++ strcpy(mode, "wb"); ++ sam_open_mode(mode+1, argv[optind], NULL); ++ if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9); ++ if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers, ++- fn_size+nargcfiles, fn, flag, reg, ga.nthreads, ++- "merge", &ga.in, &ga.out) < 0) +++ fn_size+nargcfiles, fn, fn_idx, flag, reg, ga.nthreads, +++ "merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0) ++ ret = 1; ++ ++ end: ++@@ -1621,8 +1479,9 @@ ++ for (i=0; ii, res; ++ if (i < nfiles) { // read from file ++ res = sam_read1(fp[i], hout, heap->entry.bam_record); ++@@ -1655,8 +1514,8 @@ ++ } ++ } ++ if (res >= 0) { ++- heap->pos = (((uint64_t)heap->entry.bam_record->core.tid<<32) ++- | (uint32_t)((int32_t)heap->entry.bam_record->core.pos+1)); +++ heap->tid = heap->entry.bam_record->core.tid; +++ heap->pos = (uint64_t)(heap->entry.bam_record->core.pos + 1); ++ heap->rev = bam_is_rev(heap->entry.bam_record); ++ heap->idx = (*idx)++; ++ if (g_is_by_tag) { ++@@ -1676,21 +1535,23 @@ ++ } ++ ++ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, ++- const char *mode, bam_hdr_t *hout, +++ const char *mode, sam_hdr_t *hout, ++ int n, char * const *fn, int num_in_mem, ++ buf_region *in_mem, bam1_tag *buf, int n_threads, ++ const char *cmd, const htsFormat *in_fmt, ++- const htsFormat *out_fmt) { +++ const htsFormat *out_fmt, char *arg_list, int no_pg, +++ int write_index) { ++ samFile *fpout = NULL, **fp = NULL; ++ heap1_t *heap = NULL; ++ uint64_t idx = 0; ++ int i, heap_size = n + num_in_mem; +++ char *out_idx_fn = NULL; ++ ++ g_is_by_qname = by_qname; ++ if (sort_tag) { ++ g_is_by_tag = 1; ++ g_sort_tag[0] = sort_tag[0]; ++- g_sort_tag[1] = sort_tag[1]; +++ g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; ++ } ++ if (n > 0) { ++ fp = (samFile**)calloc(n, sizeof(samFile*)); ++@@ -1701,7 +1562,7 @@ ++ ++ // Open each file, read the header and put the first read into the heap ++ for (i = 0; i < heap_size; i++) { ++- bam_hdr_t *hin; +++ sam_hdr_t *hin; ++ heap1_t *h = &heap[i]; ++ ++ if (i < n) { ++@@ -1718,7 +1579,7 @@ ++ goto fail; ++ } ++ // ... and throw it away as we don't really need it ++- bam_hdr_destroy(hin); +++ sam_hdr_destroy(hin); ++ } ++ ++ // Get a read into the heap ++@@ -1741,6 +1602,16 @@ ++ return -1; ++ } ++ +++ if (!no_pg && sam_hdr_add_pg(hout, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ print_error(cmd, "failed to add PG line to the header of \"%s\"", out); +++ sam_close(fpout); +++ return -1; +++ } +++ ++ if (n_threads > 1) hts_set_threads(fpout, n_threads); ++ ++ if (sam_hdr_write(fpout, hout) != 0) { ++@@ -1749,14 +1620,20 @@ ++ return -1; ++ } ++ +++ if (write_index) { +++ if (!(out_idx_fn = auto_index(fpout, out, hout))){ +++ sam_close(fpout); +++ return -1; +++ } +++ } +++ ++ // Now do the merge ++ ks_heapmake(heap, heap_size, heap); ++ while (heap->pos != HEAP_EMPTY) { ++ bam1_t *b = heap->entry.bam_record; ++ if (sam_write1(fpout, hout, b) < 0) { ++ print_error_errno(cmd, "failed writing to \"%s\"", out); ++- sam_close(fpout); ++- return -1; +++ goto fail; ++ } ++ if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) { ++ assert(heap->i < n); ++@@ -1775,6 +1652,15 @@ ++ } ++ free(fp); ++ free(heap); +++ +++ if (write_index) { +++ if (sam_idx_save(fpout) < 0) { +++ print_error_errno("merge", "writing index failed"); +++ goto fail; +++ } +++ free(out_idx_fn); +++ } +++ ++ if (sam_close(fpout) < 0) { ++ print_error(cmd, "error closing output file"); ++ return -1; ++@@ -1786,11 +1672,15 @@ ++ fail: ++ for (i = 0; i < n; i++) { ++ if (fp && fp[i]) sam_close(fp[i]); ++- if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record); +++ } +++ for (i = 0; i < heap_size; i++) { +++ if (heap && heap[i].i < n && heap[i].entry.bam_record) +++ bam_destroy1(heap[i].entry.bam_record); ++ } ++ free(fp); ++ free(heap); ++ if (fpout) sam_close(fpout); +++ free(out_idx_fn); ++ return -1; ++ } ++ ++@@ -1811,8 +1701,13 @@ ++ if (t != 0) return t; ++ return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0); ++ } else { ++- pa = (uint64_t)a.bam_record->core.tid<<32|(a.bam_record->core.pos+1); ++- pb = (uint64_t)b.bam_record->core.tid<<32|(b.bam_record->core.pos+1); +++ pa = a.bam_record->core.tid; +++ pb = b.bam_record->core.tid; +++ +++ if (pa == pb) { +++ pa = (uint64_t)(a.bam_record->core.pos+1); +++ pb = (uint64_t)(b.bam_record->core.pos+1); +++ } ++ ++ if (pa == pb) { ++ pa = bam_is_rev(a.bam_record); ++@@ -1913,7 +1808,7 @@ ++ size_t buf_len; ++ const char *prefix; ++ bam1_tag *buf; ++- const bam_hdr_t *h; +++ const sam_hdr_t *h; ++ int index; ++ int error; ++ int no_save; ++@@ -1921,45 +1816,99 @@ ++ ++ // Returns 0 for success ++ // -1 for failure ++-static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt) +++static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, +++ const sam_hdr_t *h, int n_threads, const htsFormat *fmt, +++ char *arg_list, int no_pg, int write_index) ++ { ++ size_t i; ++ samFile* fp; +++ char *out_idx_fn = NULL; +++ ++ fp = sam_open_format(fn, mode, fmt); ++ if (fp == NULL) return -1; ++- if (sam_hdr_write(fp, h) != 0) goto fail; +++ if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ goto fail; +++ } +++ if (sam_hdr_write(fp, (sam_hdr_t *)h) != 0) goto fail; +++ +++ if (write_index) { +++ if (!(out_idx_fn = auto_index(fp, fn, (sam_hdr_t *)h))) goto fail; +++ } +++ ++ if (n_threads > 1) hts_set_threads(fp, n_threads); ++ for (i = 0; i < l; ++i) { ++- if (sam_write1(fp, h, buf[i].bam_record) < 0) goto fail; +++ if (sam_write1(fp, (sam_hdr_t *)h, buf[i].bam_record) < 0) goto fail; ++ } +++ +++ if (write_index) { +++ if (sam_idx_save(fp) < 0) { +++ print_error_errno("merge", "writing index failed"); +++ goto fail; +++ } +++ free(out_idx_fn); +++ } +++ +++ ++ if (sam_close(fp) < 0) return -1; ++ return 0; ++ fail: ++ sam_close(fp); +++ free(out_idx_fn); ++ return -1; ++ } ++ ++ #define NUMBASE 256 ++-#define STEP 8 ++ ++-static int ks_radixsort(size_t n, bam1_tag *buf, const bam_hdr_t *h) +++static int ks_radixsort(size_t n, bam1_tag *buf, const sam_hdr_t *h) ++ { ++ int curr = 0, ret = -1; ++ ssize_t i; ++ bam1_tag *buf_ar2[2], *bam_a, *bam_b; ++- uint64_t max_pos = 0, max_digit = 0, shift = 0; ++- +++ uint64_t max_pos = 1; +++ uint32_t max_tid = 1, tid_bytes = 0, pos_bytes = 0, byte = 0; +++ uint32_t tid_shift_l, tid_shift_r; +++ int nref = sam_hdr_nref(h); +++ +++ // Count number of bytes needed for biggest tid and pos +++ // Notes: Add 1 to core.pos so always positive. +++ // Convert unmapped tid (-1) to number of references so unmapped +++ // sort to the end. ++ for (i = 0; i < n; i++) { ++ bam1_t *b = buf[i].bam_record; ++- int32_t tid = b->core.tid == -1 ? h->n_targets : b->core.tid; ++- buf[i].u.pos = (uint64_t)tid<<32 | (b->core.pos+1)<<1 | bam_is_rev(b); ++- if (max_pos < buf[i].u.pos) ++- max_pos = buf[i].u.pos; ++- } ++- ++- while (max_pos) { ++- ++max_digit; ++- max_pos = max_pos >> 1; +++ uint32_t tid = b->core.tid == -1 ? nref : b->core.tid; +++ uint64_t pos = ((uint64_t)(b->core.pos + 1) << 1) | bam_is_rev(b); +++ if (max_tid < tid) +++ max_tid = tid; +++ if (max_pos < pos) +++ max_pos = pos; +++ } +++ +++ for (; max_pos > 0; max_pos >>= 8) pos_bytes++; +++ for (; max_tid > 0; max_tid >>= 8) tid_bytes++; +++ assert(pos_bytes + tid_bytes < sizeof(buf[0].u.pos_tid)); +++ +++ tid_shift_l = pos_bytes * 8; +++ tid_shift_r = 64 - tid_shift_l; +++ +++ // Write position and tid into bam1_tag::u::pos_tid using minimum number +++ // of bytes required. Values are stored little-endian so that we +++ // get a least-significant digit (byte) radix sort. +++ for (i = 0; i < n; i++) { +++ bam1_t *b = buf[i].bam_record; +++ uint32_t tid = b->core.tid == -1 ? nref : b->core.tid; +++ // 'pos' here includes as many bytes of tid as will fit +++ // in the space remaining above pos_bytes. The rest of tid +++ // is written out separately. +++ uint64_t pos = (bam_is_rev(b) | +++ ((uint64_t)(b->core.pos + 1) << 1) | +++ (tid_shift_l < 64 ? (uint64_t) tid << tid_shift_l : 0)); +++ u64_to_le(pos, buf[i].u.pos_tid); +++ u32_to_le(tid_shift_r < 32 ? tid >> tid_shift_r : 0, +++ &buf[i].u.pos_tid[8]); ++ } ++ ++ buf_ar2[0] = buf; ++@@ -1969,18 +1918,18 @@ ++ goto err; ++ } ++ ++- while (shift < max_digit){ +++ // Least-significant digit radix sort (where "digits" are bytes) +++ for (byte = 0; byte < pos_bytes + tid_bytes; byte++) { ++ size_t remainders[NUMBASE] = { 0 }; ++ bam_a = buf_ar2[curr]; bam_b = buf_ar2[1-curr]; ++ for (i = 0; i < n; ++i) ++- remainders[(bam_a[i].u.pos >> shift) % NUMBASE]++; +++ remainders[bam_a[i].u.pos_tid[byte]]++; ++ for (i = 1; i < NUMBASE; ++i) ++ remainders[i] += remainders[i - 1]; ++ for (i = n - 1; i >= 0; i--) { ++- size_t j = --remainders[(bam_a[i].u.pos >> shift) % NUMBASE]; +++ size_t j = --remainders[bam_a[i].u.pos_tid[byte]]; ++ bam_b[j] = bam_a[i]; ++ } ++- shift += STEP; ++ curr = 1 - curr; ++ } ++ if (curr == 1) { ++@@ -2034,10 +1983,10 @@ ++ return 0; ++ } ++ ++- if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt) < 0) +++ if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, NULL, 1, 0) < 0) ++ w->error = errno; ++ } else { ++- if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0) +++ if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, NULL, 1, 0) < 0) ++ w->error = errno; ++ } ++ ++@@ -2046,7 +1995,7 @@ ++ } ++ ++ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, ++- const bam_hdr_t *h, int n_threads, buf_region *in_mem) +++ const sam_hdr_t *h, int n_threads, buf_region *in_mem) ++ { ++ int i; ++ size_t pos, rest; ++@@ -2107,6 +2056,9 @@ ++ @param max_mem approxiate maximum memory (very inaccurate) ++ @param in_fmt input file format options ++ @param out_fmt output file format and options +++ @param arg_list command string for PG line +++ @param no_pg if 1, do not add a new PG line +++ @paran write_index create index for the output file ++ @return 0 for successful sorting, negative on errors ++ ++ @discussion It may create multiple temporary subalignment files ++@@ -2116,11 +2068,12 @@ ++ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix, ++ const char *fnout, const char *modeout, ++ size_t _max_mem, int n_threads, ++- const htsFormat *in_fmt, const htsFormat *out_fmt) +++ const htsFormat *in_fmt, const htsFormat *out_fmt, +++ char *arg_list, int no_pg, int write_index) ++ { ++ int ret = -1, res, i, n_files = 0; ++ size_t max_k, k, max_mem, bam_mem_offset; ++- bam_hdr_t *header = NULL; +++ sam_hdr_t *header = NULL; ++ samFile *fp; ++ bam1_tag *buf = NULL; ++ bam1_t *b = bam_init1(); ++@@ -2139,7 +2092,8 @@ ++ g_is_by_qname = is_by_qname; ++ if (sort_by_tag) { ++ g_is_by_tag = 1; ++- strncpy(g_sort_tag, sort_by_tag, 2); +++ g_sort_tag[0] = sort_by_tag[0]; +++ g_sort_tag[1] = sort_by_tag[0] ? sort_by_tag[1] : '\0'; ++ } ++ ++ max_mem = _max_mem * n_threads; ++@@ -2162,14 +2116,15 @@ ++ else ++ new_so = "coordinate"; ++ ++- if (sam_hdr_change_HD(header, "SO", new_so) != 0) { ++- print_error("sort", ++- "failed to change sort order header to '%s'\n", new_so); +++ if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) +++ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) +++ ) { +++ print_error("sort", "failed to change sort order header to '%s'\n", new_so); ++ goto err; ++ } ++- if (sam_hdr_change_HD(header, "GO", NULL) != 0) { ++- print_error("sort", ++- "failed to delete group order header\n"); +++ +++ if (-1 == sam_hdr_remove_tag_hd(header, "GO")) { +++ print_error("sort", "failed to delete group order header\n"); ++ goto err; ++ } ++ ++@@ -2252,7 +2207,7 @@ ++ ++ // write the final output ++ if (n_files == 0 && num_in_mem < 2) { // a single block ++- if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) { +++ if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, arg_list, no_pg, write_index) != 0) { ++ print_error_errno("sort", "failed to create \"%s\"", fnout); ++ goto err; ++ } ++@@ -2269,7 +2224,8 @@ ++ } ++ if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header, ++ n_files, fns, num_in_mem, in_mem, buf, ++- n_threads, "sort", in_fmt, out_fmt) < 0) { +++ n_threads, "sort", in_fmt, out_fmt, arg_list, +++ no_pg, write_index) < 0) { ++ // Propagate bam_merge_simple() failure; it has already emitted a ++ // message explaining the failure, so no further message is needed. ++ goto err; ++@@ -2293,7 +2249,7 @@ ++ free(buf); ++ free(bam_mem); ++ free(in_mem); ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ if (fp) sam_close(fp); ++ return ret; ++ } ++@@ -2305,7 +2261,7 @@ ++ char *fnout = calloc(strlen(prefix) + 4 + 1, 1); ++ if (!fnout) return -1; ++ sprintf(fnout, "%s.bam", prefix); ++- ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL); +++ ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0); ++ free(fnout); ++ return ret; ++ } ++@@ -2320,8 +2276,9 @@ ++ " -n Sort by read name\n" ++ " -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n" ++ " -o FILE Write final output to FILE rather than standard output\n" ++-" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"); ++- sam_global_opt_help(fp, "-.O..@"); +++" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n" +++" --no-PG do not add a PG line\n"); +++ sam_global_opt_help(fp, "-.O..@-."); ++ } ++ ++ static void complain_about_memory_setting(size_t max_mem) { ++@@ -2344,8 +2301,8 @@ ++ int bam_sort(int argc, char *argv[]) ++ { ++ size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20; ++- int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1; ++- char* sort_tag = NULL; +++ int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0; +++ char* sort_tag = NULL, *arg_list = NULL; ++ char *fnout = "-", modeout[12]; ++ kstring_t tmpprefix = { 0, 0, NULL }; ++ struct stat st; ++@@ -2354,6 +2311,7 @@ ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), ++ { "threads", required_argument, NULL, '@' }, +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -2361,7 +2319,7 @@ ++ switch (c) { ++ case 'o': fnout = optarg; o_seen = 1; break; ++ case 'n': is_by_qname = 1; break; ++- case 't': sort_tag = strdup(optarg); break; +++ case 't': sort_tag = optarg; break; ++ case 'm': { ++ char *q; ++ max_mem = strtol(optarg, &q, 0); ++@@ -2372,6 +2330,7 @@ ++ } ++ case 'T': kputs(optarg, &tmpprefix); break; ++ case 'l': level = atoi(optarg); break; +++ case 1: no_pg = 1; break; ++ ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++@@ -2395,6 +2354,16 @@ ++ goto sort_end; ++ } ++ +++ if (ga.write_index && (is_by_qname || sort_tag)) { +++ fprintf(stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n"); +++ ga.write_index = 0; +++ } +++ +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("sort", "failed to create arg_list"); +++ return 1; +++ } +++ ++ if (max_mem < (SORT_MIN_MEGS_PER_THREAD << 20)) { ++ complain_about_memory_setting(max_mem); ++ ret = EXIT_FAILURE; ++@@ -2417,7 +2386,7 @@ ++ ++ ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-", ++ tmpprefix.s, fnout, modeout, max_mem, ga.nthreads, ++- &ga.in, &ga.out); +++ &ga.in, &ga.out, arg_list, no_pg, ga.write_index); ++ if (ret >= 0) ++ ret = EXIT_SUCCESS; ++ else { ++@@ -2432,6 +2401,7 @@ ++ ++ sort_end: ++ free(tmpprefix.s); +++ free(arg_list); ++ sam_global_args_free(&ga); ++ ++ return ret; ++--- python-pysam.orig/samtools/bam_sort.c.pysam.c +++++ python-pysam/samtools/bam_sort.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam_sort.c -- sorting and merging. ++ ++- Copyright (C) 2008-2016 Genome Research Ltd. +++ Copyright (C) 2008-2019 Genome Research Ltd. ++ Portions copyright (C) 2009-2012 Broad Institute. ++ ++ Author: Heng Li ++@@ -46,6 +46,7 @@ ++ #include "htslib/klist.h" ++ #include "htslib/kstring.h" ++ #include "htslib/sam.h" +++#include "htslib/hts_endian.h" ++ #include "sam_opts.h" ++ #include "samtools.h" ++ ++@@ -57,7 +58,7 @@ ++ bam1_t *bam_record; ++ union { ++ const uint8_t *tag; ++- uint64_t pos; +++ uint8_t pos_tid[12]; ++ } u; ++ } bam1_tag; ++ ++@@ -124,12 +125,12 @@ ++ return *pa? 1 : *pb? -1 : 0; ++ } ++ ++-#define HEAP_EMPTY UINT64_MAX +++#define HEAP_EMPTY (UINT64_MAX >> 1) ++ ++ typedef struct { ++ int i; ++- uint32_t rev; ++- uint64_t pos, idx; +++ uint32_t tid; +++ uint64_t pos:63, rev:1, idx; ++ bam1_tag entry; ++ } heap1_t; ++ ++@@ -155,6 +156,7 @@ ++ fb = b.entry.bam_record->core.flag & 0xc0; ++ if (fa != fb) return fa > fb; ++ } else { +++ if (a.tid != b.tid) return a.tid > b.tid; ++ if (a.pos != b.pos) return a.pos > b.pos; ++ if (a.rev != b.rev) return a.rev > b.rev; ++ } ++@@ -166,8 +168,7 @@ ++ KSORT_INIT(heap, heap1_t, heap_lt) ++ ++ typedef struct merged_header { ++- kstring_t out_hd; ++- kstring_t out_sq; +++ sam_hdr_t *hdr; ++ kstring_t out_rg; ++ kstring_t out_pg; ++ kstring_t out_co; ++@@ -189,80 +190,6 @@ ++ bool lost_coord_sort; ++ } trans_tbl_t; ++ ++-/* Something to look like a regmatch_t */ ++-typedef struct hdr_match { ++- ptrdiff_t rm_so; ++- ptrdiff_t rm_eo; ++-} hdr_match_t; ++- ++-/* ++- * Search for header lines of a particular record type. ++- * ++- * This replaces a regex search for something like /^@SQ.*\tSN:([^\t]+).*$/ ++- * but is much quicker. The locations found are returned in *matches, ++- * which has a signature the same as that of a regmatch_t. ++- * ++- * rec is the record type to match (i.e. @HD, @SQ, @PG or @RG) ++- * tag is a tag type in the record to match (SN for @SQ, ID for @PG or @RG) ++- * ++- * The location of the record (if found) is returned in matches[0] ++- * If tag is not NULL, the record is searched for the presence of the ++- * given tag. If found, the location of the value is returned in matches[1]. ++- * If the tag isn't found then the record is ignored and the search resumes ++- * on the next header line. ++- * ++- * For simplicity, some assumptions are made about rec and tag: ++- * rec should include the leading '@' sign and be three characters long. ++- * tag should be exactly two characters long. ++- * These are always string constants when this is called below, so we don't ++- * bother to check here. ++- * ++- * Returns 0 if a match was found, -1 if not. ++- */ ++- ++- ++-static int hdr_line_match(const char *text, const char *rec, ++- const char *tag, hdr_match_t *matches) { ++- const char *line_start, *line_end = text; ++- const char *tag_start, *tag_end; ++- ++- for (;;) { ++- // Find record, ensure either at start of text or follows '\n' ++- line_start = strstr(line_end, rec); ++- while (line_start && line_start > text && *(line_start - 1) != '\n') { ++- line_start = strstr(line_start + 3, rec); ++- } ++- if (!line_start) return -1; ++- ++- // Find end of header line ++- line_end = strchr(line_start, '\n'); ++- if (!line_end) line_end = line_start + strlen(line_start); ++- ++- matches[0].rm_so = line_start - text; ++- matches[0].rm_eo = line_end - text; ++- if (!tag) return 0; // Match found if not looking for tag. ++- ++- for (tag_start = line_start + 3; tag_start < line_end; tag_start++) { ++- // Find possible tag start. Hacky but quick. ++- while (*tag_start > '\n') tag_start++; ++- ++- // Check it ++- if (tag_start[0] == '\t' ++- && strncmp(tag_start + 1, tag, 2) == 0 ++- && tag_start[3] == ':') { ++- // Found tag, record location and return. ++- tag_end = tag_start + 4; ++- while (*tag_end && *tag_end != '\t' && *tag_end != '\n') ++- ++tag_end; ++- matches[1].rm_so = tag_start - text + 4; ++- matches[1].rm_eo = tag_end - text; ++- return 0; ++- } ++- } ++- // Couldn't find tag, try again from end of current record. ++- } ++-} ++- ++ static void trans_tbl_destroy(trans_tbl_t *tbl) { ++ khiter_t iter; ++ ++@@ -301,6 +228,9 @@ ++ merged_hdr = calloc(1, sizeof(*merged_hdr)); ++ if (merged_hdr == NULL) return NULL; ++ +++ merged_hdr->hdr = sam_hdr_init(); +++ if (!merged_hdr->hdr) goto fail; +++ ++ merged_hdr->targets_sz = 16; ++ merged_hdr->target_name = malloc(merged_hdr->targets_sz ++ * sizeof(*merged_hdr->target_name)); ++@@ -328,6 +258,7 @@ ++ kh_destroy(c2i, merged_hdr->sq_tids); ++ free(merged_hdr->target_name); ++ free(merged_hdr->target_len); +++ sam_hdr_destroy(merged_hdr->hdr); ++ free(merged_hdr); ++ return NULL; ++ } ++@@ -340,12 +271,6 @@ ++ return kputsn(src + from, to - from, dest) != to - from; ++ } ++ ++-// Append a header line match to kstring ++-static inline int match_to_ks(const char *src, const hdr_match_t *match, ++- kstring_t *dest) { ++- return range_to_ks(src, match->rm_so, match->rm_eo, dest); ++-} ++- ++ // Append a kstring to a kstring ++ static inline int ks_to_ks(kstring_t *src, kstring_t *dest) { ++ return kputsn(ks_str(src), ks_len(src), dest) != ks_len(src); ++@@ -387,48 +312,32 @@ ++ */ ++ ++ static int trans_tbl_add_hd(merged_header_t* merged_hdr, ++- bam_hdr_t *translate) { ++- hdr_match_t match = {0, 0}; +++ sam_hdr_t *translate) { +++ kstring_t hd_line = { 0, 0, NULL }; +++ int res; ++ ++ // TODO: handle case when @HD needs merging. ++ if (merged_hdr->have_hd) return 0; ++ ++- if (hdr_line_match(translate->text, "@HD", NULL, &match) != 0) { ++- return 0; +++ res = sam_hdr_find_hd(translate, &hd_line); +++ if (res < -1) { +++ print_error("merge", "failed to get @HD line from header"); +++ return -1; ++ } ++ ++- if (match_to_ks(translate->text, &match, &merged_hdr->out_hd)) goto memfail; ++- if (kputc('\n', &merged_hdr->out_hd) == EOF) goto memfail; ++- merged_hdr->have_hd = true; ++- ++- return 0; ++- ++- memfail: ++- perror(__func__); ++- return -1; ++-} +++ if (res < 0) // Not found +++ return 0; ++ ++-static inline int grow_target_list(merged_header_t* merged_hdr) { ++- size_t new_size; ++- char **new_names; ++- uint32_t *new_len; ++- ++- new_size = merged_hdr->targets_sz * 2; ++- new_names = realloc(merged_hdr->target_name, sizeof(*new_names) * new_size); ++- if (!new_names) goto fail; ++- merged_hdr->target_name = new_names; ++- ++- new_len = realloc(merged_hdr->target_len, sizeof(*new_len) * new_size); ++- if (!new_len) goto fail; ++- merged_hdr->target_len = new_len; +++ if (sam_hdr_add_lines(merged_hdr->hdr, hd_line.s, hd_line.l) < 0) { +++ print_error("merge", "failed to add @HD line to new header"); +++ free(hd_line.s); +++ return -1; +++ } ++ ++- merged_hdr->targets_sz = new_size; +++ free(hd_line.s); +++ merged_hdr->have_hd = true; ++ ++ return 0; ++- ++- fail: ++- perror(__func__); ++- return -1; ++ } ++ ++ /* ++@@ -446,54 +355,48 @@ ++ * Returns 0 on success, -1 on failure. ++ */ ++ ++-static int trans_tbl_add_sq(merged_header_t* merged_hdr, bam_hdr_t *translate, +++static int trans_tbl_add_sq(merged_header_t* merged_hdr, sam_hdr_t *translate, ++ trans_tbl_t* tbl) { ++- ++- kstring_t *out_text = &merged_hdr->out_sq; ++- khash_t(c2i)* sq_tids = merged_hdr->sq_tids; ++- hdr_match_t *new_sq_matches = NULL; ++- char *text; ++- hdr_match_t matches[2]; ++ int32_t i; ++- int32_t old_n_targets = merged_hdr->n_targets; ++- khiter_t iter; ++- int min_tid = -1; +++ int min_tid = -1, res; +++ kstring_t sq_line = { 0, 0, NULL }, sq_sn = { 0, 0, NULL }; ++ ++ // Fill in the tid part of the translation table, adding new targets ++ // to the merged header as we go. ++ ++- for (i = 0; i < translate->n_targets; ++i) { +++ for (i = 0; i < sam_hdr_nref(translate); ++i) { +++ int trans_tid; +++ sq_sn.l = 0; +++ res = sam_hdr_find_tag_pos(translate, "SQ", i, "SN", &sq_sn); +++ if (res < 0) { +++ print_error("merge", "failed to get @SQ SN #%d from header", i + 1); +++ goto fail; +++ } ++ ++- // Check if it's a new target. ++- iter = kh_get(c2i, sq_tids, translate->target_name[i]); +++ trans_tid = sam_hdr_name2tid(merged_hdr->hdr, sq_sn.s); +++ if (trans_tid < -1) { +++ print_error("merge", "failed to lookup ref"); +++ goto fail; +++ } ++ ++- if (iter == kh_end(sq_tids)) { ++- int ret; +++ if (trans_tid < 0) { ++ // Append missing entries to out_hdr ++- ++- if (merged_hdr->n_targets == merged_hdr->targets_sz) { ++- if (grow_target_list(merged_hdr)) goto fail; +++ sq_line.l = 0; +++ res = sam_hdr_find_line_id(translate, "SQ", "SN", sq_sn.s, &sq_line); +++ if (res < 0) { +++ print_error("merge", "failed to get @SQ SN:%s from header", sq_sn.s); +++ goto fail; ++ } ++ ++- merged_hdr->target_name[merged_hdr->n_targets] = strdup(translate->target_name[i]); ++- if (merged_hdr->target_name[merged_hdr->n_targets] == NULL) goto memfail; ++- merged_hdr->target_len[merged_hdr->n_targets] = translate->target_len[i]; ++- ++- // Record the new identifier for reference below, ++- // and when building the ttable for other inputs. ++- iter = kh_put(c2i, sq_tids, ++- merged_hdr->target_name[merged_hdr->n_targets], &ret); ++- if (ret < 0) { ++- free(merged_hdr->target_name[merged_hdr->n_targets]); ++- goto memfail; ++- } ++- assert(ret > 0); // Should not be in hash already. +++ trans_tid = sam_hdr_nref(merged_hdr->hdr); ++ ++- kh_value(sq_tids, iter) = merged_hdr->n_targets; ++- tbl->tid_trans[i] = merged_hdr->n_targets++; ++- } else { ++- tbl->tid_trans[i] = kh_value(sq_tids, iter); +++ res = sam_hdr_add_lines(merged_hdr->hdr, sq_line.s, sq_line.l); +++ if (res < 0) { +++ print_error("merge", "failed to add @SQ SN:%s to new header", sq_sn.s); +++ goto fail; +++ } ++ } +++ tbl->tid_trans[i] = trans_tid; ++ ++ if (tbl->tid_trans[i] > min_tid) { ++ min_tid = tbl->tid_trans[i]; ++@@ -502,78 +405,14 @@ ++ } ++ } ++ ++- if (merged_hdr->n_targets == old_n_targets) ++- return 0; // Everything done if no new targets. ++- ++- // Otherwise, find @SQ lines in translate->text for all newly added targets. ++- ++- new_sq_matches = malloc((merged_hdr->n_targets - old_n_targets) ++- * sizeof(*new_sq_matches)); ++- if (new_sq_matches == NULL) goto memfail; ++- ++- for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) { ++- new_sq_matches[i].rm_so = new_sq_matches[i].rm_eo = -1; ++- } ++- ++- text = translate->text; ++- while (hdr_line_match(text, "@SQ", "SN", matches) == 0) { ++- // matches[0] is whole line, matches[1] is SN value. ++- ++- // This is a bit disgusting, but avoids a copy... ++- char c = text[matches[1].rm_eo]; ++- int idx; ++- ++- text[matches[1].rm_eo] = '\0'; ++- ++- // Look up the SN value in the sq_tids hash. ++- iter = kh_get(c2i, sq_tids, text + matches[1].rm_so); ++- text[matches[1].rm_eo] = c; // restore text ++- ++- if (iter == kh_end(sq_tids)) { ++- // Warn about this, but it's not really fatal. ++- fprintf(samtools_stderr, "[W::%s] @SQ SN (%.*s) found in text header but not binary header.\n", ++- __func__, ++- (int) (matches[1].rm_eo - matches[1].rm_so), ++- text + matches[1].rm_so); ++- text += matches[0].rm_eo; ++- continue; // Skip to next ++- } ++- ++- idx = kh_value(sq_tids, iter); ++- if (idx >= old_n_targets) { ++- // is a new SQ, so record position so we can add it to out_text. ++- assert(idx < merged_hdr->n_targets); ++- ptrdiff_t off = text - translate->text; ++- new_sq_matches[idx - old_n_targets].rm_so = matches[0].rm_so + off; ++- new_sq_matches[idx - old_n_targets].rm_eo = matches[0].rm_eo + off; ++- } ++- ++- // Carry on searching from end of current match ++- text += matches[0].rm_eo; ++- } ++- ++- // Copy the @SQ headers found and recreate any missing from binary header. ++- for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) { ++- if (new_sq_matches[i].rm_so >= 0) { ++- if (match_to_ks(translate->text, &new_sq_matches[i], out_text)) ++- goto memfail; ++- if (kputc('\n', out_text) == EOF) goto memfail; ++- } else { ++- if (kputs("@SQ\tSN:", out_text) == EOF || ++- kputs(merged_hdr->target_name[i + old_n_targets], out_text) == EOF || ++- kputs("\tLN:", out_text) == EOF || ++- kputuw(merged_hdr->target_len[i + old_n_targets], out_text) == EOF || ++- kputc('\n', out_text) == EOF) goto memfail; ++- } ++- } +++ free(sq_line.s); +++ free(sq_sn.s); ++ ++- free(new_sq_matches); ++ return 0; ++ ++- memfail: ++- perror(__func__); ++ fail: ++- free(new_sq_matches); +++ free(sq_line.s); +++ free(sq_sn.s); ++ return -1; ++ } ++ ++@@ -594,29 +433,30 @@ ++ * ++ */ ++ ++-static klist_t(hdrln) * trans_rg_pg(bool is_rg, bam_hdr_t *translate, +++static klist_t(hdrln) * trans_rg_pg(bool is_rg, sam_hdr_t *translate, ++ bool merge, khash_t(cset)* known_ids, ++ khash_t(c2c)* id_map, char *override) { ++- hdr_match_t matches[2]; ++ khiter_t iter; ++- const char *text = translate->text; ++- const char *rec_type = is_rg ? "@RG" : "@PG"; +++ int num_ids, i; +++ const char *rec_type = is_rg ? "RG" : "PG"; ++ klist_t(hdrln) *hdr_lines; ++ ++ hdr_lines = kl_init(hdrln); ++ ++ // Search through translate's header ++- while (hdr_line_match(text, rec_type, "ID", matches) == 0) { ++- // matches[0] is the whole @RG/PG line; matches[1] is the ID field value +++ num_ids = sam_hdr_count_lines(translate, rec_type); +++ if (num_ids < 0) +++ goto fail; ++ +++ for (i = 0; i < num_ids; i++) { ++ kstring_t orig_id = { 0, 0, NULL }; // ID in original header ++ kstring_t transformed_id = { 0, 0, NULL }; // ID in output header ++ char *map_value; // Value to store in id_map ++ bool id_changed; // Have we changed the ID? ++ bool not_found_in_output; // ID isn't in the output header (yet) ++ ++- // Take a copy of the ID as we'll need it for a hash key. ++- if (match_to_ks(text, &matches[1], &orig_id)) goto memfail; +++ if (sam_hdr_find_tag_pos(translate, rec_type, i, "ID", &orig_id) < 0) +++ goto fail; ++ ++ // is our matched ID in our output ID set already? ++ iter = kh_get(cset, known_ids, ks_str(&orig_id)); ++@@ -653,18 +493,38 @@ ++ ++ // Does this line need to go into our output header? ++ if (not_found_in_output) { ++- ++ // Take matched line and replace ID with transformed_id ++ kstring_t new_hdr_line = { 0, 0, NULL }; +++ if (sam_hdr_find_line_id(translate, rec_type, +++ "ID", ks_str(&orig_id), &new_hdr_line) < 0){ +++ goto fail; +++ } +++ +++ if (id_changed) { +++ char *idp = strstr(ks_str(&new_hdr_line), "\tID:"), *id_end; +++ ptrdiff_t id_offset, id_len; +++ if (!idp) { +++ print_error("merge", "failed to find ID in \"%s\"\n", +++ ks_str(&new_hdr_line)); +++ goto fail; +++ } +++ idp += 4; +++ for (id_end = idp; *id_end >= '\n'; id_end++) {} +++ +++ id_offset = idp - new_hdr_line.s; +++ id_len = id_end - idp; ++ ++- if (!id_changed) { // Can just copy ++- if (match_to_ks(text, &matches[0], &new_hdr_line)) goto memfail; ++- } else { // Substitute new name for original ++- if (range_to_ks(text, matches[0].rm_so, matches[1].rm_so, ++- &new_hdr_line)) goto memfail; ++- if (ks_to_ks(&transformed_id, &new_hdr_line)) goto memfail; ++- if (range_to_ks(text, matches[1].rm_eo, matches[0].rm_eo, ++- &new_hdr_line)) goto memfail; +++ if (id_len < transformed_id.l) { +++ if (ks_resize(&new_hdr_line, new_hdr_line.l + transformed_id.l - id_len)) +++ goto fail; +++ } +++ if (id_len != transformed_id.l) { +++ memmove(new_hdr_line.s + id_offset + transformed_id.l, +++ new_hdr_line.s + id_offset + id_len, +++ new_hdr_line.l - id_offset - id_len + 1); +++ } +++ memcpy(new_hdr_line.s + id_offset, transformed_id.s, +++ transformed_id.l); ++ } ++ ++ // append line to output linked list ++@@ -688,8 +548,6 @@ ++ int in_there = 0; ++ iter = kh_put(c2c, id_map, ks_release(&orig_id), &in_there); ++ kh_value(id_map, iter) = map_value; ++- ++- text += matches[0].rm_eo; // next! ++ } ++ ++ // If there are no RG lines in the file and we are overriding add one ++@@ -726,6 +584,7 @@ ++ ++ memfail: ++ perror(__func__); +++ fail: ++ if (hdr_lines) kl_destroy(hdrln, hdr_lines); ++ return NULL; ++ } ++@@ -823,16 +682,18 @@ ++ * Returns 0 on success, -1 on failure. ++ */ ++ ++-static int trans_tbl_init(merged_header_t* merged_hdr, bam_hdr_t* translate, +++static int trans_tbl_init(merged_header_t* merged_hdr, sam_hdr_t* translate, ++ trans_tbl_t* tbl, bool merge_rg, bool merge_pg, ++ bool copy_co, char* rg_override) ++ { +++ kstring_t lines = { 0, 0, NULL }; ++ klist_t(hdrln) *rg_list = NULL; ++ klist_t(hdrln) *pg_list = NULL; ++ ++- tbl->n_targets = translate->n_targets; +++ tbl->n_targets = sam_hdr_nref(translate); ++ tbl->rg_trans = tbl->pg_trans = NULL; ++- tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int)); +++ tbl->tid_trans = (int*)calloc(tbl->n_targets ? tbl->n_targets : 1, +++ sizeof(int)); ++ if (tbl->tid_trans == NULL) goto memfail; ++ tbl->rg_trans = kh_init(c2c); ++ if (tbl->rg_trans == NULL) goto memfail; ++@@ -861,6 +722,7 @@ ++ goto fail; ++ ++ // Fix-up PP: tags in the new @PG records and add to output +++ lines.l = 0; ++ if (finish_rg_pg(false, pg_list, tbl->pg_trans, &merged_hdr->out_pg)) ++ goto fail; ++ ++@@ -869,22 +731,22 @@ ++ ++ if (copy_co) { ++ // Just append @CO headers without translation ++- const char *line, *end_pointer; ++- for (line = translate->text; *line; line = end_pointer + 1) { ++- end_pointer = strchr(line, '\n'); ++- if (strncmp(line, "@CO", 3) == 0) { ++- if (end_pointer) { ++- if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF) ++- goto memfail; ++- } else { // Last line with no trailing '\n' ++- if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail; ++- if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail; ++- } ++- } ++- if (end_pointer == NULL) break; +++ int num_co = sam_hdr_count_lines(translate, "CO"), i; +++ if (num_co < 0) +++ goto fail; +++ +++ for (i = 0; i < num_co; i++) { +++ if (sam_hdr_find_line_pos(translate, "CO", i, &lines) < 0) +++ goto fail; +++ if (ks_to_ks(&lines, &merged_hdr->out_co)) +++ goto fail; +++ if (kputc('\n', &merged_hdr->out_co) < 0) +++ goto fail; ++ } ++ } ++ +++ free(lines.s); +++ ++ return 0; ++ ++ memfail: ++@@ -893,80 +755,22 @@ ++ trans_tbl_destroy(tbl); ++ if (rg_list) kl_destroy(hdrln, rg_list); ++ if (pg_list) kl_destroy(hdrln, pg_list); +++ free(lines.s); ++ return -1; ++ } ++ ++-static inline void move_kstr_to_text(char **text, kstring_t *ks) { ++- memcpy(*text, ks_str(ks), ks_len(ks)); ++- *text += ks_len(ks); ++- **text = '\0'; ++- free(ks_release(ks)); ++-} ++- ++-/* ++- * Populate a bam_hdr_t struct from data in a merged_header_t. ++- */ ++- ++-static bam_hdr_t * finish_merged_header(merged_header_t *merged_hdr) { ++- size_t txt_sz; ++- char *text; ++- bam_hdr_t *hdr; ++- ++- // Check output text size ++- txt_sz = (ks_len(&merged_hdr->out_hd) ++- + ks_len(&merged_hdr->out_sq) ++- + ks_len(&merged_hdr->out_rg) ++- + ks_len(&merged_hdr->out_pg) ++- + ks_len(&merged_hdr->out_co)); ++- if (txt_sz >= INT32_MAX) { ++- fprintf(samtools_stderr, "[%s] Output header text too long\n", __func__); ++- return NULL; ++- } ++- ++- // Allocate new header ++- hdr = bam_hdr_init(); ++- if (hdr == NULL) goto memfail; ++- ++- // Transfer targets arrays to new header ++- hdr->n_targets = merged_hdr->n_targets; ++- if (hdr->n_targets > 0) { ++- // Try to shrink targets arrays to correct size ++- hdr->target_name = realloc(merged_hdr->target_name, ++- hdr->n_targets * sizeof(char*)); ++- if (!hdr->target_name) hdr->target_name = merged_hdr->target_name; ++- ++- hdr->target_len = realloc(merged_hdr->target_len, ++- hdr->n_targets * sizeof(uint32_t)); ++- if (!hdr->target_len) hdr->target_len = merged_hdr->target_len; ++- ++- // These have either been freed by realloc() or, in the unlikely ++- // event that failed, have had their ownership transferred to hdr ++- merged_hdr->target_name = NULL; ++- merged_hdr->target_len = NULL; ++- } ++- else { ++- hdr->target_name = NULL; ++- hdr->target_len = NULL; ++- } ++- ++- // Allocate text ++- text = hdr->text = malloc(txt_sz + 1); ++- if (!text) goto memfail; ++- ++- // Put header text in order @HD, @SQ, @RG, @PG, @CO ++- move_kstr_to_text(&text, &merged_hdr->out_hd); ++- move_kstr_to_text(&text, &merged_hdr->out_sq); ++- move_kstr_to_text(&text, &merged_hdr->out_rg); ++- move_kstr_to_text(&text, &merged_hdr->out_pg); ++- move_kstr_to_text(&text, &merged_hdr->out_co); ++- hdr->l_text = txt_sz; ++- ++- return hdr; +++static int finish_merged_header(merged_header_t *merged_hdr) { +++ if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_rg), +++ ks_len(&merged_hdr->out_rg)) < 0) +++ return -1; +++ if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_pg), +++ ks_len(&merged_hdr->out_pg)) < 0) +++ return -1; +++ if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_co), +++ ks_len(&merged_hdr->out_co)) < 0) +++ return -1; ++ ++- memfail: ++- perror(__func__); ++- bam_hdr_destroy(hdr); ++- return NULL; +++ return 0; ++ } ++ ++ /* ++@@ -981,8 +785,6 @@ ++ size_t i; ++ khiter_t iter; ++ if (!merged_hdr) return; ++- free(ks_release(&merged_hdr->out_hd)); ++- free(ks_release(&merged_hdr->out_sq)); ++ free(ks_release(&merged_hdr->out_rg)); ++ free(ks_release(&merged_hdr->out_pg)); ++ free(ks_release(&merged_hdr->out_co)); ++@@ -1149,25 +951,30 @@ ++ @param cmd command name (used in print_error() etc) ++ @param in_fmt format options for input files ++ @param out_fmt output file format and options +++ @param write_index create the index, together with the output file +++ @param arg_list command string for PG line +++ @param no_pg if 1, do not add a new PG line ++ @discussion Padding information may NOT correctly maintained. This ++ function is NOT thread safe. ++ */ ++ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode, ++- const char *headers, int n, char * const *fn, int flag, ++- const char *reg, int n_threads, const char *cmd, ++- const htsFormat *in_fmt, const htsFormat *out_fmt) +++ const char *headers, int n, char * const *fn, char * const *fn_idx, +++ int flag, const char *reg, int n_threads, const char *cmd, +++ const htsFormat *in_fmt, const htsFormat *out_fmt, int write_index, +++ char *arg_list, int no_pg) ++ { ++ samFile *fpout, **fp = NULL; ++ heap1_t *heap = NULL; ++- bam_hdr_t *hout = NULL; ++- bam_hdr_t *hin = NULL; +++ sam_hdr_t *hout = NULL; +++ sam_hdr_t *hin = NULL; ++ int i, j, *RG_len = NULL; ++ uint64_t idx = 0; ++ char **RG = NULL; ++ hts_itr_t **iter = NULL; ++- bam_hdr_t **hdr = NULL; +++ sam_hdr_t **hdr = NULL; ++ trans_tbl_t *translation_tbl = NULL; ++ int *rtrans = NULL; +++ char *out_idx_fn = NULL; ++ merged_header_t *merged_hdr = init_merged_header(); ++ if (!merged_hdr) return -1; ++ ++@@ -1190,7 +997,7 @@ ++ if (sort_tag) { ++ g_is_by_tag = 1; ++ g_sort_tag[0] = sort_tag[0]; ++- g_sort_tag[1] = sort_tag[1]; +++ g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; ++ } ++ ++ fp = (samFile**)calloc(n, sizeof(samFile*)); ++@@ -1199,7 +1006,7 @@ ++ if (!heap) goto mem_fail; ++ iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); ++ if (!iter) goto mem_fail; ++- hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); +++ hdr = (sam_hdr_t**)calloc(n, sizeof(sam_hdr_t*)); ++ if (!hdr) goto mem_fail; ++ translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); ++ if (!translation_tbl) goto mem_fail; ++@@ -1236,7 +1043,7 @@ ++ ++ // open and read the header from each file ++ for (i = 0; i < n; ++i) { ++- bam_hdr_t *hin; +++ sam_hdr_t *hin; ++ fp[i] = sam_open_format(fn[i], "r", in_fmt); ++ if (fp[i] == NULL) { ++ print_error_errno(cmd, "fail to open \"%s\"", fn[i]); ++@@ -1257,7 +1064,7 @@ ++ // TODO sam_itr_next() doesn't yet work for SAM files, ++ // so for those keep the headers around for use with sam_read1() ++ if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; ++- else { bam_hdr_destroy(hin); hdr[i] = NULL; } +++ else { sam_hdr_destroy(hin); hdr[i] = NULL; } ++ ++ if ((translation_tbl+i)->lost_coord_sort && !by_qname) { ++ fprintf(samtools_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); ++@@ -1286,41 +1093,34 @@ ++ } ++ ++ // Transform the header into standard form ++- hout = finish_merged_header(merged_hdr); +++ if (finish_merged_header(merged_hdr) < 0) +++ goto fail; +++ +++ hout = merged_hdr->hdr; ++ if (!hout) return -1; // FIXME: memory leak ++ ++ // If we're only merging a specified region move our iters to start at that point ++ if (reg) { ++- int tid, beg, end; ++- const char *name_lim; +++ int tid; +++ hts_pos_t beg, end; ++ ++- rtrans = rtrans_build(n, hout->n_targets, translation_tbl); +++ rtrans = rtrans_build(n, sam_hdr_nref(hout), translation_tbl); ++ if (!rtrans) goto mem_fail; ++ ++- name_lim = hts_parse_reg(reg, &beg, &end); ++- if (name_lim) { ++- char *name = malloc(name_lim - reg + 1); ++- if (!name) goto mem_fail; ++- memcpy(name, reg, name_lim - reg); ++- name[name_lim - reg] = '\0'; ++- tid = bam_name2id(hout, name); ++- free(name); ++- } ++- else { ++- // not parsable as a region, but possibly a sequence named "foo:a" ++- tid = bam_name2id(hout, reg); ++- beg = 0; ++- end = INT_MAX; ++- } ++- if (tid < 0) { ++- if (name_lim) fprintf(samtools_stderr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg); ++- else fprintf(samtools_stderr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg); +++ if (!sam_parse_region(hout, reg, &tid, &beg, &end, 0)) { +++ fprintf(samtools_stderr, "[%s] Badly formatted region or unknown reference name: \"%s\"\n", __func__, reg); ++ goto fail; ++ } ++ for (i = 0; i < n; ++i) { ++- hts_idx_t *idx = sam_index_load(fp[i], fn[i]); +++ hts_idx_t *idx = NULL; +++ // If index filename has not been specfied, look in BAM folder +++ if (fn_idx != NULL) { +++ idx = sam_index_load2(fp[i], fn[i], fn_idx[i]); +++ } else { +++ idx = sam_index_load(fp[i], fn[i]); +++ } ++ // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space ++- int mapped_tid = rtrans[i*hout->n_targets+tid]; +++ int mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid]; ++ if (idx == NULL) { ++ fprintf(samtools_stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", ++ __func__, fn[i]); ++@@ -1336,7 +1136,7 @@ ++ if (mapped_tid != INT32_MIN) { ++ fprintf(samtools_stderr, ++ "[%s] failed to get iterator over " ++- "{%s, %d, %d, %d}\n", +++ "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n", ++ __func__, fn[i], mapped_tid, beg, end); ++ } else { ++ fprintf(samtools_stderr, ++@@ -1373,7 +1173,8 @@ ++ res = iter[i] ? sam_itr_next(fp[i], iter[i], h->entry.bam_record) : sam_read1(fp[i], hdr[i], h->entry.bam_record); ++ if (res >= 0) { ++ bam_translate(h->entry.bam_record, translation_tbl + i); ++- h->pos = ((uint64_t)h->entry.bam_record->core.tid<<32) | (uint32_t)((int32_t)h->entry.bam_record->core.pos+1); +++ h->tid = h->entry.bam_record->core.tid; +++ h->pos = (uint64_t)(h->entry.bam_record->core.pos + 1); ++ h->rev = bam_is_rev(h->entry.bam_record); ++ h->idx = idx++; ++ if (g_is_by_tag) { ++@@ -1398,11 +1199,26 @@ ++ print_error_errno(cmd, "failed to create \"%s\"", out); ++ return -1; ++ } +++ if (!no_pg && sam_hdr_add_pg(hout, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ print_error(cmd, "failed to add PG line to the header of \"%s\"", out); +++ sam_close(fpout); +++ return -1; +++ } ++ if (sam_hdr_write(fpout, hout) != 0) { ++ print_error_errno(cmd, "failed to write header to \"%s\"", out); ++ sam_close(fpout); ++ return -1; ++ } +++ if (write_index) { +++ if (!(out_idx_fn = auto_index(fpout, out, hout))){ +++ sam_close(fpout); +++ return -1; +++ } +++ } ++ if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); ++ ++ // Begin the actual merge ++@@ -1417,11 +1233,13 @@ ++ if (sam_write1(fpout, hout, b) < 0) { ++ print_error_errno(cmd, "failed writing to \"%s\"", out); ++ sam_close(fpout); +++ free(out_idx_fn); ++ return -1; ++ } ++ if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { ++ bam_translate(b, translation_tbl + heap->i); ++- heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1); +++ heap->tid = b->core.tid; +++ heap->pos = (uint64_t)(b->core.pos + 1); ++ heap->rev = bam_is_rev(b); ++ heap->idx = idx++; ++ if (g_is_by_tag) { ++@@ -1441,6 +1259,14 @@ ++ ks_heapadjust(heap, 0, n, heap); ++ } ++ +++ if (write_index) { +++ if (sam_idx_save(fpout) < 0) { +++ print_error_errno("merge", "writing index failed"); +++ goto fail; +++ } +++ } +++ free(out_idx_fn); +++ ++ // Clean up and close ++ if (flag & MERGE_RG) { ++ for (i = 0; i != n; ++i) free(RG[i]); ++@@ -1449,11 +1275,11 @@ ++ for (i = 0; i < n; ++i) { ++ trans_tbl_destroy(translation_tbl + i); ++ hts_itr_destroy(iter[i]); ++- bam_hdr_destroy(hdr[i]); +++ sam_hdr_destroy(hdr[i]); ++ sam_close(fp[i]); ++ } ++- bam_hdr_destroy(hin); ++- bam_hdr_destroy(hout); +++ sam_hdr_destroy(hin); +++ sam_hdr_destroy(hout); ++ free_merged_header(merged_hdr); ++ free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); ++ if (sam_close(fpout) < 0) { ++@@ -1475,11 +1301,11 @@ ++ for (i = 0; i < n; ++i) { ++ if (translation_tbl && translation_tbl[i].tid_trans) trans_tbl_destroy(translation_tbl + i); ++ if (iter && iter[i]) hts_itr_destroy(iter[i]); ++- if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]); +++ if (hdr && hdr[i]) sam_hdr_destroy(hdr[i]); ++ if (fp && fp[i]) sam_close(fp[i]); ++ if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record); ++ } ++- if (hout) bam_hdr_destroy(hout); +++ if (hout) sam_hdr_destroy(hout); ++ free(RG); ++ free(translation_tbl); ++ free(hdr); ++@@ -1487,6 +1313,7 @@ ++ free(heap); ++ free(fp); ++ free(rtrans); +++ free(out_idx_fn); ++ return -1; ++ } ++ ++@@ -1497,7 +1324,7 @@ ++ strcpy(mode, "wb"); ++ if (flag & MERGE_UNCOMP) strcat(mode, "0"); ++ else if (flag & MERGE_LEVEL1) strcat(mode, "1"); ++- return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL); +++ return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1); ++ } ++ ++ static void merge_usage(FILE *to) ++@@ -1518,23 +1345,27 @@ ++ " -c Combine @RG headers with colliding IDs [alter IDs to be distinct]\n" ++ " -p Combine @PG headers with colliding IDs [alter IDs to be distinct]\n" ++ " -s VALUE Override random seed\n" ++-" -b FILE List of input BAM filenames, one per line [null]\n"); ++- sam_global_opt_help(to, "-.O..@"); +++" -b FILE List of input BAM filenames, one per line [null]\n" +++" -X Use customized index files\n" +++" --no-PG do not add a PG line\n"); +++ sam_global_opt_help(to, "-.O..@.."); ++ } ++ ++ int bam_merge(int argc, char *argv[]) ++ { ++- int c, is_by_qname = 0, flag = 0, ret = 0, level = -1; +++ int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0; ++ char *fn_headers = NULL, *reg = NULL, mode[12]; ++- char *sort_tag = NULL; +++ char *sort_tag = NULL, *arg_list = NULL; ++ long random_seed = (long)time(NULL); ++ char** fn = NULL; ++- int fn_size = 0; +++ char** fn_idx = NULL; +++ int fn_size = 0, no_pg = 0; ++ ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), ++ { "threads", required_argument, NULL, '@' }, +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -1543,13 +1374,13 @@ ++ return 0; ++ } ++ ++- while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:", lopts, NULL)) >= 0) { +++ while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:X", lopts, NULL)) >= 0) { ++ switch (c) { ++ case 'r': flag |= MERGE_RG; break; ++ case 'f': flag |= MERGE_FORCE; break; ++- case 'h': fn_headers = strdup(optarg); break; +++ case 'h': fn_headers = optarg; break; ++ case 'n': is_by_qname = 1; break; ++- case 't': sort_tag = strdup(optarg); break; +++ case 't': sort_tag = optarg; break; ++ case '1': flag |= MERGE_LEVEL1; level = 1; break; ++ case 'u': flag |= MERGE_UNCOMP; level = 0; break; ++ case 'R': reg = strdup(optarg); break; ++@@ -1557,8 +1388,13 @@ ++ case 'c': flag |= MERGE_COMBINE_RG; break; ++ case 'p': flag |= MERGE_COMBINE_PG; break; ++ case 's': random_seed = atol(optarg); break; +++ case 'X': has_index_file = 1; break; // -X flag for index filename ++ case 'b': { ++ // load the list of files to read +++ if (has_index_file) { +++ fprintf(samtools_stderr,"Error: The -b option cannot be combined with -X\n"); +++ ret = 1; goto end; +++ } ++ int nfiles; ++ char **fn_read = hts_readlines(optarg, &nfiles); ++ if (fn_read) { ++@@ -1575,7 +1411,7 @@ ++ } ++ break; ++ } ++- +++ case 1: no_pg = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': merge_usage(samtools_stderr); return 1; ++@@ -1587,6 +1423,11 @@ ++ return 1; ++ } ++ +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("merge", "failed to create arg_list"); +++ return 1; +++ } +++ ++ srand48(random_seed); ++ if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { ++ FILE *fp = fopen(argv[optind], "rb"); ++@@ -1597,24 +1438,41 @@ ++ } ++ } ++ ++- int nargcfiles = argc - (optind+1); +++ int nargcfiles = 0; +++ if (has_index_file) { // Calculate # of input BAM files +++ if ((argc - optind - 1) % 2 != 0) { +++ fprintf(samtools_stderr, "Odd number of filenames detected! Each BAM file should have an index file\n"); +++ return 1; +++ } +++ nargcfiles = (argc - optind - 1) / 2; +++ } else { +++ nargcfiles = argc - optind - 1; +++ } +++ ++ if (nargcfiles > 0) { ++ // Add argc files to end of array ++ fn = realloc(fn, (fn_size+nargcfiles) * sizeof(char*)); ++ if (fn == NULL) { ret = 1; goto end; } ++ memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*)); +++ +++ if(has_index_file) { +++ fn_idx = realloc(fn_idx, nargcfiles * sizeof(char*)); +++ if (fn_idx == NULL) { ret = 1; goto end; } +++ memcpy(fn_idx+fn_size, argv + nargcfiles + (optind+1), nargcfiles * sizeof(char*)); +++ } ++ } ++ if (fn_size+nargcfiles < 1) { ++ print_error("merge", "You must specify at least one (and usually two or more) input files"); ++ merge_usage(samtools_stderr); +++ free(fn_idx); ++ return 1; ++ } ++ strcpy(mode, "wb"); ++ sam_open_mode(mode+1, argv[optind], NULL); ++ if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9); ++ if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers, ++- fn_size+nargcfiles, fn, flag, reg, ga.nthreads, ++- "merge", &ga.in, &ga.out) < 0) +++ fn_size+nargcfiles, fn, fn_idx, flag, reg, ga.nthreads, +++ "merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0) ++ ret = 1; ++ ++ end: ++@@ -1623,8 +1481,9 @@ ++ for (i=0; ii, res; ++ if (i < nfiles) { // read from file ++ res = sam_read1(fp[i], hout, heap->entry.bam_record); ++@@ -1657,8 +1516,8 @@ ++ } ++ } ++ if (res >= 0) { ++- heap->pos = (((uint64_t)heap->entry.bam_record->core.tid<<32) ++- | (uint32_t)((int32_t)heap->entry.bam_record->core.pos+1)); +++ heap->tid = heap->entry.bam_record->core.tid; +++ heap->pos = (uint64_t)(heap->entry.bam_record->core.pos + 1); ++ heap->rev = bam_is_rev(heap->entry.bam_record); ++ heap->idx = (*idx)++; ++ if (g_is_by_tag) { ++@@ -1678,21 +1537,23 @@ ++ } ++ ++ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, ++- const char *mode, bam_hdr_t *hout, +++ const char *mode, sam_hdr_t *hout, ++ int n, char * const *fn, int num_in_mem, ++ buf_region *in_mem, bam1_tag *buf, int n_threads, ++ const char *cmd, const htsFormat *in_fmt, ++- const htsFormat *out_fmt) { +++ const htsFormat *out_fmt, char *arg_list, int no_pg, +++ int write_index) { ++ samFile *fpout = NULL, **fp = NULL; ++ heap1_t *heap = NULL; ++ uint64_t idx = 0; ++ int i, heap_size = n + num_in_mem; +++ char *out_idx_fn = NULL; ++ ++ g_is_by_qname = by_qname; ++ if (sort_tag) { ++ g_is_by_tag = 1; ++ g_sort_tag[0] = sort_tag[0]; ++- g_sort_tag[1] = sort_tag[1]; +++ g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; ++ } ++ if (n > 0) { ++ fp = (samFile**)calloc(n, sizeof(samFile*)); ++@@ -1703,7 +1564,7 @@ ++ ++ // Open each file, read the header and put the first read into the heap ++ for (i = 0; i < heap_size; i++) { ++- bam_hdr_t *hin; +++ sam_hdr_t *hin; ++ heap1_t *h = &heap[i]; ++ ++ if (i < n) { ++@@ -1720,7 +1581,7 @@ ++ goto fail; ++ } ++ // ... and throw it away as we don't really need it ++- bam_hdr_destroy(hin); +++ sam_hdr_destroy(hin); ++ } ++ ++ // Get a read into the heap ++@@ -1743,6 +1604,16 @@ ++ return -1; ++ } ++ +++ if (!no_pg && sam_hdr_add_pg(hout, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ print_error(cmd, "failed to add PG line to the header of \"%s\"", out); +++ sam_close(fpout); +++ return -1; +++ } +++ ++ if (n_threads > 1) hts_set_threads(fpout, n_threads); ++ ++ if (sam_hdr_write(fpout, hout) != 0) { ++@@ -1751,14 +1622,20 @@ ++ return -1; ++ } ++ +++ if (write_index) { +++ if (!(out_idx_fn = auto_index(fpout, out, hout))){ +++ sam_close(fpout); +++ return -1; +++ } +++ } +++ ++ // Now do the merge ++ ks_heapmake(heap, heap_size, heap); ++ while (heap->pos != HEAP_EMPTY) { ++ bam1_t *b = heap->entry.bam_record; ++ if (sam_write1(fpout, hout, b) < 0) { ++ print_error_errno(cmd, "failed writing to \"%s\"", out); ++- sam_close(fpout); ++- return -1; +++ goto fail; ++ } ++ if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) { ++ assert(heap->i < n); ++@@ -1777,6 +1654,15 @@ ++ } ++ free(fp); ++ free(heap); +++ +++ if (write_index) { +++ if (sam_idx_save(fpout) < 0) { +++ print_error_errno("merge", "writing index failed"); +++ goto fail; +++ } +++ free(out_idx_fn); +++ } +++ ++ if (sam_close(fpout) < 0) { ++ print_error(cmd, "error closing output file"); ++ return -1; ++@@ -1788,11 +1674,15 @@ ++ fail: ++ for (i = 0; i < n; i++) { ++ if (fp && fp[i]) sam_close(fp[i]); ++- if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record); +++ } +++ for (i = 0; i < heap_size; i++) { +++ if (heap && heap[i].i < n && heap[i].entry.bam_record) +++ bam_destroy1(heap[i].entry.bam_record); ++ } ++ free(fp); ++ free(heap); ++ if (fpout) sam_close(fpout); +++ free(out_idx_fn); ++ return -1; ++ } ++ ++@@ -1813,8 +1703,13 @@ ++ if (t != 0) return t; ++ return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0); ++ } else { ++- pa = (uint64_t)a.bam_record->core.tid<<32|(a.bam_record->core.pos+1); ++- pb = (uint64_t)b.bam_record->core.tid<<32|(b.bam_record->core.pos+1); +++ pa = a.bam_record->core.tid; +++ pb = b.bam_record->core.tid; +++ +++ if (pa == pb) { +++ pa = (uint64_t)(a.bam_record->core.pos+1); +++ pb = (uint64_t)(b.bam_record->core.pos+1); +++ } ++ ++ if (pa == pb) { ++ pa = bam_is_rev(a.bam_record); ++@@ -1915,7 +1810,7 @@ ++ size_t buf_len; ++ const char *prefix; ++ bam1_tag *buf; ++- const bam_hdr_t *h; +++ const sam_hdr_t *h; ++ int index; ++ int error; ++ int no_save; ++@@ -1923,45 +1818,99 @@ ++ ++ // Returns 0 for success ++ // -1 for failure ++-static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt) +++static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, +++ const sam_hdr_t *h, int n_threads, const htsFormat *fmt, +++ char *arg_list, int no_pg, int write_index) ++ { ++ size_t i; ++ samFile* fp; +++ char *out_idx_fn = NULL; +++ ++ fp = sam_open_format(fn, mode, fmt); ++ if (fp == NULL) return -1; ++- if (sam_hdr_write(fp, h) != 0) goto fail; +++ if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ goto fail; +++ } +++ if (sam_hdr_write(fp, (sam_hdr_t *)h) != 0) goto fail; +++ +++ if (write_index) { +++ if (!(out_idx_fn = auto_index(fp, fn, (sam_hdr_t *)h))) goto fail; +++ } +++ ++ if (n_threads > 1) hts_set_threads(fp, n_threads); ++ for (i = 0; i < l; ++i) { ++- if (sam_write1(fp, h, buf[i].bam_record) < 0) goto fail; +++ if (sam_write1(fp, (sam_hdr_t *)h, buf[i].bam_record) < 0) goto fail; ++ } +++ +++ if (write_index) { +++ if (sam_idx_save(fp) < 0) { +++ print_error_errno("merge", "writing index failed"); +++ goto fail; +++ } +++ free(out_idx_fn); +++ } +++ +++ ++ if (sam_close(fp) < 0) return -1; ++ return 0; ++ fail: ++ sam_close(fp); +++ free(out_idx_fn); ++ return -1; ++ } ++ ++ #define NUMBASE 256 ++-#define STEP 8 ++ ++-static int ks_radixsort(size_t n, bam1_tag *buf, const bam_hdr_t *h) +++static int ks_radixsort(size_t n, bam1_tag *buf, const sam_hdr_t *h) ++ { ++ int curr = 0, ret = -1; ++ ssize_t i; ++ bam1_tag *buf_ar2[2], *bam_a, *bam_b; ++- uint64_t max_pos = 0, max_digit = 0, shift = 0; ++- +++ uint64_t max_pos = 1; +++ uint32_t max_tid = 1, tid_bytes = 0, pos_bytes = 0, byte = 0; +++ uint32_t tid_shift_l, tid_shift_r; +++ int nref = sam_hdr_nref(h); +++ +++ // Count number of bytes needed for biggest tid and pos +++ // Notes: Add 1 to core.pos so always positive. +++ // Convert unmapped tid (-1) to number of references so unmapped +++ // sort to the end. ++ for (i = 0; i < n; i++) { ++ bam1_t *b = buf[i].bam_record; ++- int32_t tid = b->core.tid == -1 ? h->n_targets : b->core.tid; ++- buf[i].u.pos = (uint64_t)tid<<32 | (b->core.pos+1)<<1 | bam_is_rev(b); ++- if (max_pos < buf[i].u.pos) ++- max_pos = buf[i].u.pos; ++- } ++- ++- while (max_pos) { ++- ++max_digit; ++- max_pos = max_pos >> 1; +++ uint32_t tid = b->core.tid == -1 ? nref : b->core.tid; +++ uint64_t pos = ((uint64_t)(b->core.pos + 1) << 1) | bam_is_rev(b); +++ if (max_tid < tid) +++ max_tid = tid; +++ if (max_pos < pos) +++ max_pos = pos; +++ } +++ +++ for (; max_pos > 0; max_pos >>= 8) pos_bytes++; +++ for (; max_tid > 0; max_tid >>= 8) tid_bytes++; +++ assert(pos_bytes + tid_bytes < sizeof(buf[0].u.pos_tid)); +++ +++ tid_shift_l = pos_bytes * 8; +++ tid_shift_r = 64 - tid_shift_l; +++ +++ // Write position and tid into bam1_tag::u::pos_tid using minimum number +++ // of bytes required. Values are stored little-endian so that we +++ // get a least-significant digit (byte) radix sort. +++ for (i = 0; i < n; i++) { +++ bam1_t *b = buf[i].bam_record; +++ uint32_t tid = b->core.tid == -1 ? nref : b->core.tid; +++ // 'pos' here includes as many bytes of tid as will fit +++ // in the space remaining above pos_bytes. The rest of tid +++ // is written out separately. +++ uint64_t pos = (bam_is_rev(b) | +++ ((uint64_t)(b->core.pos + 1) << 1) | +++ (tid_shift_l < 64 ? (uint64_t) tid << tid_shift_l : 0)); +++ u64_to_le(pos, buf[i].u.pos_tid); +++ u32_to_le(tid_shift_r < 32 ? tid >> tid_shift_r : 0, +++ &buf[i].u.pos_tid[8]); ++ } ++ ++ buf_ar2[0] = buf; ++@@ -1971,18 +1920,18 @@ ++ goto err; ++ } ++ ++- while (shift < max_digit){ +++ // Least-significant digit radix sort (where "digits" are bytes) +++ for (byte = 0; byte < pos_bytes + tid_bytes; byte++) { ++ size_t remainders[NUMBASE] = { 0 }; ++ bam_a = buf_ar2[curr]; bam_b = buf_ar2[1-curr]; ++ for (i = 0; i < n; ++i) ++- remainders[(bam_a[i].u.pos >> shift) % NUMBASE]++; +++ remainders[bam_a[i].u.pos_tid[byte]]++; ++ for (i = 1; i < NUMBASE; ++i) ++ remainders[i] += remainders[i - 1]; ++ for (i = n - 1; i >= 0; i--) { ++- size_t j = --remainders[(bam_a[i].u.pos >> shift) % NUMBASE]; +++ size_t j = --remainders[bam_a[i].u.pos_tid[byte]]; ++ bam_b[j] = bam_a[i]; ++ } ++- shift += STEP; ++ curr = 1 - curr; ++ } ++ if (curr == 1) { ++@@ -2036,10 +1985,10 @@ ++ return 0; ++ } ++ ++- if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt) < 0) +++ if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, NULL, 1, 0) < 0) ++ w->error = errno; ++ } else { ++- if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0) +++ if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, NULL, 1, 0) < 0) ++ w->error = errno; ++ } ++ ++@@ -2048,7 +1997,7 @@ ++ } ++ ++ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, ++- const bam_hdr_t *h, int n_threads, buf_region *in_mem) +++ const sam_hdr_t *h, int n_threads, buf_region *in_mem) ++ { ++ int i; ++ size_t pos, rest; ++@@ -2109,6 +2058,9 @@ ++ @param max_mem approxiate maximum memory (very inaccurate) ++ @param in_fmt input file format options ++ @param out_fmt output file format and options +++ @param arg_list command string for PG line +++ @param no_pg if 1, do not add a new PG line +++ @paran write_index create index for the output file ++ @return 0 for successful sorting, negative on errors ++ ++ @discussion It may create multiple temporary subalignment files ++@@ -2118,11 +2070,12 @@ ++ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix, ++ const char *fnout, const char *modeout, ++ size_t _max_mem, int n_threads, ++- const htsFormat *in_fmt, const htsFormat *out_fmt) +++ const htsFormat *in_fmt, const htsFormat *out_fmt, +++ char *arg_list, int no_pg, int write_index) ++ { ++ int ret = -1, res, i, n_files = 0; ++ size_t max_k, k, max_mem, bam_mem_offset; ++- bam_hdr_t *header = NULL; +++ sam_hdr_t *header = NULL; ++ samFile *fp; ++ bam1_tag *buf = NULL; ++ bam1_t *b = bam_init1(); ++@@ -2141,7 +2094,8 @@ ++ g_is_by_qname = is_by_qname; ++ if (sort_by_tag) { ++ g_is_by_tag = 1; ++- strncpy(g_sort_tag, sort_by_tag, 2); +++ g_sort_tag[0] = sort_by_tag[0]; +++ g_sort_tag[1] = sort_by_tag[0] ? sort_by_tag[1] : '\0'; ++ } ++ ++ max_mem = _max_mem * n_threads; ++@@ -2164,14 +2118,15 @@ ++ else ++ new_so = "coordinate"; ++ ++- if (sam_hdr_change_HD(header, "SO", new_so) != 0) { ++- print_error("sort", ++- "failed to change sort order header to '%s'\n", new_so); +++ if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) +++ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) +++ ) { +++ print_error("sort", "failed to change sort order header to '%s'\n", new_so); ++ goto err; ++ } ++- if (sam_hdr_change_HD(header, "GO", NULL) != 0) { ++- print_error("sort", ++- "failed to delete group order header\n"); +++ +++ if (-1 == sam_hdr_remove_tag_hd(header, "GO")) { +++ print_error("sort", "failed to delete group order header\n"); ++ goto err; ++ } ++ ++@@ -2254,7 +2209,7 @@ ++ ++ // write the final output ++ if (n_files == 0 && num_in_mem < 2) { // a single block ++- if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) { +++ if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, arg_list, no_pg, write_index) != 0) { ++ print_error_errno("sort", "failed to create \"%s\"", fnout); ++ goto err; ++ } ++@@ -2271,7 +2226,8 @@ ++ } ++ if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header, ++ n_files, fns, num_in_mem, in_mem, buf, ++- n_threads, "sort", in_fmt, out_fmt) < 0) { +++ n_threads, "sort", in_fmt, out_fmt, arg_list, +++ no_pg, write_index) < 0) { ++ // Propagate bam_merge_simple() failure; it has already emitted a ++ // message explaining the failure, so no further message is needed. ++ goto err; ++@@ -2295,7 +2251,7 @@ ++ free(buf); ++ free(bam_mem); ++ free(in_mem); ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ if (fp) sam_close(fp); ++ return ret; ++ } ++@@ -2307,7 +2263,7 @@ ++ char *fnout = calloc(strlen(prefix) + 4 + 1, 1); ++ if (!fnout) return -1; ++ sprintf(fnout, "%s.bam", prefix); ++- ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL); +++ ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0); ++ free(fnout); ++ return ret; ++ } ++@@ -2322,8 +2278,9 @@ ++ " -n Sort by read name\n" ++ " -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n" ++ " -o FILE Write final output to FILE rather than standard output\n" ++-" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"); ++- sam_global_opt_help(fp, "-.O..@"); +++" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n" +++" --no-PG do not add a PG line\n"); +++ sam_global_opt_help(fp, "-.O..@-."); ++ } ++ ++ static void complain_about_memory_setting(size_t max_mem) { ++@@ -2346,8 +2303,8 @@ ++ int bam_sort(int argc, char *argv[]) ++ { ++ size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20; ++- int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1; ++- char* sort_tag = NULL; +++ int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0; +++ char* sort_tag = NULL, *arg_list = NULL; ++ char *fnout = "-", modeout[12]; ++ kstring_t tmpprefix = { 0, 0, NULL }; ++ struct stat st; ++@@ -2356,6 +2313,7 @@ ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), ++ { "threads", required_argument, NULL, '@' }, +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -2363,7 +2321,7 @@ ++ switch (c) { ++ case 'o': fnout = optarg; o_seen = 1; break; ++ case 'n': is_by_qname = 1; break; ++- case 't': sort_tag = strdup(optarg); break; +++ case 't': sort_tag = optarg; break; ++ case 'm': { ++ char *q; ++ max_mem = strtol(optarg, &q, 0); ++@@ -2374,6 +2332,7 @@ ++ } ++ case 'T': kputs(optarg, &tmpprefix); break; ++ case 'l': level = atoi(optarg); break; +++ case 1: no_pg = 1; break; ++ ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++@@ -2397,6 +2356,16 @@ ++ goto sort_end; ++ } ++ +++ if (ga.write_index && (is_by_qname || sort_tag)) { +++ fprintf(samtools_stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n"); +++ ga.write_index = 0; +++ } +++ +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("sort", "failed to create arg_list"); +++ return 1; +++ } +++ ++ if (max_mem < (SORT_MIN_MEGS_PER_THREAD << 20)) { ++ complain_about_memory_setting(max_mem); ++ ret = EXIT_FAILURE; ++@@ -2419,7 +2388,7 @@ ++ ++ ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-", ++ tmpprefix.s, fnout, modeout, max_mem, ga.nthreads, ++- &ga.in, &ga.out); +++ &ga.in, &ga.out, arg_list, no_pg, ga.write_index); ++ if (ret >= 0) ++ ret = EXIT_SUCCESS; ++ else { ++@@ -2434,6 +2403,7 @@ ++ ++ sort_end: ++ free(tmpprefix.s); +++ free(arg_list); ++ sam_global_args_free(&ga); ++ ++ return ret; ++--- python-pysam.orig/samtools/bam_split.c +++++ python-pysam/samtools/bam_split.c ++@@ -1,6 +1,6 @@ ++ /* bam_split.c -- split subcommand. ++ ++- Copyright (C) 2013-2016 Genome Research Ltd. +++ Copyright (C) 2013-2016,2018-2019 Genome Research Ltd. ++ ++ Author: Martin Pollard ++ ++@@ -24,7 +24,6 @@ ++ ++ #include ++ ++-#include ++ #include ++ #include ++ #include ++@@ -32,6 +31,8 @@ ++ #include ++ #include ++ #include +++#include +++#include ++ #include ++ #include ++ #include ++@@ -43,11 +44,12 @@ ++ KHASH_MAP_INIT_STR(c2i, int) ++ ++ struct parsed_opts { ++- char* merged_input_name; ++- char* unaccounted_header_name; ++- char* unaccounted_name; ++- char* output_format_string; +++ const char *merged_input_name; +++ const char *unaccounted_header_name; +++ const char *unaccounted_name; +++ const char *output_format_string; ++ bool verbose; +++ int no_pg; ++ sam_global_args ga; ++ }; ++ ++@@ -55,16 +57,18 @@ ++ ++ struct state { ++ samFile* merged_input_file; ++- bam_hdr_t* merged_input_header; +++ sam_hdr_t* merged_input_header; ++ samFile* unaccounted_file; ++- bam_hdr_t* unaccounted_header; +++ sam_hdr_t* unaccounted_header; ++ size_t output_count; ++ char** rg_id; +++ char **rg_index_file_name; ++ char **rg_output_file_name; ++ samFile** rg_output_file; ++- bam_hdr_t** rg_output_header; +++ sam_hdr_t** rg_output_header; ++ kh_c2i_t* rg_hash; ++ htsThreadPool p; +++ int write_index; ++ }; ++ ++ typedef struct state state_t; ++@@ -75,14 +79,15 @@ ++ static void usage(FILE *write_to) ++ { ++ fprintf(write_to, ++-"Usage: samtools split [-u [:]]\n" +++"Usage: samtools split [-u ] [-h ]\n" ++ " [-f ] [-v] \n" ++ "Options:\n" ++ " -f STRING output filename format string [\"%%*_%%#.%%.\"]\n" ++ " -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n" ++-" -u FILE1:FILE2 ...and override the header with FILE2\n" ++-" -v verbose output\n"); ++- sam_global_opt_help(write_to, "-....@"); +++" -h FILE2 ... and override the header with FILE2 (-u file only)\n" +++" -v verbose output\n" +++" --no-PG do not add a PG line\n"); +++ sam_global_opt_help(write_to, "-....@.."); ++ fprintf(write_to, ++ "\n" ++ "Format string expansions:\n" ++@@ -99,11 +104,11 @@ ++ { ++ if (argc == 1) { usage(stdout); return NULL; } ++ ++- const char* optstring = "vf:u:@:"; ++- char* delim; +++ const char *optstring = "vf:h:u:@:"; ++ ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -116,20 +121,19 @@ ++ while ((opt = getopt_long(argc, argv, optstring, lopts, NULL)) != -1) { ++ switch (opt) { ++ case 'f': ++- retval->output_format_string = strdup(optarg); ++- if (! retval->output_format_string ) { perror("cannot allocate output format string memory"); return NULL; } +++ retval->output_format_string = optarg; +++ break; +++ case 'h': +++ retval->unaccounted_header_name = optarg; ++ break; ++ case 'v': ++ retval->verbose = true; ++ break; ++ case 'u': ++- retval->unaccounted_name = strdup(optarg); ++- if (! retval->unaccounted_name ) { perror("cannot allocate string memory"); return NULL; } ++- if ((delim = strchr(retval->unaccounted_name, ':')) != NULL) { ++- *delim = '\0'; ++- retval->unaccounted_header_name = strdup(delim+1); ++- if (! retval->unaccounted_header_name ) { perror("cannot allocate string memory"); return NULL; } ++- } +++ retval->unaccounted_name = optarg; +++ break; +++ case 1: +++ retval->no_pg = 1; ++ break; ++ default: ++ if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break; ++@@ -141,7 +145,7 @@ ++ } ++ } ++ ++- if (retval->output_format_string == NULL) retval->output_format_string = strdup("%*_%#.%."); +++ if (retval->output_format_string == NULL) retval->output_format_string = "%*_%#.%."; ++ ++ argc -= optind; ++ argv += optind; ++@@ -153,8 +157,7 @@ ++ return NULL; ++ } ++ ++- retval->merged_input_name = strdup(argv[0]); ++- if (! retval->merged_input_name ) { perror("cannot allocate string memory"); return NULL; } +++ retval->merged_input_name = argv[0]; ++ ++ return retval; ++ } ++@@ -166,176 +169,110 @@ ++ const char* pointer = format_string; ++ const char* next; ++ while ((next = strchr(pointer, '%')) != NULL) { ++- kputsn(pointer, next-pointer, &str); +++ if (kputsn(pointer, next-pointer, &str) < 0) goto memfail; ++ ++next; ++ switch (*next) { ++ case '%': ++- kputc('%', &str); +++ if (kputc('%', &str) < 0) goto memfail; ++ break; ++ case '*': ++- kputs(basename, &str); +++ if (kputs(basename, &str) < 0) goto memfail; ++ break; ++ case '#': ++- kputl(rg_idx, &str); +++ if (kputl(rg_idx, &str) < 0) goto memfail; ++ break; ++ case '!': ++- kputs(rg_id, &str); +++ if (kputs(rg_id, &str) < 0) goto memfail; ++ break; ++ case '.': ++ // Only really need to cope with sam, bam, cram ++- if (format->format != unknown_format) ++- kputs(hts_format_file_extension(format), &str); ++- else ++- kputs("bam", &str); +++ if (format->format != unknown_format) { +++ if (kputs(hts_format_file_extension(format), &str) < 0) +++ goto memfail; +++ } else { +++ if (kputs("bam", &str) < 0) goto memfail; +++ } ++ break; ++ case '\0': ++- // Error is: fprintf(stderr, "bad format string, trailing %%\n"); ++- free(str.s); ++- return NULL; +++ print_error("split", "Trailing %% in filename format string"); +++ goto fail; ++ default: ++ // Error is: fprintf(stderr, "bad format string, unknown format specifier\n"); ++- free(str.s); ++- return NULL; +++ print_error("split", "Unknown specifier %%%c in filename format string", *next); +++ goto fail; ++ } ++ pointer = next + 1; ++ } ++- kputs(pointer, &str); +++ if (kputs(pointer, &str) < 0) goto memfail; ++ return ks_release(&str); +++ +++ memfail: +++ print_error_errno("split", "Couldn't build output filename"); +++ fail: +++ free(str.s); +++ return NULL; ++ } ++ ++ // Parse the header, count the number of RG tags and return a list of their names ++-static bool count_RG(bam_hdr_t* hdr, size_t* count, char*** output_name) +++static bool count_RG(sam_hdr_t* hdr, size_t* count, char*** output_name) ++ { ++- if (hdr->l_text < 3 ) { +++ char **names = NULL; +++ kstring_t id_val = KS_INITIALIZE; +++ int i, n_rg = sam_hdr_count_lines(hdr, "RG"); +++ +++ if (n_rg < 0) { +++ print_error("split", "Failed to get @RG IDs"); ++ *count = 0; ++ *output_name = NULL; ++- return true; +++ return false; ++ } ++- kstring_t input = { 0, 0, NULL }; ++- kputsn(hdr->text, hdr->l_text, &input); ++ ++- ////////////////////////////////////////// ++- // First stage count number of @RG tags // ++- ////////////////////////////////////////// ++- char* pointer = ks_str(&input); ++- size_t n_rg = 0; ++- // Guard against rare case where @RG is first header line ++- // This shouldn't happen but could where @HD is omitted ++- if (pointer[0] == '@' && pointer[1] == 'R' && pointer[2] == 'G' ) { ++- ++n_rg; ++- pointer += 3; ++- } ++- char* line; ++- while ((line = strstr(pointer, "\n@RG")) != NULL) { ++- ++n_rg; ++- pointer = line + 1; ++- } ++- ++- ////////////////////////////////// ++- // Second stage locate @RG ID's // ++- ////////////////////////////////// ++- char** names = (char**)calloc(sizeof(char*), n_rg); ++- size_t next = 0; ++- ++- regex_t rg_finder; ++- if (regcomp(&rg_finder, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE) != 0) { ++- free(input.s); ++- free(names); ++- return false; +++ if (n_rg == 0) { +++ *count = 0; +++ *output_name = NULL; +++ return true; ++ } ++- regmatch_t* matches = (regmatch_t*)calloc(sizeof(regmatch_t),2); ++- int error; ++- char* begin = ks_str(&input); ++- ++- while ((error = regexec(&rg_finder, begin, 2, matches, 0)) == 0) { ++- kstring_t str = { 0, 0, NULL }; ++- kputsn(begin+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &str); ++- names[next++] = ks_release(&str); ++- begin += matches[0].rm_eo; ++- } ++- ++- if (error != REG_NOMATCH) { ++- // cleanup ++- regfree(&rg_finder); ++- free(matches); ++- free(names); ++- free(input.s); ++- return false; +++ +++ names = calloc(n_rg, sizeof(names[0])); +++ if (!names) goto memfail; +++ +++ for (i = 0; i < n_rg; i++) { +++ if (sam_hdr_find_tag_pos(hdr, "RG", i, "ID", &id_val) < 0) goto memfail; +++ names[i] = ks_release(&id_val); ++ } ++- free(matches); ++ ++- // return results ++ *count = n_rg; ++ *output_name = names; ++- regfree(&rg_finder); ++- free(input.s); ++ return true; +++ +++ memfail: +++ print_error_errno("split", "Failed to get @RG IDs"); +++ *count = 0; +++ *output_name = NULL; +++ ks_free(&id_val); +++ free(names); +++ return false; ++ } ++ ++-// Filters a header of @RG lines where ID != id_keep ++-// TODO: strip @PG's descended from other RGs and their descendants ++-static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep, const char *arg_list) +++static int header_compatible(sam_hdr_t *hdr1, sam_hdr_t *hdr2) ++ { ++- kstring_t str = {0, 0, NULL}; ++- ++- regex_t rg_finder; ++- ++- if (regcomp(&rg_finder, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE) != 0) { ++- return false; +++ size_t n; +++ if (sam_hdr_nref(hdr1) != sam_hdr_nref(hdr2)) { +++ print_error("split", +++ "Unaccounted header contains wrong number of references"); +++ return -1; ++ } ++- ++- // regex vars ++- char* header = hdr->text; ++- regmatch_t* matches = (regmatch_t*)calloc(sizeof(regmatch_t),2); ++- kstring_t found_id = { 0, 0, NULL }; ++- int error; ++- ++- while ((error = regexec(&rg_finder, header, 2, matches, 0)) == 0) { ++- kputsn(header, matches[0].rm_so, &str); // copy header up until the found RG line ++- ++- found_id.l = 0; ++- kputsn(header+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &found_id); // extract ID ++- // if it matches keep keep it, else we can just ignore it ++- if (strcmp(ks_str(&found_id), id_keep) == 0) { ++- kputsn(header+matches[0].rm_so, (matches[0].rm_eo+1)-matches[0].rm_so, &str); ++- } ++- // move pointer forward ++- header += matches[0].rm_eo+1; ++- } ++- // cleanup ++- free(found_id.s); ++- free(matches); ++- regfree(&rg_finder); ++- // Did we leave loop because of an error? ++- if (error != REG_NOMATCH) { ++- return false; +++ for (n = 0; n < sam_hdr_nref(hdr1); n++) { +++ hts_pos_t h1_len = sam_hdr_tid2len(hdr1, n); +++ hts_pos_t h2_len = sam_hdr_tid2len(hdr2, n); +++ if (h1_len != h2_len) { +++ print_error("split", +++ "Unaccounted header reference %zu \"%s\" is not the same length as in the input file", +++ n + 1, sam_hdr_tid2name(hdr2, n)); +++ return -1; +++ } ++ } ++- ++- // Write remainder of string ++- kputs(header, &str); ++- ++- // Modify header ++- hdr->l_text = ks_len(&str); ++- free(hdr->text); ++- hdr->text = ks_release(&str); ++- ++- // Add the PG line ++- SAM_hdr *sh = sam_hdr_parse_(hdr->text, hdr->l_text); ++- if (sam_hdr_add_PG(sh, "samtools", ++- "VN", samtools_version(), ++- arg_list ? "CL": NULL, ++- arg_list ? arg_list : NULL, ++- NULL) != 0) ++- return -1; ++- ++- free(hdr->text); ++- hdr->text = strdup(sam_hdr_str(sh)); ++- hdr->l_text = sam_hdr_length(sh); ++- if (!hdr->text) ++- return false; ++- sam_hdr_free(sh); ++- ++- return true; +++ return 0; ++ } ++ ++ // Set the initial state ++@@ -350,6 +287,7 @@ ++ if (opts->ga.nthreads > 0) { ++ if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) { ++ fprintf(stderr, "Error creating thread pool\n"); +++ cleanup_state(retval, false); ++ return NULL; ++ } ++ } ++@@ -357,7 +295,7 @@ ++ retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in); ++ if (!retval->merged_input_file) { ++ print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name); ++- free(retval); +++ cleanup_state(retval, false); ++ return NULL; ++ } ++ if (retval->p.pool) ++@@ -381,11 +319,26 @@ ++ if (retval->unaccounted_header == NULL) { ++ print_error("split", "Could not read header from \"%s\"", opts->unaccounted_header_name); ++ cleanup_state(retval, false); +++ sam_close(hdr_load); ++ return NULL; ++ } ++ sam_close(hdr_load); +++ if (header_compatible(retval->merged_input_header, +++ retval->unaccounted_header) != 0) { +++ cleanup_state(retval, false); +++ return NULL; +++ } ++ } else { ++- retval->unaccounted_header = bam_hdr_dup(retval->merged_input_header); +++ retval->unaccounted_header = sam_hdr_dup(retval->merged_input_header); +++ if (!opts->no_pg && sam_hdr_add_pg(retval->unaccounted_header, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ print_error("split", "Could not rewrite header for \"%s\"", opts->unaccounted_name); +++ cleanup_state(retval, false); +++ return NULL; +++ } ++ } ++ ++ retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out); ++@@ -401,12 +354,15 @@ ++ // Open output files for RGs ++ if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL; ++ if (opts->verbose) fprintf(stderr, "@RG's found %zu\n",retval->output_count); ++- ++- retval->rg_output_file_name = (char **)calloc(retval->output_count, sizeof(char *)); ++- retval->rg_output_file = (samFile**)calloc(retval->output_count, sizeof(samFile*)); ++- retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*)); +++ // Prevent calloc(0, size); +++ size_t num = retval->output_count ? retval->output_count : 1; +++ retval->rg_index_file_name = (char **)calloc(num, sizeof(char *)); +++ retval->rg_output_file_name = (char **)calloc(num, sizeof(char *)); +++ retval->rg_output_file = (samFile**)calloc(num, sizeof(samFile*)); +++ retval->rg_output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*)); ++ retval->rg_hash = kh_init_c2i(); ++- if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) { +++ if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || +++ !retval->rg_hash || !retval->rg_index_file_name) { ++ print_error_errno("split", "Could not initialise output file array"); ++ cleanup_state(retval, false); ++ return NULL; ++@@ -432,7 +388,6 @@ ++ &opts->ga.out); ++ ++ if ( output_filename == NULL ) { ++- print_error("split", "Error expanding output filename format string"); ++ cleanup_state(retval, false); ++ free(input_base_name); ++ return NULL; ++@@ -452,11 +407,23 @@ ++ // Record index in hash ++ int ret; ++ khiter_t iter = kh_put_c2i(retval->rg_hash, retval->rg_id[i], &ret); +++ if (ret < 0) { +++ print_error_errno("split", "Couldn't add @RG ID to look-up table"); +++ cleanup_state(retval, false); +++ free(input_base_name); +++ return NULL; +++ } ++ kh_val(retval->rg_hash,iter) = i; ++ ++ // Set and edit header ++- retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header); ++- if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i], arg_list) ) { +++ retval->rg_output_header[i] = sam_hdr_dup(retval->merged_input_header); +++ if (sam_hdr_remove_except(retval->rg_output_header[i], "RG", "ID", retval->rg_id[i]) || +++ (!opts->no_pg && +++ sam_hdr_add_pg(retval->rg_output_header[i], "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL))) { ++ print_error("split", "Could not rewrite header for \"%s\"", output_filename); ++ cleanup_state(retval, false); ++ free(input_base_name); ++@@ -465,6 +432,7 @@ ++ } ++ ++ free(input_base_name); +++ retval->write_index = opts->ga.write_index; ++ ++ return retval; ++ } ++@@ -481,6 +449,15 @@ ++ print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]); ++ return false; ++ } +++ if (state->write_index) { +++ state->rg_index_file_name[i] = auto_index(state->rg_output_file[i], +++ state->rg_output_file_name[i], +++ state->rg_output_header[i]); +++ if (!state->rg_index_file_name[i]) { +++ print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]); +++ return false; +++ } +++ } ++ } ++ ++ bam1_t* file_read = bam_init1(); ++@@ -547,6 +524,16 @@ ++ } ++ } ++ +++ if (state->write_index) { +++ for (i = 0; i < state->output_count; i++) { +++ if (sam_idx_save(state->rg_output_file[i]) < 0) { +++ print_error_errno("split", "writing index failed"); +++ return false; +++ } +++ free(state->rg_index_file_name[i]); +++ } +++ } +++ ++ return true; ++ } ++ ++@@ -555,7 +542,7 @@ ++ int ret = 0; ++ ++ if (!status) return 0; ++- if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header); +++ if (status->unaccounted_header) sam_hdr_destroy(status->unaccounted_header); ++ if (status->unaccounted_file) { ++ if (sam_close(status->unaccounted_file) < 0 && check_close) { ++ print_error("split", "Error on closing unaccounted file"); ++@@ -566,7 +553,7 @@ ++ size_t i; ++ for (i = 0; i < status->output_count; i++) { ++ if (status->rg_output_header && status->rg_output_header[i]) ++- bam_hdr_destroy(status->rg_output_header[i]); +++ sam_hdr_destroy(status->rg_output_header[i]); ++ if (status->rg_output_file && status->rg_output_file[i]) { ++ if (sam_close(status->rg_output_file[i]) < 0 && check_close) { ++ print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]); ++@@ -577,16 +564,16 @@ ++ if (status->rg_output_file_name) free(status->rg_output_file_name[i]); ++ } ++ if (status->merged_input_header) ++- bam_hdr_destroy(status->merged_input_header); +++ sam_hdr_destroy(status->merged_input_header); ++ free(status->rg_output_header); ++ free(status->rg_output_file); ++ free(status->rg_output_file_name); +++ free(status->rg_index_file_name); ++ kh_destroy_c2i(status->rg_hash); ++ free(status->rg_id); ++- free(status); ++- ++ if (status->p.pool) ++ hts_tpool_destroy(status->p.pool); +++ free(status); ++ ++ return ret; ++ } ++@@ -594,10 +581,6 @@ ++ static void cleanup_opts(parsed_opts_t* opts) ++ { ++ if (!opts) return; ++- free(opts->merged_input_name); ++- free(opts->unaccounted_header_name); ++- free(opts->unaccounted_name); ++- free(opts->output_format_string); ++ sam_global_args_free(&opts->ga); ++ free(opts); ++ } ++@@ -605,9 +588,11 @@ ++ int main_split(int argc, char** argv) ++ { ++ int ret = 1; ++- char *arg_list = stringify_argv(argc+1, argv-1); +++ char *arg_list = NULL; ++ parsed_opts_t* opts = parse_args(argc, argv); ++ if (!opts) goto cleanup_opts; +++ if (!opts->no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) +++ goto cleanup_opts; ++ state_t* status = init(opts, arg_list); ++ if (!status) goto cleanup_opts; ++ ++--- python-pysam.orig/samtools/bam_split.c.pysam.c +++++ python-pysam/samtools/bam_split.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam_split.c -- split subcommand. ++ ++- Copyright (C) 2013-2016 Genome Research Ltd. +++ Copyright (C) 2013-2016,2018-2019 Genome Research Ltd. ++ ++ Author: Martin Pollard ++ ++@@ -26,7 +26,6 @@ ++ ++ #include ++ ++-#include ++ #include ++ #include ++ #include ++@@ -34,6 +33,8 @@ ++ #include ++ #include ++ #include +++#include +++#include ++ #include ++ #include ++ #include ++@@ -45,11 +46,12 @@ ++ KHASH_MAP_INIT_STR(c2i, int) ++ ++ struct parsed_opts { ++- char* merged_input_name; ++- char* unaccounted_header_name; ++- char* unaccounted_name; ++- char* output_format_string; +++ const char *merged_input_name; +++ const char *unaccounted_header_name; +++ const char *unaccounted_name; +++ const char *output_format_string; ++ bool verbose; +++ int no_pg; ++ sam_global_args ga; ++ }; ++ ++@@ -57,16 +59,18 @@ ++ ++ struct state { ++ samFile* merged_input_file; ++- bam_hdr_t* merged_input_header; +++ sam_hdr_t* merged_input_header; ++ samFile* unaccounted_file; ++- bam_hdr_t* unaccounted_header; +++ sam_hdr_t* unaccounted_header; ++ size_t output_count; ++ char** rg_id; +++ char **rg_index_file_name; ++ char **rg_output_file_name; ++ samFile** rg_output_file; ++- bam_hdr_t** rg_output_header; +++ sam_hdr_t** rg_output_header; ++ kh_c2i_t* rg_hash; ++ htsThreadPool p; +++ int write_index; ++ }; ++ ++ typedef struct state state_t; ++@@ -77,14 +81,15 @@ ++ static void usage(FILE *write_to) ++ { ++ fprintf(write_to, ++-"Usage: samtools split [-u [:]]\n" +++"Usage: samtools split [-u ] [-h ]\n" ++ " [-f ] [-v] \n" ++ "Options:\n" ++ " -f STRING output filename format string [\"%%*_%%#.%%.\"]\n" ++ " -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n" ++-" -u FILE1:FILE2 ...and override the header with FILE2\n" ++-" -v verbose output\n"); ++- sam_global_opt_help(write_to, "-....@"); +++" -h FILE2 ... and override the header with FILE2 (-u file only)\n" +++" -v verbose output\n" +++" --no-PG do not add a PG line\n"); +++ sam_global_opt_help(write_to, "-....@.."); ++ fprintf(write_to, ++ "\n" ++ "Format string expansions:\n" ++@@ -101,11 +106,11 @@ ++ { ++ if (argc == 1) { usage(samtools_stdout); return NULL; } ++ ++- const char* optstring = "vf:u:@:"; ++- char* delim; +++ const char *optstring = "vf:h:u:@:"; ++ ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -118,20 +123,19 @@ ++ while ((opt = getopt_long(argc, argv, optstring, lopts, NULL)) != -1) { ++ switch (opt) { ++ case 'f': ++- retval->output_format_string = strdup(optarg); ++- if (! retval->output_format_string ) { perror("cannot allocate output format string memory"); return NULL; } +++ retval->output_format_string = optarg; +++ break; +++ case 'h': +++ retval->unaccounted_header_name = optarg; ++ break; ++ case 'v': ++ retval->verbose = true; ++ break; ++ case 'u': ++- retval->unaccounted_name = strdup(optarg); ++- if (! retval->unaccounted_name ) { perror("cannot allocate string memory"); return NULL; } ++- if ((delim = strchr(retval->unaccounted_name, ':')) != NULL) { ++- *delim = '\0'; ++- retval->unaccounted_header_name = strdup(delim+1); ++- if (! retval->unaccounted_header_name ) { perror("cannot allocate string memory"); return NULL; } ++- } +++ retval->unaccounted_name = optarg; +++ break; +++ case 1: +++ retval->no_pg = 1; ++ break; ++ default: ++ if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break; ++@@ -143,7 +147,7 @@ ++ } ++ } ++ ++- if (retval->output_format_string == NULL) retval->output_format_string = strdup("%*_%#.%."); +++ if (retval->output_format_string == NULL) retval->output_format_string = "%*_%#.%."; ++ ++ argc -= optind; ++ argv += optind; ++@@ -155,8 +159,7 @@ ++ return NULL; ++ } ++ ++- retval->merged_input_name = strdup(argv[0]); ++- if (! retval->merged_input_name ) { perror("cannot allocate string memory"); return NULL; } +++ retval->merged_input_name = argv[0]; ++ ++ return retval; ++ } ++@@ -168,176 +171,110 @@ ++ const char* pointer = format_string; ++ const char* next; ++ while ((next = strchr(pointer, '%')) != NULL) { ++- kputsn(pointer, next-pointer, &str); +++ if (kputsn(pointer, next-pointer, &str) < 0) goto memfail; ++ ++next; ++ switch (*next) { ++ case '%': ++- kputc('%', &str); +++ if (kputc('%', &str) < 0) goto memfail; ++ break; ++ case '*': ++- kputs(basename, &str); +++ if (kputs(basename, &str) < 0) goto memfail; ++ break; ++ case '#': ++- kputl(rg_idx, &str); +++ if (kputl(rg_idx, &str) < 0) goto memfail; ++ break; ++ case '!': ++- kputs(rg_id, &str); +++ if (kputs(rg_id, &str) < 0) goto memfail; ++ break; ++ case '.': ++ // Only really need to cope with sam, bam, cram ++- if (format->format != unknown_format) ++- kputs(hts_format_file_extension(format), &str); ++- else ++- kputs("bam", &str); +++ if (format->format != unknown_format) { +++ if (kputs(hts_format_file_extension(format), &str) < 0) +++ goto memfail; +++ } else { +++ if (kputs("bam", &str) < 0) goto memfail; +++ } ++ break; ++ case '\0': ++- // Error is: fprintf(samtools_stderr, "bad format string, trailing %%\n"); ++- free(str.s); ++- return NULL; +++ print_error("split", "Trailing %% in filename format string"); +++ goto fail; ++ default: ++ // Error is: fprintf(samtools_stderr, "bad format string, unknown format specifier\n"); ++- free(str.s); ++- return NULL; +++ print_error("split", "Unknown specifier %%%c in filename format string", *next); +++ goto fail; ++ } ++ pointer = next + 1; ++ } ++- kputs(pointer, &str); +++ if (kputs(pointer, &str) < 0) goto memfail; ++ return ks_release(&str); +++ +++ memfail: +++ print_error_errno("split", "Couldn't build output filename"); +++ fail: +++ free(str.s); +++ return NULL; ++ } ++ ++ // Parse the header, count the number of RG tags and return a list of their names ++-static bool count_RG(bam_hdr_t* hdr, size_t* count, char*** output_name) +++static bool count_RG(sam_hdr_t* hdr, size_t* count, char*** output_name) ++ { ++- if (hdr->l_text < 3 ) { +++ char **names = NULL; +++ kstring_t id_val = KS_INITIALIZE; +++ int i, n_rg = sam_hdr_count_lines(hdr, "RG"); +++ +++ if (n_rg < 0) { +++ print_error("split", "Failed to get @RG IDs"); ++ *count = 0; ++ *output_name = NULL; ++- return true; +++ return false; ++ } ++- kstring_t input = { 0, 0, NULL }; ++- kputsn(hdr->text, hdr->l_text, &input); ++ ++- ////////////////////////////////////////// ++- // First stage count number of @RG tags // ++- ////////////////////////////////////////// ++- char* pointer = ks_str(&input); ++- size_t n_rg = 0; ++- // Guard against rare case where @RG is first header line ++- // This shouldn't happen but could where @HD is omitted ++- if (pointer[0] == '@' && pointer[1] == 'R' && pointer[2] == 'G' ) { ++- ++n_rg; ++- pointer += 3; ++- } ++- char* line; ++- while ((line = strstr(pointer, "\n@RG")) != NULL) { ++- ++n_rg; ++- pointer = line + 1; ++- } ++- ++- ////////////////////////////////// ++- // Second stage locate @RG ID's // ++- ////////////////////////////////// ++- char** names = (char**)calloc(sizeof(char*), n_rg); ++- size_t next = 0; ++- ++- regex_t rg_finder; ++- if (regcomp(&rg_finder, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE) != 0) { ++- free(input.s); ++- free(names); ++- return false; +++ if (n_rg == 0) { +++ *count = 0; +++ *output_name = NULL; +++ return true; ++ } ++- regmatch_t* matches = (regmatch_t*)calloc(sizeof(regmatch_t),2); ++- int error; ++- char* begin = ks_str(&input); ++- ++- while ((error = regexec(&rg_finder, begin, 2, matches, 0)) == 0) { ++- kstring_t str = { 0, 0, NULL }; ++- kputsn(begin+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &str); ++- names[next++] = ks_release(&str); ++- begin += matches[0].rm_eo; ++- } ++- ++- if (error != REG_NOMATCH) { ++- // cleanup ++- regfree(&rg_finder); ++- free(matches); ++- free(names); ++- free(input.s); ++- return false; +++ +++ names = calloc(n_rg, sizeof(names[0])); +++ if (!names) goto memfail; +++ +++ for (i = 0; i < n_rg; i++) { +++ if (sam_hdr_find_tag_pos(hdr, "RG", i, "ID", &id_val) < 0) goto memfail; +++ names[i] = ks_release(&id_val); ++ } ++- free(matches); ++ ++- // return results ++ *count = n_rg; ++ *output_name = names; ++- regfree(&rg_finder); ++- free(input.s); ++ return true; +++ +++ memfail: +++ print_error_errno("split", "Failed to get @RG IDs"); +++ *count = 0; +++ *output_name = NULL; +++ ks_free(&id_val); +++ free(names); +++ return false; ++ } ++ ++-// Filters a header of @RG lines where ID != id_keep ++-// TODO: strip @PG's descended from other RGs and their descendants ++-static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep, const char *arg_list) +++static int header_compatible(sam_hdr_t *hdr1, sam_hdr_t *hdr2) ++ { ++- kstring_t str = {0, 0, NULL}; ++- ++- regex_t rg_finder; ++- ++- if (regcomp(&rg_finder, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE) != 0) { ++- return false; +++ size_t n; +++ if (sam_hdr_nref(hdr1) != sam_hdr_nref(hdr2)) { +++ print_error("split", +++ "Unaccounted header contains wrong number of references"); +++ return -1; ++ } ++- ++- // regex vars ++- char* header = hdr->text; ++- regmatch_t* matches = (regmatch_t*)calloc(sizeof(regmatch_t),2); ++- kstring_t found_id = { 0, 0, NULL }; ++- int error; ++- ++- while ((error = regexec(&rg_finder, header, 2, matches, 0)) == 0) { ++- kputsn(header, matches[0].rm_so, &str); // copy header up until the found RG line ++- ++- found_id.l = 0; ++- kputsn(header+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &found_id); // extract ID ++- // if it matches keep keep it, else we can just ignore it ++- if (strcmp(ks_str(&found_id), id_keep) == 0) { ++- kputsn(header+matches[0].rm_so, (matches[0].rm_eo+1)-matches[0].rm_so, &str); ++- } ++- // move pointer forward ++- header += matches[0].rm_eo+1; ++- } ++- // cleanup ++- free(found_id.s); ++- free(matches); ++- regfree(&rg_finder); ++- // Did we leave loop because of an error? ++- if (error != REG_NOMATCH) { ++- return false; +++ for (n = 0; n < sam_hdr_nref(hdr1); n++) { +++ hts_pos_t h1_len = sam_hdr_tid2len(hdr1, n); +++ hts_pos_t h2_len = sam_hdr_tid2len(hdr2, n); +++ if (h1_len != h2_len) { +++ print_error("split", +++ "Unaccounted header reference %zu \"%s\" is not the same length as in the input file", +++ n + 1, sam_hdr_tid2name(hdr2, n)); +++ return -1; +++ } ++ } ++- ++- // Write remainder of string ++- kputs(header, &str); ++- ++- // Modify header ++- hdr->l_text = ks_len(&str); ++- free(hdr->text); ++- hdr->text = ks_release(&str); ++- ++- // Add the PG line ++- SAM_hdr *sh = sam_hdr_parse_(hdr->text, hdr->l_text); ++- if (sam_hdr_add_PG(sh, "samtools", ++- "VN", samtools_version(), ++- arg_list ? "CL": NULL, ++- arg_list ? arg_list : NULL, ++- NULL) != 0) ++- return -1; ++- ++- free(hdr->text); ++- hdr->text = strdup(sam_hdr_str(sh)); ++- hdr->l_text = sam_hdr_length(sh); ++- if (!hdr->text) ++- return false; ++- sam_hdr_free(sh); ++- ++- return true; +++ return 0; ++ } ++ ++ // Set the initial state ++@@ -352,6 +289,7 @@ ++ if (opts->ga.nthreads > 0) { ++ if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) { ++ fprintf(samtools_stderr, "Error creating thread pool\n"); +++ cleanup_state(retval, false); ++ return NULL; ++ } ++ } ++@@ -359,7 +297,7 @@ ++ retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in); ++ if (!retval->merged_input_file) { ++ print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name); ++- free(retval); +++ cleanup_state(retval, false); ++ return NULL; ++ } ++ if (retval->p.pool) ++@@ -383,11 +321,26 @@ ++ if (retval->unaccounted_header == NULL) { ++ print_error("split", "Could not read header from \"%s\"", opts->unaccounted_header_name); ++ cleanup_state(retval, false); +++ sam_close(hdr_load); ++ return NULL; ++ } ++ sam_close(hdr_load); +++ if (header_compatible(retval->merged_input_header, +++ retval->unaccounted_header) != 0) { +++ cleanup_state(retval, false); +++ return NULL; +++ } ++ } else { ++- retval->unaccounted_header = bam_hdr_dup(retval->merged_input_header); +++ retval->unaccounted_header = sam_hdr_dup(retval->merged_input_header); +++ if (!opts->no_pg && sam_hdr_add_pg(retval->unaccounted_header, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ print_error("split", "Could not rewrite header for \"%s\"", opts->unaccounted_name); +++ cleanup_state(retval, false); +++ return NULL; +++ } ++ } ++ ++ retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out); ++@@ -403,12 +356,15 @@ ++ // Open output files for RGs ++ if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL; ++ if (opts->verbose) fprintf(samtools_stderr, "@RG's found %zu\n",retval->output_count); ++- ++- retval->rg_output_file_name = (char **)calloc(retval->output_count, sizeof(char *)); ++- retval->rg_output_file = (samFile**)calloc(retval->output_count, sizeof(samFile*)); ++- retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*)); +++ // Prevent calloc(0, size); +++ size_t num = retval->output_count ? retval->output_count : 1; +++ retval->rg_index_file_name = (char **)calloc(num, sizeof(char *)); +++ retval->rg_output_file_name = (char **)calloc(num, sizeof(char *)); +++ retval->rg_output_file = (samFile**)calloc(num, sizeof(samFile*)); +++ retval->rg_output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*)); ++ retval->rg_hash = kh_init_c2i(); ++- if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) { +++ if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || +++ !retval->rg_hash || !retval->rg_index_file_name) { ++ print_error_errno("split", "Could not initialise output file array"); ++ cleanup_state(retval, false); ++ return NULL; ++@@ -434,7 +390,6 @@ ++ &opts->ga.out); ++ ++ if ( output_filename == NULL ) { ++- print_error("split", "Error expanding output filename format string"); ++ cleanup_state(retval, false); ++ free(input_base_name); ++ return NULL; ++@@ -454,11 +409,23 @@ ++ // Record index in hash ++ int ret; ++ khiter_t iter = kh_put_c2i(retval->rg_hash, retval->rg_id[i], &ret); +++ if (ret < 0) { +++ print_error_errno("split", "Couldn't add @RG ID to look-up table"); +++ cleanup_state(retval, false); +++ free(input_base_name); +++ return NULL; +++ } ++ kh_val(retval->rg_hash,iter) = i; ++ ++ // Set and edit header ++- retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header); ++- if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i], arg_list) ) { +++ retval->rg_output_header[i] = sam_hdr_dup(retval->merged_input_header); +++ if (sam_hdr_remove_except(retval->rg_output_header[i], "RG", "ID", retval->rg_id[i]) || +++ (!opts->no_pg && +++ sam_hdr_add_pg(retval->rg_output_header[i], "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL))) { ++ print_error("split", "Could not rewrite header for \"%s\"", output_filename); ++ cleanup_state(retval, false); ++ free(input_base_name); ++@@ -467,6 +434,7 @@ ++ } ++ ++ free(input_base_name); +++ retval->write_index = opts->ga.write_index; ++ ++ return retval; ++ } ++@@ -483,6 +451,15 @@ ++ print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]); ++ return false; ++ } +++ if (state->write_index) { +++ state->rg_index_file_name[i] = auto_index(state->rg_output_file[i], +++ state->rg_output_file_name[i], +++ state->rg_output_header[i]); +++ if (!state->rg_index_file_name[i]) { +++ print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]); +++ return false; +++ } +++ } ++ } ++ ++ bam1_t* file_read = bam_init1(); ++@@ -549,6 +526,16 @@ ++ } ++ } ++ +++ if (state->write_index) { +++ for (i = 0; i < state->output_count; i++) { +++ if (sam_idx_save(state->rg_output_file[i]) < 0) { +++ print_error_errno("split", "writing index failed"); +++ return false; +++ } +++ free(state->rg_index_file_name[i]); +++ } +++ } +++ ++ return true; ++ } ++ ++@@ -557,7 +544,7 @@ ++ int ret = 0; ++ ++ if (!status) return 0; ++- if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header); +++ if (status->unaccounted_header) sam_hdr_destroy(status->unaccounted_header); ++ if (status->unaccounted_file) { ++ if (sam_close(status->unaccounted_file) < 0 && check_close) { ++ print_error("split", "Error on closing unaccounted file"); ++@@ -568,7 +555,7 @@ ++ size_t i; ++ for (i = 0; i < status->output_count; i++) { ++ if (status->rg_output_header && status->rg_output_header[i]) ++- bam_hdr_destroy(status->rg_output_header[i]); +++ sam_hdr_destroy(status->rg_output_header[i]); ++ if (status->rg_output_file && status->rg_output_file[i]) { ++ if (sam_close(status->rg_output_file[i]) < 0 && check_close) { ++ print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]); ++@@ -579,16 +566,16 @@ ++ if (status->rg_output_file_name) free(status->rg_output_file_name[i]); ++ } ++ if (status->merged_input_header) ++- bam_hdr_destroy(status->merged_input_header); +++ sam_hdr_destroy(status->merged_input_header); ++ free(status->rg_output_header); ++ free(status->rg_output_file); ++ free(status->rg_output_file_name); +++ free(status->rg_index_file_name); ++ kh_destroy_c2i(status->rg_hash); ++ free(status->rg_id); ++- free(status); ++- ++ if (status->p.pool) ++ hts_tpool_destroy(status->p.pool); +++ free(status); ++ ++ return ret; ++ } ++@@ -596,10 +583,6 @@ ++ static void cleanup_opts(parsed_opts_t* opts) ++ { ++ if (!opts) return; ++- free(opts->merged_input_name); ++- free(opts->unaccounted_header_name); ++- free(opts->unaccounted_name); ++- free(opts->output_format_string); ++ sam_global_args_free(&opts->ga); ++ free(opts); ++ } ++@@ -607,9 +590,11 @@ ++ int main_split(int argc, char** argv) ++ { ++ int ret = 1; ++- char *arg_list = stringify_argv(argc+1, argv-1); +++ char *arg_list = NULL; ++ parsed_opts_t* opts = parse_args(argc, argv); ++ if (!opts) goto cleanup_opts; +++ if (!opts->no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) +++ goto cleanup_opts; ++ state_t* status = init(opts, arg_list); ++ if (!status) goto cleanup_opts; ++ ++--- python-pysam.orig/samtools/bam_stat.c +++++ python-pysam/samtools/bam_stat.c ++@@ -1,6 +1,6 @@ ++ /* bam_stat.c -- flagstat subcommand. ++ ++- Copyright (C) 2009, 2011, 2013-2015 Genome Research Ltd. +++ Copyright (C) 2009, 2011, 2013-2015, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -69,7 +69,7 @@ ++ if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ ++ } while (0) ++ ++-bam_flagstat_t *bam_flagstat_core(samFile *fp, bam_hdr_t *h) +++bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h) ++ { ++ bam_flagstat_t *s; ++ bam1_t *b; ++@@ -93,19 +93,155 @@ ++ return buffer; ++ } ++ +++static const char *percent_json(char *buffer, long long n, long long total) +++{ +++ if (total != 0) sprintf(buffer, "%.2f", (float)n / total * 100.0); +++ else strcpy(buffer, "null"); +++ return buffer; +++} +++ ++ static void usage_exit(FILE *fp, int exit_status) ++ { ++ fprintf(fp, "Usage: samtools flagstat [options] \n"); ++- sam_global_opt_help(fp, "-.---@"); +++ sam_global_opt_help(fp, "-.---@-."); +++ fprintf(fp, " -O, --"); +++ fprintf(fp, "output-fmt FORMAT[,OPT[=VAL]]...\n" +++ " Specify output format (json, tsv)\n"); ++ exit(exit_status); ++ } ++ +++static void out_fmt_default(bam_flagstat_t *s) +++{ +++ char b0[16], b1[16]; +++ printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); +++ printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); +++ printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); +++ printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); +++ printf("%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); +++ printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); +++ printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); +++ printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); +++ printf("%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); +++ printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); +++ printf("%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); +++ printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); +++ printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); +++} +++ +++static void out_fmt_json(bam_flagstat_t *s) { +++ char b0[16], b1[16]; +++ printf("{\n \"QC-passed reads\": { \n" +++ " \"total\": %lld, \n" +++ " \"secondary\": %lld, \n" +++ " \"supplementary\": %lld, \n" +++ " \"duplicates\": %lld, \n" +++ " \"mapped\": %lld, \n" +++ " \"mapped %%\": %s, \n" +++ " \"paired in sequencing\": %lld, \n" +++ " \"read1\": %lld, \n" +++ " \"read2\": %lld, \n" +++ " \"properly paired\": %lld, \n" +++ " \"properly paired %%\": %s, \n" +++ " \"with itself and mate mapped\": %lld, \n" +++ " \"singletons\": %lld, \n" +++ " \"singletons %%\": %s, \n" +++ " \"with mate mapped to a different chr\": %lld, \n" +++ " \"with mate mapped to a different chr (mapQ >= 5)\": %lld \n" +++ " }," +++ "\n \"QC-failed reads\": { \n" +++ " \"total\": %lld, \n" +++ " \"secondary\": %lld, \n" +++ " \"supplementary\": %lld, \n" +++ " \"duplicates\": %lld, \n" +++ " \"mapped\": %lld, \n" +++ " \"mapped %%\": %s, \n" +++ " \"paired in sequencing\": %lld, \n" +++ " \"read1\": %lld, \n" +++ " \"read2\": %lld, \n" +++ " \"properly paired\": %lld, \n" +++ " \"properly paired %%\": %s, \n" +++ " \"with itself and mate mapped\": %lld, \n" +++ " \"singletons\": %lld, \n" +++ " \"singletons %%\": %s, \n" +++ " \"with mate mapped to a different chr\": %lld, \n" +++ " \"with mate mapped to a different chr (mapQ >= 5)\": %lld \n" +++ " }\n" +++ "}\n", +++ s->n_reads[0], +++ s->n_secondary[0], +++ s->n_supp[0], +++ s->n_dup[0], +++ s->n_mapped[0], +++ percent_json(b0, s->n_mapped[0], s->n_reads[0]), +++ s->n_pair_all[0], +++ s->n_read1[0], +++ s->n_read2[0], +++ s->n_pair_good[0], +++ percent_json(b0, s->n_pair_good[0], s->n_pair_all[0]), +++ s->n_pair_map[0], +++ s->n_sgltn[0], +++ percent_json(b0, s->n_sgltn[0], s->n_pair_all[0]), +++ s->n_diffchr[0], +++ s->n_diffhigh[0], +++ s->n_reads[1], +++ s->n_secondary[1], +++ s->n_supp[1], +++ s->n_dup[1], +++ s->n_mapped[1], +++ percent_json(b1, s->n_mapped[1], s->n_reads[1]), +++ s->n_pair_all[1], +++ s->n_read1[1], +++ s->n_read2[1], +++ s->n_pair_good[1], +++ percent_json(b1, s->n_pair_good[1], s->n_pair_all[1]), +++ s->n_pair_map[1], +++ s->n_sgltn[1], +++ percent_json(b1, s->n_sgltn[1], s->n_pair_all[1]), +++ s->n_diffchr[1], +++ s->n_diffhigh[1] +++ ); +++} +++ +++static void out_fmt_tsv(bam_flagstat_t *s) { +++ char b0[16], b1[16]; +++ printf("%lld\t%lld\ttotal (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); +++ printf("%lld\t%lld\tsecondary\n", s->n_secondary[0], s->n_secondary[1]); +++ printf("%lld\t%lld\tsupplementary\n", s->n_supp[0], s->n_supp[1]); +++ printf("%lld\t%lld\tduplicates\n", s->n_dup[0], s->n_dup[1]); +++ printf("%lld\t%lld\tmapped\n", s->n_mapped[0], s->n_mapped[1]); +++ printf("%s\t%s\tmapped %%\n", percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); +++ printf("%lld\t%lld\tpaired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); +++ printf("%lld\t%lld\tread1\n", s->n_read1[0], s->n_read1[1]); +++ printf("%lld\t%lld\tread2\n", s->n_read2[0], s->n_read2[1]); +++ printf("%lld\t%lld\tproperly paired\n", s->n_pair_good[0], s->n_pair_good[1]); +++ printf("%s\t%s\tproperly paired %%\n", percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); +++ printf("%lld\t%lld\twith itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); +++ printf("%lld\t%lld\tsingletons\n", s->n_sgltn[0], s->n_sgltn[1]); +++ printf("%s\t%s\tsingletons %%\n", percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); +++ printf("%lld\t%lld\twith mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); +++ printf("%lld\t%lld\twith mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); +++} +++ +++/* +++ * Select flagstats output format to print. +++ */ +++static void output_fmt(bam_flagstat_t *s, const char *out_fmt) +++{ +++ if (strcmp(out_fmt, "json") == 0 || strcmp(out_fmt, "JSON") == 0) { +++ out_fmt_json(s); +++ } else if (strcmp(out_fmt, "tsv") == 0 || strcmp(out_fmt, "TSV") == 0) { +++ out_fmt_tsv(s); +++ } else { +++ out_fmt_default(s); +++ } +++} +++ ++ int bam_flagstat(int argc, char *argv[]) ++ { ++ samFile *fp; ++- bam_hdr_t *header; +++ sam_hdr_t *header; ++ bam_flagstat_t *s; ++- char b0[16], b1[16]; +++ const char *out_fmt = "default"; ++ int c; ++ ++ enum { ++@@ -114,12 +250,15 @@ ++ ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ static const struct option lopts[] = { ++- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), +++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', '-', '@'), ++ {NULL, 0, NULL, 0} ++ }; ++ ++- while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) { +++ while ((c = getopt_long(argc, argv, "@:O:", lopts, NULL)) >= 0) { ++ switch (c) { +++ case 'O': +++ out_fmt = optarg; +++ break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': ++@@ -155,22 +294,11 @@ ++ fprintf(stderr, "Failed to read header for \"%s\"\n", argv[optind]); ++ return 1; ++ } +++ ++ s = bam_flagstat_core(fp, header); ++- printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); ++- printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); ++- printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); ++- printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); ++- printf("%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); ++- printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); ++- printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); ++- printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); ++- printf("%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); ++- printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); ++- printf("%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); ++- printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); ++- printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); +++ output_fmt(s, out_fmt); ++ free(s); ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ sam_close(fp); ++ sam_global_args_free(&ga); ++ return 0; ++--- python-pysam.orig/samtools/bam_stat.c.pysam.c +++++ python-pysam/samtools/bam_stat.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bam_stat.c -- flagstat subcommand. ++ ++- Copyright (C) 2009, 2011, 2013-2015 Genome Research Ltd. +++ Copyright (C) 2009, 2011, 2013-2015, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -71,7 +71,7 @@ ++ if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ ++ } while (0) ++ ++-bam_flagstat_t *bam_flagstat_core(samFile *fp, bam_hdr_t *h) +++bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h) ++ { ++ bam_flagstat_t *s; ++ bam1_t *b; ++@@ -95,19 +95,155 @@ ++ return buffer; ++ } ++ +++static const char *percent_json(char *buffer, long long n, long long total) +++{ +++ if (total != 0) sprintf(buffer, "%.2f", (float)n / total * 100.0); +++ else strcpy(buffer, "null"); +++ return buffer; +++} +++ ++ static void usage_exit(FILE *fp, int exit_status) ++ { ++ fprintf(fp, "Usage: samtools flagstat [options] \n"); ++- sam_global_opt_help(fp, "-.---@"); +++ sam_global_opt_help(fp, "-.---@-."); +++ fprintf(fp, " -O, --"); +++ fprintf(fp, "output-fmt FORMAT[,OPT[=VAL]]...\n" +++ " Specify output format (json, tsv)\n"); ++ exit(exit_status); ++ } ++ +++static void out_fmt_default(bam_flagstat_t *s) +++{ +++ char b0[16], b1[16]; +++ fprintf(samtools_stdout, "%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); +++ fprintf(samtools_stdout, "%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); +++ fprintf(samtools_stdout, "%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); +++ fprintf(samtools_stdout, "%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); +++ fprintf(samtools_stdout, "%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); +++ fprintf(samtools_stdout, "%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); +++ fprintf(samtools_stdout, "%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); +++ fprintf(samtools_stdout, "%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); +++ fprintf(samtools_stdout, "%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); +++ fprintf(samtools_stdout, "%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); +++ fprintf(samtools_stdout, "%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); +++ fprintf(samtools_stdout, "%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); +++ fprintf(samtools_stdout, "%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); +++} +++ +++static void out_fmt_json(bam_flagstat_t *s) { +++ char b0[16], b1[16]; +++ fprintf(samtools_stdout, "{\n \"QC-passed reads\": { \n" +++ " \"total\": %lld, \n" +++ " \"secondary\": %lld, \n" +++ " \"supplementary\": %lld, \n" +++ " \"duplicates\": %lld, \n" +++ " \"mapped\": %lld, \n" +++ " \"mapped %%\": %s, \n" +++ " \"paired in sequencing\": %lld, \n" +++ " \"read1\": %lld, \n" +++ " \"read2\": %lld, \n" +++ " \"properly paired\": %lld, \n" +++ " \"properly paired %%\": %s, \n" +++ " \"with itself and mate mapped\": %lld, \n" +++ " \"singletons\": %lld, \n" +++ " \"singletons %%\": %s, \n" +++ " \"with mate mapped to a different chr\": %lld, \n" +++ " \"with mate mapped to a different chr (mapQ >= 5)\": %lld \n" +++ " }," +++ "\n \"QC-failed reads\": { \n" +++ " \"total\": %lld, \n" +++ " \"secondary\": %lld, \n" +++ " \"supplementary\": %lld, \n" +++ " \"duplicates\": %lld, \n" +++ " \"mapped\": %lld, \n" +++ " \"mapped %%\": %s, \n" +++ " \"paired in sequencing\": %lld, \n" +++ " \"read1\": %lld, \n" +++ " \"read2\": %lld, \n" +++ " \"properly paired\": %lld, \n" +++ " \"properly paired %%\": %s, \n" +++ " \"with itself and mate mapped\": %lld, \n" +++ " \"singletons\": %lld, \n" +++ " \"singletons %%\": %s, \n" +++ " \"with mate mapped to a different chr\": %lld, \n" +++ " \"with mate mapped to a different chr (mapQ >= 5)\": %lld \n" +++ " }\n" +++ "}\n", +++ s->n_reads[0], +++ s->n_secondary[0], +++ s->n_supp[0], +++ s->n_dup[0], +++ s->n_mapped[0], +++ percent_json(b0, s->n_mapped[0], s->n_reads[0]), +++ s->n_pair_all[0], +++ s->n_read1[0], +++ s->n_read2[0], +++ s->n_pair_good[0], +++ percent_json(b0, s->n_pair_good[0], s->n_pair_all[0]), +++ s->n_pair_map[0], +++ s->n_sgltn[0], +++ percent_json(b0, s->n_sgltn[0], s->n_pair_all[0]), +++ s->n_diffchr[0], +++ s->n_diffhigh[0], +++ s->n_reads[1], +++ s->n_secondary[1], +++ s->n_supp[1], +++ s->n_dup[1], +++ s->n_mapped[1], +++ percent_json(b1, s->n_mapped[1], s->n_reads[1]), +++ s->n_pair_all[1], +++ s->n_read1[1], +++ s->n_read2[1], +++ s->n_pair_good[1], +++ percent_json(b1, s->n_pair_good[1], s->n_pair_all[1]), +++ s->n_pair_map[1], +++ s->n_sgltn[1], +++ percent_json(b1, s->n_sgltn[1], s->n_pair_all[1]), +++ s->n_diffchr[1], +++ s->n_diffhigh[1] +++ ); +++} +++ +++static void out_fmt_tsv(bam_flagstat_t *s) { +++ char b0[16], b1[16]; +++ fprintf(samtools_stdout, "%lld\t%lld\ttotal (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); +++ fprintf(samtools_stdout, "%lld\t%lld\tsecondary\n", s->n_secondary[0], s->n_secondary[1]); +++ fprintf(samtools_stdout, "%lld\t%lld\tsupplementary\n", s->n_supp[0], s->n_supp[1]); +++ fprintf(samtools_stdout, "%lld\t%lld\tduplicates\n", s->n_dup[0], s->n_dup[1]); +++ fprintf(samtools_stdout, "%lld\t%lld\tmapped\n", s->n_mapped[0], s->n_mapped[1]); +++ fprintf(samtools_stdout, "%s\t%s\tmapped %%\n", percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); +++ fprintf(samtools_stdout, "%lld\t%lld\tpaired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); +++ fprintf(samtools_stdout, "%lld\t%lld\tread1\n", s->n_read1[0], s->n_read1[1]); +++ fprintf(samtools_stdout, "%lld\t%lld\tread2\n", s->n_read2[0], s->n_read2[1]); +++ fprintf(samtools_stdout, "%lld\t%lld\tproperly paired\n", s->n_pair_good[0], s->n_pair_good[1]); +++ fprintf(samtools_stdout, "%s\t%s\tproperly paired %%\n", percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); +++ fprintf(samtools_stdout, "%lld\t%lld\twith itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); +++ fprintf(samtools_stdout, "%lld\t%lld\tsingletons\n", s->n_sgltn[0], s->n_sgltn[1]); +++ fprintf(samtools_stdout, "%s\t%s\tsingletons %%\n", percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); +++ fprintf(samtools_stdout, "%lld\t%lld\twith mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); +++ fprintf(samtools_stdout, "%lld\t%lld\twith mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); +++} +++ +++/* +++ * Select flagstats output format to print. +++ */ +++static void output_fmt(bam_flagstat_t *s, const char *out_fmt) +++{ +++ if (strcmp(out_fmt, "json") == 0 || strcmp(out_fmt, "JSON") == 0) { +++ out_fmt_json(s); +++ } else if (strcmp(out_fmt, "tsv") == 0 || strcmp(out_fmt, "TSV") == 0) { +++ out_fmt_tsv(s); +++ } else { +++ out_fmt_default(s); +++ } +++} +++ ++ int bam_flagstat(int argc, char *argv[]) ++ { ++ samFile *fp; ++- bam_hdr_t *header; +++ sam_hdr_t *header; ++ bam_flagstat_t *s; ++- char b0[16], b1[16]; +++ const char *out_fmt = "default"; ++ int c; ++ ++ enum { ++@@ -116,12 +252,15 @@ ++ ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ static const struct option lopts[] = { ++- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), +++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', '-', '@'), ++ {NULL, 0, NULL, 0} ++ }; ++ ++- while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) { +++ while ((c = getopt_long(argc, argv, "@:O:", lopts, NULL)) >= 0) { ++ switch (c) { +++ case 'O': +++ out_fmt = optarg; +++ break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': ++@@ -157,22 +296,11 @@ ++ fprintf(samtools_stderr, "Failed to read header for \"%s\"\n", argv[optind]); ++ return 1; ++ } +++ ++ s = bam_flagstat_core(fp, header); ++- fprintf(samtools_stdout, "%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); ++- fprintf(samtools_stdout, "%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); ++- fprintf(samtools_stdout, "%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); ++- fprintf(samtools_stdout, "%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); ++- fprintf(samtools_stdout, "%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); ++- fprintf(samtools_stdout, "%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); ++- fprintf(samtools_stdout, "%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); ++- fprintf(samtools_stdout, "%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); ++- fprintf(samtools_stdout, "%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); ++- fprintf(samtools_stdout, "%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); ++- fprintf(samtools_stdout, "%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); ++- fprintf(samtools_stdout, "%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); ++- fprintf(samtools_stdout, "%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); +++ output_fmt(s, out_fmt); ++ free(s); ++- bam_hdr_destroy(header); +++ sam_hdr_destroy(header); ++ sam_close(fp); ++ sam_global_args_free(&ga); ++ return 0; ++--- python-pysam.orig/samtools/bamshuf.c +++++ python-pysam/samtools/bamshuf.c ++@@ -1,7 +1,7 @@ ++ /* bamshuf.c -- collate subcommand. ++ ++ Copyright (C) 2012 Broad Institute. ++- Copyright (C) 2013, 2015, 2018 Genome Research Ltd. +++ Copyright (C) 2013, 2015-2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -164,7 +164,7 @@ ++ } ++ ++ ++-static inline int write_to_bin_file(bam1_t *bam, int64_t *count, samFile **bin_files, char **names, bam_hdr_t *header, int files) { +++static inline int write_to_bin_file(bam1_t *bam, int64_t *count, samFile **bin_files, char **names, sam_hdr_t *header, int files) { ++ uint32_t x; ++ ++ x = hash_X31_Wang(bam_get_qname(bam)) % files; ++@@ -181,13 +181,13 @@ ++ ++ ++ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, ++- int is_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga) +++ int is_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga, char *arg_list, int no_pg) ++ { ++ samFile *fp, *fpw = NULL, **fpt = NULL; ++ char **fnt = NULL, modew[8]; ++ bam1_t *b = NULL; ++ int i, counter, l, r; ++- bam_hdr_t *h = NULL; +++ sam_hdr_t *h = NULL; ++ int64_t j, max_cnt = 0, *cnt = NULL; ++ elem_t *a = NULL; ++ htsThreadPool p = {NULL, 0}; ++@@ -214,14 +214,10 @@ ++ goto fail; ++ } ++ ++- if (sam_hdr_change_HD(h, "SO", "unsorted") != 0) { ++- print_error("collate", ++- "failed to change sort order header to 'unsorted'\n"); ++- goto fail; ++- } ++- if (sam_hdr_change_HD(h, "GO", "query") != 0) { ++- print_error("collate", ++- "failed to change group order header to 'query'\n"); +++ if ((-1 == sam_hdr_update_hd(h, "SO", "unsorted", "GO", "query")) +++ && (-1 == sam_hdr_add_line(h, "HD", "VN", SAM_FORMAT_VERSION, "SO", "unsorted", "GO", "query", NULL)) +++ ) { +++ print_error("collate", "failed to update HD line\n"); ++ goto fail; ++ } ++ ++@@ -254,6 +250,15 @@ ++ } ++ if (p.pool) hts_set_opt(fpw, HTS_OPT_THREAD_POOL, &p); ++ +++ if (!no_pg && sam_hdr_add_pg(h, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ print_error("collate", "failed to add PG line to header of \"%s\"", output_file); +++ goto fail; +++ } +++ ++ if (sam_hdr_write(fpw, h) < 0) { ++ print_error_errno("collate", "Couldn't write header"); ++ goto fail; ++@@ -459,7 +464,7 @@ ++ goto fail; ++ } ++ if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); ++- bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header +++ sam_hdr_destroy(sam_hdr_read(fp)); // Skip over header ++ ++ // Slurp in one of the split files ++ for (j = 0; j < c; ++j) { ++@@ -485,7 +490,7 @@ ++ } ++ } ++ ++- bam_hdr_destroy(h); +++ sam_hdr_destroy(h); ++ for (j = 0; j < max_cnt; ++j) bam_destroy1(a[j].b); ++ free(a); free(fnt); free(cnt); ++ sam_global_args_free(ga); ++@@ -503,7 +508,7 @@ ++ fail: ++ if (fp) sam_close(fp); ++ if (fpw) sam_close(fpw); ++- if (h) bam_hdr_destroy(h); +++ if (h) sam_hdr_destroy(h); ++ for (i = 0; i < n_files; ++i) { ++ if (fnt) free(fnt[i]); ++ if (fpt && fpt[i]) sam_close(fpt[i]); ++@@ -530,10 +535,11 @@ ++ " -f fast (only primary alignments)\n" ++ " -r working reads stored (with -f) [%d]\n" // reads_store ++ " -l INT compression level [%d]\n" // DEF_CLEVEL ++- " -n INT number of temporary files [%d]\n", // n_files +++ " -n INT number of temporary files [%d]\n" // n_files +++ " --no-PG do not add a PG line\n", ++ reads_store, DEF_CLEVEL, n_files); ++ ++- sam_global_opt_help(fp, "-....@"); +++ sam_global_opt_help(fp, "-....@-."); ++ fprintf(fp, ++ " is required unless the -o or -O options are used.\n"); ++ ++@@ -574,12 +580,13 @@ ++ ++ int main_bamshuf(int argc, char *argv[]) ++ { ++- int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0; +++ int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0, no_pg = 0; ++ const char *output_file = NULL; ++- char *prefix = NULL; +++ char *prefix = NULL, *arg_list = NULL; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -592,6 +599,7 @@ ++ case 'o': output_file = optarg; break; ++ case 'f': fast_coll = 1; break; ++ case 'r': reads_store = atoi(optarg); break; +++ case 1: no_pg = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': return usage(stderr, n_files, reads_store); ++@@ -612,10 +620,16 @@ ++ ++ if (!prefix) return EXIT_FAILURE; ++ +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("collate", "failed to create arg_list"); +++ return 1; +++ } +++ ++ ret = bamshuf(argv[optind], n_files, prefix, clevel, is_stdout, ++- output_file, fast_coll, reads_store, &ga); +++ output_file, fast_coll, reads_store, &ga, arg_list, no_pg); ++ ++ if (pre_mem) free(prefix); +++ free(arg_list); ++ ++ return ret; ++ } ++--- python-pysam.orig/samtools/bamshuf.c.pysam.c +++++ python-pysam/samtools/bamshuf.c.pysam.c ++@@ -3,7 +3,7 @@ ++ /* bamshuf.c -- collate subcommand. ++ ++ Copyright (C) 2012 Broad Institute. ++- Copyright (C) 2013, 2015, 2018 Genome Research Ltd. +++ Copyright (C) 2013, 2015-2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -166,7 +166,7 @@ ++ } ++ ++ ++-static inline int write_to_bin_file(bam1_t *bam, int64_t *count, samFile **bin_files, char **names, bam_hdr_t *header, int files) { +++static inline int write_to_bin_file(bam1_t *bam, int64_t *count, samFile **bin_files, char **names, sam_hdr_t *header, int files) { ++ uint32_t x; ++ ++ x = hash_X31_Wang(bam_get_qname(bam)) % files; ++@@ -183,13 +183,13 @@ ++ ++ ++ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, ++- int is_samtools_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga) +++ int is_samtools_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga, char *arg_list, int no_pg) ++ { ++ samFile *fp, *fpw = NULL, **fpt = NULL; ++ char **fnt = NULL, modew[8]; ++ bam1_t *b = NULL; ++ int i, counter, l, r; ++- bam_hdr_t *h = NULL; +++ sam_hdr_t *h = NULL; ++ int64_t j, max_cnt = 0, *cnt = NULL; ++ elem_t *a = NULL; ++ htsThreadPool p = {NULL, 0}; ++@@ -216,14 +216,10 @@ ++ goto fail; ++ } ++ ++- if (sam_hdr_change_HD(h, "SO", "unsorted") != 0) { ++- print_error("collate", ++- "failed to change sort order header to 'unsorted'\n"); ++- goto fail; ++- } ++- if (sam_hdr_change_HD(h, "GO", "query") != 0) { ++- print_error("collate", ++- "failed to change group order header to 'query'\n"); +++ if ((-1 == sam_hdr_update_hd(h, "SO", "unsorted", "GO", "query")) +++ && (-1 == sam_hdr_add_line(h, "HD", "VN", SAM_FORMAT_VERSION, "SO", "unsorted", "GO", "query", NULL)) +++ ) { +++ print_error("collate", "failed to update HD line\n"); ++ goto fail; ++ } ++ ++@@ -256,6 +252,15 @@ ++ } ++ if (p.pool) hts_set_opt(fpw, HTS_OPT_THREAD_POOL, &p); ++ +++ if (!no_pg && sam_hdr_add_pg(h, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ print_error("collate", "failed to add PG line to header of \"%s\"", output_file); +++ goto fail; +++ } +++ ++ if (sam_hdr_write(fpw, h) < 0) { ++ print_error_errno("collate", "Couldn't write header"); ++ goto fail; ++@@ -461,7 +466,7 @@ ++ goto fail; ++ } ++ if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); ++- bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header +++ sam_hdr_destroy(sam_hdr_read(fp)); // Skip over header ++ ++ // Slurp in one of the split files ++ for (j = 0; j < c; ++j) { ++@@ -487,7 +492,7 @@ ++ } ++ } ++ ++- bam_hdr_destroy(h); +++ sam_hdr_destroy(h); ++ for (j = 0; j < max_cnt; ++j) bam_destroy1(a[j].b); ++ free(a); free(fnt); free(cnt); ++ sam_global_args_free(ga); ++@@ -505,7 +510,7 @@ ++ fail: ++ if (fp) sam_close(fp); ++ if (fpw) sam_close(fpw); ++- if (h) bam_hdr_destroy(h); +++ if (h) sam_hdr_destroy(h); ++ for (i = 0; i < n_files; ++i) { ++ if (fnt) free(fnt[i]); ++ if (fpt && fpt[i]) sam_close(fpt[i]); ++@@ -532,10 +537,11 @@ ++ " -f fast (only primary alignments)\n" ++ " -r working reads stored (with -f) [%d]\n" // reads_store ++ " -l INT compression level [%d]\n" // DEF_CLEVEL ++- " -n INT number of temporary files [%d]\n", // n_files +++ " -n INT number of temporary files [%d]\n" // n_files +++ " --no-PG do not add a PG line\n", ++ reads_store, DEF_CLEVEL, n_files); ++ ++- sam_global_opt_help(fp, "-....@"); +++ sam_global_opt_help(fp, "-....@-."); ++ fprintf(fp, ++ " is required unless the -o or -O options are used.\n"); ++ ++@@ -576,12 +582,13 @@ ++ ++ int main_bamshuf(int argc, char *argv[]) ++ { ++- int c, n_files = 64, clevel = DEF_CLEVEL, is_samtools_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0; +++ int c, n_files = 64, clevel = DEF_CLEVEL, is_samtools_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0, no_pg = 0; ++ const char *output_file = NULL; ++- char *prefix = NULL; +++ char *prefix = NULL, *arg_list = NULL; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -594,6 +601,7 @@ ++ case 'o': output_file = optarg; break; ++ case 'f': fast_coll = 1; break; ++ case 'r': reads_store = atoi(optarg); break; +++ case 1: no_pg = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': return usage(samtools_stderr, n_files, reads_store); ++@@ -614,10 +622,16 @@ ++ ++ if (!prefix) return EXIT_FAILURE; ++ +++ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("collate", "failed to create arg_list"); +++ return 1; +++ } +++ ++ ret = bamshuf(argv[optind], n_files, prefix, clevel, is_samtools_stdout, ++- output_file, fast_coll, reads_store, &ga); +++ output_file, fast_coll, reads_store, &ga, arg_list, no_pg); ++ ++ if (pre_mem) free(prefix); +++ free(arg_list); ++ ++ return ret; ++ } ++--- python-pysam.orig/samtools/bamtk.c +++++ python-pysam/samtools/bamtk.c ++@@ -1,6 +1,6 @@ ++ /* bamtk.c -- main samtools command front-end. ++ ++- Copyright (C) 2008-2018 Genome Research Ltd. +++ Copyright (C) 2008-2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -38,7 +38,7 @@ ++ int bam_merge(int argc, char *argv[]); ++ int bam_index(int argc, char *argv[]); ++ int bam_sort(int argc, char *argv[]); ++-int bam_tview_main(int argc, char *argv[]); +++//int bam_tview_main(int argc, char *argv[]); ++ int bam_mating(int argc, char *argv[]); ++ int bam_rmdup(int argc, char *argv[]); ++ int bam_flagstat(int argc, char *argv[]); ++@@ -52,6 +52,7 @@ ++ int main_phase(int argc, char *argv[]); ++ int main_cat(int argc, char *argv[]); ++ int main_depth(int argc, char *argv[]); +++int main_coverage(int argc, char *argv[]); ++ int main_bam2fq(int argc, char *argv[]); ++ int main_pad2unpad(int argc, char *argv[]); ++ int main_bedcov(int argc, char *argv[]); ++@@ -109,6 +110,7 @@ ++ "\n" ++ " -- Statistics\n" ++ " bedcov read depth per BED region\n" +++" coverage alignment depth and percent coverage\n" ++ " depth compute the depth\n" ++ " flagstat simple stats\n" ++ " idxstats BAM index stats\n" ++@@ -166,14 +168,16 @@ ++ else if (strcmp(argv[1], "merge") == 0) ret = bam_merge(argc-1, argv+1); ++ else if (strcmp(argv[1], "sort") == 0) ret = bam_sort(argc-1, argv+1); ++ else if (strcmp(argv[1], "index") == 0) ret = bam_index(argc-1, argv+1); ++- else if (strcmp(argv[1], "idxstats") == 0) ret = bam_idxstats(argc-1, argv+1); +++ else if (strcmp(argv[1], "idxstat") == 0 || +++ strcmp(argv[1], "idxstats") == 0) ret = bam_idxstats(argc-1, argv+1); ++ else if (strcmp(argv[1], "faidx") == 0) ret = faidx_main(argc-1, argv+1); ++ else if (strcmp(argv[1], "fqidx") == 0) ret = fqidx_main(argc-1, argv+1); ++ else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1); ++ else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1); ++ else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1); ++ else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1); ++- else if (strcmp(argv[1], "flagstat") == 0) ret = bam_flagstat(argc-1, argv+1); +++ else if (strcmp(argv[1], "flagstat") == 0 || +++ strcmp(argv[1], "flagstats") == 0) ret = bam_flagstat(argc-1, argv+1); ++ else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1); ++ else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1); ++ else if (strcmp(argv[1], "reheader") == 0) ret = main_reheader(argc-1, argv+1); ++@@ -181,6 +185,7 @@ ++ else if (strcmp(argv[1], "targetcut") == 0) ret = main_cut_target(argc-1, argv+1); ++ else if (strcmp(argv[1], "phase") == 0) ret = main_phase(argc-1, argv+1); ++ else if (strcmp(argv[1], "depth") == 0) ret = main_depth(argc-1, argv+1); +++ else if (strcmp(argv[1], "coverage") == 0) ret = main_coverage(argc-1, argv+1); ++ else if (strcmp(argv[1], "bam2fq") == 0 || ++ strcmp(argv[1], "fastq") == 0 || ++ strcmp(argv[1], "fasta") == 0) ret = main_bam2fq(argc-1, argv+1); ++@@ -189,8 +194,10 @@ ++ else if (strcmp(argv[1], "bedcov") == 0) ret = main_bedcov(argc-1, argv+1); ++ else if (strcmp(argv[1], "bamshuf") == 0) ret = main_bamshuf(argc-1, argv+1); ++ else if (strcmp(argv[1], "collate") == 0) ret = main_bamshuf(argc-1, argv+1); ++- else if (strcmp(argv[1], "stats") == 0) ret = main_stats(argc-1, argv+1); ++- else if (strcmp(argv[1], "flags") == 0) ret = main_flags(argc-1, argv+1); +++ else if (strcmp(argv[1], "stat") == 0 || +++ strcmp(argv[1], "stats") == 0) ret = main_stats(argc-1, argv+1); +++ else if (strcmp(argv[1], "flag") == 0 || +++ strcmp(argv[1], "flags") == 0) ret = main_flags(argc-1, argv+1); ++ else if (strcmp(argv[1], "split") == 0) ret = main_split(argc-1, argv+1); ++ else if (strcmp(argv[1], "quickcheck") == 0) ret = main_quickcheck(argc-1, argv+1); ++ else if (strcmp(argv[1], "addreplacerg") == 0) ret = main_addreplacerg(argc-1, argv+1); ++@@ -198,12 +205,12 @@ ++ fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); ++ return 1; ++ } ++- else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); +++ //else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); ++ else if (strcmp(argv[1], "--version") == 0) { ++ printf( ++ "samtools %s\n" ++ "Using htslib %s\n" ++-"Copyright (C) 2018 Genome Research Ltd.\n", +++"Copyright (C) 2019 Genome Research Ltd.\n", ++ samtools_version(), hts_version()); ++ } ++ else if (strcmp(argv[1], "--version-only") == 0) { ++--- python-pysam.orig/samtools/bamtk.c.pysam.c +++++ python-pysam/samtools/bamtk.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* bamtk.c -- main samtools command front-end. ++ ++- Copyright (C) 2008-2018 Genome Research Ltd. +++ Copyright (C) 2008-2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -54,6 +54,7 @@ ++ int main_phase(int argc, char *argv[]); ++ int main_cat(int argc, char *argv[]); ++ int main_depth(int argc, char *argv[]); +++int main_coverage(int argc, char *argv[]); ++ int main_bam2fq(int argc, char *argv[]); ++ int main_pad2unpad(int argc, char *argv[]); ++ int main_bedcov(int argc, char *argv[]); ++@@ -111,6 +112,7 @@ ++ "\n" ++ " -- Statistics\n" ++ " bedcov read depth per BED region\n" +++" coverage alignment depth and percent coverage\n" ++ " depth compute the depth\n" ++ " flagstat simple stats\n" ++ " idxstats BAM index stats\n" ++@@ -168,14 +170,16 @@ ++ else if (strcmp(argv[1], "merge") == 0) ret = bam_merge(argc-1, argv+1); ++ else if (strcmp(argv[1], "sort") == 0) ret = bam_sort(argc-1, argv+1); ++ else if (strcmp(argv[1], "index") == 0) ret = bam_index(argc-1, argv+1); ++- else if (strcmp(argv[1], "idxstats") == 0) ret = bam_idxstats(argc-1, argv+1); +++ else if (strcmp(argv[1], "idxstat") == 0 || +++ strcmp(argv[1], "idxstats") == 0) ret = bam_idxstats(argc-1, argv+1); ++ else if (strcmp(argv[1], "faidx") == 0) ret = faidx_main(argc-1, argv+1); ++ else if (strcmp(argv[1], "fqidx") == 0) ret = fqidx_main(argc-1, argv+1); ++ else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1); ++ else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1); ++ else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1); ++ else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1); ++- else if (strcmp(argv[1], "flagstat") == 0) ret = bam_flagstat(argc-1, argv+1); +++ else if (strcmp(argv[1], "flagstat") == 0 || +++ strcmp(argv[1], "flagstats") == 0) ret = bam_flagstat(argc-1, argv+1); ++ else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1); ++ else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1); ++ else if (strcmp(argv[1], "reheader") == 0) ret = main_reheader(argc-1, argv+1); ++@@ -183,6 +187,7 @@ ++ else if (strcmp(argv[1], "targetcut") == 0) ret = main_cut_target(argc-1, argv+1); ++ else if (strcmp(argv[1], "phase") == 0) ret = main_phase(argc-1, argv+1); ++ else if (strcmp(argv[1], "depth") == 0) ret = main_depth(argc-1, argv+1); +++ else if (strcmp(argv[1], "coverage") == 0) ret = main_coverage(argc-1, argv+1); ++ else if (strcmp(argv[1], "bam2fq") == 0 || ++ strcmp(argv[1], "fastq") == 0 || ++ strcmp(argv[1], "fasta") == 0) ret = main_bam2fq(argc-1, argv+1); ++@@ -191,8 +196,10 @@ ++ else if (strcmp(argv[1], "bedcov") == 0) ret = main_bedcov(argc-1, argv+1); ++ else if (strcmp(argv[1], "bamshuf") == 0) ret = main_bamshuf(argc-1, argv+1); ++ else if (strcmp(argv[1], "collate") == 0) ret = main_bamshuf(argc-1, argv+1); ++- else if (strcmp(argv[1], "stats") == 0) ret = main_stats(argc-1, argv+1); ++- else if (strcmp(argv[1], "flags") == 0) ret = main_flags(argc-1, argv+1); +++ else if (strcmp(argv[1], "stat") == 0 || +++ strcmp(argv[1], "stats") == 0) ret = main_stats(argc-1, argv+1); +++ else if (strcmp(argv[1], "flag") == 0 || +++ strcmp(argv[1], "flags") == 0) ret = main_flags(argc-1, argv+1); ++ else if (strcmp(argv[1], "split") == 0) ret = main_split(argc-1, argv+1); ++ else if (strcmp(argv[1], "quickcheck") == 0) ret = main_quickcheck(argc-1, argv+1); ++ else if (strcmp(argv[1], "addreplacerg") == 0) ret = main_addreplacerg(argc-1, argv+1); ++@@ -202,10 +209,10 @@ ++ } ++ //else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); ++ else if (strcmp(argv[1], "--version") == 0) { ++- fprintf(samtools_stdout, +++ fprintf(samtools_stdout, ++ "samtools %s\n" ++ "Using htslib %s\n" ++-"Copyright (C) 2018 Genome Research Ltd.\n", +++"Copyright (C) 2019 Genome Research Ltd.\n", ++ samtools_version(), hts_version()); ++ } ++ else if (strcmp(argv[1], "--version-only") == 0) { ++--- python-pysam.orig/samtools/bedcov.c +++++ python-pysam/samtools/bedcov.c ++@@ -1,7 +1,7 @@ ++ /* bedcov.c -- bedcov subcommand. ++ ++ Copyright (C) 2012 Broad Institute. ++- Copyright (C) 2013-2014 Genome Research Ltd. +++ Copyright (C) 2013-2014, 2018, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -34,6 +34,7 @@ ++ #include "htslib/kstring.h" ++ #include "htslib/sam.h" ++ #include "htslib/thread_pool.h" +++#include "samtools.h" ++ #include "sam_opts.h" ++ ++ #include "htslib/kseq.h" ++@@ -41,7 +42,7 @@ ++ ++ typedef struct { ++ htsFile *fp; ++- bam_hdr_t *header; +++ sam_hdr_t *header; ++ hts_itr_t *iter; ++ int min_mapQ; ++ } aux_t; ++@@ -71,7 +72,7 @@ ++ int *n_plp, dret, i, j, m, n, c, min_mapQ = 0, skip_DN = 0; ++ int64_t *cnt; ++ const bam_pileup1_t **plp; ++- int usage = 0; +++ int usage = 0, has_index_file = 0; ++ ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ static const struct option lopts[] = { ++@@ -79,9 +80,10 @@ ++ { NULL, 0, NULL, 0 } ++ }; ++ ++- while ((c = getopt_long(argc, argv, "Q:j", lopts, NULL)) >= 0) { +++ while ((c = getopt_long(argc, argv, "Q:Xj", lopts, NULL)) >= 0) { ++ switch (c) { ++ case 'Q': min_mapQ = atoi(optarg); break; +++ case 'X': has_index_file = 1; break; ++ case 'j': skip_DN = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++@@ -93,20 +95,36 @@ ++ fprintf(stderr, "Usage: samtools bedcov [options] [...]\n\n"); ++ fprintf(stderr, "Options:\n"); ++ fprintf(stderr, " -Q mapping quality threshold [0]\n"); +++ fprintf(stderr, " -X use customized index files\n"); ++ fprintf(stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n"); ++- sam_global_opt_help(stderr, "-.--.-"); +++ sam_global_opt_help(stderr, "-.--.--."); ++ return 1; ++ } +++ if (has_index_file) { +++ if ((argc - optind - 1) % 2 != 0) { // Calculate # of input BAM files +++ fprintf(stderr, "ERROR: odd number of filenames detected! Each BAM file should have an index file\n"); +++ return 1; +++ } +++ n = (argc - optind - 1) / 2; +++ } else { +++ n = argc - optind - 1; +++ } +++ ++ memset(&str, 0, sizeof(kstring_t)); ++- n = argc - optind - 1; ++ aux = calloc(n, sizeof(aux_t*)); ++ idx = calloc(n, sizeof(hts_idx_t*)); ++ for (i = 0; i < n; ++i) { ++ aux[i] = calloc(1, sizeof(aux_t)); ++ aux[i]->min_mapQ = min_mapQ; ++ aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in); ++- if (aux[i]->fp) ++- idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); +++ if (aux[i]->fp) { +++ // If index filename has not been specfied, look in BAM folder +++ if (has_index_file) { +++ idx[i] = sam_index_load2(aux[i]->fp, argv[i+optind+1], argv[i+optind+n+1]); +++ } else { +++ idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); +++ } +++ } ++ if (aux[i]->fp == 0 || idx[i] == 0) { ++ fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); ++ return 2; ++@@ -122,6 +140,10 @@ ++ cnt = calloc(n, 8); ++ ++ fp = gzopen(argv[optind], "rb"); +++ if (fp == NULL) { +++ print_error_errno("bedcov", "can't open BED file '%s'", argv[optind]); +++ return 2; +++ } ++ ks = ks_init(fp); ++ n_plp = calloc(n, sizeof(int)); ++ plp = calloc(n, sizeof(bam_pileup1_t*)); ++@@ -186,7 +208,7 @@ ++ for (i = 0; i < n; ++i) { ++ if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); ++ hts_idx_destroy(idx[i]); ++- bam_hdr_destroy(aux[i]->header); +++ sam_hdr_destroy(aux[i]->header); ++ sam_close(aux[i]->fp); ++ free(aux[i]); ++ } ++--- python-pysam.orig/samtools/bedcov.c.pysam.c +++++ python-pysam/samtools/bedcov.c.pysam.c ++@@ -3,7 +3,7 @@ ++ /* bedcov.c -- bedcov subcommand. ++ ++ Copyright (C) 2012 Broad Institute. ++- Copyright (C) 2013-2014 Genome Research Ltd. +++ Copyright (C) 2013-2014, 2018, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -36,6 +36,7 @@ ++ #include "htslib/kstring.h" ++ #include "htslib/sam.h" ++ #include "htslib/thread_pool.h" +++#include "samtools.h" ++ #include "sam_opts.h" ++ ++ #include "htslib/kseq.h" ++@@ -43,7 +44,7 @@ ++ ++ typedef struct { ++ htsFile *fp; ++- bam_hdr_t *header; +++ sam_hdr_t *header; ++ hts_itr_t *iter; ++ int min_mapQ; ++ } aux_t; ++@@ -73,7 +74,7 @@ ++ int *n_plp, dret, i, j, m, n, c, min_mapQ = 0, skip_DN = 0; ++ int64_t *cnt; ++ const bam_pileup1_t **plp; ++- int usage = 0; +++ int usage = 0, has_index_file = 0; ++ ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ static const struct option lopts[] = { ++@@ -81,9 +82,10 @@ ++ { NULL, 0, NULL, 0 } ++ }; ++ ++- while ((c = getopt_long(argc, argv, "Q:j", lopts, NULL)) >= 0) { +++ while ((c = getopt_long(argc, argv, "Q:Xj", lopts, NULL)) >= 0) { ++ switch (c) { ++ case 'Q': min_mapQ = atoi(optarg); break; +++ case 'X': has_index_file = 1; break; ++ case 'j': skip_DN = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++@@ -95,20 +97,36 @@ ++ fprintf(samtools_stderr, "Usage: samtools bedcov [options] [...]\n\n"); ++ fprintf(samtools_stderr, "Options:\n"); ++ fprintf(samtools_stderr, " -Q mapping quality threshold [0]\n"); +++ fprintf(samtools_stderr, " -X use customized index files\n"); ++ fprintf(samtools_stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n"); ++- sam_global_opt_help(samtools_stderr, "-.--.-"); +++ sam_global_opt_help(samtools_stderr, "-.--.--."); ++ return 1; ++ } +++ if (has_index_file) { +++ if ((argc - optind - 1) % 2 != 0) { // Calculate # of input BAM files +++ fprintf(samtools_stderr, "ERROR: odd number of filenames detected! Each BAM file should have an index file\n"); +++ return 1; +++ } +++ n = (argc - optind - 1) / 2; +++ } else { +++ n = argc - optind - 1; +++ } +++ ++ memset(&str, 0, sizeof(kstring_t)); ++- n = argc - optind - 1; ++ aux = calloc(n, sizeof(aux_t*)); ++ idx = calloc(n, sizeof(hts_idx_t*)); ++ for (i = 0; i < n; ++i) { ++ aux[i] = calloc(1, sizeof(aux_t)); ++ aux[i]->min_mapQ = min_mapQ; ++ aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in); ++- if (aux[i]->fp) ++- idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); +++ if (aux[i]->fp) { +++ // If index filename has not been specfied, look in BAM folder +++ if (has_index_file) { +++ idx[i] = sam_index_load2(aux[i]->fp, argv[i+optind+1], argv[i+optind+n+1]); +++ } else { +++ idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); +++ } +++ } ++ if (aux[i]->fp == 0 || idx[i] == 0) { ++ fprintf(samtools_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); ++ return 2; ++@@ -124,6 +142,10 @@ ++ cnt = calloc(n, 8); ++ ++ fp = gzopen(argv[optind], "rb"); +++ if (fp == NULL) { +++ print_error_errno("bedcov", "can't open BED file '%s'", argv[optind]); +++ return 2; +++ } ++ ks = ks_init(fp); ++ n_plp = calloc(n, sizeof(int)); ++ plp = calloc(n, sizeof(bam_pileup1_t*)); ++@@ -188,7 +210,7 @@ ++ for (i = 0; i < n; ++i) { ++ if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); ++ hts_idx_destroy(idx[i]); ++- bam_hdr_destroy(aux[i]->header); +++ sam_hdr_destroy(aux[i]->header); ++ sam_close(aux[i]->fp); ++ free(aux[i]); ++ } ++--- python-pysam.orig/samtools/bedidx.c +++++ python-pysam/samtools/bedidx.c ++@@ -1,7 +1,7 @@ ++ /* bedidx.c -- BED file indexing. ++ ++ Copyright (C) 2011 Broad Institute. ++- Copyright (C) 2014,2017 Genome Research Ltd. +++ Copyright (C) 2014, 2017-2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -34,26 +34,28 @@ ++ #include "bedidx.h" ++ ++ #include "htslib/ksort.h" ++-KSORT_INIT_GENERIC(uint64_t) ++ ++ #include "htslib/kseq.h" ++ KSTREAM_INIT(gzFile, gzread, 8192) ++ +++static inline int lt_pair_pos(hts_pair_pos_t a, hts_pair_pos_t b) { +++ if (a.beg == b.beg) return a.end < b.end; +++ return a.beg < b.beg; +++} +++KSORT_INIT_STATIC(hts_pair_pos_t, hts_pair_pos_t, lt_pair_pos) +++ ++ /*! @typedef ++ * @abstract bed_reglist_t - value type of the BED hash table ++ * This structure encodes the list of intervals (ranges) for the regions provided via BED file or ++ * command line arguments. ++- * @field *a pointer to the array of intervals (kept as 64 bit integers). The upper 32 bits ++- * encode the beginning of the interval, while the lower 32 bits encode the end, for easy sorting. ++- * |-- 32 bits --|-- 32 bits --| ++- * |---- beg ----|---- end ----| +++ * @field *a pointer to the array of intervals. ++ * @field n actual number of elements contained by a ++ * @field m number of allocated elements to a (n <= m) ++ * @field *idx index array for computing the minimum offset ++ */ ++ typedef struct { ++ int n, m; ++- uint64_t *a; +++ hts_pair_pos_t *a; ++ int *idx; ++ int filter; ++ } bed_reglist_t; ++@@ -71,7 +73,6 @@ ++ khint_t k; ++ int i; ++ const char *reg; ++- uint32_t beg, end; ++ ++ if (!h) { ++ printf("Hash table is empty!\n"); ++@@ -84,10 +85,8 @@ ++ if ((p = &kh_val(h,k)) != NULL && p->n > 0) { ++ printf("Filter: %d\n", p->filter); ++ for (i=0; in; i++) { ++- beg = (uint32_t)(p->a[i]>>32); ++- end = (uint32_t)(p->a[i]); ++- ++- printf("\tinterval[%d]: %d-%d\n",i,beg,end); +++ printf("\tinterval[%d]: %"PRIhts_pos"-%"PRIhts_pos"\n", +++ i,p->a[i].beg,p->a[i].end); ++ } ++ } else { ++ printf("Region '%s' has no intervals!\n", reg); ++@@ -97,20 +96,23 @@ ++ } ++ #endif ++ ++-static int *bed_index_core(int n, uint64_t *a) +++static int *bed_index_core(int n, hts_pair_pos_t *a) ++ { ++- int i, j, l, *idx; +++ int i, j, l, *idx, *new_idx; ++ l = 0; idx = 0; ++ for (i = 0; i < n; ++i) { ++- int beg, end; ++- beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; +++ hts_pos_t beg, end; +++ beg = a[i].beg >> LIDX_SHIFT; end = a[i].end >> LIDX_SHIFT; ++ if (l < end + 1) { ++ int old_l = l; ++ l = end + 1; ++ kroundup32(l); ++- idx = realloc(idx, l * sizeof(int)); ++- if (!idx) +++ new_idx = realloc(idx, l * sizeof(*idx)); +++ if (!new_idx) { +++ free(idx); ++ return NULL; +++ } +++ idx = new_idx; ++ ++ for (j = old_l; j < l; ++j) ++ idx[j] = -1; ++@@ -131,19 +133,19 @@ ++ if (kh_exist(h, k)) { ++ bed_reglist_t *p = &kh_val(h, k); ++ if (p->idx) free(p->idx); ++- ks_introsort(uint64_t, p->n, p->a); +++ ks_introsort(hts_pair_pos_t, p->n, p->a); ++ p->idx = bed_index_core(p->n, p->a); ++ } ++ } ++ } ++ ++-static int bed_minoff(const bed_reglist_t *p, unsigned int beg, unsigned int end) { +++static int bed_minoff(const bed_reglist_t *p, hts_pos_t beg, hts_pos_t end) { ++ int i, min_off=0; ++ ++ if (p && p->idx) { ++ min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; ++ if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here ++- int n = beg>>LIDX_SHIFT; +++ hts_pos_t n = beg>>LIDX_SHIFT; ++ if (n > p->n) ++ n = p->n; ++ for (i = n - 1; i >= 0; --i) ++@@ -156,21 +158,21 @@ ++ return min_off; ++ } ++ ++-static int bed_overlap_core(const bed_reglist_t *p, int beg, int end) +++static int bed_overlap_core(const bed_reglist_t *p, hts_pos_t beg, hts_pos_t end) ++ { ++ int i, min_off; ++ if (p->n == 0) return 0; ++ min_off = bed_minoff(p, beg, end); ++ ++ for (i = min_off; i < p->n; ++i) { ++- if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed ++- if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) +++ if (p->a[i].beg >= end) break; // out of range; no need to proceed +++ if (p->a[i].end > beg && p->a[i].beg < end) ++ return 1; // find the overlap; return ++ } ++ return 0; ++ } ++ ++-int bed_overlap(const void *_h, const char *chr, int beg, int end) +++int bed_overlap(const void *_h, const char *chr, hts_pos_t beg, hts_pos_t end) ++ { ++ const reghash_t *h = (const reghash_t*)_h; ++ khint_t k; ++@@ -202,11 +204,11 @@ ++ continue; ++ ++ for (new_n = 0, j = 1; j < p->n; j++) { ++- if ((uint32_t)p->a[new_n] < (uint32_t)(p->a[j]>>32)) { +++ if (p->a[new_n].end < p->a[j].beg) { ++ p->a[++new_n] = p->a[j]; ++ } else { ++- if ((uint32_t)p->a[new_n] < (uint32_t)p->a[j]) ++- p->a[new_n] = (p->a[new_n] & 0xFFFFFFFF00000000) | (uint32_t)(p->a[j]); +++ if (p->a[new_n].end < p->a[j].end) +++ p->a[new_n].end = p->a[j].end; ++ } ++ } ++ ++@@ -260,13 +262,17 @@ ++ if (fp == 0) return 0; ++ ks = ks_init(fp); ++ if (NULL == ks) goto fail; // In case ks_init ever gets error checking... ++- while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) > 0) { // read a line +++ int ks_len; +++ while ((ks_len = ks_getuntil(ks, KS_SEP_LINE, &str, &dret)) >= 0) { // read a line ++ char *ref = str.s, *ref_end; ++- unsigned int beg = 0, end = 0; +++ uint64_t beg = 0, end = 0; ++ int num = 0; ++ khint_t k; ++ bed_reglist_t *p; ++ +++ if (ks_len == 0) +++ continue; // skip blank lines +++ ++ line++; ++ while (*ref && isspace(*ref)) ref++; ++ if ('\0' == *ref) continue; // Skip blank lines ++@@ -275,7 +281,7 @@ ++ while (*ref_end && !isspace(*ref_end)) ref_end++; ++ if ('\0' != *ref_end) { ++ *ref_end = '\0'; // terminate ref and look for start, end ++- num = sscanf(ref_end + 1, "%u %u", &beg, &end); +++ num = sscanf(ref_end + 1, "%"SCNu64" %"SCNu64, &beg, &end); ++ } ++ if (1 == num) { // VCF-style format ++ end = beg--; // Counts from 1 instead of 0 for BED files ++@@ -293,7 +299,8 @@ ++ } else { ++ fprintf(stderr, ++ "[bed_read] Parse error reading \"%s\" at line %u : " ++- "end (%u) must not be less than start (%u)\n", +++ "end (%"PRIu64") must not be less " +++ "than start (%"PRIu64")\n", ++ fn, line, end, beg); ++ } ++ errno = 0; // Prevent caller from printing misleading error messages ++@@ -318,16 +325,21 @@ ++ // Add begin,end to the list ++ if (p->n == p->m) { ++ p->m = p->m ? p->m<<1 : 4; ++- p->a = realloc(p->a, p->m * sizeof(uint64_t)); ++- if (NULL == p->a) goto fail; +++ hts_pair_pos_t *new_a = realloc(p->a, p->m * sizeof(p->a[0])); +++ if (NULL == new_a) goto fail; +++ p->a = new_a; ++ } ++- p->a[p->n++] = (uint64_t)beg<<32 | end; +++ p->a[p->n].beg = beg; +++ p->a[p->n++].end = end; ++ } ++ // FIXME: Need to check for errors in ks_getuntil. At the moment it ++ // doesn't look like it can return one. Possibly use gzgets instead? ++ +++ if (gzclose(fp) != Z_OK) { +++ fp = NULL; +++ goto fail; +++ } ++ ks_destroy(ks); ++- gzclose(fp); ++ free(str.s); ++ bed_index(h); ++ //bed_unify(h); ++@@ -361,7 +373,7 @@ ++ kh_destroy(reg, h); ++ } ++ ++-static void *bed_insert(void *reg_hash, char *reg, unsigned int beg, unsigned int end) { +++static void *bed_insert(void *reg_hash, char *reg, hts_pos_t beg, hts_pos_t end) { ++ ++ reghash_t *h; ++ khint_t k; ++@@ -390,10 +402,12 @@ ++ // Add beg and end to the list ++ if (p->n == p->m) { ++ p->m = p->m ? p->m<<1 : 4; ++- p->a = realloc(p->a, p->m * sizeof(uint64_t)); ++- if (NULL == p->a) goto fail; +++ hts_pair_pos_t *new_a = realloc(p->a, p->m * sizeof(p->a[0])); +++ if (NULL == new_a) goto fail; +++ p->a = new_a; ++ } ++- p->a[p->n++] = (uint64_t)beg<<32 | end; +++ p->a[p->n].beg = beg; +++ p->a[p->n++].end = end; ++ ++ fail: ++ return h; ++@@ -413,10 +427,10 @@ ++ reghash_t *t; ++ bed_reglist_t *p, *q; ++ khint_t l, k; ++- uint64_t *new_a; +++ hts_pair_pos_t *new_a; ++ int i, j, new_n, min_off; ++ const char *reg; ++- uint32_t beg, end; +++ hts_pos_t beg, end; ++ ++ h = (reghash_t *)reg_hash; ++ t = (reghash_t *)tmp_hash; ++@@ -434,20 +448,21 @@ ++ if (k == kh_end(h) || !(p = &kh_val(h, k)) || !(p->n)) ++ continue; ++ ++- new_a = (uint64_t *)calloc(q->n + p->n, sizeof(uint64_t)); +++ new_a = calloc(q->n + p->n, sizeof(new_a[0])); ++ if (!new_a) ++ return NULL; ++ new_n = 0; ++ ++ for (i = 0; i < q->n; i++) { ++- beg = (uint32_t)(q->a[i]>>32); ++- end = (uint32_t)(q->a[i]); +++ beg = q->a[i].beg; +++ end = q->a[i].end; ++ ++ min_off = bed_minoff(p, beg, end); ++ for (j = min_off; j < p->n; ++j) { ++- if ((uint32_t)(p->a[j]>>32) >= end) break; // out of range; no need to proceed ++- if ((uint32_t)(p->a[j]) > beg && (uint32_t)(p->a[j]>>32) < end) { ++- new_a[new_n++] = ((uint64_t)MAX((uint32_t)(p->a[j]>>32), beg) << 32) | MIN((uint32_t)p->a[j], end); +++ if (p->a[j].beg >= end) break; // out of range; no need to proceed +++ if (p->a[j].end > beg && p->a[j].beg < end) { +++ new_a[new_n].beg = MAX(p->a[j].beg, beg); +++ new_a[new_n++].end = MIN(p->a[j].end, end); ++ } ++ } ++ } ++@@ -494,6 +509,11 @@ ++ ++ for (i=first; i 1024) { ++@@ -596,8 +616,8 @@ ++ reglist[count].max_end = 0; ++ ++ for (j = 0; j < p->n; j++) { ++- reglist[count].intervals[j].beg = (uint32_t)(p->a[j]>>32); ++- reglist[count].intervals[j].end = (uint32_t)(p->a[j]); +++ reglist[count].intervals[j].beg = p->a[j].beg; +++ reglist[count].intervals[j].end = p->a[j].end; ++ ++ if (reglist[count].intervals[j].end > reglist[count].max_end) ++ reglist[count].max_end = reglist[count].intervals[j].end; ++--- python-pysam.orig/samtools/bedidx.c.pysam.c +++++ python-pysam/samtools/bedidx.c.pysam.c ++@@ -3,7 +3,7 @@ ++ /* bedidx.c -- BED file indexing. ++ ++ Copyright (C) 2011 Broad Institute. ++- Copyright (C) 2014,2017 Genome Research Ltd. +++ Copyright (C) 2014, 2017-2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -36,26 +36,28 @@ ++ #include "bedidx.h" ++ ++ #include "htslib/ksort.h" ++-KSORT_INIT_GENERIC(uint64_t) ++ ++ #include "htslib/kseq.h" ++ KSTREAM_INIT(gzFile, gzread, 8192) ++ +++static inline int lt_pair_pos(hts_pair_pos_t a, hts_pair_pos_t b) { +++ if (a.beg == b.beg) return a.end < b.end; +++ return a.beg < b.beg; +++} +++KSORT_INIT_STATIC(hts_pair_pos_t, hts_pair_pos_t, lt_pair_pos) +++ ++ /*! @typedef ++ * @abstract bed_reglist_t - value type of the BED hash table ++ * This structure encodes the list of intervals (ranges) for the regions provided via BED file or ++ * command line arguments. ++- * @field *a pointer to the array of intervals (kept as 64 bit integers). The upper 32 bits ++- * encode the beginning of the interval, while the lower 32 bits encode the end, for easy sorting. ++- * |-- 32 bits --|-- 32 bits --| ++- * |---- beg ----|---- end ----| +++ * @field *a pointer to the array of intervals. ++ * @field n actual number of elements contained by a ++ * @field m number of allocated elements to a (n <= m) ++ * @field *idx index array for computing the minimum offset ++ */ ++ typedef struct { ++ int n, m; ++- uint64_t *a; +++ hts_pair_pos_t *a; ++ int *idx; ++ int filter; ++ } bed_reglist_t; ++@@ -73,7 +75,6 @@ ++ khint_t k; ++ int i; ++ const char *reg; ++- uint32_t beg, end; ++ ++ if (!h) { ++ fprintf(samtools_stdout, "Hash table is empty!\n"); ++@@ -86,10 +87,8 @@ ++ if ((p = &kh_val(h,k)) != NULL && p->n > 0) { ++ fprintf(samtools_stdout, "Filter: %d\n", p->filter); ++ for (i=0; in; i++) { ++- beg = (uint32_t)(p->a[i]>>32); ++- end = (uint32_t)(p->a[i]); ++- ++- fprintf(samtools_stdout, "\tinterval[%d]: %d-%d\n",i,beg,end); +++ fprintf(samtools_stdout, "\tinterval[%d]: %"PRIhts_pos"-%"PRIhts_pos"\n", +++ i,p->a[i].beg,p->a[i].end); ++ } ++ } else { ++ fprintf(samtools_stdout, "Region '%s' has no intervals!\n", reg); ++@@ -99,20 +98,23 @@ ++ } ++ #endif ++ ++-static int *bed_index_core(int n, uint64_t *a) +++static int *bed_index_core(int n, hts_pair_pos_t *a) ++ { ++- int i, j, l, *idx; +++ int i, j, l, *idx, *new_idx; ++ l = 0; idx = 0; ++ for (i = 0; i < n; ++i) { ++- int beg, end; ++- beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; +++ hts_pos_t beg, end; +++ beg = a[i].beg >> LIDX_SHIFT; end = a[i].end >> LIDX_SHIFT; ++ if (l < end + 1) { ++ int old_l = l; ++ l = end + 1; ++ kroundup32(l); ++- idx = realloc(idx, l * sizeof(int)); ++- if (!idx) +++ new_idx = realloc(idx, l * sizeof(*idx)); +++ if (!new_idx) { +++ free(idx); ++ return NULL; +++ } +++ idx = new_idx; ++ ++ for (j = old_l; j < l; ++j) ++ idx[j] = -1; ++@@ -133,19 +135,19 @@ ++ if (kh_exist(h, k)) { ++ bed_reglist_t *p = &kh_val(h, k); ++ if (p->idx) free(p->idx); ++- ks_introsort(uint64_t, p->n, p->a); +++ ks_introsort(hts_pair_pos_t, p->n, p->a); ++ p->idx = bed_index_core(p->n, p->a); ++ } ++ } ++ } ++ ++-static int bed_minoff(const bed_reglist_t *p, unsigned int beg, unsigned int end) { +++static int bed_minoff(const bed_reglist_t *p, hts_pos_t beg, hts_pos_t end) { ++ int i, min_off=0; ++ ++ if (p && p->idx) { ++ min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; ++ if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here ++- int n = beg>>LIDX_SHIFT; +++ hts_pos_t n = beg>>LIDX_SHIFT; ++ if (n > p->n) ++ n = p->n; ++ for (i = n - 1; i >= 0; --i) ++@@ -158,21 +160,21 @@ ++ return min_off; ++ } ++ ++-static int bed_overlap_core(const bed_reglist_t *p, int beg, int end) +++static int bed_overlap_core(const bed_reglist_t *p, hts_pos_t beg, hts_pos_t end) ++ { ++ int i, min_off; ++ if (p->n == 0) return 0; ++ min_off = bed_minoff(p, beg, end); ++ ++ for (i = min_off; i < p->n; ++i) { ++- if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed ++- if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) +++ if (p->a[i].beg >= end) break; // out of range; no need to proceed +++ if (p->a[i].end > beg && p->a[i].beg < end) ++ return 1; // find the overlap; return ++ } ++ return 0; ++ } ++ ++-int bed_overlap(const void *_h, const char *chr, int beg, int end) +++int bed_overlap(const void *_h, const char *chr, hts_pos_t beg, hts_pos_t end) ++ { ++ const reghash_t *h = (const reghash_t*)_h; ++ khint_t k; ++@@ -204,11 +206,11 @@ ++ continue; ++ ++ for (new_n = 0, j = 1; j < p->n; j++) { ++- if ((uint32_t)p->a[new_n] < (uint32_t)(p->a[j]>>32)) { +++ if (p->a[new_n].end < p->a[j].beg) { ++ p->a[++new_n] = p->a[j]; ++ } else { ++- if ((uint32_t)p->a[new_n] < (uint32_t)p->a[j]) ++- p->a[new_n] = (p->a[new_n] & 0xFFFFFFFF00000000) | (uint32_t)(p->a[j]); +++ if (p->a[new_n].end < p->a[j].end) +++ p->a[new_n].end = p->a[j].end; ++ } ++ } ++ ++@@ -262,13 +264,17 @@ ++ if (fp == 0) return 0; ++ ks = ks_init(fp); ++ if (NULL == ks) goto fail; // In case ks_init ever gets error checking... ++- while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) > 0) { // read a line +++ int ks_len; +++ while ((ks_len = ks_getuntil(ks, KS_SEP_LINE, &str, &dret)) >= 0) { // read a line ++ char *ref = str.s, *ref_end; ++- unsigned int beg = 0, end = 0; +++ uint64_t beg = 0, end = 0; ++ int num = 0; ++ khint_t k; ++ bed_reglist_t *p; ++ +++ if (ks_len == 0) +++ continue; // skip blank lines +++ ++ line++; ++ while (*ref && isspace(*ref)) ref++; ++ if ('\0' == *ref) continue; // Skip blank lines ++@@ -277,7 +283,7 @@ ++ while (*ref_end && !isspace(*ref_end)) ref_end++; ++ if ('\0' != *ref_end) { ++ *ref_end = '\0'; // terminate ref and look for start, end ++- num = sscanf(ref_end + 1, "%u %u", &beg, &end); +++ num = sscanf(ref_end + 1, "%"SCNu64" %"SCNu64, &beg, &end); ++ } ++ if (1 == num) { // VCF-style format ++ end = beg--; // Counts from 1 instead of 0 for BED files ++@@ -295,7 +301,8 @@ ++ } else { ++ fprintf(samtools_stderr, ++ "[bed_read] Parse error reading \"%s\" at line %u : " ++- "end (%u) must not be less than start (%u)\n", +++ "end (%"PRIu64") must not be less " +++ "than start (%"PRIu64")\n", ++ fn, line, end, beg); ++ } ++ errno = 0; // Prevent caller from printing misleading error messages ++@@ -320,16 +327,21 @@ ++ // Add begin,end to the list ++ if (p->n == p->m) { ++ p->m = p->m ? p->m<<1 : 4; ++- p->a = realloc(p->a, p->m * sizeof(uint64_t)); ++- if (NULL == p->a) goto fail; +++ hts_pair_pos_t *new_a = realloc(p->a, p->m * sizeof(p->a[0])); +++ if (NULL == new_a) goto fail; +++ p->a = new_a; ++ } ++- p->a[p->n++] = (uint64_t)beg<<32 | end; +++ p->a[p->n].beg = beg; +++ p->a[p->n++].end = end; ++ } ++ // FIXME: Need to check for errors in ks_getuntil. At the moment it ++ // doesn't look like it can return one. Possibly use gzgets instead? ++ +++ if (gzclose(fp) != Z_OK) { +++ fp = NULL; +++ goto fail; +++ } ++ ks_destroy(ks); ++- gzclose(fp); ++ free(str.s); ++ bed_index(h); ++ //bed_unify(h); ++@@ -363,7 +375,7 @@ ++ kh_destroy(reg, h); ++ } ++ ++-static void *bed_insert(void *reg_hash, char *reg, unsigned int beg, unsigned int end) { +++static void *bed_insert(void *reg_hash, char *reg, hts_pos_t beg, hts_pos_t end) { ++ ++ reghash_t *h; ++ khint_t k; ++@@ -392,10 +404,12 @@ ++ // Add beg and end to the list ++ if (p->n == p->m) { ++ p->m = p->m ? p->m<<1 : 4; ++- p->a = realloc(p->a, p->m * sizeof(uint64_t)); ++- if (NULL == p->a) goto fail; +++ hts_pair_pos_t *new_a = realloc(p->a, p->m * sizeof(p->a[0])); +++ if (NULL == new_a) goto fail; +++ p->a = new_a; ++ } ++- p->a[p->n++] = (uint64_t)beg<<32 | end; +++ p->a[p->n].beg = beg; +++ p->a[p->n++].end = end; ++ ++ fail: ++ return h; ++@@ -415,10 +429,10 @@ ++ reghash_t *t; ++ bed_reglist_t *p, *q; ++ khint_t l, k; ++- uint64_t *new_a; +++ hts_pair_pos_t *new_a; ++ int i, j, new_n, min_off; ++ const char *reg; ++- uint32_t beg, end; +++ hts_pos_t beg, end; ++ ++ h = (reghash_t *)reg_hash; ++ t = (reghash_t *)tmp_hash; ++@@ -436,20 +450,21 @@ ++ if (k == kh_end(h) || !(p = &kh_val(h, k)) || !(p->n)) ++ continue; ++ ++- new_a = (uint64_t *)calloc(q->n + p->n, sizeof(uint64_t)); +++ new_a = calloc(q->n + p->n, sizeof(new_a[0])); ++ if (!new_a) ++ return NULL; ++ new_n = 0; ++ ++ for (i = 0; i < q->n; i++) { ++- beg = (uint32_t)(q->a[i]>>32); ++- end = (uint32_t)(q->a[i]); +++ beg = q->a[i].beg; +++ end = q->a[i].end; ++ ++ min_off = bed_minoff(p, beg, end); ++ for (j = min_off; j < p->n; ++j) { ++- if ((uint32_t)(p->a[j]>>32) >= end) break; // out of range; no need to proceed ++- if ((uint32_t)(p->a[j]) > beg && (uint32_t)(p->a[j]>>32) < end) { ++- new_a[new_n++] = ((uint64_t)MAX((uint32_t)(p->a[j]>>32), beg) << 32) | MIN((uint32_t)p->a[j], end); +++ if (p->a[j].beg >= end) break; // out of range; no need to proceed +++ if (p->a[j].end > beg && p->a[j].beg < end) { +++ new_a[new_n].beg = MAX(p->a[j].beg, beg); +++ new_a[new_n++].end = MIN(p->a[j].end, end); ++ } ++ } ++ } ++@@ -496,6 +511,11 @@ ++ ++ for (i=first; i 1024) { ++@@ -598,8 +618,8 @@ ++ reglist[count].max_end = 0; ++ ++ for (j = 0; j < p->n; j++) { ++- reglist[count].intervals[j].beg = (uint32_t)(p->a[j]>>32); ++- reglist[count].intervals[j].end = (uint32_t)(p->a[j]); +++ reglist[count].intervals[j].beg = p->a[j].beg; +++ reglist[count].intervals[j].end = p->a[j].end; ++ ++ if (reglist[count].intervals[j].end > reglist[count].max_end) ++ reglist[count].max_end = reglist[count].intervals[j].end; ++--- python-pysam.orig/samtools/bedidx.h +++++ python-pysam/samtools/bedidx.h ++@@ -36,7 +36,7 @@ ++ ++ void *bed_read(const char *fn); ++ void bed_destroy(void *_h); ++-int bed_overlap(const void *_h, const char *chr, int beg, int end); +++int bed_overlap(const void *_h, const char *chr, hts_pos_t beg, hts_pos_t end); ++ void *bed_hash_regions(void *reg_hash, char **regs, int first, int last, int *op); ++ const char* bed_get(void *reg_hash, int index, int filter); ++ hts_reglist_t *bed_reglist(void *reg_hash, int filter, int *count_regs); ++--- /dev/null +++++ python-pysam/samtools/coverage.c ++@@ -0,0 +1,702 @@ +++/* coverage.c -- samtools coverage subcommand +++ +++ Copyright (C) 2018,2019 Florian Breitwieser +++ Portions copyright (C) 2019 Genome Research Ltd. +++ +++ Author: Florian P Breitwieser +++ +++Permission is hereby granted, free of charge, to any person obtaining a copy +++of this software and associated documentation files (the "Software"), to deal +++in the Software without restriction, including without limitation the rights +++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++copies of the Software, and to permit persons to whom the Software is +++furnished to do so, subject to the following conditions: +++ +++The above copyright notice and this permission notice shall be included in +++all copies or substantial portions of the Software. +++ +++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++DEALINGS IN THE SOFTWARE. */ +++ +++/* This program calculates coverage from multiple BAMs +++ * simutaneously, to achieve random access and to use the BED interface. +++ * To compile this program separately, you may: +++ * +++ * gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz +++ */ +++ +++// C headers +++#include +++ +++#include +++#include +++#include +++#include // variadic functions +++#include // INT_MAX +++#include // round +++#include +++#include +++#include +++#include +++ +++#ifdef _WIN32 +++#include +++#else +++#include +++#endif +++ +++#include "htslib/sam.h" +++#include "htslib/hts.h" +++#include "samtools.h" +++#include "sam_opts.h" +++ +++const char *VERSION = "0.1"; +++ +++typedef struct { // auxiliary data structure to hold a BAM file +++ samFile *fp; // file handle +++ sam_hdr_t *hdr; // file header +++ hts_itr_t *iter; // iterator to a region - NULL for us by default +++ int min_mapQ; // mapQ filter +++ int min_len; // length filter +++ unsigned int n_reads; // records the number of reads seen in file +++ unsigned int n_selected_reads; // records the number of reads passing filter +++ unsigned long summed_mapQ; // summed mapQ of all reads passing filter +++ int fail_flags; +++ int required_flags; +++} bam_aux_t; +++ +++typedef struct { // auxiliary data structure to hold stats on coverage +++ unsigned long long n_covered_bases; +++ unsigned long long summed_coverage; +++ unsigned long long summed_baseQ; +++ unsigned long long summed_mapQ; +++ unsigned int n_reads; +++ unsigned int n_selected_reads; +++ int32_t tid; // chromosome ID, defined by header +++ hts_pos_t beg; +++ hts_pos_t end; +++ int64_t bin_width; +++} stats_aux_t; +++ +++#if __STDC_VERSION__ >= 199901L +++#define VERTICAL_LINE "\u2502" // BOX DRAWINGS LIGHT VERTICAL +++ +++// UTF8 specifies block characters in eights going from \u2581 (lower one eight block) to \u2588 (full block) +++// https://en.wikipedia.org/wiki/Block_Elements +++// LOWER ONE EIGHTH BLOCK … FULL BLOCK +++static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"}; +++// In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those +++static const char *const BLOCK_CHARS2[2] = {"\u2584", "\u2588"}; +++ +++#else +++ +++// Fall back to explicit UTF-8 encodings of the same characters +++#define VERTICAL_LINE "\xE2\x94\x82" +++ +++static const char *const BLOCK_CHARS8[8] = { +++ "\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84", +++ "\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" }; +++ +++static const char *const BLOCK_CHARS2[2] = {"\xE2\x96\x84", "\xE2\x96\x88"}; +++ +++#endif +++ +++// in bam_plcmd.c +++int read_file_list(const char *file_list, int *n, char **argv[]); +++ +++static int usage() { +++ fprintf(stdout, "Usage: samtools coverage [options] in1.bam [in2.bam [...]]\n\n" +++ "Input options:\n" +++ " -b, --bam-list FILE list of input BAM filenames, one per line\n" +++ " -l, --min-read-len INT ignore reads shorter than INT bp [0]\n" +++ " -q, --min-MQ INT base quality threshold [0]\n" +++ " -Q, --min-BQ INT mapping quality threshold [0]\n" +++ " --rf required flags: skip reads with mask bits unset []\n" +++ " --ff filter flags: skip reads with mask bits set \n" +++ " [UNMAP,SECONDARY,QCFAIL,DUP]\n" +++ "Output options:\n" +++ " -m, --histogram show histogram instead of tabular output\n" +++ " -A, --ascii show only ASCII characters in histogram\n" +++ " -o, --output FILE write output to FILE [stdout]\n" +++ " -H, --no-header don't print a header in tabular mode\n" +++ " -w, --n-bins INT number of bins in histogram [terminal width - 40]\n" +++ " -r, --region REG show specified region. Format: chr:start-end. \n" +++ " -h, --help help (this page)\n"); +++ +++ fprintf(stdout, "\nGeneric options:\n"); +++ sam_global_opt_help(stdout, "-.--.--."); +++ +++ fprintf(stdout, +++ "\nSee manpage for additional details.\n" +++ " rname Reference name / chromosome\n" +++ " startpos Start position\n" +++ " endpos End position (or sequence length)\n" +++ " numreads Number reads aligned to the region (after filtering)\n" +++ " covbases Number of covered bases with depth >= 1\n" +++ " coverage Proportion of covered bases [0..1]\n" +++ " meandepth Mean depth of coverage\n" +++ " meanbaseq Mean baseQ in covered region\n" +++ " meanmapq Mean mapQ of selected reads\n" +++ ); +++ +++ return EXIT_SUCCESS; +++} +++ +++static char* center_text(char *text, char *buf, int width) { +++ int len = strlen(text); +++ assert(len <= width); +++ int padding = (width - len) / 2; +++ int padding_ex = (width - len) % 2; +++ if (padding >= 1) +++ sprintf(buf, " %*s%*s", len+padding, text, padding-1+padding_ex, " "); +++ else +++ sprintf(buf, "%s", text); +++ +++ return buf; +++} +++ +++static char* readable_bps(double base_pairs, char *buf) { +++ const char* units[] = {"", "K", "M", "G", "T"}; +++ int i = 0; +++ while (base_pairs >= 1000 && i < (sizeof(units)/sizeof(units[0]) - 1)) { +++ base_pairs /= 1000; +++ i++; +++ } +++ sprintf(buf, "%.*f%s", i, base_pairs, units[i]); +++ return buf; +++} +++ +++static void set_read_counts(bam_aux_t **data, stats_aux_t *stats, int n_bam_files) { +++ int i; +++ stats->n_reads = 0; +++ stats->n_selected_reads = 0; +++ stats->summed_mapQ = 0; +++ for (i = 0; i < n_bam_files && data[i]; ++i) { +++ stats->n_reads += data[i]->n_reads; +++ stats->n_selected_reads += data[i]->n_selected_reads; +++ stats->summed_mapQ += data[i]->summed_mapQ; +++ data[i]->n_reads = 0; +++ data[i]->n_selected_reads = 0; +++ data[i]->summed_mapQ = 0; +++ } +++} +++ +++// read one alignment from one BAM file +++static int read_bam(void *data, bam1_t *b) { +++ bam_aux_t *aux = (bam_aux_t*)data; // data in fact is a pointer to an auxiliary structure +++ int ret; +++ while (1) { +++ if((ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b)) < 0) break; +++ ++aux->n_reads; +++ +++ if ( aux->fail_flags && (b->core.flag & aux->fail_flags) ) continue; +++ if ( aux->required_flags && !(b->core.flag & aux->required_flags) ) continue; +++ if ( b->core.qual < aux->min_mapQ ) continue; +++ if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; +++ ++aux->n_selected_reads; +++ aux->summed_mapQ += b->core.qual; +++ break; +++ } +++ return ret; +++} +++ +++void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats) { +++ fputs(sam_hdr_tid2name(h, stats->tid), file_out); +++ double region_len = (double) stats->end - stats->beg; +++ fprintf(file_out, "\t%"PRId64"\t%"PRId64"\t%u\t%llu\t%g\t%g\t%.3g\t%.3g\n", +++ stats->beg+1, +++ stats->end, +++ stats->n_selected_reads, +++ stats->n_covered_bases, +++ 100.0 * stats->n_covered_bases / region_len, +++ stats->summed_coverage / region_len, +++ stats->summed_coverage > 0? stats->summed_baseQ/(double) stats->summed_coverage : 0, +++ stats->n_selected_reads > 0? stats->summed_mapQ/(double) stats->n_selected_reads : 0 +++ ); +++} +++ +++void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, const uint32_t *hist, +++ const int hist_size, const bool full_utf) { +++ int i, col; +++ bool show_percentiles = false; +++ const int n_rows = 10; +++ const char * const * BLOCK_CHARS = full_utf? BLOCK_CHARS8 : BLOCK_CHARS2; +++ const int blockchar_len = full_utf? 8 : 2; +++ /* +++ if (stats->beg == 0) { +++ stats->end = h->target_len[stats->tid]; +++ } +++ */ +++ double region_len = stats->end - stats->beg; +++ +++ // Calculate histogram that contains percent covered +++ double hist_data[hist_size]; +++ double max_val = 0.0; +++ for (i = 0; i < hist_size; ++i) { +++ hist_data[i] = 100 * hist[i] / (double) stats->bin_width; +++ if (hist_data[i] > max_val) max_val = hist_data[i]; +++ } +++ +++ char buf[30]; +++ fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, stats->tid), readable_bps(sam_hdr_tid2len(h, stats->tid), buf)); +++ +++ double row_bin_size = max_val / (double) n_rows; +++ for (i = n_rows-1; i >= 0; --i) { +++ double current_bin = row_bin_size * i; +++ if (show_percentiles) { +++ fprintf(file_out, ">%3i%% ", i*10); +++ } else { +++ fprintf(file_out, ">%7.2f%% ", current_bin); +++ } +++ fprintf(file_out, VERTICAL_LINE); +++ for (col = 0; col < hist_size; ++col) { +++ // get the difference in eights, or halfs when full UTF8 is not supported +++ int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1; +++ if (cur_val_diff < 0) { +++ fputc(' ', file_out); +++ } else { +++ if (cur_val_diff >= blockchar_len) +++ cur_val_diff = blockchar_len - 1; +++ +++ fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]); +++ } +++ } +++ fprintf(file_out, VERTICAL_LINE); +++ fputc(' ', file_out); +++ switch (i) { +++ case 9: fprintf(file_out, "Number of reads: %i", stats->n_selected_reads); break; +++ case 8: if (stats->n_reads - stats->n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats->n_reads - stats->n_selected_reads); break; +++ case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats->n_covered_bases, buf)); break; +++ case 6: fprintf(file_out, "Percent covered: %.4g%%", +++ 100.0 * stats->n_covered_bases / region_len); break; +++ case 5: fprintf(file_out, "Mean coverage: %.3gx", +++ stats->summed_coverage / region_len); break; +++ case 4: fprintf(file_out, "Mean baseQ: %.3g", +++ stats->summed_baseQ/(double) stats->summed_coverage); break; +++ case 3: fprintf(file_out, "Mean mapQ: %.3g", +++ stats->summed_mapQ/(double) stats->n_selected_reads); break; +++ case 1: fprintf(file_out, "Histo bin width: %sbp", +++ readable_bps(stats->bin_width, buf)); break; +++ case 0: fprintf(file_out, "Histo max bin: %.5g%%", max_val); break; +++ }; +++ fputc('\n', file_out); +++ } +++ +++ // print x axis. Could be made pretty for widths that are not divisible +++ // by 10 by variable spacing of the labels, instead of placing a label every 10 characters +++ char buf2[50]; +++ fprintf(file_out, " %s", center_text(readable_bps(stats->beg + 1, buf), buf2, 10)); +++ int rest; +++ for (rest = 10; rest < 10*(hist_size/10); rest += 10) { +++ fprintf(file_out, "%s", center_text(readable_bps(stats->beg + stats->bin_width*rest, buf), buf2, 10)); +++ } +++ int last_padding = hist_size%10; +++ fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats->end, buf), buf2, 10)); +++ fprintf(file_out, "\n"); +++} +++ +++int main_coverage(int argc, char *argv[]) { +++ int status = EXIT_SUCCESS; +++ +++ int ret, tid, pos, i, j; +++ +++ int max_depth = 0; +++ int opt_min_baseQ = 0; +++ int opt_min_mapQ = 0; +++ int opt_min_len = 0; +++ int opt_n_bins = 50; +++ bool opt_full_width = true; +++ char *opt_output_file = NULL; +++ bam_aux_t **data = NULL; +++ bam_mplp_t mplp = NULL; +++ const bam_pileup1_t **plp = NULL; +++ uint32_t *hist = NULL; +++ stats_aux_t *stats = NULL; +++ char *opt_reg = 0; // specified region +++ char *opt_file_list = NULL; +++ int n_bam_files = 0; +++ char **fn = NULL; +++ int fail_flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); // Default fail flags +++ int required_flags = 0; +++ +++ int *n_plp = NULL; +++ sam_hdr_t *h = NULL; // BAM header of the 1st input +++ +++ bool opt_print_header = true; +++ bool opt_print_tabular = true; +++ bool opt_print_histogram = false; +++ bool *covered_tids = NULL; +++ bool opt_full_utf = true; +++ +++ FILE *file_out = stdout; +++ +++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; +++ static const struct option lopts[] = { +++ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), +++ {"rf", required_argument, NULL, 1}, // require flag +++ {"ff", required_argument, NULL, 2}, // filter flag +++ {"incl-flags", required_argument, NULL, 1}, // require flag +++ {"excl-flags", required_argument, NULL, 2}, // filter flag +++ {"bam-list", required_argument, NULL, 'b'}, +++ {"min-read-len", required_argument, NULL, 'L'}, +++ {"min-MQ", required_argument, NULL, 'q'}, +++ {"min-mq", required_argument, NULL, 'q'}, +++ {"min-BQ", required_argument, NULL, 'Q'}, +++ {"min-bq", required_argument, NULL, 'Q'}, +++ {"histogram", no_argument, NULL, 'm'}, +++ {"ascii", no_argument, NULL, 'A'}, +++ {"output", required_argument, NULL, 'o'}, +++ {"no-header", no_argument, NULL, 'H'}, +++ {"n-bins", required_argument, NULL, 'w'}, +++ {"region", required_argument, NULL, 'r'}, +++ {"help", no_argument, NULL, 'h'}, +++ { NULL, 0, NULL, 0 } +++ }; +++ +++ // parse the command line +++ int c; +++ opterr = 0; +++ while ((c = getopt_long(argc, argv, "Ao:L:q:Q:hHw:r:b:m", lopts, NULL)) != -1) { +++ switch (c) { +++ case 1: +++ if ((required_flags = bam_str2flag(optarg)) < 0) { +++ fprintf(stderr,"Could not parse --rf %s\n", optarg); return EXIT_FAILURE; +++ }; break; +++ case 2: +++ if ((fail_flags = bam_str2flag(optarg)) < 0) { +++ fprintf(stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE; +++ }; break; +++ case 'o': opt_output_file = optarg; opt_full_width = false; break; +++ case 'L': opt_min_len = atoi(optarg); break; +++ case 'q': opt_min_baseQ = atoi(optarg); break; +++ case 'Q': opt_min_mapQ = atoi(optarg); break; +++ case 'w': opt_n_bins = atoi(optarg); opt_full_width = false; +++ opt_print_histogram = true; opt_print_tabular = false; +++ break; +++ case 'r': opt_reg = optarg; break; // parsing a region requires a BAM header (strdup unnecessary) +++ case 'b': opt_file_list = optarg; break; +++ case 'm': opt_print_histogram = true; opt_print_tabular = false; break; +++ case 'A': opt_full_utf = false; +++ opt_print_histogram = true; opt_print_tabular = false; +++ break; +++ case 'H': opt_print_header = false; break; +++ case 'h': return usage(); +++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; +++ /* else fall-through */ +++ case '?': +++ if (optopt != '?') { // '-?' appeared on command line +++ if (optopt) { // Bad short option +++ print_error("coverage", "invalid option -- '%c'", optopt); +++ } else { // Bad long option +++ // Do our best. There is no good solution to finding +++ // out what the bad option was. +++ // See, e.g. https://stackoverflow.com/questions/2723888/where-does-getopt-long-store-an-unrecognized-option +++ if (optind > 0 && strncmp(argv[optind - 1], "--", 2) == 0) { +++ print_error("coverage", "unrecognised option '%s'", +++ argv[optind - 1]); +++ } +++ } +++ } +++ return usage(); +++ } +++ } +++ if (optind == argc && !opt_file_list) +++ return usage(); +++ +++ // output file provided by user +++ if (opt_output_file != NULL && strcmp(opt_output_file,"-")!=0) { +++ file_out = fopen( opt_output_file, "w" ); +++ if (file_out == NULL) { +++ print_error_errno("coverage", "Cannot open \"%s\" for writing.", opt_output_file); +++ return EXIT_FAILURE; +++ } +++ } +++ +++ if (opt_n_bins <= 0 || opt_full_width) { +++ // get number of columns of terminal +++ const char* env_columns = getenv("COLUMNS"); +++ int columns = 0; +++ if (env_columns == NULL) { +++#ifdef _WIN32 +++ CONSOLE_SCREEN_BUFFER_INFO csbi; +++ if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi)) { +++ columns = csbi.srWindow.Right - csbi.srWindow.Left + 1; +++ } +++#else +++ struct winsize w; +++ if (ioctl(2, TIOCGWINSZ, &w) == 0) +++ columns = w.ws_col; +++#endif +++ } else { +++ columns = atoi(env_columns); // atoi(NULL) returns 0 +++ } +++ +++ if (columns > 60) { +++ opt_n_bins = columns - 40; +++ } else { +++ opt_n_bins = 40; +++ } +++ } +++ +++ // setvbuf(file_out, NULL, _IONBF, 0); //turn off buffering +++ +++ // Open all BAM files +++ if (opt_file_list) { +++ // Read file names from opt_file_list into argv, and record the number of files in n_bam_files +++ if (read_file_list(opt_file_list, &n_bam_files, &fn)) { +++ print_error_errno("coverage", "Cannot open file list \"%s\".", opt_file_list); +++ return EXIT_FAILURE; +++ } +++ argv = fn; +++ optind = 0; +++ } else { +++ n_bam_files = argc - optind; // the number of BAMs on the command line +++ } +++ +++ data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file +++ if (!data) { +++ print_error("coverage", "Failed to allocate memory"); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ +++ for (i = 0; i < n_bam_files; ++i) { +++ int rf; +++ data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t)); +++ if (!data[i]) { +++ print_error("coverage", "Failed to allocate memory"); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM +++ +++ if (data[i]->fp == NULL) { +++ print_error_errno("coverage", "Could not open \"%s\"", argv[optind+i]); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; +++ if (opt_min_baseQ) rf |= SAM_QUAL; +++ +++ // Set CRAM options on file handle - returns 0 on success +++ if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { +++ print_error_errno("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { +++ print_error_errno("coverage", "Failed to set CRAM_OPT_DECODE_MD value"); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ data[i]->min_mapQ = opt_min_mapQ; // set the mapQ filter +++ data[i]->min_len = opt_min_len; // set the qlen filter +++ data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header +++ data[i]->fail_flags = fail_flags; +++ data[i]->required_flags = required_flags; +++ if (data[i]->hdr == NULL) { +++ print_error_errno("coverage", "Could not read header for \"%s\"", argv[optind+i]); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ +++ // Lookup region if specified +++ if (opt_reg) { // if a region is specified +++ hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index +++ if (idx == NULL) { +++ print_error_errno("coverage", "Failed to load index for \"%s\"", argv[optind+i]); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator +++ hts_idx_destroy(idx); // the index is not needed any more; free the memory +++ if (data[i]->iter == NULL) { +++ print_error_errno("coverage", "Failed to parse region \"%s\"", opt_reg); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ } +++ } +++ +++ if (opt_print_tabular && opt_print_header) +++ fputs("#rname\tstartpos\tendpos\tnumreads\tcovbases\tcoverage\tmeandepth\tmeanbaseq\tmeanmapq\n", file_out); +++ +++ h = data[0]->hdr; // easy access to the header of the 1st BAM +++ int n_targets = sam_hdr_nref(h); +++ covered_tids = calloc(n_targets, sizeof(bool)); +++ stats = calloc(1, sizeof(stats_aux_t)); +++ if (!covered_tids || !stats) { +++ print_error("coverage", "Failed to allocate memory"); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ +++ int64_t n_bins = opt_n_bins; +++ if (opt_reg) { +++ stats->tid = data[0]->iter->tid; +++ stats->beg = data[0]->iter->beg; // and to the parsed region coordinates +++ stats->end = data[0]->iter->end; +++ if (stats->end == HTS_POS_MAX) { +++ stats->end = sam_hdr_tid2len(h, stats->tid); +++ } +++ if (opt_n_bins > stats->end - stats->beg) { +++ n_bins = stats->end - stats->beg; +++ } +++ stats->bin_width = (stats->end-stats->beg) / n_bins; +++ } else { +++ stats->tid = -1; +++ } +++ +++ int64_t current_bin = 0; +++ +++ // the core multi-pileup loop +++ mplp = bam_mplp_init(n_bam_files, read_bam, (void**)data); // initialization +++ if (max_depth > 0) +++ bam_mplp_set_maxcnt(mplp, max_depth); // set maximum coverage depth +++ else if (!max_depth) +++ bam_mplp_set_maxcnt(mplp, INT_MAX); +++ +++ +++ // Extra info for histogram and coverage counting +++ hist = (uint32_t*) calloc(opt_n_bins, sizeof(uint32_t)); +++ n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM +++ plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) +++ if (!hist || !n_plp || !plp) { +++ print_error("coverage", "Failed to allocate memory"); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position +++ +++ if (tid != stats->tid) { // Next target sequence +++ if (stats->tid >= 0) { // It's not the first sequence, print results +++ set_read_counts(data, stats, n_bam_files); +++ if (opt_print_histogram) { +++ print_hist(file_out, h, stats, hist, n_bins, opt_full_utf); +++ fputc('\n', file_out); +++ } else if (opt_print_tabular) { +++ print_tabular_line(file_out, h, stats); +++ } +++ +++ // reset data +++ memset(stats, 0, sizeof(stats_aux_t)); +++ if (opt_print_histogram) +++ memset(hist, 0, n_bins*sizeof(uint32_t)); +++ } +++ +++ stats->tid = tid; +++ covered_tids[tid] = true; +++ if (!opt_reg) +++ stats->end = sam_hdr_tid2len(h, tid); +++ +++ if (opt_print_histogram) { +++ n_bins = opt_n_bins > stats->end-stats->beg? stats->end-stats->beg : opt_n_bins; +++ stats->bin_width = (stats->end-stats->beg) / n_bins; +++ } +++ } +++ if (pos < stats->beg || pos >= stats->end) continue; // out of range; skip +++ if (tid >= n_targets) continue; // diff number of @SQ lines per file? +++ +++ if (opt_print_histogram) { +++ current_bin = (pos - stats->beg) / stats->bin_width; +++ } +++ +++ bool count_base = false; +++ for (i = 0; i < n_bam_files; ++i) { // base level filters have to go here +++ int depth_at_pos = n_plp[i]; +++ for (j = 0; j < n_plp[i]; ++j) { +++ const bam_pileup1_t *p = plp[i] + j; // DON'T modify plp[][] unless you really know +++ +++ if (p->is_del || p->is_refskip) --depth_at_pos; // having dels or refskips at tid:pos +++ else if (p->qpos < p->b->core.l_qseq && +++ bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality +++ else +++ stats->summed_baseQ += bam_get_qual(p->b)[p->qpos]; +++ } +++ if (depth_at_pos > 0) { +++ count_base = true; +++ stats->summed_coverage += depth_at_pos; +++ } +++ // hist[current_bin] += depth_at_pos; // Add counts to the histogram here to have one based on coverage +++ //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output +++ } +++ if (count_base) { +++ ++(stats->n_covered_bases); +++ if (opt_print_histogram && current_bin < n_bins) +++ ++(hist[current_bin]); // Histogram based on breadth of coverage +++ } +++ } +++ +++ if (stats->tid != -1) { +++ set_read_counts(data, stats, n_bam_files); +++ if (opt_print_histogram) { +++ print_hist(file_out, h, stats, hist, n_bins, opt_full_utf); +++ } else if (opt_print_tabular) { +++ print_tabular_line(file_out, h, stats); +++ } +++ } +++ +++ +++ if (!opt_reg && opt_print_tabular) { +++ memset(stats, 0, sizeof(stats_aux_t)); +++ for (i = 0; i < n_targets; ++i) { +++ if (!covered_tids[i]) { +++ stats->tid = i; +++ stats->end = sam_hdr_tid2len(h, i); +++ print_tabular_line(file_out, h, stats); +++ } +++ } +++ } +++ +++ if (ret < 0) status = EXIT_FAILURE; +++ +++coverage_end: +++ if (n_plp) free(n_plp); +++ if (plp) free(plp); +++ bam_mplp_destroy(mplp); +++ +++ if (covered_tids) free(covered_tids); +++ if (hist) free(hist); +++ if (stats) free(stats); +++ +++ +++ // Close files and free data structures +++ if (!(file_out == stdout || fclose(file_out) == 0)) { +++ if (status == EXIT_SUCCESS) { +++ print_error_errno("coverage", "error on closing \"%s\"", +++ (opt_output_file && strcmp(opt_output_file, "-") != 0? +++ opt_output_file : "stdout")); +++ status = EXIT_FAILURE; +++ } +++ } +++ +++ if (data) { +++ for (i = 0; i < n_bam_files && data[i]; ++i) { +++ sam_hdr_destroy(data[i]->hdr); +++ if (data[i]->fp) sam_close(data[i]->fp); +++ hts_itr_destroy(data[i]->iter); +++ free(data[i]); +++ } +++ free(data); +++ } +++ +++ if (opt_file_list && fn) { +++ for (i = 0; i < n_bam_files; ++i) +++ free(fn[i]); +++ free(fn); +++ } +++ sam_global_args_free(&ga); +++ +++ return status; +++} +++ +++#ifdef _MAIN_BAMCOV +++int main(int argc, char *argv[]) { +++ return main_coverage(argc, argv); +++} +++#endif ++--- /dev/null +++++ python-pysam/samtools/coverage.c.pysam.c ++@@ -0,0 +1,704 @@ +++#include "samtools.pysam.h" +++ +++/* coverage.c -- samtools coverage subcommand +++ +++ Copyright (C) 2018,2019 Florian Breitwieser +++ Portions copyright (C) 2019 Genome Research Ltd. +++ +++ Author: Florian P Breitwieser +++ +++Permission is hereby granted, free of charge, to any person obtaining a copy +++of this software and associated documentation files (the "Software"), to deal +++in the Software without restriction, including without limitation the rights +++to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +++copies of the Software, and to permit persons to whom the Software is +++furnished to do so, subject to the following conditions: +++ +++The above copyright notice and this permission notice shall be included in +++all copies or substantial portions of the Software. +++ +++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +++THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +++FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +++DEALINGS IN THE SOFTWARE. */ +++ +++/* This program calculates coverage from multiple BAMs +++ * simutaneously, to achieve random access and to use the BED interface. +++ * To compile this program separately, you may: +++ * +++ * gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz +++ */ +++ +++// C headers +++#include +++ +++#include +++#include +++#include +++#include // variadic functions +++#include // INT_MAX +++#include // round +++#include +++#include +++#include +++#include +++ +++#ifdef _WIN32 +++#include +++#else +++#include +++#endif +++ +++#include "htslib/sam.h" +++#include "htslib/hts.h" +++#include "samtools.h" +++#include "sam_opts.h" +++ +++const char *VERSION = "0.1"; +++ +++typedef struct { // auxiliary data structure to hold a BAM file +++ samFile *fp; // file handle +++ sam_hdr_t *hdr; // file header +++ hts_itr_t *iter; // iterator to a region - NULL for us by default +++ int min_mapQ; // mapQ filter +++ int min_len; // length filter +++ unsigned int n_reads; // records the number of reads seen in file +++ unsigned int n_selected_reads; // records the number of reads passing filter +++ unsigned long summed_mapQ; // summed mapQ of all reads passing filter +++ int fail_flags; +++ int required_flags; +++} bam_aux_t; +++ +++typedef struct { // auxiliary data structure to hold stats on coverage +++ unsigned long long n_covered_bases; +++ unsigned long long summed_coverage; +++ unsigned long long summed_baseQ; +++ unsigned long long summed_mapQ; +++ unsigned int n_reads; +++ unsigned int n_selected_reads; +++ int32_t tid; // chromosome ID, defined by header +++ hts_pos_t beg; +++ hts_pos_t end; +++ int64_t bin_width; +++} stats_aux_t; +++ +++#if __STDC_VERSION__ >= 199901L +++#define VERTICAL_LINE "\u2502" // BOX DRAWINGS LIGHT VERTICAL +++ +++// UTF8 specifies block characters in eights going from \u2581 (lower one eight block) to \u2588 (full block) +++// https://en.wikipedia.org/wiki/Block_Elements +++// LOWER ONE EIGHTH BLOCK … FULL BLOCK +++static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"}; +++// In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those +++static const char *const BLOCK_CHARS2[2] = {"\u2584", "\u2588"}; +++ +++#else +++ +++// Fall back to explicit UTF-8 encodings of the same characters +++#define VERTICAL_LINE "\xE2\x94\x82" +++ +++static const char *const BLOCK_CHARS8[8] = { +++ "\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84", +++ "\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" }; +++ +++static const char *const BLOCK_CHARS2[2] = {"\xE2\x96\x84", "\xE2\x96\x88"}; +++ +++#endif +++ +++// in bam_plcmd.c +++int read_file_list(const char *file_list, int *n, char **argv[]); +++ +++static int usage() { +++ fprintf(samtools_stdout, "Usage: samtools coverage [options] in1.bam [in2.bam [...]]\n\n" +++ "Input options:\n" +++ " -b, --bam-list FILE list of input BAM filenames, one per line\n" +++ " -l, --min-read-len INT ignore reads shorter than INT bp [0]\n" +++ " -q, --min-MQ INT base quality threshold [0]\n" +++ " -Q, --min-BQ INT mapping quality threshold [0]\n" +++ " --rf required flags: skip reads with mask bits unset []\n" +++ " --ff filter flags: skip reads with mask bits set \n" +++ " [UNMAP,SECONDARY,QCFAIL,DUP]\n" +++ "Output options:\n" +++ " -m, --histogram show histogram instead of tabular output\n" +++ " -A, --ascii show only ASCII characters in histogram\n" +++ " -o, --output FILE write output to FILE [samtools_stdout]\n" +++ " -H, --no-header don't print a header in tabular mode\n" +++ " -w, --n-bins INT number of bins in histogram [terminal width - 40]\n" +++ " -r, --region REG show specified region. Format: chr:start-end. \n" +++ " -h, --help help (this page)\n"); +++ +++ fprintf(samtools_stdout, "\nGeneric options:\n"); +++ sam_global_opt_help(samtools_stdout, "-.--.--."); +++ +++ fprintf(samtools_stdout, +++ "\nSee manpage for additional details.\n" +++ " rname Reference name / chromosome\n" +++ " startpos Start position\n" +++ " endpos End position (or sequence length)\n" +++ " numreads Number reads aligned to the region (after filtering)\n" +++ " covbases Number of covered bases with depth >= 1\n" +++ " coverage Proportion of covered bases [0..1]\n" +++ " meandepth Mean depth of coverage\n" +++ " meanbaseq Mean baseQ in covered region\n" +++ " meanmapq Mean mapQ of selected reads\n" +++ ); +++ +++ return EXIT_SUCCESS; +++} +++ +++static char* center_text(char *text, char *buf, int width) { +++ int len = strlen(text); +++ assert(len <= width); +++ int padding = (width - len) / 2; +++ int padding_ex = (width - len) % 2; +++ if (padding >= 1) +++ sprintf(buf, " %*s%*s", len+padding, text, padding-1+padding_ex, " "); +++ else +++ sprintf(buf, "%s", text); +++ +++ return buf; +++} +++ +++static char* readable_bps(double base_pairs, char *buf) { +++ const char* units[] = {"", "K", "M", "G", "T"}; +++ int i = 0; +++ while (base_pairs >= 1000 && i < (sizeof(units)/sizeof(units[0]) - 1)) { +++ base_pairs /= 1000; +++ i++; +++ } +++ sprintf(buf, "%.*f%s", i, base_pairs, units[i]); +++ return buf; +++} +++ +++static void set_read_counts(bam_aux_t **data, stats_aux_t *stats, int n_bam_files) { +++ int i; +++ stats->n_reads = 0; +++ stats->n_selected_reads = 0; +++ stats->summed_mapQ = 0; +++ for (i = 0; i < n_bam_files && data[i]; ++i) { +++ stats->n_reads += data[i]->n_reads; +++ stats->n_selected_reads += data[i]->n_selected_reads; +++ stats->summed_mapQ += data[i]->summed_mapQ; +++ data[i]->n_reads = 0; +++ data[i]->n_selected_reads = 0; +++ data[i]->summed_mapQ = 0; +++ } +++} +++ +++// read one alignment from one BAM file +++static int read_bam(void *data, bam1_t *b) { +++ bam_aux_t *aux = (bam_aux_t*)data; // data in fact is a pointer to an auxiliary structure +++ int ret; +++ while (1) { +++ if((ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b)) < 0) break; +++ ++aux->n_reads; +++ +++ if ( aux->fail_flags && (b->core.flag & aux->fail_flags) ) continue; +++ if ( aux->required_flags && !(b->core.flag & aux->required_flags) ) continue; +++ if ( b->core.qual < aux->min_mapQ ) continue; +++ if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; +++ ++aux->n_selected_reads; +++ aux->summed_mapQ += b->core.qual; +++ break; +++ } +++ return ret; +++} +++ +++void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats) { +++ fputs(sam_hdr_tid2name(h, stats->tid), file_out); +++ double region_len = (double) stats->end - stats->beg; +++ fprintf(file_out, "\t%"PRId64"\t%"PRId64"\t%u\t%llu\t%g\t%g\t%.3g\t%.3g\n", +++ stats->beg+1, +++ stats->end, +++ stats->n_selected_reads, +++ stats->n_covered_bases, +++ 100.0 * stats->n_covered_bases / region_len, +++ stats->summed_coverage / region_len, +++ stats->summed_coverage > 0? stats->summed_baseQ/(double) stats->summed_coverage : 0, +++ stats->n_selected_reads > 0? stats->summed_mapQ/(double) stats->n_selected_reads : 0 +++ ); +++} +++ +++void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, const uint32_t *hist, +++ const int hist_size, const bool full_utf) { +++ int i, col; +++ bool show_percentiles = false; +++ const int n_rows = 10; +++ const char * const * BLOCK_CHARS = full_utf? BLOCK_CHARS8 : BLOCK_CHARS2; +++ const int blockchar_len = full_utf? 8 : 2; +++ /* +++ if (stats->beg == 0) { +++ stats->end = h->target_len[stats->tid]; +++ } +++ */ +++ double region_len = stats->end - stats->beg; +++ +++ // Calculate histogram that contains percent covered +++ double hist_data[hist_size]; +++ double max_val = 0.0; +++ for (i = 0; i < hist_size; ++i) { +++ hist_data[i] = 100 * hist[i] / (double) stats->bin_width; +++ if (hist_data[i] > max_val) max_val = hist_data[i]; +++ } +++ +++ char buf[30]; +++ fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, stats->tid), readable_bps(sam_hdr_tid2len(h, stats->tid), buf)); +++ +++ double row_bin_size = max_val / (double) n_rows; +++ for (i = n_rows-1; i >= 0; --i) { +++ double current_bin = row_bin_size * i; +++ if (show_percentiles) { +++ fprintf(file_out, ">%3i%% ", i*10); +++ } else { +++ fprintf(file_out, ">%7.2f%% ", current_bin); +++ } +++ fprintf(file_out, VERTICAL_LINE); +++ for (col = 0; col < hist_size; ++col) { +++ // get the difference in eights, or halfs when full UTF8 is not supported +++ int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1; +++ if (cur_val_diff < 0) { +++ fputc(' ', file_out); +++ } else { +++ if (cur_val_diff >= blockchar_len) +++ cur_val_diff = blockchar_len - 1; +++ +++ fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]); +++ } +++ } +++ fprintf(file_out, VERTICAL_LINE); +++ fputc(' ', file_out); +++ switch (i) { +++ case 9: fprintf(file_out, "Number of reads: %i", stats->n_selected_reads); break; +++ case 8: if (stats->n_reads - stats->n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats->n_reads - stats->n_selected_reads); break; +++ case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats->n_covered_bases, buf)); break; +++ case 6: fprintf(file_out, "Percent covered: %.4g%%", +++ 100.0 * stats->n_covered_bases / region_len); break; +++ case 5: fprintf(file_out, "Mean coverage: %.3gx", +++ stats->summed_coverage / region_len); break; +++ case 4: fprintf(file_out, "Mean baseQ: %.3g", +++ stats->summed_baseQ/(double) stats->summed_coverage); break; +++ case 3: fprintf(file_out, "Mean mapQ: %.3g", +++ stats->summed_mapQ/(double) stats->n_selected_reads); break; +++ case 1: fprintf(file_out, "Histo bin width: %sbp", +++ readable_bps(stats->bin_width, buf)); break; +++ case 0: fprintf(file_out, "Histo max bin: %.5g%%", max_val); break; +++ }; +++ fputc('\n', file_out); +++ } +++ +++ // print x axis. Could be made pretty for widths that are not divisible +++ // by 10 by variable spacing of the labels, instead of placing a label every 10 characters +++ char buf2[50]; +++ fprintf(file_out, " %s", center_text(readable_bps(stats->beg + 1, buf), buf2, 10)); +++ int rest; +++ for (rest = 10; rest < 10*(hist_size/10); rest += 10) { +++ fprintf(file_out, "%s", center_text(readable_bps(stats->beg + stats->bin_width*rest, buf), buf2, 10)); +++ } +++ int last_padding = hist_size%10; +++ fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats->end, buf), buf2, 10)); +++ fprintf(file_out, "\n"); +++} +++ +++int main_coverage(int argc, char *argv[]) { +++ int status = EXIT_SUCCESS; +++ +++ int ret, tid, pos, i, j; +++ +++ int max_depth = 0; +++ int opt_min_baseQ = 0; +++ int opt_min_mapQ = 0; +++ int opt_min_len = 0; +++ int opt_n_bins = 50; +++ bool opt_full_width = true; +++ char *opt_output_file = NULL; +++ bam_aux_t **data = NULL; +++ bam_mplp_t mplp = NULL; +++ const bam_pileup1_t **plp = NULL; +++ uint32_t *hist = NULL; +++ stats_aux_t *stats = NULL; +++ char *opt_reg = 0; // specified region +++ char *opt_file_list = NULL; +++ int n_bam_files = 0; +++ char **fn = NULL; +++ int fail_flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); // Default fail flags +++ int required_flags = 0; +++ +++ int *n_plp = NULL; +++ sam_hdr_t *h = NULL; // BAM header of the 1st input +++ +++ bool opt_print_header = true; +++ bool opt_print_tabular = true; +++ bool opt_print_histogram = false; +++ bool *covered_tids = NULL; +++ bool opt_full_utf = true; +++ +++ FILE *file_out = samtools_stdout; +++ +++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; +++ static const struct option lopts[] = { +++ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), +++ {"rf", required_argument, NULL, 1}, // require flag +++ {"ff", required_argument, NULL, 2}, // filter flag +++ {"incl-flags", required_argument, NULL, 1}, // require flag +++ {"excl-flags", required_argument, NULL, 2}, // filter flag +++ {"bam-list", required_argument, NULL, 'b'}, +++ {"min-read-len", required_argument, NULL, 'L'}, +++ {"min-MQ", required_argument, NULL, 'q'}, +++ {"min-mq", required_argument, NULL, 'q'}, +++ {"min-BQ", required_argument, NULL, 'Q'}, +++ {"min-bq", required_argument, NULL, 'Q'}, +++ {"histogram", no_argument, NULL, 'm'}, +++ {"ascii", no_argument, NULL, 'A'}, +++ {"output", required_argument, NULL, 'o'}, +++ {"no-header", no_argument, NULL, 'H'}, +++ {"n-bins", required_argument, NULL, 'w'}, +++ {"region", required_argument, NULL, 'r'}, +++ {"help", no_argument, NULL, 'h'}, +++ { NULL, 0, NULL, 0 } +++ }; +++ +++ // parse the command line +++ int c; +++ opterr = 0; +++ while ((c = getopt_long(argc, argv, "Ao:L:q:Q:hHw:r:b:m", lopts, NULL)) != -1) { +++ switch (c) { +++ case 1: +++ if ((required_flags = bam_str2flag(optarg)) < 0) { +++ fprintf(samtools_stderr,"Could not parse --rf %s\n", optarg); return EXIT_FAILURE; +++ }; break; +++ case 2: +++ if ((fail_flags = bam_str2flag(optarg)) < 0) { +++ fprintf(samtools_stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE; +++ }; break; +++ case 'o': opt_output_file = optarg; opt_full_width = false; break; +++ case 'L': opt_min_len = atoi(optarg); break; +++ case 'q': opt_min_baseQ = atoi(optarg); break; +++ case 'Q': opt_min_mapQ = atoi(optarg); break; +++ case 'w': opt_n_bins = atoi(optarg); opt_full_width = false; +++ opt_print_histogram = true; opt_print_tabular = false; +++ break; +++ case 'r': opt_reg = optarg; break; // parsing a region requires a BAM header (strdup unnecessary) +++ case 'b': opt_file_list = optarg; break; +++ case 'm': opt_print_histogram = true; opt_print_tabular = false; break; +++ case 'A': opt_full_utf = false; +++ opt_print_histogram = true; opt_print_tabular = false; +++ break; +++ case 'H': opt_print_header = false; break; +++ case 'h': return usage(); +++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; +++ /* else fall-through */ +++ case '?': +++ if (optopt != '?') { // '-?' appeared on command line +++ if (optopt) { // Bad short option +++ print_error("coverage", "invalid option -- '%c'", optopt); +++ } else { // Bad long option +++ // Do our best. There is no good solution to finding +++ // out what the bad option was. +++ // See, e.g. https://stackoverflow.com/questions/2723888/where-does-getopt-long-store-an-unrecognized-option +++ if (optind > 0 && strncmp(argv[optind - 1], "--", 2) == 0) { +++ print_error("coverage", "unrecognised option '%s'", +++ argv[optind - 1]); +++ } +++ } +++ } +++ return usage(); +++ } +++ } +++ if (optind == argc && !opt_file_list) +++ return usage(); +++ +++ // output file provided by user +++ if (opt_output_file != NULL && strcmp(opt_output_file,"-")!=0) { +++ file_out = fopen( opt_output_file, "w" ); +++ if (file_out == NULL) { +++ print_error_errno("coverage", "Cannot open \"%s\" for writing.", opt_output_file); +++ return EXIT_FAILURE; +++ } +++ } +++ +++ if (opt_n_bins <= 0 || opt_full_width) { +++ // get number of columns of terminal +++ const char* env_columns = getenv("COLUMNS"); +++ int columns = 0; +++ if (env_columns == NULL) { +++#ifdef _WIN32 +++ CONSOLE_SCREEN_BUFFER_INFO csbi; +++ if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi)) { +++ columns = csbi.srWindow.Right - csbi.srWindow.Left + 1; +++ } +++#else +++ struct winsize w; +++ if (ioctl(2, TIOCGWINSZ, &w) == 0) +++ columns = w.ws_col; +++#endif +++ } else { +++ columns = atoi(env_columns); // atoi(NULL) returns 0 +++ } +++ +++ if (columns > 60) { +++ opt_n_bins = columns - 40; +++ } else { +++ opt_n_bins = 40; +++ } +++ } +++ +++ // setvbuf(file_out, NULL, _IONBF, 0); //turn off buffering +++ +++ // Open all BAM files +++ if (opt_file_list) { +++ // Read file names from opt_file_list into argv, and record the number of files in n_bam_files +++ if (read_file_list(opt_file_list, &n_bam_files, &fn)) { +++ print_error_errno("coverage", "Cannot open file list \"%s\".", opt_file_list); +++ return EXIT_FAILURE; +++ } +++ argv = fn; +++ optind = 0; +++ } else { +++ n_bam_files = argc - optind; // the number of BAMs on the command line +++ } +++ +++ data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file +++ if (!data) { +++ print_error("coverage", "Failed to allocate memory"); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ +++ for (i = 0; i < n_bam_files; ++i) { +++ int rf; +++ data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t)); +++ if (!data[i]) { +++ print_error("coverage", "Failed to allocate memory"); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM +++ +++ if (data[i]->fp == NULL) { +++ print_error_errno("coverage", "Could not open \"%s\"", argv[optind+i]); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; +++ if (opt_min_baseQ) rf |= SAM_QUAL; +++ +++ // Set CRAM options on file handle - returns 0 on success +++ if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { +++ print_error_errno("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { +++ print_error_errno("coverage", "Failed to set CRAM_OPT_DECODE_MD value"); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ data[i]->min_mapQ = opt_min_mapQ; // set the mapQ filter +++ data[i]->min_len = opt_min_len; // set the qlen filter +++ data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header +++ data[i]->fail_flags = fail_flags; +++ data[i]->required_flags = required_flags; +++ if (data[i]->hdr == NULL) { +++ print_error_errno("coverage", "Could not read header for \"%s\"", argv[optind+i]); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ +++ // Lookup region if specified +++ if (opt_reg) { // if a region is specified +++ hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index +++ if (idx == NULL) { +++ print_error_errno("coverage", "Failed to load index for \"%s\"", argv[optind+i]); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator +++ hts_idx_destroy(idx); // the index is not needed any more; free the memory +++ if (data[i]->iter == NULL) { +++ print_error_errno("coverage", "Failed to parse region \"%s\"", opt_reg); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ } +++ } +++ +++ if (opt_print_tabular && opt_print_header) +++ fputs("#rname\tstartpos\tendpos\tnumreads\tcovbases\tcoverage\tmeandepth\tmeanbaseq\tmeanmapq\n", file_out); +++ +++ h = data[0]->hdr; // easy access to the header of the 1st BAM +++ int n_targets = sam_hdr_nref(h); +++ covered_tids = calloc(n_targets, sizeof(bool)); +++ stats = calloc(1, sizeof(stats_aux_t)); +++ if (!covered_tids || !stats) { +++ print_error("coverage", "Failed to allocate memory"); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ +++ int64_t n_bins = opt_n_bins; +++ if (opt_reg) { +++ stats->tid = data[0]->iter->tid; +++ stats->beg = data[0]->iter->beg; // and to the parsed region coordinates +++ stats->end = data[0]->iter->end; +++ if (stats->end == HTS_POS_MAX) { +++ stats->end = sam_hdr_tid2len(h, stats->tid); +++ } +++ if (opt_n_bins > stats->end - stats->beg) { +++ n_bins = stats->end - stats->beg; +++ } +++ stats->bin_width = (stats->end-stats->beg) / n_bins; +++ } else { +++ stats->tid = -1; +++ } +++ +++ int64_t current_bin = 0; +++ +++ // the core multi-pileup loop +++ mplp = bam_mplp_init(n_bam_files, read_bam, (void**)data); // initialization +++ if (max_depth > 0) +++ bam_mplp_set_maxcnt(mplp, max_depth); // set maximum coverage depth +++ else if (!max_depth) +++ bam_mplp_set_maxcnt(mplp, INT_MAX); +++ +++ +++ // Extra info for histogram and coverage counting +++ hist = (uint32_t*) calloc(opt_n_bins, sizeof(uint32_t)); +++ n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM +++ plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) +++ if (!hist || !n_plp || !plp) { +++ print_error("coverage", "Failed to allocate memory"); +++ status = EXIT_FAILURE; +++ goto coverage_end; +++ } +++ while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position +++ +++ if (tid != stats->tid) { // Next target sequence +++ if (stats->tid >= 0) { // It's not the first sequence, print results +++ set_read_counts(data, stats, n_bam_files); +++ if (opt_print_histogram) { +++ print_hist(file_out, h, stats, hist, n_bins, opt_full_utf); +++ fputc('\n', file_out); +++ } else if (opt_print_tabular) { +++ print_tabular_line(file_out, h, stats); +++ } +++ +++ // reset data +++ memset(stats, 0, sizeof(stats_aux_t)); +++ if (opt_print_histogram) +++ memset(hist, 0, n_bins*sizeof(uint32_t)); +++ } +++ +++ stats->tid = tid; +++ covered_tids[tid] = true; +++ if (!opt_reg) +++ stats->end = sam_hdr_tid2len(h, tid); +++ +++ if (opt_print_histogram) { +++ n_bins = opt_n_bins > stats->end-stats->beg? stats->end-stats->beg : opt_n_bins; +++ stats->bin_width = (stats->end-stats->beg) / n_bins; +++ } +++ } +++ if (pos < stats->beg || pos >= stats->end) continue; // out of range; skip +++ if (tid >= n_targets) continue; // diff number of @SQ lines per file? +++ +++ if (opt_print_histogram) { +++ current_bin = (pos - stats->beg) / stats->bin_width; +++ } +++ +++ bool count_base = false; +++ for (i = 0; i < n_bam_files; ++i) { // base level filters have to go here +++ int depth_at_pos = n_plp[i]; +++ for (j = 0; j < n_plp[i]; ++j) { +++ const bam_pileup1_t *p = plp[i] + j; // DON'T modify plp[][] unless you really know +++ +++ if (p->is_del || p->is_refskip) --depth_at_pos; // having dels or refskips at tid:pos +++ else if (p->qpos < p->b->core.l_qseq && +++ bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality +++ else +++ stats->summed_baseQ += bam_get_qual(p->b)[p->qpos]; +++ } +++ if (depth_at_pos > 0) { +++ count_base = true; +++ stats->summed_coverage += depth_at_pos; +++ } +++ // hist[current_bin] += depth_at_pos; // Add counts to the histogram here to have one based on coverage +++ //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output +++ } +++ if (count_base) { +++ ++(stats->n_covered_bases); +++ if (opt_print_histogram && current_bin < n_bins) +++ ++(hist[current_bin]); // Histogram based on breadth of coverage +++ } +++ } +++ +++ if (stats->tid != -1) { +++ set_read_counts(data, stats, n_bam_files); +++ if (opt_print_histogram) { +++ print_hist(file_out, h, stats, hist, n_bins, opt_full_utf); +++ } else if (opt_print_tabular) { +++ print_tabular_line(file_out, h, stats); +++ } +++ } +++ +++ +++ if (!opt_reg && opt_print_tabular) { +++ memset(stats, 0, sizeof(stats_aux_t)); +++ for (i = 0; i < n_targets; ++i) { +++ if (!covered_tids[i]) { +++ stats->tid = i; +++ stats->end = sam_hdr_tid2len(h, i); +++ print_tabular_line(file_out, h, stats); +++ } +++ } +++ } +++ +++ if (ret < 0) status = EXIT_FAILURE; +++ +++coverage_end: +++ if (n_plp) free(n_plp); +++ if (plp) free(plp); +++ bam_mplp_destroy(mplp); +++ +++ if (covered_tids) free(covered_tids); +++ if (hist) free(hist); +++ if (stats) free(stats); +++ +++ +++ // Close files and free data structures +++ if (!(file_out == samtools_stdout || fclose(file_out) == 0)) { +++ if (status == EXIT_SUCCESS) { +++ print_error_errno("coverage", "error on closing \"%s\"", +++ (opt_output_file && strcmp(opt_output_file, "-") != 0? +++ opt_output_file : "samtools_stdout")); +++ status = EXIT_FAILURE; +++ } +++ } +++ +++ if (data) { +++ for (i = 0; i < n_bam_files && data[i]; ++i) { +++ sam_hdr_destroy(data[i]->hdr); +++ if (data[i]->fp) sam_close(data[i]->fp); +++ hts_itr_destroy(data[i]->iter); +++ free(data[i]); +++ } +++ free(data); +++ } +++ +++ if (opt_file_list && fn) { +++ for (i = 0; i < n_bam_files; ++i) +++ free(fn[i]); +++ free(fn); +++ } +++ sam_global_args_free(&ga); +++ +++ return status; +++} +++ +++#ifdef _MAIN_BAMCOV +++int samtools_coverage_main(int argc, char *argv[]) { +++ return main_coverage(argc, argv); +++} +++#endif ++--- python-pysam.orig/samtools/cut_target.c +++++ python-pysam/samtools/cut_target.c ++@@ -1,7 +1,7 @@ ++ /* cut_target.c -- targetcut subcommand. ++ ++ Copyright (C) 2011 Broad Institute. ++- Copyright (C) 2012-2013, 2015, 2016 Genome Research Ltd. +++ Copyright (C) 2012-2013, 2015, 2016, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -49,9 +49,9 @@ ++ int min_baseQ, tid, max_bases; ++ uint16_t *bases; ++ samFile *fp; ++- bam_hdr_t *h; +++ sam_hdr_t *h; ++ char *ref; ++- int len; +++ hts_pos_t len; ++ faidx_t *fai; ++ errmod_t *em; ++ } ct_t; ++@@ -92,9 +92,10 @@ ++ return ret<<8|k; ++ } ++ ++-static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns) +++static void process_cns(sam_hdr_t *h, int tid, hts_pos_t l, uint16_t *cns) ++ { ++- int i, f[2][2], *prev, *curr, *swap_tmp, s; +++ int64_t i, s; +++ int f[2][2], *prev, *curr, *swap_tmp; ++ uint8_t *b; // backtrack array ++ b = calloc(l, 1); ++ f[0][0] = f[0][1] = 0; ++@@ -123,11 +124,11 @@ ++ s = b[i]>>s&1; ++ } ++ // print ++- for (i = 0, s = -1; i < INT_MAX && i <= l; ++i) { +++ for (i = 0, s = -1; i < INT64_MAX && i <= l; ++i) { ++ if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { ++ if (s >= 0) { ++- int j; ++- printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); +++ int64_t j; +++ printf("%s:%"PRId64"-%"PRId64"\t0\t%s\t%"PRId64"\t60\t%"PRId64"M\t*\t0\t0\t", sam_hdr_tid2name(h, tid), s+1, i, sam_hdr_tid2name(h, tid), s+1, i-s); ++ for (j = s; j < i; ++j) { ++ int c = cns[j]>>8; ++ if (c == 0) putchar('N'); ++@@ -157,7 +158,7 @@ ++ if ( g->fai && b->core.tid >= 0 ) { ++ if (b->core.tid != g->tid) { // then load the sequence ++ free(g->ref); ++- g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &g->len); +++ g->ref = fai_fetch64(g->fai, sam_hdr_tid2name(g->h, b->core.tid), &g->len); ++ g->tid = b->core.tid; ++ } ++ sam_prob_realn(b, g->ref, g->len, 1<<1|1); ++@@ -169,7 +170,8 @@ ++ ++ int main_cut_target(int argc, char *argv[]) ++ { ++- int c, tid, pos, n, lasttid = -1, l, max_l, usage = 0; +++ int c, tid, pos, n, lasttid = -1, usage = 0; +++ hts_pos_t l, max_l; ++ const bam_pileup1_t *p; ++ bam_plp_t plp; ++ uint16_t *cns; ++@@ -201,7 +203,7 @@ ++ } ++ if (usage || argc == optind) { ++ fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] \n"); ++- sam_global_opt_help(stderr, "-.--f-"); +++ sam_global_opt_help(stderr, "-.--f--."); ++ return 1; ++ } ++ l = max_l = 0; cns = 0; ++@@ -223,12 +225,12 @@ ++ if (tid < 0) break; ++ if (tid != lasttid) { // change of chromosome ++ if (cns) process_cns(g.h, lasttid, l, cns); ++- if (max_l < g.h->target_len[tid]) { ++- max_l = g.h->target_len[tid]; +++ if (max_l < sam_hdr_tid2len(g.h, tid)) { +++ max_l = sam_hdr_tid2len(g.h, tid); ++ kroundup32(max_l); ++ cns = realloc(cns, max_l * 2); ++ } ++- l = g.h->target_len[tid]; +++ l = sam_hdr_tid2len(g.h, tid); ++ memset(cns, 0, max_l * 2); ++ lasttid = tid; ++ } ++@@ -236,7 +238,7 @@ ++ } ++ process_cns(g.h, lasttid, l, cns); ++ free(cns); ++- bam_hdr_destroy(g.h); +++ sam_hdr_destroy(g.h); ++ bam_plp_destroy(plp); ++ sam_close(g.fp); ++ if (g.fai) { ++--- python-pysam.orig/samtools/cut_target.c.pysam.c +++++ python-pysam/samtools/cut_target.c.pysam.c ++@@ -3,7 +3,7 @@ ++ /* cut_target.c -- targetcut subcommand. ++ ++ Copyright (C) 2011 Broad Institute. ++- Copyright (C) 2012-2013, 2015, 2016 Genome Research Ltd. +++ Copyright (C) 2012-2013, 2015, 2016, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -51,9 +51,9 @@ ++ int min_baseQ, tid, max_bases; ++ uint16_t *bases; ++ samFile *fp; ++- bam_hdr_t *h; +++ sam_hdr_t *h; ++ char *ref; ++- int len; +++ hts_pos_t len; ++ faidx_t *fai; ++ errmod_t *em; ++ } ct_t; ++@@ -94,9 +94,10 @@ ++ return ret<<8|k; ++ } ++ ++-static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns) +++static void process_cns(sam_hdr_t *h, int tid, hts_pos_t l, uint16_t *cns) ++ { ++- int i, f[2][2], *prev, *curr, *swap_tmp, s; +++ int64_t i, s; +++ int f[2][2], *prev, *curr, *swap_tmp; ++ uint8_t *b; // backtrack array ++ b = calloc(l, 1); ++ f[0][0] = f[0][1] = 0; ++@@ -125,11 +126,11 @@ ++ s = b[i]>>s&1; ++ } ++ // print ++- for (i = 0, s = -1; i < INT_MAX && i <= l; ++i) { +++ for (i = 0, s = -1; i < INT64_MAX && i <= l; ++i) { ++ if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { ++ if (s >= 0) { ++- int j; ++- fprintf(samtools_stdout, "%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); +++ int64_t j; +++ fprintf(samtools_stdout, "%s:%"PRId64"-%"PRId64"\t0\t%s\t%"PRId64"\t60\t%"PRId64"M\t*\t0\t0\t", sam_hdr_tid2name(h, tid), s+1, i, sam_hdr_tid2name(h, tid), s+1, i-s); ++ for (j = s; j < i; ++j) { ++ int c = cns[j]>>8; ++ if (c == 0) fputc('N', samtools_stdout); ++@@ -159,7 +160,7 @@ ++ if ( g->fai && b->core.tid >= 0 ) { ++ if (b->core.tid != g->tid) { // then load the sequence ++ free(g->ref); ++- g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &g->len); +++ g->ref = fai_fetch64(g->fai, sam_hdr_tid2name(g->h, b->core.tid), &g->len); ++ g->tid = b->core.tid; ++ } ++ sam_prob_realn(b, g->ref, g->len, 1<<1|1); ++@@ -171,7 +172,8 @@ ++ ++ int main_cut_target(int argc, char *argv[]) ++ { ++- int c, tid, pos, n, lasttid = -1, l, max_l, usage = 0; +++ int c, tid, pos, n, lasttid = -1, usage = 0; +++ hts_pos_t l, max_l; ++ const bam_pileup1_t *p; ++ bam_plp_t plp; ++ uint16_t *cns; ++@@ -203,7 +205,7 @@ ++ } ++ if (usage || argc == optind) { ++ fprintf(samtools_stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] \n"); ++- sam_global_opt_help(samtools_stderr, "-.--f-"); +++ sam_global_opt_help(samtools_stderr, "-.--f--."); ++ return 1; ++ } ++ l = max_l = 0; cns = 0; ++@@ -225,12 +227,12 @@ ++ if (tid < 0) break; ++ if (tid != lasttid) { // change of chromosome ++ if (cns) process_cns(g.h, lasttid, l, cns); ++- if (max_l < g.h->target_len[tid]) { ++- max_l = g.h->target_len[tid]; +++ if (max_l < sam_hdr_tid2len(g.h, tid)) { +++ max_l = sam_hdr_tid2len(g.h, tid); ++ kroundup32(max_l); ++ cns = realloc(cns, max_l * 2); ++ } ++- l = g.h->target_len[tid]; +++ l = sam_hdr_tid2len(g.h, tid); ++ memset(cns, 0, max_l * 2); ++ lasttid = tid; ++ } ++@@ -238,7 +240,7 @@ ++ } ++ process_cns(g.h, lasttid, l, cns); ++ free(cns); ++- bam_hdr_destroy(g.h); +++ sam_hdr_destroy(g.h); ++ bam_plp_destroy(plp); ++ sam_close(g.fp); ++ if (g.fai) { ++--- python-pysam.orig/samtools/dict.c +++++ python-pysam/samtools/dict.c ++@@ -98,6 +98,7 @@ ++ hts_md5_destroy(md5); ++ ++ if (args->output_fname) fclose(out); +++ gzclose(fp); ++ } ++ ++ static int dict_usage(void) ++--- python-pysam.orig/samtools/dict.c.pysam.c +++++ python-pysam/samtools/dict.c.pysam.c ++@@ -100,6 +100,7 @@ ++ hts_md5_destroy(md5); ++ ++ if (args->output_fname) fclose(out); +++ gzclose(fp); ++ } ++ ++ static int dict_usage(void) ++--- python-pysam.orig/samtools/faidx.c +++++ python-pysam/samtools/faidx.c ++@@ -1,6 +1,6 @@ ++ /* faidx.c -- faidx subcommand. ++ ++- Copyright (C) 2008, 2009, 2013, 2016, 2018 Genome Research Ltd. +++ Copyright (C) 2008, 2009, 2013, 2016, 2018-2019 Genome Research Ltd. ++ Portions copyright (C) 2011 Broad Institute. ++ ++ Author: Heng Li ++@@ -67,9 +67,9 @@ ++ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ++ }; ++ ++-static void reverse_complement(char *str, int len) { +++static void reverse_complement(char *str, const hts_pos_t len) { ++ char c; ++- int i = 0, j = len - 1; +++ hts_pos_t i = 0, j = len - 1; ++ ++ while (i <= j) { ++ c = str[i]; ++@@ -80,10 +80,9 @@ ++ } ++ } ++ ++- ++-static void reverse(char *str, int len) { +++static void reverse(char *str, const hts_pos_t len) { ++ char c; ++- int i = 0, j = len - 1; +++ hts_pos_t i = 0, j = len - 1; ++ ++ while (i < j) { ++ c = str[i]; ++@@ -95,9 +94,10 @@ ++ } ++ ++ ++-static int write_line(FILE *file, const char *line, const char *name, const int ignore, ++- const int length, const int seq_len) { ++- int beg, end; +++static int write_line(faidx_t *faid, FILE *file, const char *line, const char *name, +++ const int ignore, const int length, const hts_pos_t seq_len) { +++ int id; +++ hts_pos_t beg, end; ++ ++ if (seq_len < 0) { ++ fprintf(stderr, "[faidx] Failed to fetch sequence in %s\n", name); ++@@ -109,15 +109,16 @@ ++ } ++ } else if (seq_len == 0) { ++ fprintf(stderr, "[faidx] Zero length sequence: %s\n", name); ++- } else if (hts_parse_reg(name, &beg, &end) && (end < INT_MAX) && (seq_len != end - beg)) { +++ } else if (fai_parse_region(faid, name, &id, &beg, &end, 0) +++ && (end < INT_MAX) && (seq_len != end - beg)) { ++ fprintf(stderr, "[faidx] Truncated sequence: %s\n", name); ++ } ++ ++- size_t i, seq_sz = seq_len; +++ hts_pos_t i, seq_sz = seq_len; ++ ++ for (i = 0; i < seq_sz; i += length) ++ { ++- size_t len = i + length < seq_sz ? length : seq_sz - i; +++ hts_pos_t len = i + length < seq_sz ? length : seq_sz - i; ++ if (fwrite(line + i, 1, len, file) < len || ++ fputc('\n', file) == EOF) { ++ print_error_errno("faidx", "failed to write output"); ++@@ -133,8 +134,8 @@ ++ const int length, const int rev, ++ const char *pos_strand_name, const char *neg_strand_name, ++ enum fai_format_options format) { ++- int seq_len; ++- char *seq = fai_fetch(faid, name, &seq_len); +++ hts_pos_t seq_len; +++ char *seq = fai_fetch64(faid, name, &seq_len); ++ ++ if (format == FAI_FASTA) { ++ fprintf(file, ">%s%s\n", name, rev ? neg_strand_name : pos_strand_name); ++@@ -146,7 +147,8 @@ ++ reverse_complement(seq, seq_len); ++ } ++ ++- if (write_line(file, seq, name, ignore, length, seq_len) == EXIT_FAILURE) { +++ if (write_line(faid, file, seq, name, ignore, length, seq_len) +++ == EXIT_FAILURE) { ++ free(seq); ++ return EXIT_FAILURE; ++ } ++@@ -156,14 +158,15 @@ ++ if (format == FAI_FASTQ) { ++ fprintf(file, "+\n"); ++ ++- char *qual = fai_fetchqual(faid, name, &seq_len); +++ char *qual = fai_fetchqual64(faid, name, &seq_len); ++ ++ if (rev && seq_len > 0) { ++ reverse(qual, seq_len); ++ } ++ ++- if (write_line(file, qual, name, ignore, length, seq_len) == EXIT_FAILURE) { ++- free(seq); +++ if (write_line(faid, file, qual, name, ignore, length, seq_len) +++ == EXIT_FAILURE) { +++ free(qual); ++ return EXIT_FAILURE; ++ } ++ ++--- python-pysam.orig/samtools/faidx.c.pysam.c +++++ python-pysam/samtools/faidx.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* faidx.c -- faidx subcommand. ++ ++- Copyright (C) 2008, 2009, 2013, 2016, 2018 Genome Research Ltd. +++ Copyright (C) 2008, 2009, 2013, 2016, 2018-2019 Genome Research Ltd. ++ Portions copyright (C) 2011 Broad Institute. ++ ++ Author: Heng Li ++@@ -69,9 +69,9 @@ ++ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ++ }; ++ ++-static void reverse_complement(char *str, int len) { +++static void reverse_complement(char *str, const hts_pos_t len) { ++ char c; ++- int i = 0, j = len - 1; +++ hts_pos_t i = 0, j = len - 1; ++ ++ while (i <= j) { ++ c = str[i]; ++@@ -82,10 +82,9 @@ ++ } ++ } ++ ++- ++-static void reverse(char *str, int len) { +++static void reverse(char *str, const hts_pos_t len) { ++ char c; ++- int i = 0, j = len - 1; +++ hts_pos_t i = 0, j = len - 1; ++ ++ while (i < j) { ++ c = str[i]; ++@@ -97,9 +96,10 @@ ++ } ++ ++ ++-static int write_line(FILE *file, const char *line, const char *name, const int ignore, ++- const int length, const int seq_len) { ++- int beg, end; +++static int write_line(faidx_t *faid, FILE *file, const char *line, const char *name, +++ const int ignore, const int length, const hts_pos_t seq_len) { +++ int id; +++ hts_pos_t beg, end; ++ ++ if (seq_len < 0) { ++ fprintf(samtools_stderr, "[faidx] Failed to fetch sequence in %s\n", name); ++@@ -111,15 +111,16 @@ ++ } ++ } else if (seq_len == 0) { ++ fprintf(samtools_stderr, "[faidx] Zero length sequence: %s\n", name); ++- } else if (hts_parse_reg(name, &beg, &end) && (end < INT_MAX) && (seq_len != end - beg)) { +++ } else if (fai_parse_region(faid, name, &id, &beg, &end, 0) +++ && (end < INT_MAX) && (seq_len != end - beg)) { ++ fprintf(samtools_stderr, "[faidx] Truncated sequence: %s\n", name); ++ } ++ ++- size_t i, seq_sz = seq_len; +++ hts_pos_t i, seq_sz = seq_len; ++ ++ for (i = 0; i < seq_sz; i += length) ++ { ++- size_t len = i + length < seq_sz ? length : seq_sz - i; +++ hts_pos_t len = i + length < seq_sz ? length : seq_sz - i; ++ if (fwrite(line + i, 1, len, file) < len || ++ fputc('\n', file) == EOF) { ++ print_error_errno("faidx", "failed to write output"); ++@@ -135,8 +136,8 @@ ++ const int length, const int rev, ++ const char *pos_strand_name, const char *neg_strand_name, ++ enum fai_format_options format) { ++- int seq_len; ++- char *seq = fai_fetch(faid, name, &seq_len); +++ hts_pos_t seq_len; +++ char *seq = fai_fetch64(faid, name, &seq_len); ++ ++ if (format == FAI_FASTA) { ++ fprintf(file, ">%s%s\n", name, rev ? neg_strand_name : pos_strand_name); ++@@ -148,7 +149,8 @@ ++ reverse_complement(seq, seq_len); ++ } ++ ++- if (write_line(file, seq, name, ignore, length, seq_len) == EXIT_FAILURE) { +++ if (write_line(faid, file, seq, name, ignore, length, seq_len) +++ == EXIT_FAILURE) { ++ free(seq); ++ return EXIT_FAILURE; ++ } ++@@ -158,14 +160,15 @@ ++ if (format == FAI_FASTQ) { ++ fprintf(file, "+\n"); ++ ++- char *qual = fai_fetchqual(faid, name, &seq_len); +++ char *qual = fai_fetchqual64(faid, name, &seq_len); ++ ++ if (rev && seq_len > 0) { ++ reverse(qual, seq_len); ++ } ++ ++- if (write_line(file, qual, name, ignore, length, seq_len) == EXIT_FAILURE) { ++- free(seq); +++ if (write_line(faid, file, qual, name, ignore, length, seq_len) +++ == EXIT_FAILURE) { +++ free(qual); ++ return EXIT_FAILURE; ++ } ++ ++--- python-pysam.orig/samtools/htslib-1.9/LICENSE +++++ /dev/null ++@@ -1,69 +0,0 @@ ++-[Files in this distribution outwith the cram/ subdirectory are distributed ++-according to the terms of the following MIT/Expat license.] ++- ++-The MIT/Expat License ++- ++-Copyright (C) 2012-2018 Genome Research Ltd. ++- ++-Permission is hereby granted, free of charge, to any person obtaining a copy ++-of this software and associated documentation files (the "Software"), to deal ++-in the Software without restriction, including without limitation the rights ++-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++-copies of the Software, and to permit persons to whom the Software is ++-furnished to do so, subject to the following conditions: ++- ++-The above copyright notice and this permission notice shall be included in ++-all copies or substantial portions of the Software. ++- ++-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++-DEALINGS IN THE SOFTWARE. ++- ++- ++-[Files within the cram/ subdirectory in this distribution are distributed ++-according to the terms of the following Modified 3-Clause BSD license.] ++- ++-The Modified-BSD License ++- ++-Copyright (C) 2012-2018 Genome Research Ltd. ++- ++-Redistribution and use in source and binary forms, with or without ++-modification, are permitted provided that the following conditions are met: ++- ++-1. Redistributions of source code must retain the above copyright notice, ++- this list of conditions and the following disclaimer. ++- ++-2. Redistributions in binary form must reproduce the above copyright notice, ++- this list of conditions and the following disclaimer in the documentation ++- and/or other materials provided with the distribution. ++- ++-3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute ++- nor the names of its contributors may be used to endorse or promote products ++- derived from this software without specific prior written permission. ++- ++-THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" ++-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++-DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR ITS CONTRIBUTORS BE LIABLE ++-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++- ++- ++-[The use of a range of years within a copyright notice in this distribution ++-should be interpreted as being equivalent to a list of years including the ++-first and last year specified and all consecutive years between them. ++- ++-For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009, ++-2011-2012" should be interpreted as being identical to a notice that reads ++-"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice ++-that reads "Copyright (C) 2005-2012" should be interpreted as being identical ++-to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, ++-2011, 2012".] ++--- python-pysam.orig/samtools/htslib-1.9/README +++++ /dev/null ++@@ -1,5 +0,0 @@ ++-HTSlib is an implementation of a unified C library for accessing common file ++-formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing ++-data. It is the core library used by samtools and bcftools. ++- ++-See INSTALL for building and installation instructions. ++--- python-pysam.orig/samtools/misc/ace2sam.c +++++ python-pysam/samtools/misc/ace2sam.c ++@@ -93,7 +93,8 @@ ++ s.l = s.m = 0; s.s = 0; ++ af_n = af_max = af_i = 0; af = 0; ++ for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0; ++- fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); +++ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); +++ if (fp == NULL) fatal("can't open input file"); ++ ks = ks_init(fp); ++ while (ks_getuntil(ks, 0, &s, &dret) >= 0) { ++ if (strcmp(s.s, "CO") == 0) { // contig sequence ++--- python-pysam.orig/samtools/misc/ace2sam.c.pysam.c +++++ python-pysam/samtools/misc/ace2sam.c.pysam.c ++@@ -95,7 +95,8 @@ ++ s.l = s.m = 0; s.s = 0; ++ af_n = af_max = af_i = 0; af = 0; ++ for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0; ++- fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); +++ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); +++ if (fp == NULL) fatal("can't open input file"); ++ ks = ks_init(fp); ++ while (ks_getuntil(ks, 0, &s, &dret) >= 0) { ++ if (strcmp(s.s, "CO") == 0) { // contig sequence ++--- python-pysam.orig/samtools/padding.c +++++ python-pysam/samtools/padding.c ++@@ -1,7 +1,7 @@ ++ /* padding.c -- depad subcommand. ++ ++ Copyright (C) 2011, 2012 Broad Institute. ++- Copyright (C) 2014-2016 Genome Research Ltd. +++ Copyright (C) 2014-2016, 2019 Genome Research Ltd. ++ Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute. ++ ++ Author: Heng Li ++@@ -29,10 +29,10 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++-#include "sam_header.h" ++ #include "sam_opts.h" ++ #include "samtools.h" ++ ++@@ -62,6 +62,10 @@ ++ if (_n == _m) { \ ++ _m = _m? _m<<1 : 4; \ ++ _c = (uint32_t*)realloc(_c, _m * 4); \ +++ if (!(_c)) { \ +++ fprintf(stderr, "[depad] ERROR: Memory allocation failure.\n"); \ +++ return -1; \ +++ } \ ++ } \ ++ _c[_n++] = (_v); \ ++ } while (0) ++@@ -107,15 +111,15 @@ ++ return length != s->l; ++ } ++ ++-int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) +++int load_unpadded_ref(faidx_t *fai, const char *ref_name, hts_pos_t ref_len, kstring_t *seq) ++ { ++ char base; ++ char *fai_ref = 0; ++- int fai_ref_len = 0, k; +++ hts_pos_t fai_ref_len = 0, k; ++ ++- fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); +++ fai_ref = fai_fetch64(fai, ref_name, &fai_ref_len); ++ if (fai_ref_len != ref_len) { ++- fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); +++ fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %"PRIhts_pos", expected %"PRIhts_pos"\n", ref_name, fai_ref_len, ref_len); ++ free(fai_ref); ++ return -1; ++ } ++@@ -141,16 +145,16 @@ ++ return 0; ++ } ++ ++-int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len) +++hts_pos_t get_unpadded_len(faidx_t *fai, const char *ref_name, hts_pos_t padded_len) ++ { ++ char base; ++ char *fai_ref = 0; ++- int fai_ref_len = 0, k; ++- int bases=0, gaps=0; +++ hts_pos_t fai_ref_len = 0, k; +++ hts_pos_t bases=0, gaps=0; ++ ++- fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); +++ fai_ref = fai_fetch64(fai, ref_name, &fai_ref_len); ++ if (fai_ref_len != padded_len) { ++- fprintf(stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len); +++ fprintf(stderr, "[depad] ERROR: FASTA sequence '%s' length %"PRIhts_pos", expected %"PRIhts_pos"\n", ref_name, fai_ref_len, padded_len); ++ free(fai_ref); ++ return -1; ++ } ++@@ -185,7 +189,7 @@ ++ return posmap; ++ } ++ ++-int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) +++int bam_pad2unpad(samFile *in, samFile *out, sam_hdr_t *h, faidx_t *fai) ++ { ++ bam1_t *b = 0; ++ kstring_t r, q; ++@@ -207,21 +211,21 @@ ++ ++ uint32_t *cigar = bam_get_cigar(b); ++ n2 = 0; ++- if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) { +++ if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid)) == 0) { ++ // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b)); ++ r_tid = b->core.tid; ++ if (0!=unpad_seq(b, &r)) { ++ fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b)); ++ return -1; ++ }; ++- if (h->target_len[r_tid] != r.l) { ++- fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); +++ if (sam_hdr_tid2len(h, r_tid) != r.l) { +++ fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %"PRId64" in BAM header, but %zu in embedded reference\n", bam_get_qname(b), (int64_t) sam_hdr_tid2len(h, r_tid), r.l); ++ return -1; ++ } ++ if (fai) { ++ // Check the embedded reference matches the FASTA file ++- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { ++- fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); +++ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), &q)) { +++ fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", sam_hdr_tid2name(h, b->core.tid)); ++ return -1; ++ } ++ assert(r.l == q.l); ++@@ -230,7 +234,7 @@ ++ if (r.s[i] != q.s[i]) { ++ // Show gaps as ASCII 45 ++ fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", ++- h->target_name[b->core.tid], i+1, +++ sam_hdr_tid2name(h, b->core.tid), i+1, ++ r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45, ++ q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45); ++ return -1; ++@@ -249,15 +253,15 @@ ++ ; // good case, reference available ++ //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b)); ++ } else if (fai) { ++- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { ++- fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); +++ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), &r)) { +++ fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", sam_hdr_tid2name(h, b->core.tid)); ++ return -1; ++ } ++ posmap = update_posmap(posmap, r); ++ r_tid = b->core.tid; ++ // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); ++ } else { ++- fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); +++ fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", sam_hdr_tid2name(h, b->core.tid)); ++ return -1; ++ } ++ if (0!=unpad_seq(b, &q)) { ++@@ -343,19 +347,19 @@ ++ /* Nasty case, Must load alternative posmap */ ++ // fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); ++ if (!fai) { ++- fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); +++ fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", sam_hdr_tid2name(h, b->core.mtid)); ++ return -1; ++ } ++ /* Temporarily load the other reference sequence */ ++- if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { ++- fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); +++ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.mtid), sam_hdr_tid2len(h, b->core.mtid), &r)) { +++ fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", sam_hdr_tid2name(h, b->core.mtid)); ++ return -1; ++ } ++ posmap = update_posmap(posmap, r); ++ b->core.mpos = posmap[b->core.mpos]; ++ /* Restore the reference and posmap*/ ++- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { ++- fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); +++ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), &r)) { +++ fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", sam_hdr_tid2name(h, b->core.tid)); ++ return -1; ++ } ++ posmap = update_posmap(posmap, r); ++@@ -374,126 +378,47 @@ ++ ret = 1; ++ } ++ free(r.s); free(q.s); free(posmap); +++ free(cigar2); ++ bam_destroy1(b); ++ return ret; ++ } ++ ++-bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) +++sam_hdr_t * fix_header(sam_hdr_t *old, faidx_t *fai) ++ { ++- int i = 0, unpadded_len = 0; ++- bam_hdr_t *header = 0 ; ++- unsigned short ln_found; ++- ++- header = bam_hdr_dup(old); ++- for (i = 0; i < old->n_targets; ++i) { ++- unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]); +++ int i = 0, ret = 0; +++ hts_pos_t unpadded_len = 0; +++ sam_hdr_t *header = sam_hdr_dup(old); +++ if (!header) +++ return NULL; +++ +++ int nref = sam_hdr_nref(old); +++ char len_buf[64]; +++ +++ for (i = 0; i < nref; ++i) { +++ unpadded_len = get_unpadded_len(fai, sam_hdr_tid2name(old, i), sam_hdr_tid2len(old, i)); ++ if (unpadded_len < 0) { ++- fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]); +++ fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %"PRIhts_pos"\n", sam_hdr_tid2name(old, i), (hts_pos_t) sam_hdr_tid2len(old, i)); +++ } else if (unpadded_len > sam_hdr_tid2len(old, i)) { +++ fprintf(stderr, "[depad] New unpadded length of '%s' is larger than the padded length (%"PRIhts_pos" > %"PRIhts_pos")\n", +++ sam_hdr_tid2name(old, i), unpadded_len, +++ (hts_pos_t) sam_hdr_tid2len(old, i)); +++ ret = 1; ++ } else { ++- header->target_len[i] = unpadded_len; +++ sprintf(len_buf, "%"PRIhts_pos"", unpadded_len); +++ if ((ret |= sam_hdr_update_line(header, "SQ", "SN", sam_hdr_tid2name(header, i), "LN", len_buf, NULL))) +++ fprintf(stderr, "[depad] Error updating length of '%s' from %"PRIhts_pos" to %"PRIhts_pos"\n", +++ sam_hdr_tid2name(header, i), +++ (hts_pos_t) sam_hdr_tid2len(header, i), +++ unpadded_len); ++ //fprintf(stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]); ++ } ++ } ++- /* Duplicating the header allocated new buffer for header string */ ++- /* After modifying the @SQ lines it will only get smaller, since */ ++- /* the LN entries will be the same or shorter, and we'll remove */ ++- /* any MD entries (MD5 checksums). */ ++- assert(strlen(old->text) == strlen(header->text)); ++- assert (0==strcmp(old->text, header->text)); ++- const char *text; ++- text = old->text; ++- header->text[0] = '\0'; /* Resuse the allocated buffer */ ++- char * newtext = header->text; ++- char * end=NULL; ++- while (text[0]=='@') { ++- end = strchr(text, '\n'); ++- assert(end != 0); ++- if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') { ++- const char *cp = text+3; ++- char *name = strstr(text, "\tSN:"); ++- char *name_end; ++- if (!name) { ++- fprintf(stderr, "Unable to find SN: header field\n"); ++- return NULL; ++- } ++- name += 4; ++- for (name_end = name; name_end != end && *name_end != '\t'; name_end++); ++- strcat(newtext, "@SQ"); ++- ln_found = 0; ++- ++- /* Parse the @SQ lines */ ++- while (cp != end) { ++- if (!ln_found && end-cp >= 2 && strncmp(cp, "LN", 2) == 0) { ++- // Rewrite the length ++- char len_buf[100]; ++- int tid; ++- unsigned int old_length, new_length; ++- const char *old_cp = cp; ++- ++- ln_found = 1; ++- ++- while (cp != end && *cp++ != '\t'); ++- old_length = (int)(cp - old_cp); ++- ++- for (tid = 0; tid < header->n_targets; tid++) { ++- // may want to hash this, but new header API incoming. ++- if (strncmp(name, header->target_name[tid], name_end - name) == 0) { ++- new_length = sprintf(len_buf, "LN:%d", header->target_len[tid]); ++- if (new_length <= old_length) { ++- strcat(newtext, len_buf); ++- } ++- else { ++- fprintf(stderr, "LN value of the reference is larger than the original!\n"); ++- exit(1); ++- } ++- break; ++- } ++- } ++ ++- if (cp != end) ++- strcat(newtext, "\t"); ++- } else if (end-cp >= 2 && ++- ((ln_found && strncmp(cp, "LN", 2) == 0) || ++- strncmp(cp, "M5", 2) == 0 || ++- strncmp(cp, "UR", 2) == 0)) ++- { ++- // skip secondary LNs ++- // MD5 changed during depadding; ditch it. ++- // URLs are also invalid. ++- while (cp != end && *cp++ != '\t'); ++- } else { ++- // Otherwise copy this sub-field verbatim ++- const char *cp_start = cp; ++- while (cp != end && *cp++ != '\t'); ++- strncat(newtext, cp_start, cp-cp_start); ++- } ++- } ++- ++- // Add newline, replacing trailing '\t' if last on line was the LN: ++- char *text_end = newtext + strlen(newtext); ++- if (text_end[-1] == '\t') ++- text_end[-1] = '\n'; ++- else ++- *text_end++ = '\n', *text_end = '\0'; ++- } else { ++- /* Copy this line to the new header */ ++- strncat(newtext, text, end - text + 1); ++- } ++- text = end + 1; +++ if (ret) { +++ sam_hdr_destroy(header); +++ return NULL; ++ } ++- assert (text[0]=='\0'); ++- /* Check we didn't overflow the buffer */ ++- assert (strlen(header->text) <= strlen(old->text)); ++- if (strlen(header->text) < header->l_text) { ++- //fprintf(stderr, "[depad] Reallocating header buffer\n"); ++- assert (newtext == header->text); ++- newtext = malloc(strlen(header->text) + 1); ++- strcpy(newtext, header->text); ++- free(header->text); ++- header->text = newtext; ++- header->l_text = strlen(newtext); ++- } ++- //fprintf(stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text); +++ ++ return header; ++ } ++ ++@@ -502,15 +427,17 @@ ++ int main_pad2unpad(int argc, char *argv[]) ++ { ++ samFile *in = 0, *out = 0; ++- bam_hdr_t *h = 0, *h_fix = 0; +++ sam_hdr_t *h = 0, *h_fix = 0; ++ faidx_t *fai = 0; ++- int c, compress_level = -1, is_long_help = 0; ++- char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0; +++ int c, compress_level = -1, is_long_help = 0, no_pg = 0; +++ char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0, *fn_out_idx = NULL; ++ int ret=0; +++ char *arg_list = NULL; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T', '-'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -532,6 +459,7 @@ ++ if (ga.out.format == unknown_format) ++ hts_parse_format(&ga.out, "bam"); ++ break; +++ case 1: no_pg = 1; break; ++ case '?': is_long_help = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); ++@@ -569,7 +497,11 @@ ++ goto depad_end; ++ } ++ if (fai) { ++- h_fix = fix_header(h, fai); +++ if (!(h_fix = fix_header(h, fai))){ +++ fprintf(stderr, "[depad] failed to fix the header from\n"); +++ ret = 1; +++ goto depad_end; +++ } ++ } else { ++ fprintf(stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); ++ h_fix = h; ++@@ -587,25 +519,61 @@ ++ if (ga.out.format == cram) ++ hts_set_opt(out, CRAM_OPT_NO_REF, 1); ++ +++ if (!no_pg) { +++ if(!(arg_list = stringify_argv(argc+1, argv-1))) { +++ fprintf(stderr, "[depad] failed to create arg_list\n"); +++ ret = 1; +++ goto depad_end; +++ } +++ +++ if (sam_hdr_add_pg(h_fix, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ fprintf(stderr, "[depad] failed to add PG line to header\n"); +++ ret = 1; +++ goto depad_end; +++ } +++ } +++ ++ if (sam_hdr_write(out, h_fix) != 0) { ++ fprintf(stderr, "[depad] failed to write header.\n"); ++ ret = 1; ++ goto depad_end; ++ } +++ if (ga.write_index) { +++ if (!(fn_out_idx = auto_index(out, fn_out, h_fix))) { +++ ret = 1; +++ goto depad_end; +++ } +++ } ++ ++ // Do the depad ++ if (bam_pad2unpad(in, out, h, fai) != 0) ret = 1; ++ +++ if (ga.write_index) { +++ if (sam_idx_save(out) < 0) { +++ print_error_errno("depad", "writing index failed"); +++ ret = 1; +++ } +++ } +++ ++ depad_end: ++ // close files, free and return +++ free(arg_list); ++ if (fai) fai_destroy(fai); ++- if (h) bam_hdr_destroy(h); +++ if (h) sam_hdr_destroy(h); +++ if (h_fix && h_fix != h) sam_hdr_destroy(h_fix); ++ if (in) sam_close(in); ++ if (out && sam_close(out) < 0) { ++ fprintf(stderr, "[depad] error on closing output file.\n"); ++ ret = 1; ++ } ++ free(fn_list); free(fn_out); +++ if (fn_out_idx) +++ free(fn_out_idx); +++ sam_global_args_free(&ga); ++ return ret; ++ } ++ ++@@ -621,8 +589,9 @@ ++ fprintf(stderr, " -T, --reference FILE\n"); ++ fprintf(stderr, " Padded reference sequence file [null]\n"); ++ fprintf(stderr, " -o FILE Output file name [stdout]\n"); +++ fprintf(stderr, " --no-PG do not add a PG line\n"); ++ fprintf(stderr, " -? Longer help\n"); ++- sam_global_opt_help(stderr, "-...--"); +++ sam_global_opt_help(stderr, "-...--.."); ++ ++ if (is_long_help) ++ fprintf(stderr, ++--- python-pysam.orig/samtools/padding.c.pysam.c +++++ python-pysam/samtools/padding.c.pysam.c ++@@ -3,7 +3,7 @@ ++ /* padding.c -- depad subcommand. ++ ++ Copyright (C) 2011, 2012 Broad Institute. ++- Copyright (C) 2014-2016 Genome Research Ltd. +++ Copyright (C) 2014-2016, 2019 Genome Research Ltd. ++ Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute. ++ ++ Author: Heng Li ++@@ -31,10 +31,10 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++-#include "sam_header.h" ++ #include "sam_opts.h" ++ #include "samtools.h" ++ ++@@ -64,6 +64,10 @@ ++ if (_n == _m) { \ ++ _m = _m? _m<<1 : 4; \ ++ _c = (uint32_t*)realloc(_c, _m * 4); \ +++ if (!(_c)) { \ +++ fprintf(samtools_stderr, "[depad] ERROR: Memory allocation failure.\n"); \ +++ return -1; \ +++ } \ ++ } \ ++ _c[_n++] = (_v); \ ++ } while (0) ++@@ -109,15 +113,15 @@ ++ return length != s->l; ++ } ++ ++-int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) +++int load_unpadded_ref(faidx_t *fai, const char *ref_name, hts_pos_t ref_len, kstring_t *seq) ++ { ++ char base; ++ char *fai_ref = 0; ++- int fai_ref_len = 0, k; +++ hts_pos_t fai_ref_len = 0, k; ++ ++- fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); +++ fai_ref = fai_fetch64(fai, ref_name, &fai_ref_len); ++ if (fai_ref_len != ref_len) { ++- fprintf(samtools_stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); +++ fprintf(samtools_stderr, "[depad] ERROR: FASTA sequence %s length %"PRIhts_pos", expected %"PRIhts_pos"\n", ref_name, fai_ref_len, ref_len); ++ free(fai_ref); ++ return -1; ++ } ++@@ -143,16 +147,16 @@ ++ return 0; ++ } ++ ++-int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len) +++hts_pos_t get_unpadded_len(faidx_t *fai, const char *ref_name, hts_pos_t padded_len) ++ { ++ char base; ++ char *fai_ref = 0; ++- int fai_ref_len = 0, k; ++- int bases=0, gaps=0; +++ hts_pos_t fai_ref_len = 0, k; +++ hts_pos_t bases=0, gaps=0; ++ ++- fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); +++ fai_ref = fai_fetch64(fai, ref_name, &fai_ref_len); ++ if (fai_ref_len != padded_len) { ++- fprintf(samtools_stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len); +++ fprintf(samtools_stderr, "[depad] ERROR: FASTA sequence '%s' length %"PRIhts_pos", expected %"PRIhts_pos"\n", ref_name, fai_ref_len, padded_len); ++ free(fai_ref); ++ return -1; ++ } ++@@ -187,7 +191,7 @@ ++ return posmap; ++ } ++ ++-int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) +++int bam_pad2unpad(samFile *in, samFile *out, sam_hdr_t *h, faidx_t *fai) ++ { ++ bam1_t *b = 0; ++ kstring_t r, q; ++@@ -209,21 +213,21 @@ ++ ++ uint32_t *cigar = bam_get_cigar(b); ++ n2 = 0; ++- if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) { +++ if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid)) == 0) { ++ // fprintf(samtools_stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b)); ++ r_tid = b->core.tid; ++ if (0!=unpad_seq(b, &r)) { ++ fprintf(samtools_stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b)); ++ return -1; ++ }; ++- if (h->target_len[r_tid] != r.l) { ++- fprintf(samtools_stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); +++ if (sam_hdr_tid2len(h, r_tid) != r.l) { +++ fprintf(samtools_stderr, "[depad] ERROR: (Padded) length of '%s' is %"PRId64" in BAM header, but %zu in embedded reference\n", bam_get_qname(b), (int64_t) sam_hdr_tid2len(h, r_tid), r.l); ++ return -1; ++ } ++ if (fai) { ++ // Check the embedded reference matches the FASTA file ++- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { ++- fprintf(samtools_stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); +++ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), &q)) { +++ fprintf(samtools_stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", sam_hdr_tid2name(h, b->core.tid)); ++ return -1; ++ } ++ assert(r.l == q.l); ++@@ -232,7 +236,7 @@ ++ if (r.s[i] != q.s[i]) { ++ // Show gaps as ASCII 45 ++ fprintf(samtools_stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", ++- h->target_name[b->core.tid], i+1, +++ sam_hdr_tid2name(h, b->core.tid), i+1, ++ r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45, ++ q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45); ++ return -1; ++@@ -251,15 +255,15 @@ ++ ; // good case, reference available ++ //fprintf(samtools_stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b)); ++ } else if (fai) { ++- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { ++- fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); +++ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), &r)) { +++ fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", sam_hdr_tid2name(h, b->core.tid)); ++ return -1; ++ } ++ posmap = update_posmap(posmap, r); ++ r_tid = b->core.tid; ++ // fprintf(samtools_stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); ++ } else { ++- fprintf(samtools_stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); +++ fprintf(samtools_stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", sam_hdr_tid2name(h, b->core.tid)); ++ return -1; ++ } ++ if (0!=unpad_seq(b, &q)) { ++@@ -345,19 +349,19 @@ ++ /* Nasty case, Must load alternative posmap */ ++ // fprintf(samtools_stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); ++ if (!fai) { ++- fprintf(samtools_stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); +++ fprintf(samtools_stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", sam_hdr_tid2name(h, b->core.mtid)); ++ return -1; ++ } ++ /* Temporarily load the other reference sequence */ ++- if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { ++- fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); +++ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.mtid), sam_hdr_tid2len(h, b->core.mtid), &r)) { +++ fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", sam_hdr_tid2name(h, b->core.mtid)); ++ return -1; ++ } ++ posmap = update_posmap(posmap, r); ++ b->core.mpos = posmap[b->core.mpos]; ++ /* Restore the reference and posmap*/ ++- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { ++- fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); +++ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), &r)) { +++ fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", sam_hdr_tid2name(h, b->core.tid)); ++ return -1; ++ } ++ posmap = update_posmap(posmap, r); ++@@ -376,126 +380,47 @@ ++ ret = 1; ++ } ++ free(r.s); free(q.s); free(posmap); +++ free(cigar2); ++ bam_destroy1(b); ++ return ret; ++ } ++ ++-bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) +++sam_hdr_t * fix_header(sam_hdr_t *old, faidx_t *fai) ++ { ++- int i = 0, unpadded_len = 0; ++- bam_hdr_t *header = 0 ; ++- unsigned short ln_found; ++- ++- header = bam_hdr_dup(old); ++- for (i = 0; i < old->n_targets; ++i) { ++- unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]); +++ int i = 0, ret = 0; +++ hts_pos_t unpadded_len = 0; +++ sam_hdr_t *header = sam_hdr_dup(old); +++ if (!header) +++ return NULL; +++ +++ int nref = sam_hdr_nref(old); +++ char len_buf[64]; +++ +++ for (i = 0; i < nref; ++i) { +++ unpadded_len = get_unpadded_len(fai, sam_hdr_tid2name(old, i), sam_hdr_tid2len(old, i)); ++ if (unpadded_len < 0) { ++- fprintf(samtools_stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]); +++ fprintf(samtools_stderr, "[depad] ERROR getting unpadded length of '%s', padded length %"PRIhts_pos"\n", sam_hdr_tid2name(old, i), (hts_pos_t) sam_hdr_tid2len(old, i)); +++ } else if (unpadded_len > sam_hdr_tid2len(old, i)) { +++ fprintf(samtools_stderr, "[depad] New unpadded length of '%s' is larger than the padded length (%"PRIhts_pos" > %"PRIhts_pos")\n", +++ sam_hdr_tid2name(old, i), unpadded_len, +++ (hts_pos_t) sam_hdr_tid2len(old, i)); +++ ret = 1; ++ } else { ++- header->target_len[i] = unpadded_len; +++ sprintf(len_buf, "%"PRIhts_pos"", unpadded_len); +++ if ((ret |= sam_hdr_update_line(header, "SQ", "SN", sam_hdr_tid2name(header, i), "LN", len_buf, NULL))) +++ fprintf(samtools_stderr, "[depad] Error updating length of '%s' from %"PRIhts_pos" to %"PRIhts_pos"\n", +++ sam_hdr_tid2name(header, i), +++ (hts_pos_t) sam_hdr_tid2len(header, i), +++ unpadded_len); ++ //fprintf(samtools_stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]); ++ } ++ } ++- /* Duplicating the header allocated new buffer for header string */ ++- /* After modifying the @SQ lines it will only get smaller, since */ ++- /* the LN entries will be the same or shorter, and we'll remove */ ++- /* any MD entries (MD5 checksums). */ ++- assert(strlen(old->text) == strlen(header->text)); ++- assert (0==strcmp(old->text, header->text)); ++- const char *text; ++- text = old->text; ++- header->text[0] = '\0'; /* Resuse the allocated buffer */ ++- char * newtext = header->text; ++- char * end=NULL; ++- while (text[0]=='@') { ++- end = strchr(text, '\n'); ++- assert(end != 0); ++- if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') { ++- const char *cp = text+3; ++- char *name = strstr(text, "\tSN:"); ++- char *name_end; ++- if (!name) { ++- fprintf(samtools_stderr, "Unable to find SN: header field\n"); ++- return NULL; ++- } ++- name += 4; ++- for (name_end = name; name_end != end && *name_end != '\t'; name_end++); ++- strcat(newtext, "@SQ"); ++- ln_found = 0; ++- ++- /* Parse the @SQ lines */ ++- while (cp != end) { ++- if (!ln_found && end-cp >= 2 && strncmp(cp, "LN", 2) == 0) { ++- // Rewrite the length ++- char len_buf[100]; ++- int tid; ++- unsigned int old_length, new_length; ++- const char *old_cp = cp; ++- ++- ln_found = 1; ++- ++- while (cp != end && *cp++ != '\t'); ++- old_length = (int)(cp - old_cp); ++- ++- for (tid = 0; tid < header->n_targets; tid++) { ++- // may want to hash this, but new header API incoming. ++- if (strncmp(name, header->target_name[tid], name_end - name) == 0) { ++- new_length = sprintf(len_buf, "LN:%d", header->target_len[tid]); ++- if (new_length <= old_length) { ++- strcat(newtext, len_buf); ++- } ++- else { ++- fprintf(samtools_stderr, "LN value of the reference is larger than the original!\n"); ++- exit(1); ++- } ++- break; ++- } ++- } ++ ++- if (cp != end) ++- strcat(newtext, "\t"); ++- } else if (end-cp >= 2 && ++- ((ln_found && strncmp(cp, "LN", 2) == 0) || ++- strncmp(cp, "M5", 2) == 0 || ++- strncmp(cp, "UR", 2) == 0)) ++- { ++- // skip secondary LNs ++- // MD5 changed during depadding; ditch it. ++- // URLs are also invalid. ++- while (cp != end && *cp++ != '\t'); ++- } else { ++- // Otherwise copy this sub-field verbatim ++- const char *cp_start = cp; ++- while (cp != end && *cp++ != '\t'); ++- strncat(newtext, cp_start, cp-cp_start); ++- } ++- } ++- ++- // Add newline, replacing trailing '\t' if last on line was the LN: ++- char *text_end = newtext + strlen(newtext); ++- if (text_end[-1] == '\t') ++- text_end[-1] = '\n'; ++- else ++- *text_end++ = '\n', *text_end = '\0'; ++- } else { ++- /* Copy this line to the new header */ ++- strncat(newtext, text, end - text + 1); ++- } ++- text = end + 1; +++ if (ret) { +++ sam_hdr_destroy(header); +++ return NULL; ++ } ++- assert (text[0]=='\0'); ++- /* Check we didn't overflow the buffer */ ++- assert (strlen(header->text) <= strlen(old->text)); ++- if (strlen(header->text) < header->l_text) { ++- //fprintf(samtools_stderr, "[depad] Reallocating header buffer\n"); ++- assert (newtext == header->text); ++- newtext = malloc(strlen(header->text) + 1); ++- strcpy(newtext, header->text); ++- free(header->text); ++- header->text = newtext; ++- header->l_text = strlen(newtext); ++- } ++- //fprintf(samtools_stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text); +++ ++ return header; ++ } ++ ++@@ -504,15 +429,17 @@ ++ int main_pad2unpad(int argc, char *argv[]) ++ { ++ samFile *in = 0, *out = 0; ++- bam_hdr_t *h = 0, *h_fix = 0; +++ sam_hdr_t *h = 0, *h_fix = 0; ++ faidx_t *fai = 0; ++- int c, compress_level = -1, is_long_help = 0; ++- char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0; +++ int c, compress_level = -1, is_long_help = 0, no_pg = 0; +++ char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0, *fn_out_idx = NULL; ++ int ret=0; +++ char *arg_list = NULL; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T', '-'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -534,6 +461,7 @@ ++ if (ga.out.format == unknown_format) ++ hts_parse_format(&ga.out, "bam"); ++ break; +++ case 1: no_pg = 1; break; ++ case '?': is_long_help = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ fprintf(samtools_stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); ++@@ -571,7 +499,11 @@ ++ goto depad_end; ++ } ++ if (fai) { ++- h_fix = fix_header(h, fai); +++ if (!(h_fix = fix_header(h, fai))){ +++ fprintf(samtools_stderr, "[depad] failed to fix the header from\n"); +++ ret = 1; +++ goto depad_end; +++ } ++ } else { ++ fprintf(samtools_stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); ++ h_fix = h; ++@@ -589,25 +521,61 @@ ++ if (ga.out.format == cram) ++ hts_set_opt(out, CRAM_OPT_NO_REF, 1); ++ +++ if (!no_pg) { +++ if(!(arg_list = stringify_argv(argc+1, argv-1))) { +++ fprintf(samtools_stderr, "[depad] failed to create arg_list\n"); +++ ret = 1; +++ goto depad_end; +++ } +++ +++ if (sam_hdr_add_pg(h_fix, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ fprintf(samtools_stderr, "[depad] failed to add PG line to header\n"); +++ ret = 1; +++ goto depad_end; +++ } +++ } +++ ++ if (sam_hdr_write(out, h_fix) != 0) { ++ fprintf(samtools_stderr, "[depad] failed to write header.\n"); ++ ret = 1; ++ goto depad_end; ++ } +++ if (ga.write_index) { +++ if (!(fn_out_idx = auto_index(out, fn_out, h_fix))) { +++ ret = 1; +++ goto depad_end; +++ } +++ } ++ ++ // Do the depad ++ if (bam_pad2unpad(in, out, h, fai) != 0) ret = 1; ++ +++ if (ga.write_index) { +++ if (sam_idx_save(out) < 0) { +++ print_error_errno("depad", "writing index failed"); +++ ret = 1; +++ } +++ } +++ ++ depad_end: ++ // close files, free and return +++ free(arg_list); ++ if (fai) fai_destroy(fai); ++- if (h) bam_hdr_destroy(h); +++ if (h) sam_hdr_destroy(h); +++ if (h_fix && h_fix != h) sam_hdr_destroy(h_fix); ++ if (in) sam_close(in); ++ if (out && sam_close(out) < 0) { ++ fprintf(samtools_stderr, "[depad] error on closing output file.\n"); ++ ret = 1; ++ } ++ free(fn_list); free(fn_out); +++ if (fn_out_idx) +++ free(fn_out_idx); +++ sam_global_args_free(&ga); ++ return ret; ++ } ++ ++@@ -623,8 +591,9 @@ ++ fprintf(samtools_stderr, " -T, --reference FILE\n"); ++ fprintf(samtools_stderr, " Padded reference sequence file [null]\n"); ++ fprintf(samtools_stderr, " -o FILE Output file name [samtools_stdout]\n"); +++ fprintf(samtools_stderr, " --no-PG do not add a PG line\n"); ++ fprintf(samtools_stderr, " -? Longer help\n"); ++- sam_global_opt_help(samtools_stderr, "-...--"); +++ sam_global_opt_help(samtools_stderr, "-...--.."); ++ ++ if (is_long_help) ++ fprintf(samtools_stderr, ++--- python-pysam.orig/samtools/phase.c +++++ python-pysam/samtools/phase.c ++@@ -1,7 +1,7 @@ ++ /* phase.c -- phase subcommand. ++ ++ Copyright (C) 2011 Broad Institute. ++- Copyright (C) 2013-2016 Genome Research Ltd. +++ Copyright (C) 2013-2016, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -52,15 +52,15 @@ ++ ++ typedef struct { ++ // configurations, initialized in the main function ++- int flag, k, min_baseQ, min_varLOD, max_depth; +++ int flag, k, min_baseQ, min_varLOD, max_depth, no_pg; ++ // other global variables ++ int vpos_shift; ++ samFile* fp; ++- bam_hdr_t* fp_hdr; ++- char *pre; +++ sam_hdr_t* fp_hdr; +++ char *pre, *arg_list; ++ char *out_name[3]; ++ samFile* out[3]; ++- bam_hdr_t* out_hdr[3]; +++ sam_hdr_t* out_hdr[3]; ++ // alignment queue ++ int n, m; ++ bam1_t **b; ++@@ -503,7 +503,7 @@ ++ return ret; ++ } ++ ++-static khash_t(set64) *loadpos(const char *fn, bam_hdr_t *h) +++static khash_t(set64) *loadpos(const char *fn, sam_hdr_t *h) ++ { ++ gzFile fp; ++ kstream_t *ks; ++@@ -511,9 +511,15 @@ ++ kstring_t *str; ++ khash_t(set64) *hash; ++ +++ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); +++ if (fp == NULL) { +++ print_error_errno("phase", "Couldn't open site file '%s'", fn); +++ return NULL; +++ } +++ ++ hash = kh_init(set64); ++ str = calloc(1, sizeof(kstring_t)); ++- fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); +++ ++ ks = ks_init(fp); ++ while (ks_getuntil(ks, 0, str, &dret) >= 0) { ++ int tid = bam_name2id(h, str->s); ++@@ -557,7 +563,15 @@ ++ return -1; ++ } ++ ++- g->out_hdr[c] = bam_hdr_dup(g->fp_hdr); +++ g->out_hdr[c] = sam_hdr_dup(g->fp_hdr); +++ if (!g->no_pg && sam_hdr_add_pg(g->out_hdr[c], "samtools", +++ "VN", samtools_version(), +++ g->arg_list ? "CL": NULL, +++ g->arg_list ? g->arg_list : NULL, +++ NULL)) { +++ print_error("phase", "failed to add PG line to header"); +++ return -1; +++ } ++ if (sam_hdr_write(g->out[c], g->out_hdr[c]) < 0) { ++ print_error_errno("phase", "Failed to write header for '%s'", g->out_name[c]); ++ return -1; ++@@ -582,6 +596,7 @@ ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -601,6 +616,7 @@ ++ case 'A': g.flag |= FLAG_DROP_AMBI; break; ++ case 'b': g.pre = strdup(optarg); break; ++ case 'l': fn_list = strdup(optarg); break; +++ case 1: g.no_pg = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': usage=1; break; ++@@ -618,10 +634,11 @@ ++ // fprintf(stderr, " -l FILE list of sites to phase [null]\n"); ++ fprintf(stderr, " -F do not attempt to fix chimeras\n"); ++ fprintf(stderr, " -A drop reads with ambiguous phase\n"); +++ fprintf(stderr, " --no-PG do not add a PG line\n"); ++ // fprintf(stderr, " -e do not discover SNPs (effective with -l)\n"); ++ fprintf(stderr, "\n"); ++ ++- sam_global_opt_help(stderr, "-....-"); +++ sam_global_opt_help(stderr, "-....--."); ++ ++ return 1; ++ } ++@@ -636,8 +653,13 @@ ++ __func__, argv[optind]); ++ return 1; ++ } +++ if (!g.no_pg && !(g.arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("phase", "failed to create arg_list"); +++ return 1; +++ } ++ if (fn_list) { // read the list of sites to phase ++ set = loadpos(fn_list, g.fp_hdr); +++ if (set == NULL) return 1; ++ free(fn_list); ++ } else g.flag &= ~FLAG_LIST_EXCL; ++ if (g.pre) { // open BAMs to write ++@@ -677,7 +699,7 @@ ++ g.vpos_shift = 0; ++ if (lasttid >= 0) { ++ seqs = shrink_hash(seqs); ++- if (phase(&g, g.fp_hdr->target_name[lasttid], +++ if (phase(&g, sam_hdr_tid2name(g.fp_hdr, lasttid), ++ vpos, cns, seqs) < 0) { ++ return 1; ++ } ++@@ -749,7 +771,7 @@ ++ } ++ if (dophase) { ++ seqs = shrink_hash(seqs); ++- if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { +++ if (phase(&g, sam_hdr_tid2name(g.fp_hdr, tid), vpos, cns, seqs) < 0) { ++ return 1; ++ } ++ update_vpos(vpos, seqs); ++@@ -759,11 +781,11 @@ ++ ++vpos; ++ } ++ if (tid >= 0) { ++- if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { +++ if (phase(&g, sam_hdr_tid2name(g.fp_hdr, tid), vpos, cns, seqs) < 0) { ++ return 1; ++ } ++ } ++- bam_hdr_destroy(g.fp_hdr); +++ sam_hdr_destroy(g.fp_hdr); ++ bam_plp_destroy(iter); ++ sam_close(g.fp); ++ kh_destroy(64, seqs); ++@@ -779,12 +801,13 @@ ++ __func__, g.out_name[c]); ++ res = 1; ++ } ++- bam_hdr_destroy(g.out_hdr[c]); +++ sam_hdr_destroy(g.out_hdr[c]); ++ free(g.out_name[c]); ++ } ++ free(g.pre); free(g.b); ++ if (res) return 1; ++ } +++ free(g.arg_list); ++ sam_global_args_free(&ga); ++ return 0; ++ } ++--- python-pysam.orig/samtools/phase.c.pysam.c +++++ python-pysam/samtools/phase.c.pysam.c ++@@ -3,7 +3,7 @@ ++ /* phase.c -- phase subcommand. ++ ++ Copyright (C) 2011 Broad Institute. ++- Copyright (C) 2013-2016 Genome Research Ltd. +++ Copyright (C) 2013-2016, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -54,15 +54,15 @@ ++ ++ typedef struct { ++ // configurations, initialized in the main function ++- int flag, k, min_baseQ, min_varLOD, max_depth; +++ int flag, k, min_baseQ, min_varLOD, max_depth, no_pg; ++ // other global variables ++ int vpos_shift; ++ samFile* fp; ++- bam_hdr_t* fp_hdr; ++- char *pre; +++ sam_hdr_t* fp_hdr; +++ char *pre, *arg_list; ++ char *out_name[3]; ++ samFile* out[3]; ++- bam_hdr_t* out_hdr[3]; +++ sam_hdr_t* out_hdr[3]; ++ // alignment queue ++ int n, m; ++ bam1_t **b; ++@@ -505,7 +505,7 @@ ++ return ret; ++ } ++ ++-static khash_t(set64) *loadpos(const char *fn, bam_hdr_t *h) +++static khash_t(set64) *loadpos(const char *fn, sam_hdr_t *h) ++ { ++ gzFile fp; ++ kstream_t *ks; ++@@ -513,9 +513,15 @@ ++ kstring_t *str; ++ khash_t(set64) *hash; ++ +++ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); +++ if (fp == NULL) { +++ print_error_errno("phase", "Couldn't open site file '%s'", fn); +++ return NULL; +++ } +++ ++ hash = kh_init(set64); ++ str = calloc(1, sizeof(kstring_t)); ++- fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); +++ ++ ks = ks_init(fp); ++ while (ks_getuntil(ks, 0, str, &dret) >= 0) { ++ int tid = bam_name2id(h, str->s); ++@@ -559,7 +565,15 @@ ++ return -1; ++ } ++ ++- g->out_hdr[c] = bam_hdr_dup(g->fp_hdr); +++ g->out_hdr[c] = sam_hdr_dup(g->fp_hdr); +++ if (!g->no_pg && sam_hdr_add_pg(g->out_hdr[c], "samtools", +++ "VN", samtools_version(), +++ g->arg_list ? "CL": NULL, +++ g->arg_list ? g->arg_list : NULL, +++ NULL)) { +++ print_error("phase", "failed to add PG line to header"); +++ return -1; +++ } ++ if (sam_hdr_write(g->out[c], g->out_hdr[c]) < 0) { ++ print_error_errno("phase", "Failed to write header for '%s'", g->out_name[c]); ++ return -1; ++@@ -584,6 +598,7 @@ ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -603,6 +618,7 @@ ++ case 'A': g.flag |= FLAG_DROP_AMBI; break; ++ case 'b': g.pre = strdup(optarg); break; ++ case 'l': fn_list = strdup(optarg); break; +++ case 1: g.no_pg = 1; break; ++ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; ++ /* else fall-through */ ++ case '?': usage=1; break; ++@@ -620,10 +636,11 @@ ++ // fprintf(samtools_stderr, " -l FILE list of sites to phase [null]\n"); ++ fprintf(samtools_stderr, " -F do not attempt to fix chimeras\n"); ++ fprintf(samtools_stderr, " -A drop reads with ambiguous phase\n"); +++ fprintf(samtools_stderr, " --no-PG do not add a PG line\n"); ++ // fprintf(samtools_stderr, " -e do not discover SNPs (effective with -l)\n"); ++ fprintf(samtools_stderr, "\n"); ++ ++- sam_global_opt_help(samtools_stderr, "-....-"); +++ sam_global_opt_help(samtools_stderr, "-....--."); ++ ++ return 1; ++ } ++@@ -638,8 +655,13 @@ ++ __func__, argv[optind]); ++ return 1; ++ } +++ if (!g.no_pg && !(g.arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("phase", "failed to create arg_list"); +++ return 1; +++ } ++ if (fn_list) { // read the list of sites to phase ++ set = loadpos(fn_list, g.fp_hdr); +++ if (set == NULL) return 1; ++ free(fn_list); ++ } else g.flag &= ~FLAG_LIST_EXCL; ++ if (g.pre) { // open BAMs to write ++@@ -679,7 +701,7 @@ ++ g.vpos_shift = 0; ++ if (lasttid >= 0) { ++ seqs = shrink_hash(seqs); ++- if (phase(&g, g.fp_hdr->target_name[lasttid], +++ if (phase(&g, sam_hdr_tid2name(g.fp_hdr, lasttid), ++ vpos, cns, seqs) < 0) { ++ return 1; ++ } ++@@ -751,7 +773,7 @@ ++ } ++ if (dophase) { ++ seqs = shrink_hash(seqs); ++- if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { +++ if (phase(&g, sam_hdr_tid2name(g.fp_hdr, tid), vpos, cns, seqs) < 0) { ++ return 1; ++ } ++ update_vpos(vpos, seqs); ++@@ -761,11 +783,11 @@ ++ ++vpos; ++ } ++ if (tid >= 0) { ++- if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { +++ if (phase(&g, sam_hdr_tid2name(g.fp_hdr, tid), vpos, cns, seqs) < 0) { ++ return 1; ++ } ++ } ++- bam_hdr_destroy(g.fp_hdr); +++ sam_hdr_destroy(g.fp_hdr); ++ bam_plp_destroy(iter); ++ sam_close(g.fp); ++ kh_destroy(64, seqs); ++@@ -781,12 +803,13 @@ ++ __func__, g.out_name[c]); ++ res = 1; ++ } ++- bam_hdr_destroy(g.out_hdr[c]); +++ sam_hdr_destroy(g.out_hdr[c]); ++ free(g.out_name[c]); ++ } ++ free(g.pre); free(g.b); ++ if (res) return 1; ++ } +++ free(g.arg_list); ++ sam_global_args_free(&ga); ++ return 0; ++ } ++--- python-pysam.orig/samtools/sam.c +++++ python-pysam/samtools/sam.c ++@@ -1,6 +1,6 @@ ++ /* sam.c -- format-neutral SAM/BAM API. ++ ++- Copyright (C) 2009, 2012-2015 Genome Research Ltd. +++ Copyright (C) 2009, 2012-2016 Genome Research Ltd. ++ Portions copyright (C) 2011 Broad Institute. ++ ++ Author: Heng Li ++@@ -65,12 +65,12 @@ ++ return NULL; ++ } ++ fp->is_write = 0; ++- if (fp->header->n_targets == 0 && bam_verbose >= 1) +++ if (sam_hdr_nref(fp->header) == 0 && bam_verbose >= 1) ++ fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); ++ } ++ else { ++ enum htsExactFormat fmt = hts_get_format(fp->file)->format; ++- fp->header = (bam_hdr_t *)aux; // For writing, we won't free it +++ fp->header = (sam_hdr_t *)aux; // For writing, we won't free it ++ fp->is_write = 1; ++ if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) { ++ if (sam_hdr_write(fp->file, fp->header) < 0) { ++@@ -89,7 +89,7 @@ ++ void samclose(samfile_t *fp) ++ { ++ if (fp) { ++- if (!fp->is_write && fp->header) bam_hdr_destroy(fp->header); +++ if (!fp->is_write && fp->header) sam_hdr_destroy(fp->header); ++ sam_close(fp->file); ++ free(fp); ++ } ++--- python-pysam.orig/samtools/sam.c.pysam.c +++++ python-pysam/samtools/sam.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* sam.c -- format-neutral SAM/BAM API. ++ ++- Copyright (C) 2009, 2012-2015 Genome Research Ltd. +++ Copyright (C) 2009, 2012-2016 Genome Research Ltd. ++ Portions copyright (C) 2011 Broad Institute. ++ ++ Author: Heng Li ++@@ -67,12 +67,12 @@ ++ return NULL; ++ } ++ fp->is_write = 0; ++- if (fp->header->n_targets == 0 && bam_verbose >= 1) +++ if (sam_hdr_nref(fp->header) == 0 && bam_verbose >= 1) ++ fprintf(samtools_stderr, "[samopen] no @SQ lines in the header.\n"); ++ } ++ else { ++ enum htsExactFormat fmt = hts_get_format(fp->file)->format; ++- fp->header = (bam_hdr_t *)aux; // For writing, we won't free it +++ fp->header = (sam_hdr_t *)aux; // For writing, we won't free it ++ fp->is_write = 1; ++ if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) { ++ if (sam_hdr_write(fp->file, fp->header) < 0) { ++@@ -91,7 +91,7 @@ ++ void samclose(samfile_t *fp) ++ { ++ if (fp) { ++- if (!fp->is_write && fp->header) bam_hdr_destroy(fp->header); +++ if (!fp->is_write && fp->header) sam_hdr_destroy(fp->header); ++ sam_close(fp->file); ++ free(fp); ++ } ++--- python-pysam.orig/samtools/sam.h +++++ python-pysam/samtools/sam.h ++@@ -1,6 +1,6 @@ ++ /* sam.h -- format-neutral SAM/BAM API. ++ ++- Copyright (C) 2009, 2013-2015 Genome Research Ltd. +++ Copyright (C) 2009, 2013-2015, 2019 Genome Research Ltd. ++ ++ Author: Heng Li ++ ++@@ -49,7 +49,7 @@ ++ typedef struct { ++ samFile *file; ++ struct { BGZF *bam; } x; // Hack so that fp->x.bam still works ++- bam_hdr_t *header; +++ sam_hdr_t *header; ++ unsigned short is_write:1; ++ } samfile_t; ++ ++@@ -103,14 +103,20 @@ ++ static inline int samwrite(samfile_t *fp, const bam1_t *b) { return sam_write1(fp->file, fp->header, b); } ++ ++ /*! ++- @abstract Load BAM/CRAM index for use with samfetch() +++ @abstract Load BAM/CRAM index for use with samfetch() with supporting the use of index file ++ @param fp file handler ++ @param fn name of the BAM or CRAM file (NOT the index file) +++ @param fnidx name of the index file ++ @return pointer to the index structure ++ */ ++- static inline bam_index_t *samtools_sam_index_load(samfile_t *fp, const char *fn) { return sam_index_load(fp->file, fn); } +++ static inline bam_index_t *samtools_sam_index_load(samfile_t *fp, const char *fn, const char *fnidx) { +++ if (fnidx != NULL) { +++ return sam_index_load2(fp->file, fn, fnidx); +++ } +++ return sam_index_load(fp->file, fn); +++ } ++ #undef sam_index_load ++- #define sam_index_load(fp,fn) (samtools_sam_index_load((fp), (fn))) +++ #define sam_index_load(fp,fn,fnidx) (samtools_sam_index_load((fp), (fn), (fnidx))) ++ ++ /*! ++ @abstract Retrieve the alignments overlapping the specified region. ++--- python-pysam.orig/samtools/sam_header.c +++++ /dev/null ++@@ -1,836 +0,0 @@ ++-/* sam_header.c -- basic SAM/BAM header API. ++- ++- Copyright (C) 2009-2013 Genome Research Ltd. ++- ++- Author: Petr Danecek ++- ++-Permission is hereby granted, free of charge, to any person obtaining a copy ++-of this software and associated documentation files (the "Software"), to deal ++-in the Software without restriction, including without limitation the rights ++-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++-copies of the Software, and to permit persons to whom the Software is ++-furnished to do so, subject to the following conditions: ++- ++-The above copyright notice and this permission notice shall be included in ++-all copies or substantial portions of the Software. ++- ++-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++-DEALINGS IN THE SOFTWARE. */ ++- ++-#include ++- ++-#include "sam_header.h" ++-#include ++-#include ++-#include ++-#include ++-#include ++- ++-#include "htslib/khash.h" ++-KHASH_MAP_INIT_STR(str, const char *) ++- ++-struct _HeaderList ++-{ ++- struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only. ++- struct _HeaderList *next; ++- void *data; ++-}; ++-typedef struct _HeaderList list_t; ++-typedef list_t HeaderDict; ++- ++-typedef struct ++-{ ++- char key[2]; ++- char *value; ++-} ++-HeaderTag; ++- ++-typedef struct ++-{ ++- char type[2]; ++- list_t *tags; ++-} ++-HeaderLine; ++- ++-const char *o_hd_tags[] = {"SO","GO",NULL}; ++-const char *r_hd_tags[] = {"VN",NULL}; ++- ++-const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; ++-const char *r_sq_tags[] = {"SN","LN",NULL}; ++-const char *u_sq_tags[] = {"SN",NULL}; ++- ++-const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL}; ++-const char *r_rg_tags[] = {"ID",NULL}; ++-const char *u_rg_tags[] = {"ID",NULL}; ++- ++-const char *o_pg_tags[] = {"VN","CL",NULL}; ++-const char *r_pg_tags[] = {"ID",NULL}; ++- ++-const char *types[] = {"HD","SQ","RG","PG","CO",NULL}; ++-const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL}; ++-const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL}; ++-const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL}; ++- ++- ++-static void debug(const char *format, ...) ++-{ ++- va_list ap; ++- va_start(ap, format); ++- vfprintf(stderr, format, ap); ++- va_end(ap); ++-} ++- ++-#if 0 ++-// Replaced by list_append_to_end ++-static list_t *list_prepend(list_t *root, void *data) ++-{ ++- list_t *l = malloc(sizeof(list_t)); ++- l->next = root; ++- l->data = data; ++- return l; ++-} ++-#endif ++- ++-// Relies on the root->last being correct. Do not use with the other list_* ++-// routines unless they are fixed to modify root->last as well. ++-static list_t *list_append_to_end(list_t *root, void *data) ++-{ ++- list_t *l = malloc(sizeof(list_t)); ++- l->last = l; ++- l->next = NULL; ++- l->data = data; ++- ++- if ( !root ) ++- return l; ++- ++- root->last->next = l; ++- root->last = l; ++- return root; ++-} ++- ++-static list_t *list_append(list_t *root, void *data) ++-{ ++- list_t *l = root; ++- while (l && l->next) ++- l = l->next; ++- if ( l ) ++- { ++- l->next = malloc(sizeof(list_t)); ++- l = l->next; ++- } ++- else ++- { ++- l = malloc(sizeof(list_t)); ++- root = l; ++- } ++- l->data = data; ++- l->next = NULL; ++- return root; ++-} ++- ++-static void list_free(list_t *root) ++-{ ++- list_t *l = root; ++- while (root) ++- { ++- l = root; ++- root = root->next; ++- free(l); ++- } ++-} ++- ++- ++- ++-// Look for a tag "XY" in a predefined const char *[] array. ++-static int tag_exists(const char *tag, const char **tags) ++-{ ++- int itag=0; ++- if ( !tags ) return -1; ++- while ( tags[itag] ) ++- { ++- if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; ++- itag++; ++- } ++- return -1; ++-} ++- ++- ++- ++-// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text ++-// or NULL if everything has been read. The lineptr should be freed by the caller. The ++-// newline character is stripped. ++-static const char *nextline(char **lineptr, size_t *n, const char *text) ++-{ ++- int len; ++- const char *to = text; ++- ++- if ( !*to ) return NULL; ++- ++- while ( *to && *to!='\n' && *to!='\r' ) to++; ++- len = to - text + 1; ++- ++- if ( *to ) ++- { ++- // Advance the pointer for the next call ++- if ( *to=='\n' ) to++; ++- else if ( *to=='\r' && *(to+1)=='\n' ) to+=2; ++- } ++- if ( !len ) ++- return to; ++- ++- if ( !*lineptr ) ++- { ++- *lineptr = malloc(len); ++- *n = len; ++- } ++- else if ( *nkey[0] = name[0]; ++- tag->key[1] = name[1]; ++- tag->value = malloc(len+1); ++- memcpy(tag->value,value_from,len+1); ++- tag->value[len] = 0; ++- return tag; ++-} ++- ++-static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key) ++-{ ++- list_t *tags = hline->tags; ++- while (tags) ++- { ++- HeaderTag *tag = tags->data; ++- if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag; ++- tags = tags->next; ++- } ++- return NULL; ++-} ++- ++- ++-// Return codes: ++-// 0 .. different types or unique tags differ or conflicting tags, cannot be merged ++-// 1 .. all tags identical -> no need to merge, drop one ++-// 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated ++-// 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line ++-static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2) ++-{ ++- HeaderTag *t1, *t2; ++- ++- if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] ) ++- return 0; ++- ++- int itype = tag_exists(hline1->type,types); ++- if ( itype==-1 ) { ++- debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]); ++- return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code ++- } ++- ++- if ( unique_tags[itype] ) ++- { ++- t1 = header_line_has_tag(hline1,unique_tags[itype][0]); ++- t2 = header_line_has_tag(hline2,unique_tags[itype][0]); ++- if ( !t1 || !t2 ) // this should never happen, the unique tags are required ++- return 2; ++- ++- if ( strcmp(t1->value,t2->value) ) ++- return 0; // the unique tags differ, cannot be merged ++- } ++- if ( !required_tags[itype] && !optional_tags[itype] ) ++- { ++- t1 = hline1->tags->data; ++- t2 = hline2->tags->data; ++- if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments ++- return 0; ++- } ++- ++- int missing=0, itag=0; ++- while ( required_tags[itype] && required_tags[itype][itag] ) ++- { ++- t1 = header_line_has_tag(hline1,required_tags[itype][itag]); ++- t2 = header_line_has_tag(hline2,required_tags[itype][itag]); ++- if ( !t1 && !t2 ) ++- return 2; // this should never happen ++- else if ( !t1 || !t2 ) ++- missing = 1; // there is some tag missing in one of the hlines ++- else if ( strcmp(t1->value,t2->value) ) ++- { ++- if ( unique_tags[itype] ) ++- return 2; // the lines have a matching unique tag but have a conflicting tag ++- ++- return 0; // the lines contain conflicting tags, cannot be merged ++- } ++- itag++; ++- } ++- itag = 0; ++- while ( optional_tags[itype] && optional_tags[itype][itag] ) ++- { ++- t1 = header_line_has_tag(hline1,optional_tags[itype][itag]); ++- t2 = header_line_has_tag(hline2,optional_tags[itype][itag]); ++- if ( !t1 && !t2 ) ++- { ++- itag++; ++- continue; ++- } ++- if ( !t1 || !t2 ) ++- missing = 1; // there is some tag missing in one of the hlines ++- else if ( strcmp(t1->value,t2->value) ) ++- { ++- if ( unique_tags[itype] ) ++- return 2; // the lines have a matching unique tag but have a conflicting tag ++- ++- return 0; // the lines contain conflicting tags, cannot be merged ++- } ++- itag++; ++- } ++- if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged ++- return 1; ++-} ++- ++- ++-static HeaderLine *sam_header_line_clone(const HeaderLine *hline) ++-{ ++- list_t *tags; ++- HeaderLine *out = malloc(sizeof(HeaderLine)); ++- out->type[0] = hline->type[0]; ++- out->type[1] = hline->type[1]; ++- out->tags = NULL; ++- ++- tags = hline->tags; ++- while (tags) ++- { ++- HeaderTag *old = tags->data; ++- ++- HeaderTag *new = malloc(sizeof(HeaderTag)); ++- new->key[0] = old->key[0]; ++- new->key[1] = old->key[1]; ++- new->value = strdup(old->value); ++- out->tags = list_append(out->tags, new); ++- ++- tags = tags->next; ++- } ++- return out; ++-} ++- ++-static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline) ++-{ ++- list_t *tmpl_tags; ++- ++- if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] ) ++- return 0; ++- ++- tmpl_tags = tmpl_hline->tags; ++- while (tmpl_tags) ++- { ++- HeaderTag *tmpl_tag = tmpl_tags->data; ++- HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key); ++- if ( !out_tag ) ++- { ++- HeaderTag *tag = malloc(sizeof(HeaderTag)); ++- tag->key[0] = tmpl_tag->key[0]; ++- tag->key[1] = tmpl_tag->key[1]; ++- tag->value = strdup(tmpl_tag->value); ++- out_hline->tags = list_append(out_hline->tags,tag); ++- } ++- tmpl_tags = tmpl_tags->next; ++- } ++- return 1; ++-} ++- ++- ++-static HeaderLine *sam_header_line_parse(const char *headerLine) ++-{ ++- HeaderLine *hline; ++- HeaderTag *tag; ++- const char *from, *to; ++- from = headerLine; ++- ++- if ( *from != '@' ) { ++- debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine); ++- return 0; ++- } ++- to = ++from; ++- ++- while (*to && *to!='\t') to++; ++- if ( to-from != 2 ) { ++- debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine); ++- return 0; ++- } ++- ++- hline = malloc(sizeof(HeaderLine)); ++- hline->type[0] = from[0]; ++- hline->type[1] = from[1]; ++- hline->tags = NULL; ++- ++- int itype = tag_exists(hline->type, types); ++- ++- from = to; ++- while (*to && *to=='\t') to++; ++- if ( to-from != 1 ) { ++- debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); ++- free(hline); ++- return 0; ++- } ++- from = to; ++- while (*from) ++- { ++- while (*to && *to!='\t') to++; ++- ++- if ( !required_tags[itype] && !optional_tags[itype] ) ++- { ++- // CO is a special case, it can contain anything, including tabs ++- if ( *to ) { to++; continue; } ++- tag = new_tag(" ",from,to-1); ++- } ++- else ++- tag = new_tag(from,from+3,to-1); ++- ++- if ( header_line_has_tag(hline,tag->key) ) ++- debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine); ++- hline->tags = list_append(hline->tags, tag); ++- ++- from = to; ++- while (*to && *to=='\t') to++; ++- if ( *to && to-from != 1 ) { ++- debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); ++- return 0; ++- } ++- ++- from = to; ++- } ++- return hline; ++-} ++- ++- ++-// Must be of an existing type, all tags must be recognised and all required tags must be present ++-static int sam_header_line_validate(HeaderLine *hline) ++-{ ++- list_t *tags; ++- HeaderTag *tag; ++- int itype, itag; ++- ++- // Is the type correct? ++- itype = tag_exists(hline->type, types); ++- if ( itype==-1 ) ++- { ++- debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]); ++- return 0; ++- } ++- ++- // Has all required tags? ++- itag = 0; ++- while ( required_tags[itype] && required_tags[itype][itag] ) ++- { ++- if ( !header_line_has_tag(hline,required_tags[itype][itag]) ) ++- { ++- debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1], ++- hline->type[0],hline->type[1]); ++- return 0; ++- } ++- itag++; ++- } ++- ++- // Are all tags recognised? ++- tags = hline->tags; ++- while ( tags ) ++- { ++- tag = tags->data; ++- if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) ) ++- { ++- // Lower case tags are user-defined values. ++- if( !(islower(tag->key[0]) || islower(tag->key[1])) ) ++- { ++- // Neither is lower case, but tag was not recognized. ++- debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); ++- // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes ++- } ++- // else - allow user defined tag ++- } ++- tags = tags->next; ++- } ++- ++- return 1; ++-} ++- ++- ++-static void print_header_line(FILE *fp, HeaderLine *hline) ++-{ ++- list_t *tags = hline->tags; ++- HeaderTag *tag; ++- ++- fprintf(fp, "@%c%c", hline->type[0],hline->type[1]); ++- while (tags) ++- { ++- tag = tags->data; ++- ++- fprintf(fp, "\t"); ++- if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) ++- fprintf(fp, "%c%c:", tag->key[0],tag->key[1]); ++- fprintf(fp, "%s", tag->value); ++- ++- tags = tags->next; ++- } ++- fprintf(fp,"\n"); ++-} ++- ++- ++-static void sam_header_line_free(HeaderLine *hline) ++-{ ++- list_t *tags = hline->tags; ++- while (tags) ++- { ++- HeaderTag *tag = tags->data; ++- free(tag->value); ++- free(tag); ++- tags = tags->next; ++- } ++- list_free(hline->tags); ++- free(hline); ++-} ++- ++-void sam_header_free(void *_header) ++-{ ++- HeaderDict *header = (HeaderDict*)_header; ++- list_t *hlines = header; ++- while (hlines) ++- { ++- sam_header_line_free(hlines->data); ++- hlines = hlines->next; ++- } ++- list_free(header); ++-} ++- ++-HeaderDict *sam_header_clone(const HeaderDict *dict) ++-{ ++- HeaderDict *out = NULL; ++- while (dict) ++- { ++- HeaderLine *hline = dict->data; ++- out = list_append(out, sam_header_line_clone(hline)); ++- dict = dict->next; ++- } ++- return out; ++-} ++- ++-// Returns a newly allocated string ++-char *sam_header_write(const void *_header) ++-{ ++- const HeaderDict *header = (const HeaderDict*)_header; ++- char *out = NULL; ++- int len=0, nout=0; ++- const list_t *hlines; ++- ++- // Calculate the length of the string to allocate ++- hlines = header; ++- while (hlines) ++- { ++- len += 4; // @XY and \n ++- ++- HeaderLine *hline = hlines->data; ++- list_t *tags = hline->tags; ++- while (tags) ++- { ++- HeaderTag *tag = tags->data; ++- len += strlen(tag->value) + 1; // \t ++- if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) ++- len += strlen(tag->value) + 3; // XY: ++- tags = tags->next; ++- } ++- hlines = hlines->next; ++- } ++- ++- nout = 0; ++- out = malloc(len+1); ++- hlines = header; ++- while (hlines) ++- { ++- HeaderLine *hline = hlines->data; ++- ++- nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]); ++- ++- list_t *tags = hline->tags; ++- while (tags) ++- { ++- HeaderTag *tag = tags->data; ++- nout += sprintf(out+nout,"\t"); ++- if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) ++- nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]); ++- nout += sprintf(out+nout,"%s", tag->value); ++- tags = tags->next; ++- } ++- hlines = hlines->next; ++- nout += sprintf(out+nout,"\n"); ++- } ++- out[len] = 0; ++- return out; ++-} ++- ++-void *sam_header_parse2(const char *headerText) ++-{ ++- list_t *hlines = NULL; ++- HeaderLine *hline; ++- const char *text; ++- char *buf=NULL; ++- size_t nbuf = 0; ++- int tovalidate = 0; ++- ++- if ( !headerText ) ++- return 0; ++- ++- text = headerText; ++- while ( (text=nextline(&buf, &nbuf, text)) ) ++- { ++- hline = sam_header_line_parse(buf); ++- if ( hline && (!tovalidate || sam_header_line_validate(hline)) ) ++- // With too many (~250,000) reference sequences the header parsing was too slow with list_append. ++- hlines = list_append_to_end(hlines, hline); ++- else ++- { ++- if (hline) sam_header_line_free(hline); ++- sam_header_free(hlines); ++- if ( buf ) free(buf); ++- return NULL; ++- } ++- } ++- if ( buf ) free(buf); ++- ++- return hlines; ++-} ++- ++-void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2]) ++-{ ++- const HeaderDict *dict = (const HeaderDict*)_dict; ++- const list_t *l = dict; ++- khash_t(str) *tbl = kh_init(str); ++- khiter_t k; ++- int ret; ++- ++- if (_dict == 0) return tbl; // return an empty (not null) hash table ++- while (l) ++- { ++- HeaderLine *hline = l->data; ++- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) ++- { ++- l = l->next; ++- continue; ++- } ++- ++- HeaderTag *key, *value; ++- key = header_line_has_tag(hline,key_tag); ++- value = header_line_has_tag(hline,value_tag); ++- if ( !key || !value ) ++- { ++- l = l->next; ++- continue; ++- } ++- ++- k = kh_get(str, tbl, key->value); ++- if ( k != kh_end(tbl) ) ++- debug("[sam_header_lookup_table] They key %s not unique.\n", key->value); ++- k = kh_put(str, tbl, key->value, &ret); ++- kh_value(tbl, k) = value->value; ++- ++- l = l->next; ++- } ++- return tbl; ++-} ++- ++-char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n) ++-{ ++- const HeaderDict *dict = (const HeaderDict*)_dict; ++- const list_t *l = dict; ++- int max, n; ++- char **ret; ++- ++- ret = 0; *_n = max = n = 0; ++- while (l) ++- { ++- HeaderLine *hline = l->data; ++- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) ++- { ++- l = l->next; ++- continue; ++- } ++- ++- HeaderTag *key; ++- key = header_line_has_tag(hline,key_tag); ++- if ( !key ) ++- { ++- l = l->next; ++- continue; ++- } ++- ++- if (n == max) { ++- max = max? max<<1 : 4; ++- ret = realloc(ret, max * sizeof(char*)); ++- } ++- ret[n++] = key->value; ++- ++- l = l->next; ++- } ++- *_n = n; ++- return ret; ++-} ++- ++-void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value) ++-{ ++- list_t *l = iter; ++- if ( !l ) return NULL; ++- ++- while (l) ++- { ++- HeaderLine *hline = l->data; ++- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) ++- { ++- l = l->next; ++- continue; ++- } ++- ++- HeaderTag *key, *value; ++- key = header_line_has_tag(hline,key_tag); ++- value = header_line_has_tag(hline,value_tag); ++- if ( !key || !value ) ++- { ++- l = l->next; ++- continue; ++- } ++- ++- *_key = key->value; ++- *_value = value->value; ++- return l->next; ++- } ++- return l; ++-} ++- ++-const char *sam_tbl_get(void *h, const char *key) ++-{ ++- khash_t(str) *tbl = (khash_t(str)*)h; ++- khint_t k; ++- k = kh_get(str, tbl, key); ++- return k == kh_end(tbl)? 0 : kh_val(tbl, k); ++-} ++- ++-int sam_tbl_size(void *h) ++-{ ++- khash_t(str) *tbl = (khash_t(str)*)h; ++- return h? kh_size(tbl) : 0; ++-} ++- ++-void sam_tbl_destroy(void *h) ++-{ ++- khash_t(str) *tbl = (khash_t(str)*)h; ++- kh_destroy(str, tbl); ++-} ++- ++-void *sam_header_merge(int n, const void **_dicts) ++-{ ++- const HeaderDict **dicts = (const HeaderDict**)_dicts; ++- HeaderDict *out_dict; ++- int idict, status; ++- ++- if ( n<2 ) return NULL; ++- ++- out_dict = sam_header_clone(dicts[0]); ++- ++- for (idict=1; idictdata, out_hlines->data); ++- if ( status==0 ) ++- { ++- out_hlines = out_hlines->next; ++- continue; ++- } ++- ++- if ( status==2 ) ++- { ++- print_header_line(stderr,tmpl_hlines->data); ++- print_header_line(stderr,out_hlines->data); ++- debug("Conflicting lines, cannot merge the headers.\n"); ++- return 0; ++- } ++- if ( status==3 ) ++- sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data); ++- ++- inserted = 1; ++- break; ++- } ++- if ( !inserted ) ++- out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data)); ++- ++- tmpl_hlines = tmpl_hlines->next; ++- } ++- } ++- ++- return out_dict; ++-} ++- ++-char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n) ++-{ ++- int nout = 0; ++- char **out = NULL; ++- ++- *n = 0; ++- list_t *l = (list_t *)dict; ++- if ( !l ) return NULL; ++- ++- int i, ntags = 0; ++- while ( tags[ntags] ) ntags++; ++- ++- while (l) ++- { ++- HeaderLine *hline = l->data; ++- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) ++- { ++- l = l->next; ++- continue; ++- } ++- out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags); ++- for (i=0; ivalue; ++- } ++- nout++; ++- l = l->next; ++- } ++- *n = nout; ++- return out; ++-} ++- ++--- python-pysam.orig/samtools/sam_header.c.pysam.c +++++ /dev/null ++@@ -1,838 +0,0 @@ ++-#include "samtools.pysam.h" ++- ++-/* sam_header.c -- basic SAM/BAM header API. ++- ++- Copyright (C) 2009-2013 Genome Research Ltd. ++- ++- Author: Petr Danecek ++- ++-Permission is hereby granted, free of charge, to any person obtaining a copy ++-of this software and associated documentation files (the "Software"), to deal ++-in the Software without restriction, including without limitation the rights ++-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++-copies of the Software, and to permit persons to whom the Software is ++-furnished to do so, subject to the following conditions: ++- ++-The above copyright notice and this permission notice shall be included in ++-all copies or substantial portions of the Software. ++- ++-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++-DEALINGS IN THE SOFTWARE. */ ++- ++-#include ++- ++-#include "sam_header.h" ++-#include ++-#include ++-#include ++-#include ++-#include ++- ++-#include "htslib/khash.h" ++-KHASH_MAP_INIT_STR(str, const char *) ++- ++-struct _HeaderList ++-{ ++- struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only. ++- struct _HeaderList *next; ++- void *data; ++-}; ++-typedef struct _HeaderList list_t; ++-typedef list_t HeaderDict; ++- ++-typedef struct ++-{ ++- char key[2]; ++- char *value; ++-} ++-HeaderTag; ++- ++-typedef struct ++-{ ++- char type[2]; ++- list_t *tags; ++-} ++-HeaderLine; ++- ++-const char *o_hd_tags[] = {"SO","GO",NULL}; ++-const char *r_hd_tags[] = {"VN",NULL}; ++- ++-const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; ++-const char *r_sq_tags[] = {"SN","LN",NULL}; ++-const char *u_sq_tags[] = {"SN",NULL}; ++- ++-const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL}; ++-const char *r_rg_tags[] = {"ID",NULL}; ++-const char *u_rg_tags[] = {"ID",NULL}; ++- ++-const char *o_pg_tags[] = {"VN","CL",NULL}; ++-const char *r_pg_tags[] = {"ID",NULL}; ++- ++-const char *types[] = {"HD","SQ","RG","PG","CO",NULL}; ++-const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL}; ++-const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL}; ++-const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL}; ++- ++- ++-static void debug(const char *format, ...) ++-{ ++- va_list ap; ++- va_start(ap, format); ++- vfprintf(samtools_stderr, format, ap); ++- va_end(ap); ++-} ++- ++-#if 0 ++-// Replaced by list_append_to_end ++-static list_t *list_prepend(list_t *root, void *data) ++-{ ++- list_t *l = malloc(sizeof(list_t)); ++- l->next = root; ++- l->data = data; ++- return l; ++-} ++-#endif ++- ++-// Relies on the root->last being correct. Do not use with the other list_* ++-// routines unless they are fixed to modify root->last as well. ++-static list_t *list_append_to_end(list_t *root, void *data) ++-{ ++- list_t *l = malloc(sizeof(list_t)); ++- l->last = l; ++- l->next = NULL; ++- l->data = data; ++- ++- if ( !root ) ++- return l; ++- ++- root->last->next = l; ++- root->last = l; ++- return root; ++-} ++- ++-static list_t *list_append(list_t *root, void *data) ++-{ ++- list_t *l = root; ++- while (l && l->next) ++- l = l->next; ++- if ( l ) ++- { ++- l->next = malloc(sizeof(list_t)); ++- l = l->next; ++- } ++- else ++- { ++- l = malloc(sizeof(list_t)); ++- root = l; ++- } ++- l->data = data; ++- l->next = NULL; ++- return root; ++-} ++- ++-static void list_free(list_t *root) ++-{ ++- list_t *l = root; ++- while (root) ++- { ++- l = root; ++- root = root->next; ++- free(l); ++- } ++-} ++- ++- ++- ++-// Look for a tag "XY" in a predefined const char *[] array. ++-static int tag_exists(const char *tag, const char **tags) ++-{ ++- int itag=0; ++- if ( !tags ) return -1; ++- while ( tags[itag] ) ++- { ++- if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; ++- itag++; ++- } ++- return -1; ++-} ++- ++- ++- ++-// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text ++-// or NULL if everything has been read. The lineptr should be freed by the caller. The ++-// newline character is stripped. ++-static const char *nextline(char **lineptr, size_t *n, const char *text) ++-{ ++- int len; ++- const char *to = text; ++- ++- if ( !*to ) return NULL; ++- ++- while ( *to && *to!='\n' && *to!='\r' ) to++; ++- len = to - text + 1; ++- ++- if ( *to ) ++- { ++- // Advance the pointer for the next call ++- if ( *to=='\n' ) to++; ++- else if ( *to=='\r' && *(to+1)=='\n' ) to+=2; ++- } ++- if ( !len ) ++- return to; ++- ++- if ( !*lineptr ) ++- { ++- *lineptr = malloc(len); ++- *n = len; ++- } ++- else if ( *nkey[0] = name[0]; ++- tag->key[1] = name[1]; ++- tag->value = malloc(len+1); ++- memcpy(tag->value,value_from,len+1); ++- tag->value[len] = 0; ++- return tag; ++-} ++- ++-static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key) ++-{ ++- list_t *tags = hline->tags; ++- while (tags) ++- { ++- HeaderTag *tag = tags->data; ++- if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag; ++- tags = tags->next; ++- } ++- return NULL; ++-} ++- ++- ++-// Return codes: ++-// 0 .. different types or unique tags differ or conflicting tags, cannot be merged ++-// 1 .. all tags identical -> no need to merge, drop one ++-// 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated ++-// 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line ++-static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2) ++-{ ++- HeaderTag *t1, *t2; ++- ++- if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] ) ++- return 0; ++- ++- int itype = tag_exists(hline1->type,types); ++- if ( itype==-1 ) { ++- debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]); ++- return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code ++- } ++- ++- if ( unique_tags[itype] ) ++- { ++- t1 = header_line_has_tag(hline1,unique_tags[itype][0]); ++- t2 = header_line_has_tag(hline2,unique_tags[itype][0]); ++- if ( !t1 || !t2 ) // this should never happen, the unique tags are required ++- return 2; ++- ++- if ( strcmp(t1->value,t2->value) ) ++- return 0; // the unique tags differ, cannot be merged ++- } ++- if ( !required_tags[itype] && !optional_tags[itype] ) ++- { ++- t1 = hline1->tags->data; ++- t2 = hline2->tags->data; ++- if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments ++- return 0; ++- } ++- ++- int missing=0, itag=0; ++- while ( required_tags[itype] && required_tags[itype][itag] ) ++- { ++- t1 = header_line_has_tag(hline1,required_tags[itype][itag]); ++- t2 = header_line_has_tag(hline2,required_tags[itype][itag]); ++- if ( !t1 && !t2 ) ++- return 2; // this should never happen ++- else if ( !t1 || !t2 ) ++- missing = 1; // there is some tag missing in one of the hlines ++- else if ( strcmp(t1->value,t2->value) ) ++- { ++- if ( unique_tags[itype] ) ++- return 2; // the lines have a matching unique tag but have a conflicting tag ++- ++- return 0; // the lines contain conflicting tags, cannot be merged ++- } ++- itag++; ++- } ++- itag = 0; ++- while ( optional_tags[itype] && optional_tags[itype][itag] ) ++- { ++- t1 = header_line_has_tag(hline1,optional_tags[itype][itag]); ++- t2 = header_line_has_tag(hline2,optional_tags[itype][itag]); ++- if ( !t1 && !t2 ) ++- { ++- itag++; ++- continue; ++- } ++- if ( !t1 || !t2 ) ++- missing = 1; // there is some tag missing in one of the hlines ++- else if ( strcmp(t1->value,t2->value) ) ++- { ++- if ( unique_tags[itype] ) ++- return 2; // the lines have a matching unique tag but have a conflicting tag ++- ++- return 0; // the lines contain conflicting tags, cannot be merged ++- } ++- itag++; ++- } ++- if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged ++- return 1; ++-} ++- ++- ++-static HeaderLine *sam_header_line_clone(const HeaderLine *hline) ++-{ ++- list_t *tags; ++- HeaderLine *out = malloc(sizeof(HeaderLine)); ++- out->type[0] = hline->type[0]; ++- out->type[1] = hline->type[1]; ++- out->tags = NULL; ++- ++- tags = hline->tags; ++- while (tags) ++- { ++- HeaderTag *old = tags->data; ++- ++- HeaderTag *new = malloc(sizeof(HeaderTag)); ++- new->key[0] = old->key[0]; ++- new->key[1] = old->key[1]; ++- new->value = strdup(old->value); ++- out->tags = list_append(out->tags, new); ++- ++- tags = tags->next; ++- } ++- return out; ++-} ++- ++-static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline) ++-{ ++- list_t *tmpl_tags; ++- ++- if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] ) ++- return 0; ++- ++- tmpl_tags = tmpl_hline->tags; ++- while (tmpl_tags) ++- { ++- HeaderTag *tmpl_tag = tmpl_tags->data; ++- HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key); ++- if ( !out_tag ) ++- { ++- HeaderTag *tag = malloc(sizeof(HeaderTag)); ++- tag->key[0] = tmpl_tag->key[0]; ++- tag->key[1] = tmpl_tag->key[1]; ++- tag->value = strdup(tmpl_tag->value); ++- out_hline->tags = list_append(out_hline->tags,tag); ++- } ++- tmpl_tags = tmpl_tags->next; ++- } ++- return 1; ++-} ++- ++- ++-static HeaderLine *sam_header_line_parse(const char *headerLine) ++-{ ++- HeaderLine *hline; ++- HeaderTag *tag; ++- const char *from, *to; ++- from = headerLine; ++- ++- if ( *from != '@' ) { ++- debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine); ++- return 0; ++- } ++- to = ++from; ++- ++- while (*to && *to!='\t') to++; ++- if ( to-from != 2 ) { ++- debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine); ++- return 0; ++- } ++- ++- hline = malloc(sizeof(HeaderLine)); ++- hline->type[0] = from[0]; ++- hline->type[1] = from[1]; ++- hline->tags = NULL; ++- ++- int itype = tag_exists(hline->type, types); ++- ++- from = to; ++- while (*to && *to=='\t') to++; ++- if ( to-from != 1 ) { ++- debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); ++- free(hline); ++- return 0; ++- } ++- from = to; ++- while (*from) ++- { ++- while (*to && *to!='\t') to++; ++- ++- if ( !required_tags[itype] && !optional_tags[itype] ) ++- { ++- // CO is a special case, it can contain anything, including tabs ++- if ( *to ) { to++; continue; } ++- tag = new_tag(" ",from,to-1); ++- } ++- else ++- tag = new_tag(from,from+3,to-1); ++- ++- if ( header_line_has_tag(hline,tag->key) ) ++- debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine); ++- hline->tags = list_append(hline->tags, tag); ++- ++- from = to; ++- while (*to && *to=='\t') to++; ++- if ( *to && to-from != 1 ) { ++- debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); ++- return 0; ++- } ++- ++- from = to; ++- } ++- return hline; ++-} ++- ++- ++-// Must be of an existing type, all tags must be recognised and all required tags must be present ++-static int sam_header_line_validate(HeaderLine *hline) ++-{ ++- list_t *tags; ++- HeaderTag *tag; ++- int itype, itag; ++- ++- // Is the type correct? ++- itype = tag_exists(hline->type, types); ++- if ( itype==-1 ) ++- { ++- debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]); ++- return 0; ++- } ++- ++- // Has all required tags? ++- itag = 0; ++- while ( required_tags[itype] && required_tags[itype][itag] ) ++- { ++- if ( !header_line_has_tag(hline,required_tags[itype][itag]) ) ++- { ++- debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1], ++- hline->type[0],hline->type[1]); ++- return 0; ++- } ++- itag++; ++- } ++- ++- // Are all tags recognised? ++- tags = hline->tags; ++- while ( tags ) ++- { ++- tag = tags->data; ++- if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) ) ++- { ++- // Lower case tags are user-defined values. ++- if( !(islower(tag->key[0]) || islower(tag->key[1])) ) ++- { ++- // Neither is lower case, but tag was not recognized. ++- debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); ++- // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes ++- } ++- // else - allow user defined tag ++- } ++- tags = tags->next; ++- } ++- ++- return 1; ++-} ++- ++- ++-static void print_header_line(FILE *fp, HeaderLine *hline) ++-{ ++- list_t *tags = hline->tags; ++- HeaderTag *tag; ++- ++- fprintf(fp, "@%c%c", hline->type[0],hline->type[1]); ++- while (tags) ++- { ++- tag = tags->data; ++- ++- fprintf(fp, "\t"); ++- if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) ++- fprintf(fp, "%c%c:", tag->key[0],tag->key[1]); ++- fprintf(fp, "%s", tag->value); ++- ++- tags = tags->next; ++- } ++- fprintf(fp,"\n"); ++-} ++- ++- ++-static void sam_header_line_free(HeaderLine *hline) ++-{ ++- list_t *tags = hline->tags; ++- while (tags) ++- { ++- HeaderTag *tag = tags->data; ++- free(tag->value); ++- free(tag); ++- tags = tags->next; ++- } ++- list_free(hline->tags); ++- free(hline); ++-} ++- ++-void sam_header_free(void *_header) ++-{ ++- HeaderDict *header = (HeaderDict*)_header; ++- list_t *hlines = header; ++- while (hlines) ++- { ++- sam_header_line_free(hlines->data); ++- hlines = hlines->next; ++- } ++- list_free(header); ++-} ++- ++-HeaderDict *sam_header_clone(const HeaderDict *dict) ++-{ ++- HeaderDict *out = NULL; ++- while (dict) ++- { ++- HeaderLine *hline = dict->data; ++- out = list_append(out, sam_header_line_clone(hline)); ++- dict = dict->next; ++- } ++- return out; ++-} ++- ++-// Returns a newly allocated string ++-char *sam_header_write(const void *_header) ++-{ ++- const HeaderDict *header = (const HeaderDict*)_header; ++- char *out = NULL; ++- int len=0, nout=0; ++- const list_t *hlines; ++- ++- // Calculate the length of the string to allocate ++- hlines = header; ++- while (hlines) ++- { ++- len += 4; // @XY and \n ++- ++- HeaderLine *hline = hlines->data; ++- list_t *tags = hline->tags; ++- while (tags) ++- { ++- HeaderTag *tag = tags->data; ++- len += strlen(tag->value) + 1; // \t ++- if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) ++- len += strlen(tag->value) + 3; // XY: ++- tags = tags->next; ++- } ++- hlines = hlines->next; ++- } ++- ++- nout = 0; ++- out = malloc(len+1); ++- hlines = header; ++- while (hlines) ++- { ++- HeaderLine *hline = hlines->data; ++- ++- nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]); ++- ++- list_t *tags = hline->tags; ++- while (tags) ++- { ++- HeaderTag *tag = tags->data; ++- nout += sprintf(out+nout,"\t"); ++- if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) ++- nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]); ++- nout += sprintf(out+nout,"%s", tag->value); ++- tags = tags->next; ++- } ++- hlines = hlines->next; ++- nout += sprintf(out+nout,"\n"); ++- } ++- out[len] = 0; ++- return out; ++-} ++- ++-void *sam_header_parse2(const char *headerText) ++-{ ++- list_t *hlines = NULL; ++- HeaderLine *hline; ++- const char *text; ++- char *buf=NULL; ++- size_t nbuf = 0; ++- int tovalidate = 0; ++- ++- if ( !headerText ) ++- return 0; ++- ++- text = headerText; ++- while ( (text=nextline(&buf, &nbuf, text)) ) ++- { ++- hline = sam_header_line_parse(buf); ++- if ( hline && (!tovalidate || sam_header_line_validate(hline)) ) ++- // With too many (~250,000) reference sequences the header parsing was too slow with list_append. ++- hlines = list_append_to_end(hlines, hline); ++- else ++- { ++- if (hline) sam_header_line_free(hline); ++- sam_header_free(hlines); ++- if ( buf ) free(buf); ++- return NULL; ++- } ++- } ++- if ( buf ) free(buf); ++- ++- return hlines; ++-} ++- ++-void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2]) ++-{ ++- const HeaderDict *dict = (const HeaderDict*)_dict; ++- const list_t *l = dict; ++- khash_t(str) *tbl = kh_init(str); ++- khiter_t k; ++- int ret; ++- ++- if (_dict == 0) return tbl; // return an empty (not null) hash table ++- while (l) ++- { ++- HeaderLine *hline = l->data; ++- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) ++- { ++- l = l->next; ++- continue; ++- } ++- ++- HeaderTag *key, *value; ++- key = header_line_has_tag(hline,key_tag); ++- value = header_line_has_tag(hline,value_tag); ++- if ( !key || !value ) ++- { ++- l = l->next; ++- continue; ++- } ++- ++- k = kh_get(str, tbl, key->value); ++- if ( k != kh_end(tbl) ) ++- debug("[sam_header_lookup_table] They key %s not unique.\n", key->value); ++- k = kh_put(str, tbl, key->value, &ret); ++- kh_value(tbl, k) = value->value; ++- ++- l = l->next; ++- } ++- return tbl; ++-} ++- ++-char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n) ++-{ ++- const HeaderDict *dict = (const HeaderDict*)_dict; ++- const list_t *l = dict; ++- int max, n; ++- char **ret; ++- ++- ret = 0; *_n = max = n = 0; ++- while (l) ++- { ++- HeaderLine *hline = l->data; ++- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) ++- { ++- l = l->next; ++- continue; ++- } ++- ++- HeaderTag *key; ++- key = header_line_has_tag(hline,key_tag); ++- if ( !key ) ++- { ++- l = l->next; ++- continue; ++- } ++- ++- if (n == max) { ++- max = max? max<<1 : 4; ++- ret = realloc(ret, max * sizeof(char*)); ++- } ++- ret[n++] = key->value; ++- ++- l = l->next; ++- } ++- *_n = n; ++- return ret; ++-} ++- ++-void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value) ++-{ ++- list_t *l = iter; ++- if ( !l ) return NULL; ++- ++- while (l) ++- { ++- HeaderLine *hline = l->data; ++- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) ++- { ++- l = l->next; ++- continue; ++- } ++- ++- HeaderTag *key, *value; ++- key = header_line_has_tag(hline,key_tag); ++- value = header_line_has_tag(hline,value_tag); ++- if ( !key || !value ) ++- { ++- l = l->next; ++- continue; ++- } ++- ++- *_key = key->value; ++- *_value = value->value; ++- return l->next; ++- } ++- return l; ++-} ++- ++-const char *sam_tbl_get(void *h, const char *key) ++-{ ++- khash_t(str) *tbl = (khash_t(str)*)h; ++- khint_t k; ++- k = kh_get(str, tbl, key); ++- return k == kh_end(tbl)? 0 : kh_val(tbl, k); ++-} ++- ++-int sam_tbl_size(void *h) ++-{ ++- khash_t(str) *tbl = (khash_t(str)*)h; ++- return h? kh_size(tbl) : 0; ++-} ++- ++-void sam_tbl_destroy(void *h) ++-{ ++- khash_t(str) *tbl = (khash_t(str)*)h; ++- kh_destroy(str, tbl); ++-} ++- ++-void *sam_header_merge(int n, const void **_dicts) ++-{ ++- const HeaderDict **dicts = (const HeaderDict**)_dicts; ++- HeaderDict *out_dict; ++- int idict, status; ++- ++- if ( n<2 ) return NULL; ++- ++- out_dict = sam_header_clone(dicts[0]); ++- ++- for (idict=1; idictdata, out_hlines->data); ++- if ( status==0 ) ++- { ++- out_hlines = out_hlines->next; ++- continue; ++- } ++- ++- if ( status==2 ) ++- { ++- print_header_line(samtools_stderr,tmpl_hlines->data); ++- print_header_line(samtools_stderr,out_hlines->data); ++- debug("Conflicting lines, cannot merge the headers.\n"); ++- return 0; ++- } ++- if ( status==3 ) ++- sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data); ++- ++- inserted = 1; ++- break; ++- } ++- if ( !inserted ) ++- out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data)); ++- ++- tmpl_hlines = tmpl_hlines->next; ++- } ++- } ++- ++- return out_dict; ++-} ++- ++-char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n) ++-{ ++- int nout = 0; ++- char **out = NULL; ++- ++- *n = 0; ++- list_t *l = (list_t *)dict; ++- if ( !l ) return NULL; ++- ++- int i, ntags = 0; ++- while ( tags[ntags] ) ntags++; ++- ++- while (l) ++- { ++- HeaderLine *hline = l->data; ++- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) ++- { ++- l = l->next; ++- continue; ++- } ++- out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags); ++- for (i=0; ivalue; ++- } ++- nout++; ++- l = l->next; ++- } ++- *n = nout; ++- return out; ++-} ++- ++--- python-pysam.orig/samtools/sam_header.h +++++ /dev/null ++@@ -1,72 +0,0 @@ ++-/* sam_header.h -- basic SAM/BAM header API. ++- ++- Copyright (C) 2009, 2012, 2013 Genome Research Ltd. ++- ++- Author: Petr Danecek ++- ++-Permission is hereby granted, free of charge, to any person obtaining a copy ++-of this software and associated documentation files (the "Software"), to deal ++-in the Software without restriction, including without limitation the rights ++-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++-copies of the Software, and to permit persons to whom the Software is ++-furnished to do so, subject to the following conditions: ++- ++-The above copyright notice and this permission notice shall be included in ++-all copies or substantial portions of the Software. ++- ++-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++-DEALINGS IN THE SOFTWARE. */ ++- ++-#ifndef __SAM_HEADER_H__ ++-#define __SAM_HEADER_H__ ++- ++-#ifdef __cplusplus ++-extern "C" { ++-#endif ++- ++- void *sam_header_parse2(const char *headerText); ++- void *sam_header_merge(int n, const void **dicts); ++- void sam_header_free(void *header); ++- char *sam_header_write(const void *headerDict); // returns a newly allocated string ++- ++- /* ++- // Usage example ++- const char *key, *val; ++- void *iter = sam_header_parse2(bam->header->text); ++- while ( iter = sam_header_key_val(iter, "RG","ID","SM" &key,&val) ) printf("%s\t%s\n", key,val); ++- */ ++- void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **key, const char **value); ++- char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); ++- ++- /* ++- // Usage example ++- int i, j, n; ++- const char *tags[] = {"SN","LN","UR","M5",NULL}; ++- void *dict = sam_header_parse2(bam->header->text); ++- char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &n); ++- for (i=0; i ++ ++@@ -66,8 +66,23 @@ ++ break; ++ } else if (strcmp(lopt->name, "reference") == 0) { ++ char *ref = malloc(10 + strlen(optarg) + 1); +++ +++ if (!ref) { +++ fprintf(stderr, "Unable to allocate memory in " +++ "parse_sam_global_opt.\n"); +++ +++ return -1; +++ } +++ ++ sprintf(ref, "reference=%s", optarg); ++- ga->reference = strdup(optarg); +++ +++ if (!(ga->reference = strdup(optarg))) { +++ fprintf(stderr, "Unable to allocate memory in " +++ "parse_sam_global_opt.\n"); +++ +++ return -1; +++ } +++ ++ r = hts_opt_add((hts_opt **)&ga->in.specific, ref); ++ r |= hts_opt_add((hts_opt **)&ga->out.specific, ref); ++ free(ref); ++@@ -75,17 +90,32 @@ ++ } else if (strcmp(lopt->name, "threads") == 0) { ++ ga->nthreads = atoi(optarg); ++ break; ++-// } else if (strcmp(lopt->name, "verbose") == 0) { ++-// ga->verbosity++; ++-// break; +++ } else if (strcmp(lopt->name, "write-index") == 0) { +++ ga->write_index = 1; +++ break; +++ } else if (strcmp(lopt->name, "verbosity") == 0) { +++ hts_verbose = atoi(optarg); +++ break; ++ } ++ } ++ ++ if (!lopt->name) { ++- fprintf(stderr, "Unexpected global option: %s\n", lopt->name); +++ fprintf(stderr, "Unexpected global option.\n"); ++ return -1; ++ } ++ +++ /* +++ * SAM format with compression enabled implies SAM.bgzf +++ */ +++ if (ga->out.format == sam) { +++ hts_opt *opts = (hts_opt *)ga->out.specific; +++ while (opts) { +++ if (opts->opt == HTS_OPT_COMPRESSION_LEVEL) +++ ga->out.compression = bgzf; +++ opts = opts->next; +++ } +++ } +++ ++ return r; ++ } ++ ++@@ -136,9 +166,12 @@ ++ else if (strcmp(lopts[i].name, "threads") == 0) ++ fprintf(fp,"threads INT\n" ++ " Number of additional threads to use [0]\n"); ++-// else if (strcmp(lopts[i].name, "verbose") == 0) ++-// fprintf(fp,"verbose\n" ++-// " Increment level of verbosity\n"); +++ else if (strcmp(lopts[i].name, "write-index") == 0) +++ fprintf(fp,"write-index\n" +++ " Automatically index the output files [off]\n"); +++ else if (strcmp(lopts[i].name, "verbosity") == 0) +++ fprintf(fp,"verbosity INT\n" +++ " Set level of verbosity\n"); ++ } ++ } ++ ++--- python-pysam.orig/samtools/sam_opts.c.pysam.c +++++ python-pysam/samtools/sam_opts.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* sam_opts.c -- utilities to aid parsing common command line options. ++ ++- Copyright (C) 2015 Genome Research Ltd. +++ Copyright (C) 2015, 2019 Genome Research Ltd. ++ ++ Author: James Bonfield ++ ++@@ -68,8 +68,23 @@ ++ break; ++ } else if (strcmp(lopt->name, "reference") == 0) { ++ char *ref = malloc(10 + strlen(optarg) + 1); +++ +++ if (!ref) { +++ fprintf(samtools_stderr, "Unable to allocate memory in " +++ "parse_sam_global_opt.\n"); +++ +++ return -1; +++ } +++ ++ sprintf(ref, "reference=%s", optarg); ++- ga->reference = strdup(optarg); +++ +++ if (!(ga->reference = strdup(optarg))) { +++ fprintf(samtools_stderr, "Unable to allocate memory in " +++ "parse_sam_global_opt.\n"); +++ +++ return -1; +++ } +++ ++ r = hts_opt_add((hts_opt **)&ga->in.specific, ref); ++ r |= hts_opt_add((hts_opt **)&ga->out.specific, ref); ++ free(ref); ++@@ -77,17 +92,32 @@ ++ } else if (strcmp(lopt->name, "threads") == 0) { ++ ga->nthreads = atoi(optarg); ++ break; ++-// } else if (strcmp(lopt->name, "verbose") == 0) { ++-// ga->verbosity++; ++-// break; +++ } else if (strcmp(lopt->name, "write-index") == 0) { +++ ga->write_index = 1; +++ break; +++ } else if (strcmp(lopt->name, "verbosity") == 0) { +++ hts_verbose = atoi(optarg); +++ break; ++ } ++ } ++ ++ if (!lopt->name) { ++- fprintf(samtools_stderr, "Unexpected global option: %s\n", lopt->name); +++ fprintf(samtools_stderr, "Unexpected global option.\n"); ++ return -1; ++ } ++ +++ /* +++ * SAM format with compression enabled implies SAM.bgzf +++ */ +++ if (ga->out.format == sam) { +++ hts_opt *opts = (hts_opt *)ga->out.specific; +++ while (opts) { +++ if (opts->opt == HTS_OPT_COMPRESSION_LEVEL) +++ ga->out.compression = bgzf; +++ opts = opts->next; +++ } +++ } +++ ++ return r; ++ } ++ ++@@ -138,9 +168,12 @@ ++ else if (strcmp(lopts[i].name, "threads") == 0) ++ fprintf(fp,"threads INT\n" ++ " Number of additional threads to use [0]\n"); ++-// else if (strcmp(lopts[i].name, "verbose") == 0) ++-// fprintf(fp,"verbose\n" ++-// " Increment level of verbosity\n"); +++ else if (strcmp(lopts[i].name, "write-index") == 0) +++ fprintf(fp,"write-index\n" +++ " Automatically index the output files [off]\n"); +++ else if (strcmp(lopts[i].name, "verbosity") == 0) +++ fprintf(fp,"verbosity INT\n" +++ " Set level of verbosity\n"); ++ } ++ } ++ ++--- python-pysam.orig/samtools/sam_opts.h +++++ python-pysam/samtools/sam_opts.h ++@@ -1,6 +1,6 @@ ++ /* sam_opts.h -- utilities to aid parsing common command line options. ++ ++- Copyright (C) 2015 Genome Research Ltd. +++ Copyright (C) 2015, 2019 Genome Research Ltd. ++ ++ Author: James Bonfield ++ ++@@ -35,7 +35,7 @@ ++ htsFormat out; ++ char *reference; ++ int nthreads; ++- //int verbosity; +++ int write_index; ++ } sam_global_args; ++ ++ #define SAM_GLOBAL_ARGS_INIT {{0},{0}} ++@@ -47,7 +47,8 @@ ++ SAM_OPT_OUTPUT_FMT_OPTION, ++ SAM_OPT_REFERENCE, ++ SAM_OPT_NTHREADS, ++- //SAM_OPT_VERBOSE +++ SAM_OPT_WRITE_INDEX, +++ SAM_OPT_VERBOSITY, ++ }; ++ ++ #define SAM_OPT_VAL(val, defval) ((val) == '-')? '?' : (val)? (val) : (defval) ++@@ -64,8 +65,9 @@ ++ {"output-fmt", required_argument, NULL, SAM_OPT_VAL(o3, SAM_OPT_OUTPUT_FMT)}, \ ++ {"output-fmt-option", required_argument, NULL, SAM_OPT_VAL(o4, SAM_OPT_OUTPUT_FMT_OPTION)}, \ ++ {"reference", required_argument, NULL, SAM_OPT_VAL(o5, SAM_OPT_REFERENCE)}, \ ++- {"threads", required_argument, NULL, SAM_OPT_VAL(o6, SAM_OPT_NTHREADS)} ++- //{"verbose", no_argument, NULL, SAM_OPT_VERBOSE} +++ {"threads", required_argument, NULL, SAM_OPT_VAL(o6, SAM_OPT_NTHREADS)}, \ +++ {"write-index", no_argument, NULL, SAM_OPT_WRITE_INDEX}, \ +++ {"verbosity", required_argument, NULL, SAM_OPT_VERBOSITY} ++ ++ /* ++ * Processes a standard "global" samtools long option. ++--- python-pysam.orig/samtools/sam_utils.c +++++ python-pysam/samtools/sam_utils.c ++@@ -1,6 +1,6 @@ ++ /* sam_utils.c -- various utilities internal to samtools. ++ ++- Copyright (C) 2014-2016 Genome Research Ltd. +++ Copyright (C) 2014-2016, 2018, 2019 Genome Research Ltd. ++ ++ Author: John Marshall ++ ++@@ -23,6 +23,7 @@ ++ DEALINGS IN THE SOFTWARE. */ ++ ++ #include +++#include ++ ++ #include ++ #include ++@@ -58,3 +59,80 @@ ++ vprint_error_core(subcommand, format, args, err? strerror(err) : NULL); ++ va_end(args); ++ } +++ +++void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp) +++{ +++ int r = sam_close(fp); +++ if (r >= 0) return; +++ +++ // TODO Need error infrastructure so we can print a message instead of r +++ if (fname) print_error(subcmd, "error closing \"%s\": %d", fname, r); +++ else print_error(subcmd, "error closing %s: %d", null_fname, r); +++ +++ *retp = EXIT_FAILURE; +++} +++ +++/* Pick an index suffix based on the output file descriptor type. */ +++static char *idx_suffix(htsFile *fp) { +++ switch (fp->format.format) { +++ case sam: +++ case bam: +++ // Tough cheese if you wanted bai! +++ // New feature => mandatory new index too, for simplicity of CLI. +++ return "csi"; +++ +++ case cram: +++ return "crai"; +++ +++ default: +++ return NULL; +++ } +++} +++ +++/* +++ * Utility function to add an index to a file we've opened for write. +++ * NB: Call this after writing the header and before writing sequences. +++ * +++ * The returned index filename should be freed by the caller, but only +++ * after sam_idx_save has been called. +++ * +++ * Returns index filename on success, +++ * NULL on failure. +++ */ +++char *auto_index(htsFile *fp, const char *fn, bam_hdr_t *header) { +++ char *fn_idx; +++ int min_shift = 14; /* CSI */ +++ if (!fn || !*fn || strcmp(fn, "-") == 0) +++ return NULL; +++ +++ char *delim = strstr(fn, HTS_IDX_DELIM); +++ if (delim != NULL) { +++ delim += strlen(HTS_IDX_DELIM); +++ +++ fn_idx = strdup(delim); +++ if (!fn_idx) +++ return NULL; +++ +++ size_t l = strlen(fn_idx); +++ if (l >= 4 && strcmp(fn_idx + l - 4, ".bai") == 0) +++ min_shift = 0; +++ } else { +++ char *suffix = idx_suffix(fp); +++ if (!suffix) +++ return NULL; +++ +++ fn_idx = malloc(strlen(fn)+6); +++ if (!fn_idx) +++ return NULL; +++ +++ sprintf(fn_idx, "%s.%s", fn, suffix); +++ } +++ +++ if (sam_idx_init(fp, header, min_shift, fn_idx) < 0) { +++ print_error_errno("auto_index", "failed to open index \"%s\" for writing", fn_idx); +++ free(fn_idx); +++ return NULL; +++ } +++ +++ return fn_idx; +++} ++--- python-pysam.orig/samtools/sam_utils.c.pysam.c +++++ python-pysam/samtools/sam_utils.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* sam_utils.c -- various utilities internal to samtools. ++ ++- Copyright (C) 2014-2016 Genome Research Ltd. +++ Copyright (C) 2014-2016, 2018, 2019 Genome Research Ltd. ++ ++ Author: John Marshall ++ ++@@ -25,6 +25,7 @@ ++ DEALINGS IN THE SOFTWARE. */ ++ ++ #include +++#include ++ ++ #include ++ #include ++@@ -60,3 +61,80 @@ ++ vprint_error_core(subcommand, format, args, err? strerror(err) : NULL); ++ va_end(args); ++ } +++ +++void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp) +++{ +++ int r = sam_close(fp); +++ if (r >= 0) return; +++ +++ // TODO Need error infrastructure so we can print a message instead of r +++ if (fname) print_error(subcmd, "error closing \"%s\": %d", fname, r); +++ else print_error(subcmd, "error closing %s: %d", null_fname, r); +++ +++ *retp = EXIT_FAILURE; +++} +++ +++/* Pick an index suffix based on the output file descriptor type. */ +++static char *idx_suffix(htsFile *fp) { +++ switch (fp->format.format) { +++ case sam: +++ case bam: +++ // Tough cheese if you wanted bai! +++ // New feature => mandatory new index too, for simplicity of CLI. +++ return "csi"; +++ +++ case cram: +++ return "crai"; +++ +++ default: +++ return NULL; +++ } +++} +++ +++/* +++ * Utility function to add an index to a file we've opened for write. +++ * NB: Call this after writing the header and before writing sequences. +++ * +++ * The returned index filename should be freed by the caller, but only +++ * after sam_idx_save has been called. +++ * +++ * Returns index filename on success, +++ * NULL on failure. +++ */ +++char *auto_index(htsFile *fp, const char *fn, bam_hdr_t *header) { +++ char *fn_idx; +++ int min_shift = 14; /* CSI */ +++ if (!fn || !*fn || strcmp(fn, "-") == 0) +++ return NULL; +++ +++ char *delim = strstr(fn, HTS_IDX_DELIM); +++ if (delim != NULL) { +++ delim += strlen(HTS_IDX_DELIM); +++ +++ fn_idx = strdup(delim); +++ if (!fn_idx) +++ return NULL; +++ +++ size_t l = strlen(fn_idx); +++ if (l >= 4 && strcmp(fn_idx + l - 4, ".bai") == 0) +++ min_shift = 0; +++ } else { +++ char *suffix = idx_suffix(fp); +++ if (!suffix) +++ return NULL; +++ +++ fn_idx = malloc(strlen(fn)+6); +++ if (!fn_idx) +++ return NULL; +++ +++ sprintf(fn_idx, "%s.%s", fn, suffix); +++ } +++ +++ if (sam_idx_init(fp, header, min_shift, fn_idx) < 0) { +++ print_error_errno("auto_index", "failed to open index \"%s\" for writing", fn_idx); +++ free(fn_idx); +++ return NULL; +++ } +++ +++ return fn_idx; +++} ++--- python-pysam.orig/samtools/sam_view.c +++++ python-pysam/samtools/sam_view.c ++@@ -1,6 +1,6 @@ ++ /* sam_view.c -- SAM<->BAM<->CRAM conversion. ++ ++- Copyright (C) 2009-2017 Genome Research Ltd. +++ Copyright (C) 2009-2019 Genome Research Ltd. ++ Portions copyright (C) 2009, 2011, 2012 Broad Institute. ++ ++ Author: Heng Li ++@@ -32,33 +32,25 @@ ++ #include ++ #include ++ #include ++-#include ++-#include ++ #include ++-#include ++ #include "htslib/sam.h" ++ #include "htslib/faidx.h" ++-#include "htslib/kstring.h" ++ #include "htslib/khash.h" ++-#include "htslib/klist.h" ++ #include "htslib/thread_pool.h" ++-#include "htslib/bgzf.h" ++ #include "samtools.h" ++ #include "sam_opts.h" ++ #include "bedidx.h" ++ ++-#define DEFAULT_BARCODE_TAG "BC" ++-#define DEFAULT_QUALITY_TAG "QT" ++- ++ KHASH_SET_INIT_STR(rg) ++-#define taglist_free(p) ++-KLIST_INIT(ktaglist, char*, taglist_free) +++KHASH_SET_INIT_STR(tv) ++ ++ typedef khash_t(rg) *rghash_t; +++typedef khash_t(tv) *tvhash_t; ++ ++ // This structure contains the settings for a samview run ++ typedef struct samview_settings { ++ rghash_t rghash; +++ tvhash_t tvhash; ++ int min_mapQ; ++ int flag_on; ++ int flag_off; ++@@ -72,16 +64,17 @@ ++ size_t remove_aux_len; ++ char** remove_aux; ++ int multi_region; +++ char* tag; ++ } samview_settings_t; ++ ++ ++ // TODO Add declarations of these to a viable htslib or samtools header ++-extern const char *bam_get_library(bam_hdr_t *header, const bam1_t *b); +++extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b); ++ extern int bam_remove_B(bam1_t *b); ++ extern char *samfaipath(const char *fn_ref); ++ ++ // Returns 0 to indicate read should be output 1 otherwise ++-static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settings) +++static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings) ++ { ++ if (settings->remove_B) bam_remove_B(b); ++ if (settings->min_qlen > 0) { ++@@ -96,7 +89,7 @@ ++ return 1; ++ if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff)) ++ return 1; ++- if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) +++ if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, sam_hdr_tid2name(h, b->core.tid), b->core.pos, bam_endpos(b)))) ++ return 1; ++ if (settings->subsam_frac > 0.) { ++ uint32_t k = __ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(b)) ^ settings->subsam_seed); ++@@ -109,8 +102,17 @@ ++ if (k == kh_end(settings->rghash)) return 1; ++ } ++ } +++ if (settings->tvhash && settings->tag) { +++ uint8_t *s = bam_aux_get(b, settings->tag); +++ if (s) { +++ khint_t k = kh_get(tv, settings->tvhash, (char*)(s + 1)); +++ if (k == kh_end(settings->tvhash)) return 1; +++ } else { +++ return 1; +++ } +++ } ++ if (settings->library) { ++- const char *p = bam_get_library((bam_hdr_t*)h, b); +++ const char *p = bam_get_library((sam_hdr_t*)h, b); ++ if (!p || strcmp(p, settings->library) != 0) return 1; ++ } ++ if (settings->remove_aux_len) { ++@@ -125,37 +127,6 @@ ++ return 0; ++ } ++ ++-static char *drop_rg(char *hdtxt, rghash_t h, int *len) ++-{ ++- char *p = hdtxt, *q, *r, *s; ++- kstring_t str; ++- memset(&str, 0, sizeof(kstring_t)); ++- while (1) { ++- int toprint = 0; ++- q = strchr(p, '\n'); ++- if (q == 0) q = p + strlen(p); ++- if (q - p < 3) break; // the line is too short; then stop ++- if (strncmp(p, "@RG\t", 4) == 0) { ++- int c; ++- khint_t k; ++- if ((r = strstr(p, "\tID:")) != 0) { ++- r += 4; ++- for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s); ++- c = *s; *s = '\0'; ++- k = kh_get(rg, h, r); ++- *s = c; ++- if (k != kh_end(h)) toprint = 1; ++- } ++- } else toprint = 1; ++- if (toprint) { ++- kputsn(p, q - p, &str); kputc('\n', &str); ++- } ++- p = q + 1; ++- } ++- *len = str.l; ++- return str.s; ++-} ++- ++ static int usage(FILE *fp, int exit_status, int is_long_help); ++ ++ static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name) ++@@ -217,39 +188,87 @@ ++ return (ret != -1) ? 0 : -1; ++ } ++ ++-static inline int check_sam_write1(samFile *fp, const bam_hdr_t *h, const bam1_t *b, const char *fname, int *retp) +++static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name) ++ { ++- int r = sam_write1(fp, h, b); ++- if (r >= 0) return r; +++ char *d = strdup(name); +++ int ret = 0; ++ ++- if (fname) print_error_errno("view", "writing to \"%s\" failed", fname); ++- else print_error_errno("view", "writing to standard output failed"); +++ if (d == NULL) goto err; ++ ++- *retp = EXIT_FAILURE; ++- return r; +++ if (settings->tvhash == NULL) { +++ settings->tvhash = kh_init(tv); +++ if (settings->tvhash == NULL) goto err; +++ } +++ +++ kh_put(tv, settings->tvhash, d, &ret); +++ if (ret == -1) goto err; +++ if (ret == 0) free(d); /* Duplicate */ +++ return 0; +++ +++ err: +++ print_error(subcmd, "Couldn't add \"%s\" to tag values list: memory exhausted?", name); +++ free(d); +++ return -1; +++} +++ +++static int add_tag_values_file(const char *subcmd, samview_settings_t *settings, char *fn) +++{ +++ FILE *fp; +++ char buf[1024]; +++ int ret = 0; +++ if (settings->tvhash == NULL) { +++ settings->tvhash = kh_init(tv); +++ if (settings->tvhash == NULL) { +++ perror(NULL); +++ return -1; +++ } +++ } +++ +++ fp = fopen(fn, "r"); +++ if (fp == NULL) { +++ print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); +++ return -1; +++ } +++ +++ while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { +++ char *d = strdup(buf); +++ if (d != NULL) { +++ kh_put(tv, settings->tvhash, d, &ret); +++ if (ret == 0) free(d); /* Duplicate */ +++ } else { +++ ret = -1; +++ } +++ } +++ if (ferror(fp)) ret = -1; +++ if (ret == -1) { +++ print_error_errno(subcmd, "failed to read \"%s\"", fn); +++ } +++ fclose(fp); +++ return (ret != -1) ? 0 : -1; ++ } ++ ++-static void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp) +++static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b, const char *fname, int *retp) ++ { ++- int r = sam_close(fp); ++- if (r >= 0) return; +++ int r = sam_write1(fp, h, b); +++ if (r >= 0) return r; ++ ++- // TODO Need error infrastructure so we can print a message instead of r ++- if (fname) print_error(subcmd, "error closing \"%s\": %d", fname, r); ++- else print_error(subcmd, "error closing %s: %d", null_fname, r); +++ if (fname) print_error_errno("view", "writing to \"%s\" failed", fname); +++ else print_error_errno("view", "writing to standard output failed"); ++ ++ *retp = EXIT_FAILURE; +++ return r; ++ } ++ ++ int main_samview(int argc, char *argv[]) ++ { ++- int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0; +++ int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0; ++ int64_t count = 0; ++ samFile *in = 0, *out = 0, *un_out=0; ++ FILE *fp_out = NULL; ++- bam_hdr_t *header = NULL; +++ sam_hdr_t *header = NULL; ++ char out_mode[5], out_un_mode[5], *out_format = ""; ++- char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; +++ char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; +++ char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ htsThreadPool p = {NULL, 0}; ++ int filter_state = ALL, filter_op = 0; ++@@ -257,6 +276,7 @@ ++ ++ samview_settings_t settings = { ++ .rghash = NULL, +++ .tvhash = NULL, ++ .min_mapQ = 0, ++ .flag_on = 0, ++ .flag_off = 0, ++@@ -267,11 +287,13 @@ ++ .subsam_frac = -1., ++ .library = NULL, ++ .bed = NULL, ++- .multi_region = 0 +++ .multi_region = 0, +++ .tag = NULL ++ }; ++ ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -288,7 +310,7 @@ ++ opterr = 0; ++ ++ while ((c = getopt_long(argc, argv, ++- "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:L:s:@:m:x:U:M", +++ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:d:D:L:s:@:m:x:U:MX", ++ lopts, NULL)) >= 0) { ++ switch (c) { ++ case 's': ++@@ -298,7 +320,6 @@ ++ srand(settings.subsam_seed); ++ settings.subsam_seed = rand(); ++ } ++- ++ if (q && *q == '.') { ++ settings.subsam_frac = strtod(q, &q); ++ if (*q) ret = 1; ++@@ -321,6 +342,7 @@ ++ case 'H': is_header_only = 1; break; ++ case 'o': fn_out = strdup(optarg); break; ++ case 'U': fn_un_out = strdup(optarg); break; +++ case 'X': has_index_file = 1; break; ++ case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; ++ case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; ++ case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break; ++@@ -347,6 +369,63 @@ ++ goto view_end; ++ } ++ break; +++ case 'd': +++ if (strlen(optarg) < 4 || optarg[2] != ':') { +++ print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg); +++ ret = 1; +++ goto view_end; +++ } +++ +++ if (settings.tag) { +++ if (settings.tag[0] != optarg[0] || settings.tag[1] != optarg[1]) { +++ print_error("view", "Different tag \"%s\" was specified before: \"%s\"", settings.tag, optarg); +++ ret = 1; +++ goto view_end; +++ } +++ } else { +++ if (!(settings.tag = calloc(3, 1))) { +++ print_error("view", "Could not allocate memory for tag: \"%s\"", optarg); +++ ret = 1; +++ goto view_end; +++ } +++ memcpy(settings.tag, optarg, 2); +++ } +++ +++ if (add_tag_value_single("view", &settings, optarg+3) != 0) { +++ ret = 1; +++ goto view_end; +++ } +++ break; +++ case 'D': +++ // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX +++ // path translation as described at: +++ // http://www.mingw.org/wiki/Posix_path_conversion +++ if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) { +++ print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg); +++ ret = 1; +++ goto view_end; +++ } +++ +++ if (settings.tag) { +++ if (settings.tag[0] != optarg[0] || settings.tag[1] != optarg[1]) { +++ print_error("view", "Different tag \"%s\" was specified before: \"%s\"", settings.tag, optarg); +++ ret = 1; +++ goto view_end; +++ } +++ } else { +++ if (!(settings.tag = calloc(3, 1))) { +++ print_error("view", "Could not allocate memory for tag: \"%s\"", optarg); +++ ret = 1; +++ goto view_end; +++ } +++ memcpy(settings.tag, optarg, 2); +++ } +++ +++ if (add_tag_values_file("view", &settings, optarg+3) != 0) { +++ ret = 1; +++ goto view_end; +++ } +++ break; ++ /* REMOVED as htslib doesn't support this ++ //case 'x': out_format = "x"; break; ++ //case 'X': out_format = "X"; break; ++@@ -380,6 +459,7 @@ ++ } ++ break; ++ case 'M': settings.multi_region = 1; break; +++ case 1: no_pg = 1; break; ++ default: ++ if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) ++ return usage(stderr, EXIT_FAILURE, 0); ++@@ -429,13 +509,8 @@ ++ ret = 1; ++ goto view_end; ++ } ++- if (settings.rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... ++- char *tmp; ++- int l; ++- tmp = drop_rg(header->text, settings.rghash, &l); ++- free(header->text); ++- header->text = tmp; ++- header->l_text = l; +++ if (settings.rghash) { +++ sam_hdr_remove_lines(header, "RG", "ID", settings.rghash); ++ } ++ if (!is_count) { ++ if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { ++@@ -450,7 +525,25 @@ ++ goto view_end; ++ } ++ } ++- if (*out_format || is_header || +++ +++ if (!no_pg) { +++ if (!(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("view", "failed to create arg_list"); +++ ret = 1; +++ goto view_end; +++ } +++ if (sam_hdr_add_pg(header, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ print_error("view", "failed to add PG line to the header"); +++ ret = 1; +++ goto view_end; +++ } +++ } +++ +++ if (*out_format || ga.write_index || is_header || ++ out_mode[1] == 'b' || out_mode[1] == 'c' || ++ (ga.out.format != sam && ga.out.format != unknown_format)) { ++ if (sam_hdr_write(out, header) != 0) { ++@@ -459,6 +552,13 @@ ++ goto view_end; ++ } ++ } +++ if (ga.write_index) { +++ if (!(fn_out_idx = auto_index(out, fn_out, header))) { +++ ret = 1; +++ goto view_end; +++ } +++ } +++ ++ if (fn_un_out) { ++ if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { ++ print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out); ++@@ -481,6 +581,12 @@ ++ goto view_end; ++ } ++ } +++ if (ga.write_index) { +++ if (!(fn_un_out_idx = auto_index(un_out, fn_un_out, header))) { +++ ret = 1; +++ goto view_end; +++ } +++ } ++ } ++ } ++ else { ++@@ -505,11 +611,23 @@ ++ } ++ if (is_header_only) goto view_end; // no need to print alignments ++ +++ if (has_index_file) { +++ fn_idx_in = (optind+1 < argc)? argv[optind+1] : 0; +++ if (fn_idx_in == 0) { +++ fprintf(stderr, "[main_samview] incorrect number of arguments for -X option. Aborting.\n"); +++ return 1; +++ } +++ } +++ ++ if (settings.multi_region) { ++- if (optind < argc - 1) { //regions have been specified in the command line +++ if (!has_index_file && optind < argc - 1) { //regions have been specified in the command line ++ settings.bed = bed_hash_regions(settings.bed, argv, optind+1, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file ++ if (!filter_op) ++ filter_state = FILTERED; +++ } else if (has_index_file && optind < argc - 2) { +++ settings.bed = bed_hash_regions(settings.bed, argv, optind+2, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file +++ if (!filter_op) +++ filter_state = FILTERED; ++ } else { ++ bed_unify(settings.bed); ++ } ++@@ -518,7 +636,13 @@ ++ if (settings.bed == NULL) { // index is unavailable or no regions have been specified ++ fprintf(stderr, "[main_samview] no regions or BED file have been provided. Aborting.\n"); ++ } else { ++- hts_idx_t *idx = sam_index_load(in, fn_in); // load index +++ hts_idx_t *idx = NULL; +++ // If index filename has not been specfied, look in BAM folder +++ if (fn_idx_in != 0) { +++ idx = sam_index_load2(in, fn_in, fn_idx_in); // load index +++ } else { +++ idx = sam_index_load(in, fn_in); +++ } ++ if (idx != NULL) { ++ ++ int regcount = 0; ++@@ -555,7 +679,7 @@ ++ } ++ bam_destroy1(b); ++ } else { ++- if (optind + 1 >= argc) { // convert/print the entire file +++ if ((has_index_file && optind >= argc - 2) || (!has_index_file && optind >= argc - 1)) { // convert/print the entire file ++ bam1_t *b = bam_init1(); ++ int r; ++ while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' ++@@ -574,22 +698,25 @@ ++ } else { // retrieve alignments in specified regions ++ int i; ++ bam1_t *b; ++- hts_idx_t *idx = sam_index_load(in, fn_in); // load index +++ hts_idx_t *idx = NULL; +++ // If index filename has not been specfied, look in BAM folder +++ if (fn_idx_in != NULL) { +++ idx = sam_index_load2(in, fn_in, fn_idx_in); // load index +++ } else { +++ idx = sam_index_load(in, fn_in); +++ } ++ if (idx == 0) { // index is unavailable ++ fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); ++ ret = 1; ++ goto view_end; ++ } ++ b = bam_init1(); ++- for (i = optind + 1; i < argc; ++i) { +++ +++ for (i = (has_index_file)? optind+2 : optind+1; i < argc; ++i) { ++ int result; ++ hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' ++ if (iter == NULL) { // region invalid or reference name not found ++- int beg, end; ++- if (hts_parse_reg(argv[i], &beg, &end)) ++- fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); ++- else ++- fprintf(stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); +++ fprintf(stderr, "[main_samview] region \"%s\" specifies an invalid region or unknown reference. Continue anyway.\n", argv[i]); ++ continue; ++ } ++ // fetch alignments ++@@ -613,6 +740,17 @@ ++ } ++ } ++ +++ if (ga.write_index) { +++ if (sam_idx_save(out) < 0) { +++ print_error_errno("view", "writing index failed"); +++ ret = 1; +++ } +++ if (un_out && sam_idx_save(un_out) < 0) { +++ print_error_errno("view", "writing index failed"); +++ ret = 1; +++ } +++ } +++ ++ view_end: ++ if (is_count && ret == 0) { ++ if (fprintf(fn_out? fp_out : stdout, "%" PRId64 "\n", count) < 0) { ++@@ -630,7 +768,7 @@ ++ ++ free(fn_list); free(fn_out); free(settings.library); free(fn_un_out); ++ sam_global_args_free(&ga); ++- if ( header ) bam_hdr_destroy(header); +++ if ( header ) sam_hdr_destroy(header); ++ if (settings.bed) bed_destroy(settings.bed); ++ if (settings.rghash) { ++ khint_t k; ++@@ -638,13 +776,28 @@ ++ if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k)); ++ kh_destroy(rg, settings.rghash); ++ } +++ if (settings.tvhash) { +++ khint_t k; +++ for (k = 0; k < kh_end(settings.tvhash); ++k) +++ if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k)); +++ kh_destroy(tv, settings.tvhash); +++ } ++ if (settings.remove_aux_len) { ++ free(settings.remove_aux); ++ } +++ if (settings.tag) { +++ free(settings.tag); +++ } ++ ++ if (p.pool) ++ hts_tpool_destroy(p.pool); ++ +++ if (fn_out_idx) +++ free(fn_out_idx); +++ if (fn_un_out_idx) +++ free(fn_un_out_idx); +++ free(arg_list); +++ ++ return ret; ++ } ++ ++@@ -667,10 +820,16 @@ ++ " -U FILE output reads not selected by filters to FILE [null]\n" ++ // extra input ++ " -t FILE FILE listing reference names and lengths (see long help) [null]\n" +++" -X include customized index file\n" ++ // read filters ++ " -L FILE only include reads overlapping this BED FILE [null]\n" ++ " -r STR only include reads in read group STR [null]\n" ++ " -R FILE only include reads with read group listed in FILE [null]\n" +++" -d STR:STR\n" +++" only include reads with tag STR and associated value STR [null]\n" +++" -D STR:FILE\n" +++" only include reads with tag STR and associated values listed in\n" +++" FILE [null]\n" ++ " -q INT only include reads with mapping quality >= INT [0]\n" ++ " -l STR only include reads in library STR [null]\n" ++ " -m INT only include reads with number of CIGAR operations consuming\n" ++@@ -687,9 +846,10 @@ ++ " -B collapse the backward CIGAR operation\n" ++ // general options ++ " -? print long help, including note about region specification\n" ++-" -S ignored (input format is auto-detected)\n"); +++" -S ignored (input format is auto-detected)\n" +++" --no-PG do not add a PG line\n"); ++ ++- sam_global_opt_help(fp, "-.O.T@"); +++ sam_global_opt_help(fp, "-.O.T@.."); ++ fprintf(fp, "\n"); ++ ++ if (is_long_help) ++@@ -747,903 +907,3 @@ ++ free(argv2); ++ return ret; ++ } ++- ++-int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; ++-static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; ++- ++-static void bam2fq_usage(FILE *to, const char *command) ++-{ ++- int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; ++- fprintf(to, ++-"Usage: samtools %s [options...] \n", command); ++- fprintf(to, ++-"Options:\n" ++-" -0 FILE write reads designated READ_OTHER to FILE\n" ++-" -1 FILE write reads designated READ1 to FILE\n" ++-" -2 FILE write reads designated READ2 to FILE\n" ++-" note: if a singleton file is specified with -s, only\n" ++-" paired reads will be written to the -1 and -2 files.\n" ++-" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x ++-" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0 ++-" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) ++-" -n don't append /1 and /2 to the read name\n" ++-" -N always append /1 and /2 to the read name\n"); ++- if (fq) fprintf(to, ++-" -O output quality in the OQ tag if present\n"); ++- fprintf(to, ++-" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" ++-" -t copy RG, BC and QT tags to the %s header line\n", ++- fq ? "FASTQ" : "FASTA"); ++- fprintf(to, ++-" -T TAGLIST copy arbitrary tags to the %s header line\n", ++- fq ? "FASTQ" : "FASTA"); ++- if (fq) fprintf(to, ++-" -v INT default quality score if not given in file [1]\n" ++-" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" ++-" -c compression level [0..9] to use when creating gz or bgzf fastq files\n" ++-" --i1 FILE write first index reads to FILE\n" ++-" --i2 FILE write second index reads to FILE\n" ++-" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" ++-" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" ++-" --index-format STR How to parse barcode and quality tags\n\n"); ++- sam_global_opt_help(to, "-.--.@"); ++- fprintf(to, ++-"\n" ++-"Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n" ++-"Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n" ++-"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n" ++-"or both unset.\n" ++-"Run 'samtools flags' for more information on flag codes and meanings.\n"); ++- fprintf(to, ++-"\n" ++-"The index-format string describes how to parse the barcode and quality tags, for example:\n" ++-" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" ++-" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" ++-"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" ++-"'read until the separator or end of tag', for example:\n" ++-" n*i* ignore the left part of the tag until the separator, then use the second part\n" ++-" of the tag as index 1\n"); ++- fprintf(to, ++-"\n" ++-"Examples:\n" ++-" To get just the paired reads in separate files, use:\n" ++-" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n -F 0x900 in.bam\n" ++-"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n" ++-" samtools %s -F 0x900 in.bam > all_reads.%s\n", ++- command, fq ? "fq" : "fa", fq ? "fq" : "fa", ++- command, fq ? "fq" : "fa"); ++-} ++- ++-typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart; ++-typedef enum { FASTA, FASTQ } fastfile; ++-typedef struct bam2fq_opts { ++- char *fnse; ++- char *fnr[3]; ++- char *fn_input; // pointer to input filename in argv do not free ++- bool has12, has12always, use_oq, copy_tags, illumina_tag; ++- int flag_on, flag_off, flag_alloff; ++- sam_global_args ga; ++- fastfile filetype; ++- int def_qual; ++- char *barcode_tag; ++- char *quality_tag; ++- char *index_file[2]; ++- char *index_format; ++- char *extra_tags; ++- char compression_level; ++-} bam2fq_opts_t; ++- ++-typedef struct bam2fq_state { ++- samFile *fp; ++- BGZF *fpse; ++- BGZF *fpr[3]; ++- BGZF *fpi[2]; ++- BGZF *hstdout; ++- bam_hdr_t *h; ++- bool has12, use_oq, copy_tags, illumina_tag; ++- int flag_on, flag_off, flag_alloff; ++- fastfile filetype; ++- int def_qual; ++- klist_t(ktaglist) *taglist; ++- char *index_sequence; ++- char compression_level; ++-} bam2fq_state_t; ++- ++-/* ++- * Get and decode the read from a BAM record. ++- * ++- * TODO: htslib really needs an interface for this. Consider this or perhaps ++- * bam_get_seq_str (current vs original orientation) and bam_get_qual_str ++- * functions as string formatted equivalents to bam_get_{seq,qual}? ++- */ ++- ++-/* ++- * Reverse a string in place. ++- * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. ++- * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik ++- */ ++-static char *reverse(char *str) ++-{ ++- int i = strlen(str)-1,j=0; ++- char ch; ++- while (i>j) { ++- ch = str[i]; ++- str[i]= str[j]; ++- str[j] = ch; ++- i--; ++- j++; ++- } ++- return str; ++-} ++- ++-/* return the read, reverse complemented if necessary */ ++-static char *get_read(const bam1_t *rec) ++-{ ++- int len = rec->core.l_qseq + 1; ++- char *read = calloc(1, len); ++- char *seq = (char *)bam_get_seq(rec); ++- int n; ++- ++- if (!read) return NULL; ++- ++- for (n=0; n < rec->core.l_qseq; n++) { ++- if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; ++- else read[n] = seq_nt16_str[bam_seqi(seq,n)]; ++- } ++- if (rec->core.flag & BAM_FREVERSE) reverse(read); ++- return read; ++-} ++- ++-/* ++- * get and decode the quality from a BAM record ++- */ ++-static int get_quality(const bam1_t *rec, char **qual_out) ++-{ ++- char *quality = calloc(1, rec->core.l_qseq + 1); ++- char *q = (char *)bam_get_qual(rec); ++- int n; ++- ++- if (!quality) return -1; ++- ++- if (*q == '\xff') { ++- free(quality); ++- *qual_out = NULL; ++- return 0; ++- } ++- ++- for (n=0; n < rec->core.l_qseq; n++) { ++- quality[n] = q[n]+33; ++- } ++- if (rec->core.flag & BAM_FREVERSE) reverse(quality); ++- *qual_out = quality; ++- return 0; ++-} ++- ++-// ++-// End of htslib complaints ++-// ++- ++- ++-static readpart which_readpart(const bam1_t *b) ++-{ ++- if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { ++- return READ_1; ++- } else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) { ++- return READ_2; ++- } else { ++- return READ_UNKNOWN; ++- } ++-} ++- ++-/* ++- * parse the length part from the index-format string ++- */ ++-static int getLength(char **s) ++-{ ++- int n = 0; ++- while (**s) { ++- if (**s == '*') { n=-1; (*s)++; break; } ++- if ( !isdigit(**s)) break; ++- n = n*10 + ((**s)-'0'); ++- (*s)++; ++- } ++- return n; ++-} ++- ++-static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf) ++-{ ++- uint8_t *s = bam_aux_get(rec, tag); ++- if (s) { ++- char aux_type = *s; ++- switch (aux_type) { ++- case 'C': ++- case 'S': aux_type = 'I'; break; ++- case 'c': ++- case 's': aux_type = 'i'; break; ++- case 'd': aux_type = 'f'; break; ++- } ++- ++- // Ensure space. Need 6 chars + length of tag. Max length of ++- // i is 16, A is 21, B currently 26, Z is unknown, so ++- // have to check that one later. ++- if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false; ++- ++- kputc('\t', linebuf); ++- kputsn(tag, 2, linebuf); ++- kputc(':', linebuf); ++- kputc(aux_type=='I'? 'i': aux_type, linebuf); ++- kputc(':', linebuf); ++- switch (aux_type) { ++- case 'H': ++- case 'Z': ++- if (kputs(bam_aux2Z(s), linebuf) < 0) return false; ++- break; ++- case 'i': kputw(bam_aux2i(s), linebuf); break; ++- case 'I': kputuw(bam_aux2i(s), linebuf); break; ++- case 'A': kputc(bam_aux2A(s), linebuf); break; ++- case 'f': kputd(bam_aux2f(s), linebuf); break; ++- case 'B': kputs("*** Unhandled aux type ***", linebuf); return false; ++- default: kputs("*** Unknown aux type ***", linebuf); return false; ++- } ++- } ++- return true; ++-} ++- ++-static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec) ++-{ ++- if (!index_sequence) return 0; ++- ++- kstring_t new = {0,0,NULL}; ++- if (linebuf->s) { ++- char *s = strchr(linebuf->s, '\n'); ++- if (s) { ++- if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0) ++- return -1; ++- *s = 0; ++- kputs(linebuf->s, &new); ++- kputc(' ', &new); ++- readpart readpart = which_readpart(rec); ++- if (readpart == READ_1) kputc('1', &new); ++- else if (readpart == READ_2) kputc('2', &new); ++- else kputc('0', &new); ++- ++- kputc(':', &new); ++- if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new); ++- else kputc('N', &new); ++- ++- kputs(":0:", &new); ++- kputs(index_sequence, &new); ++- kputc('\n', &new); ++- kputs(s+1, &new); ++- free(ks_release(linebuf)); ++- linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m; ++- } ++- } ++- return 0; ++-} ++- ++-static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) ++-{ ++- int i; ++- ++- linebuf->l = 0; ++- // Write read name ++- if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false; ++- if (kputs(bam_get_qname(rec), linebuf) < 0) return false; ++- // Add the /1 /2 if requested ++- if (state->has12) { ++- readpart readpart = which_readpart(rec); ++- if (readpart == READ_1) { ++- if (kputs("/1", linebuf) < 0) return false; ++- } else if (readpart == READ_2) { ++- if (kputs("/2", linebuf) < 0) return false; ++- } ++- } ++- if (state->copy_tags) { ++- for (i = 0; copied_tags[i]; ++i) { ++- if (!copy_tag(copied_tags[i], rec, linebuf)) { ++- fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s); ++- return false; ++- } ++- } ++- } ++- ++- if (state->taglist->size) { ++- kliter_t(ktaglist) *p; ++- for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) { ++- if (!copy_tag(kl_val(p), rec, linebuf)) { ++- fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s); ++- return false; ++- } ++- } ++- } ++- ++- if (kputc('\n', linebuf) < 0) return false; ++- if (kputs(seq, linebuf) < 0) return false; ++- if (kputc('\n', linebuf) < 0) return false; ++- ++- if (state->filetype == FASTQ) { ++- // Write quality ++- if (kputs("+\n", linebuf) < 0) return false; ++- if (qual && *qual) { ++- if (kputs(qual, linebuf) < 0) return false; ++- } else { ++- int len = strlen(seq); ++- if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false; ++- for (i = 0; i < len; ++i) { ++- kputc(33 + state->def_qual, linebuf); ++- } ++- } ++- if (kputc('\n', linebuf) < 0) return false; ++- } ++- return true; ++-} ++- ++-/* ++- * Create FASTQ lines from the barcode tag using the index-format ++- */ ++-static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) ++-{ ++- uint8_t *p; ++- char *ifmt = opts->index_format; ++- char *tag = NULL; ++- char *qual = NULL; ++- char *sub_tag = NULL; ++- char *sub_qual = NULL; ++- size_t tag_len; ++- int file_number = 0; ++- kstring_t linebuf = { 0, 0, NULL }; // Buffer ++- ++- ++- // read barcode tag ++- p = bam_aux_get(rec,opts->barcode_tag); ++- if (p) tag = bam_aux2Z(p); ++- ++- if (!tag) return true; // there is no tag ++- ++- tag_len = strlen(tag); ++- sub_tag = calloc(1, tag_len + 1); ++- if (!sub_tag) goto fail; ++- sub_qual = calloc(1, tag_len + 1); ++- if (!sub_qual) goto fail; ++- ++- // read quality tag ++- p = bam_aux_get(rec, opts->quality_tag); ++- if (p) qual = bam_aux2Z(p); ++- ++- // Parse the index-format string ++- while (*ifmt) { ++- if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly ++- char action = *ifmt; // should be 'i' or 'n' ++- ifmt++; // skip over action ++- int index_len = getLength(&ifmt); ++- int n = 0; ++- ++- if (index_len < 0) { ++- // read until separator ++- while (isalpha(*tag)) { ++- sub_tag[n] = *tag++; ++- if (qual) sub_qual[n] = *qual++; ++- n++; ++- } ++- if (*tag) { // skip separator ++- tag++; ++- if (qual) qual++; ++- } ++- } else { ++- // read index_len characters ++- while (index_len-- && *tag) { ++- sub_tag[n] = *tag++; ++- if (qual) sub_qual[n] = *qual++; ++- n++; ++- } ++- } ++- sub_tag[n] = '\0'; ++- sub_qual[n] = '\0'; ++- ++- if (action=='i' && *sub_tag && state->fpi[file_number]) { ++- //if (file_number==0) state->index_sequence = strdup(sub_tag); // we're going to need this later... ++- state->index_sequence = strdup(sub_tag); // we're going to need this later... ++- if (!state->index_sequence) goto fail; ++- if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail; ++- if (state->illumina_tag) { ++- if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf, rec) < 0) { ++- goto fail; ++- } ++- } ++- if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0) ++- goto fail; ++- } ++- ++- } ++- ++- free(sub_qual); free(sub_tag); ++- free(linebuf.s); ++- return true; ++- ++- fail: ++- perror(__func__); ++- free(sub_qual); free(sub_tag); ++- free(linebuf.s); ++- return true; ++-} ++- ++-// Transform a bam1_t record into a string with the FASTQ representation of it ++-// @returns false for error, true for success ++-static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) ++-{ ++- int32_t qlen = b->core.l_qseq; ++- assert(qlen >= 0); ++- const uint8_t *oq = NULL; ++- char *qual = NULL; ++- ++- char *seq = get_read(b); ++- if (!seq) return false; ++- ++- if (state->use_oq) oq = bam_aux_get(b, "OQ"); ++- if (oq && *oq=='Z') { ++- qual = strdup(bam_aux2Z(oq)); ++- if (!qual) goto fail; ++- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented ++- reverse(qual); ++- } ++- } else { ++- if (get_quality(b, &qual) < 0) goto fail; ++- } ++- ++- if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail; ++- ++- free(qual); ++- free(seq); ++- return true; ++- ++- fail: ++- free(seq); ++- free(qual); ++- return false; ++-} ++- ++-static void free_opts(bam2fq_opts_t *opts) ++-{ ++- free(opts->barcode_tag); ++- free(opts->quality_tag); ++- free(opts->index_format); ++- free(opts->extra_tags); ++- free(opts); ++-} ++- ++-// return true if valid ++-static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) ++-{ ++- // Parse args ++- bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t)); ++- opts->has12 = true; ++- opts->has12always = false; ++- opts->filetype = FASTQ; ++- opts->def_qual = 1; ++- opts->barcode_tag = NULL; ++- opts->quality_tag = NULL; ++- opts->index_format = NULL; ++- opts->index_file[0] = NULL; ++- opts->index_file[1] = NULL; ++- opts->extra_tags = NULL; ++- opts->compression_level = 1; ++- ++- int c; ++- sam_global_args_init(&opts->ga); ++- static const struct option lopts[] = { ++- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), ++- {"i1", required_argument, NULL, 1}, ++- {"I1", required_argument, NULL, 1}, ++- {"i2", required_argument, NULL, 2}, ++- {"I2", required_argument, NULL, 2}, ++- {"if", required_argument, NULL, 3}, ++- {"IF", required_argument, NULL, 3}, ++- {"index-format", required_argument, NULL, 3}, ++- {"barcode-tag", required_argument, NULL, 'b'}, ++- {"quality-tag", required_argument, NULL, 'q'}, ++- { NULL, 0, NULL, 0 } ++- }; ++- while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) { ++- switch (c) { ++- case 'b': opts->barcode_tag = strdup(optarg); break; ++- case 'q': opts->quality_tag = strdup(optarg); break; ++- case 1 : opts->index_file[0] = optarg; break; ++- case 2 : opts->index_file[1] = optarg; break; ++- case 3 : opts->index_format = strdup(optarg); break; ++- case '0': opts->fnr[0] = optarg; break; ++- case '1': opts->fnr[1] = optarg; break; ++- case '2': opts->fnr[2] = optarg; break; ++- case 'f': opts->flag_on |= strtol(optarg, 0, 0); break; ++- case 'F': opts->flag_off |= strtol(optarg, 0, 0); break; ++- case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; ++- case 'n': opts->has12 = false; break; ++- case 'N': opts->has12always = true; break; ++- case 'O': opts->use_oq = true; break; ++- case 's': opts->fnse = optarg; break; ++- case 't': opts->copy_tags = true; break; ++- case 'i': opts->illumina_tag = true; break; ++- case 'c': opts->compression_level = atoi(optarg); break; ++- case 'T': opts->extra_tags = strdup(optarg); break; ++- case 'v': opts->def_qual = atoi(optarg); break; ++- case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; ++- default: ++- if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { ++- bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; ++- } ++- break; ++- } ++- } ++- ++- if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; ++- if (opts->has12always) opts->has12 = true; ++- ++- if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); ++- if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); ++- ++- int nIndex = 0; ++- if (opts->index_format) { ++- char *s; ++- for (s = opts->index_format; *s; s++) { ++- if (*s == 'i') nIndex++; ++- } ++- } ++- if (nIndex>2) { ++- fprintf(stderr,"Invalid index format: more than 2 indexes\n"); ++- bam2fq_usage(stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if (opts->index_file[1] && !opts->index_file[0]) { ++- fprintf(stderr, "Index one specified, but index two not given\n"); ++- bam2fq_usage(stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if (nIndex==2 && !opts->index_file[1]) { ++- fprintf(stderr, "index_format specifies two indexes, but only one index file given\n"); ++- bam2fq_usage(stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if (nIndex==1 && !opts->index_file[0]) { ++- fprintf(stderr, "index_format specifies an index, but no index file given\n"); ++- bam2fq_usage(stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if (nIndex==0 && opts->index_file[0]) { ++- fprintf(stderr, "index_format not specified, but index file given\n"); ++- bam2fq_usage(stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if (opts->def_qual < 0 || 93 < opts->def_qual) { ++- fprintf(stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); ++- bam2fq_usage(stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- const char* type_str = argv[0]; ++- if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) { ++- opts->filetype = FASTQ; ++- } else if (strcasecmp("fasta", type_str) == 0) { ++- opts->filetype = FASTA; ++- } else { ++- print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str); ++- bam2fq_usage(stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if ((argc - (optind)) == 0) { ++- fprintf(stderr, "No input file specified.\n"); ++- bam2fq_usage(stdout, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if ((argc - (optind)) != 1) { ++- fprintf(stderr, "Too many arguments.\n"); ++- bam2fq_usage(stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- opts->fn_input = argv[optind]; ++- *opts_out = opts; ++- return true; ++-} ++- ++-static BGZF *open_fqfile(char *filename, int c) ++-{ ++- char mode[4] = "w"; ++- size_t len = strlen(filename); ++- ++- mode[2] = 0; mode[3] = 0; ++- if (len > 3 && strstr(filename + (len - 3),".gz")) { ++- mode[1] = 'g'; mode[2] = c+'0'; ++- } else if ((len > 4 && strstr(filename + (len - 4),".bgz")) ++- || (len > 5 && strstr(filename + (len - 5),".bgzf"))) { ++- mode[1] = c+'0'; ++- } else { ++- mode[1] = 'u'; ++- } ++- ++- return bgzf_open(filename,mode); ++-} ++- ++-static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) ++-{ ++- bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); ++- state->flag_on = opts->flag_on; ++- state->flag_off = opts->flag_off; ++- state->flag_alloff = opts->flag_alloff; ++- state->has12 = opts->has12; ++- state->use_oq = opts->use_oq; ++- state->illumina_tag = opts->illumina_tag; ++- state->copy_tags = opts->copy_tags; ++- state->filetype = opts->filetype; ++- state->def_qual = opts->def_qual; ++- state->index_sequence = NULL; ++- state->hstdout = NULL; ++- state->compression_level = opts->compression_level; ++- ++- state->taglist = kl_init(ktaglist); ++- if (opts->extra_tags) { ++- char *save_p; ++- char *s = strtok_r(opts->extra_tags, ",", &save_p); ++- while (s) { ++- if (strlen(s) != 2) { ++- fprintf(stderr, "Parsing extra tags - '%s' is not two characters\n", s); ++- free(state); ++- return false; ++- } ++- char **et = kl_pushp(ktaglist, state->taglist); ++- *et = s; ++- s = strtok_r(NULL, ",", &save_p); ++- } ++- } ++- ++- state->fp = sam_open(opts->fn_input, "r"); ++- if (state->fp == NULL) { ++- print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); ++- free(state); ++- return false; ++- } ++- if (opts->ga.nthreads > 0) ++- hts_set_threads(state->fp, opts->ga.nthreads); ++- uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; ++- if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX; ++- if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { ++- fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); ++- free(state); ++- return false; ++- } ++- if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) { ++- fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); ++- free(state); ++- return false; ++- } ++- if (opts->fnse) { ++- state->fpse = open_fqfile(opts->fnse, state->compression_level); ++- if (state->fpse == NULL) { ++- print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse); ++- free(state); ++- return false; ++- } ++- } ++- ++- if (opts->ga.reference) { ++- if (hts_set_fai_filename(state->fp, opts->ga.reference) != 0) { ++- print_error_errno("bam2fq", "cannot load reference \"%s\"", opts->ga.reference); ++- free(state); ++- return false; ++- } ++- } ++- ++- int i; ++- for (i = 0; i < 3; ++i) { ++- if (opts->fnr[i]) { ++- state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level); ++- if (state->fpr[i] == NULL) { ++- print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", i, opts->fnr[i]); ++- free(state); ++- return false; ++- } ++- } else { ++- if (!state->hstdout) { ++- state->hstdout = bgzf_dopen(fileno(stdout), "wu"); ++- if (!state->hstdout) { ++- print_error_errno("bam2fq", "Cannot open STDOUT"); ++- free(state); ++- return false; ++- } ++- } ++- state->fpr[i] = state->hstdout; ++- } ++- } ++- for (i = 0; i < 2; i++) { ++- state->fpi[i] = NULL; ++- if (opts->index_file[i]) { ++- state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level); ++- if (state->fpi[i] == NULL) { ++- print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", i+1, opts->index_file[i]); ++- free(state); ++- return false; ++- } ++- } ++- } ++- ++- state->h = sam_hdr_read(state->fp); ++- if (state->h == NULL) { ++- fprintf(stderr, "Failed to read header for \"%s\"\n", opts->fn_input); ++- free(state); ++- return false; ++- } ++- ++- *state_out = state; ++- return true; ++-} ++- ++-static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status) ++-{ ++- bool valid = true; ++- bam_hdr_destroy(state->h); ++- check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status); ++- if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } ++- int i; ++- for (i = 0; i < 3; ++i) { ++- if (state->fpr[i] != state->hstdout) { ++- if (bgzf_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } ++- } ++- } ++- if (state->hstdout) { ++- if (bgzf_close(state->hstdout)) { ++- print_error_errno("bam2fq", "Error closing STDOUT"); ++- valid = false; ++- } ++- } ++- for (i = 0; i < 2; i++) { ++- if (state->fpi[i] && bgzf_close(state->fpi[i])) { ++- print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); ++- valid = false; ++- } ++- } ++- kl_destroy(ktaglist,state->taglist); ++- free(state->index_sequence); ++- free(state); ++- return valid; ++-} ++- ++-static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) ++-{ ++- return (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments ++- || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags ++- || (b->core.flag&(state->flag_off)) != 0 ++- || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff)); ++- ++-} ++- ++-static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) ++-{ ++- int n; ++- bam1_t *records[3]; ++- bam1_t* b = bam_init1(); ++- char *current_qname = NULL; ++- int64_t n_reads = 0, n_singletons = 0; // Statistics ++- kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}}; ++- int score[3]; ++- int at_eof; ++- if (b == NULL ) { ++- perror("[bam2fq_mainloop] Malloc error for bam record buffer."); ++- return false; ++- } ++- ++- bool valid = true; ++- while (true) { ++- int res = sam_read1(state->fp, state->h, b); ++- if (res < -1) { ++- fprintf(stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); ++- return false; ++- } ++- at_eof = res < 0; ++- ++- if (!at_eof && filter_it_out(b, state)) continue; ++- if (!at_eof) ++n_reads; ++- ++- if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) { ++- if (current_qname) { ++- if (state->illumina_tag) { ++- for (n=0; valid && n<3; n++) { ++- if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false; ++- } ++- if (!valid) break; ++- } ++- free(state->index_sequence); state->index_sequence = NULL; ++- if (score[1] > 0 && score[2] > 0) { ++- // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] ++- if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } ++- if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } ++- } else if (score[1] > 0 || score[2] > 0) { ++- if (state->fpse) { ++- // print whichever one exists to fpse ++- if (score[1] > 0) { ++- if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } ++- } else { ++- if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } ++- } ++- ++n_singletons; ++- } else { ++- if (score[1] > 0) { ++- if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } ++- } else { ++- if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } ++- } ++- } ++- } ++- if (score[0]) { // TODO: check this ++- // print linebuf[0] to fpr[0] ++- if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; } ++- } ++- } ++- ++- if (at_eof) break; ++- ++- free(current_qname); ++- current_qname = strdup(bam_get_qname(b)); ++- if (!current_qname) { valid = false; break; } ++- score[0] = score[1] = score[2] = 0; ++- } ++- ++- // Prefer a copy of the read that has base qualities ++- int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; ++- if (b_score > score[which_readpart(b)]) { ++- if (state->fpi[0]) if (!tags2fq(b, state, opts)) return false; ++- records[which_readpart(b)] = b; ++- if(!bam1_to_fq(b, &linebuf[which_readpart(b)], state)) { ++- fprintf(stderr, "[%s] Error converting read to FASTA/Q\n", __func__); ++- return false; ++- } ++- score[which_readpart(b)] = b_score; ++- } ++- } ++- if (!valid) ++- { ++- perror("[bam2fq_mainloop] Error writing to FASTx files."); ++- } ++- bam_destroy1(b); ++- free(current_qname); ++- free(linebuf[0].s); ++- free(linebuf[1].s); ++- free(linebuf[2].s); ++- fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); ++- fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); ++- ++- return valid; ++-} ++- ++-int main_bam2fq(int argc, char *argv[]) ++-{ ++- int status = EXIT_SUCCESS; ++- bam2fq_opts_t* opts = NULL; ++- bam2fq_state_t* state = NULL; ++- ++- bool valid = parse_opts(argc, argv, &opts); ++- if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE; ++- ++- if (!init_state(opts, &state)) return EXIT_FAILURE; ++- ++- if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; ++- ++- if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; ++- sam_global_args_free(&opts->ga); ++- free_opts(opts); ++- ++- return status; ++-} ++--- python-pysam.orig/samtools/sam_view.c.pysam.c +++++ python-pysam/samtools/sam_view.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* sam_view.c -- SAM<->BAM<->CRAM conversion. ++ ++- Copyright (C) 2009-2017 Genome Research Ltd. +++ Copyright (C) 2009-2019 Genome Research Ltd. ++ Portions copyright (C) 2009, 2011, 2012 Broad Institute. ++ ++ Author: Heng Li ++@@ -34,33 +34,25 @@ ++ #include ++ #include ++ #include ++-#include ++-#include ++ #include ++-#include ++ #include "htslib/sam.h" ++ #include "htslib/faidx.h" ++-#include "htslib/kstring.h" ++ #include "htslib/khash.h" ++-#include "htslib/klist.h" ++ #include "htslib/thread_pool.h" ++-#include "htslib/bgzf.h" ++ #include "samtools.h" ++ #include "sam_opts.h" ++ #include "bedidx.h" ++ ++-#define DEFAULT_BARCODE_TAG "BC" ++-#define DEFAULT_QUALITY_TAG "QT" ++- ++ KHASH_SET_INIT_STR(rg) ++-#define taglist_free(p) ++-KLIST_INIT(ktaglist, char*, taglist_free) +++KHASH_SET_INIT_STR(tv) ++ ++ typedef khash_t(rg) *rghash_t; +++typedef khash_t(tv) *tvhash_t; ++ ++ // This structure contains the settings for a samview run ++ typedef struct samview_settings { ++ rghash_t rghash; +++ tvhash_t tvhash; ++ int min_mapQ; ++ int flag_on; ++ int flag_off; ++@@ -74,16 +66,17 @@ ++ size_t remove_aux_len; ++ char** remove_aux; ++ int multi_region; +++ char* tag; ++ } samview_settings_t; ++ ++ ++ // TODO Add declarations of these to a viable htslib or samtools header ++-extern const char *bam_get_library(bam_hdr_t *header, const bam1_t *b); +++extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b); ++ extern int bam_remove_B(bam1_t *b); ++ extern char *samfaipath(const char *fn_ref); ++ ++ // Returns 0 to indicate read should be output 1 otherwise ++-static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settings) +++static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings) ++ { ++ if (settings->remove_B) bam_remove_B(b); ++ if (settings->min_qlen > 0) { ++@@ -98,7 +91,7 @@ ++ return 1; ++ if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff)) ++ return 1; ++- if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) +++ if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, sam_hdr_tid2name(h, b->core.tid), b->core.pos, bam_endpos(b)))) ++ return 1; ++ if (settings->subsam_frac > 0.) { ++ uint32_t k = __ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(b)) ^ settings->subsam_seed); ++@@ -111,8 +104,17 @@ ++ if (k == kh_end(settings->rghash)) return 1; ++ } ++ } +++ if (settings->tvhash && settings->tag) { +++ uint8_t *s = bam_aux_get(b, settings->tag); +++ if (s) { +++ khint_t k = kh_get(tv, settings->tvhash, (char*)(s + 1)); +++ if (k == kh_end(settings->tvhash)) return 1; +++ } else { +++ return 1; +++ } +++ } ++ if (settings->library) { ++- const char *p = bam_get_library((bam_hdr_t*)h, b); +++ const char *p = bam_get_library((sam_hdr_t*)h, b); ++ if (!p || strcmp(p, settings->library) != 0) return 1; ++ } ++ if (settings->remove_aux_len) { ++@@ -127,37 +129,6 @@ ++ return 0; ++ } ++ ++-static char *drop_rg(char *hdtxt, rghash_t h, int *len) ++-{ ++- char *p = hdtxt, *q, *r, *s; ++- kstring_t str; ++- memset(&str, 0, sizeof(kstring_t)); ++- while (1) { ++- int toprint = 0; ++- q = strchr(p, '\n'); ++- if (q == 0) q = p + strlen(p); ++- if (q - p < 3) break; // the line is too short; then stop ++- if (strncmp(p, "@RG\t", 4) == 0) { ++- int c; ++- khint_t k; ++- if ((r = strstr(p, "\tID:")) != 0) { ++- r += 4; ++- for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s); ++- c = *s; *s = '\0'; ++- k = kh_get(rg, h, r); ++- *s = c; ++- if (k != kh_end(h)) toprint = 1; ++- } ++- } else toprint = 1; ++- if (toprint) { ++- kputsn(p, q - p, &str); kputc('\n', &str); ++- } ++- p = q + 1; ++- } ++- *len = str.l; ++- return str.s; ++-} ++- ++ static int usage(FILE *fp, int exit_status, int is_long_help); ++ ++ static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name) ++@@ -219,39 +190,87 @@ ++ return (ret != -1) ? 0 : -1; ++ } ++ ++-static inline int check_sam_write1(samFile *fp, const bam_hdr_t *h, const bam1_t *b, const char *fname, int *retp) +++static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name) ++ { ++- int r = sam_write1(fp, h, b); ++- if (r >= 0) return r; +++ char *d = strdup(name); +++ int ret = 0; ++ ++- if (fname) print_error_errno("view", "writing to \"%s\" failed", fname); ++- else print_error_errno("view", "writing to standard output failed"); +++ if (d == NULL) goto err; ++ ++- *retp = EXIT_FAILURE; ++- return r; +++ if (settings->tvhash == NULL) { +++ settings->tvhash = kh_init(tv); +++ if (settings->tvhash == NULL) goto err; +++ } +++ +++ kh_put(tv, settings->tvhash, d, &ret); +++ if (ret == -1) goto err; +++ if (ret == 0) free(d); /* Duplicate */ +++ return 0; +++ +++ err: +++ print_error(subcmd, "Couldn't add \"%s\" to tag values list: memory exhausted?", name); +++ free(d); +++ return -1; +++} +++ +++static int add_tag_values_file(const char *subcmd, samview_settings_t *settings, char *fn) +++{ +++ FILE *fp; +++ char buf[1024]; +++ int ret = 0; +++ if (settings->tvhash == NULL) { +++ settings->tvhash = kh_init(tv); +++ if (settings->tvhash == NULL) { +++ perror(NULL); +++ return -1; +++ } +++ } +++ +++ fp = fopen(fn, "r"); +++ if (fp == NULL) { +++ print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); +++ return -1; +++ } +++ +++ while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { +++ char *d = strdup(buf); +++ if (d != NULL) { +++ kh_put(tv, settings->tvhash, d, &ret); +++ if (ret == 0) free(d); /* Duplicate */ +++ } else { +++ ret = -1; +++ } +++ } +++ if (ferror(fp)) ret = -1; +++ if (ret == -1) { +++ print_error_errno(subcmd, "failed to read \"%s\"", fn); +++ } +++ fclose(fp); +++ return (ret != -1) ? 0 : -1; ++ } ++ ++-static void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp) +++static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b, const char *fname, int *retp) ++ { ++- int r = sam_close(fp); ++- if (r >= 0) return; +++ int r = sam_write1(fp, h, b); +++ if (r >= 0) return r; ++ ++- // TODO Need error infrastructure so we can print a message instead of r ++- if (fname) print_error(subcmd, "error closing \"%s\": %d", fname, r); ++- else print_error(subcmd, "error closing %s: %d", null_fname, r); +++ if (fname) print_error_errno("view", "writing to \"%s\" failed", fname); +++ else print_error_errno("view", "writing to standard output failed"); ++ ++ *retp = EXIT_FAILURE; +++ return r; ++ } ++ ++ int main_samview(int argc, char *argv[]) ++ { ++- int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0; +++ int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0; ++ int64_t count = 0; ++ samFile *in = 0, *out = 0, *un_out=0; ++ FILE *fp_out = NULL; ++- bam_hdr_t *header = NULL; +++ sam_hdr_t *header = NULL; ++ char out_mode[5], out_un_mode[5], *out_format = ""; ++- char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; +++ char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; +++ char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ htsThreadPool p = {NULL, 0}; ++ int filter_state = ALL, filter_op = 0; ++@@ -259,6 +278,7 @@ ++ ++ samview_settings_t settings = { ++ .rghash = NULL, +++ .tvhash = NULL, ++ .min_mapQ = 0, ++ .flag_on = 0, ++ .flag_off = 0, ++@@ -269,11 +289,13 @@ ++ .subsam_frac = -1., ++ .library = NULL, ++ .bed = NULL, ++- .multi_region = 0 +++ .multi_region = 0, +++ .tag = NULL ++ }; ++ ++ static const struct option lopts[] = { ++ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), +++ {"no-PG", no_argument, NULL, 1}, ++ { NULL, 0, NULL, 0 } ++ }; ++ ++@@ -290,7 +312,7 @@ ++ opterr = 0; ++ ++ while ((c = getopt_long(argc, argv, ++- "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:L:s:@:m:x:U:M", +++ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:d:D:L:s:@:m:x:U:MX", ++ lopts, NULL)) >= 0) { ++ switch (c) { ++ case 's': ++@@ -300,7 +322,6 @@ ++ srand(settings.subsam_seed); ++ settings.subsam_seed = rand(); ++ } ++- ++ if (q && *q == '.') { ++ settings.subsam_frac = strtod(q, &q); ++ if (*q) ret = 1; ++@@ -323,6 +344,7 @@ ++ case 'H': is_header_only = 1; break; ++ case 'o': fn_out = strdup(optarg); break; ++ case 'U': fn_un_out = strdup(optarg); break; +++ case 'X': has_index_file = 1; break; ++ case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; ++ case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; ++ case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break; ++@@ -349,6 +371,63 @@ ++ goto view_end; ++ } ++ break; +++ case 'd': +++ if (strlen(optarg) < 4 || optarg[2] != ':') { +++ print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg); +++ ret = 1; +++ goto view_end; +++ } +++ +++ if (settings.tag) { +++ if (settings.tag[0] != optarg[0] || settings.tag[1] != optarg[1]) { +++ print_error("view", "Different tag \"%s\" was specified before: \"%s\"", settings.tag, optarg); +++ ret = 1; +++ goto view_end; +++ } +++ } else { +++ if (!(settings.tag = calloc(3, 1))) { +++ print_error("view", "Could not allocate memory for tag: \"%s\"", optarg); +++ ret = 1; +++ goto view_end; +++ } +++ memcpy(settings.tag, optarg, 2); +++ } +++ +++ if (add_tag_value_single("view", &settings, optarg+3) != 0) { +++ ret = 1; +++ goto view_end; +++ } +++ break; +++ case 'D': +++ // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX +++ // path translation as described at: +++ // http://www.mingw.org/wiki/Posix_path_conversion +++ if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) { +++ print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg); +++ ret = 1; +++ goto view_end; +++ } +++ +++ if (settings.tag) { +++ if (settings.tag[0] != optarg[0] || settings.tag[1] != optarg[1]) { +++ print_error("view", "Different tag \"%s\" was specified before: \"%s\"", settings.tag, optarg); +++ ret = 1; +++ goto view_end; +++ } +++ } else { +++ if (!(settings.tag = calloc(3, 1))) { +++ print_error("view", "Could not allocate memory for tag: \"%s\"", optarg); +++ ret = 1; +++ goto view_end; +++ } +++ memcpy(settings.tag, optarg, 2); +++ } +++ +++ if (add_tag_values_file("view", &settings, optarg+3) != 0) { +++ ret = 1; +++ goto view_end; +++ } +++ break; ++ /* REMOVED as htslib doesn't support this ++ //case 'x': out_format = "x"; break; ++ //case 'X': out_format = "X"; break; ++@@ -382,6 +461,7 @@ ++ } ++ break; ++ case 'M': settings.multi_region = 1; break; +++ case 1: no_pg = 1; break; ++ default: ++ if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) ++ return usage(samtools_stderr, EXIT_FAILURE, 0); ++@@ -431,13 +511,8 @@ ++ ret = 1; ++ goto view_end; ++ } ++- if (settings.rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... ++- char *tmp; ++- int l; ++- tmp = drop_rg(header->text, settings.rghash, &l); ++- free(header->text); ++- header->text = tmp; ++- header->l_text = l; +++ if (settings.rghash) { +++ sam_hdr_remove_lines(header, "RG", "ID", settings.rghash); ++ } ++ if (!is_count) { ++ if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { ++@@ -452,7 +527,25 @@ ++ goto view_end; ++ } ++ } ++- if (*out_format || is_header || +++ +++ if (!no_pg) { +++ if (!(arg_list = stringify_argv(argc+1, argv-1))) { +++ print_error("view", "failed to create arg_list"); +++ ret = 1; +++ goto view_end; +++ } +++ if (sam_hdr_add_pg(header, "samtools", +++ "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)) { +++ print_error("view", "failed to add PG line to the header"); +++ ret = 1; +++ goto view_end; +++ } +++ } +++ +++ if (*out_format || ga.write_index || is_header || ++ out_mode[1] == 'b' || out_mode[1] == 'c' || ++ (ga.out.format != sam && ga.out.format != unknown_format)) { ++ if (sam_hdr_write(out, header) != 0) { ++@@ -461,6 +554,13 @@ ++ goto view_end; ++ } ++ } +++ if (ga.write_index) { +++ if (!(fn_out_idx = auto_index(out, fn_out, header))) { +++ ret = 1; +++ goto view_end; +++ } +++ } +++ ++ if (fn_un_out) { ++ if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { ++ print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out); ++@@ -483,6 +583,12 @@ ++ goto view_end; ++ } ++ } +++ if (ga.write_index) { +++ if (!(fn_un_out_idx = auto_index(un_out, fn_un_out, header))) { +++ ret = 1; +++ goto view_end; +++ } +++ } ++ } ++ } ++ else { ++@@ -507,11 +613,23 @@ ++ } ++ if (is_header_only) goto view_end; // no need to print alignments ++ +++ if (has_index_file) { +++ fn_idx_in = (optind+1 < argc)? argv[optind+1] : 0; +++ if (fn_idx_in == 0) { +++ fprintf(samtools_stderr, "[main_samview] incorrect number of arguments for -X option. Aborting.\n"); +++ return 1; +++ } +++ } +++ ++ if (settings.multi_region) { ++- if (optind < argc - 1) { //regions have been specified in the command line +++ if (!has_index_file && optind < argc - 1) { //regions have been specified in the command line ++ settings.bed = bed_hash_regions(settings.bed, argv, optind+1, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file ++ if (!filter_op) ++ filter_state = FILTERED; +++ } else if (has_index_file && optind < argc - 2) { +++ settings.bed = bed_hash_regions(settings.bed, argv, optind+2, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file +++ if (!filter_op) +++ filter_state = FILTERED; ++ } else { ++ bed_unify(settings.bed); ++ } ++@@ -520,7 +638,13 @@ ++ if (settings.bed == NULL) { // index is unavailable or no regions have been specified ++ fprintf(samtools_stderr, "[main_samview] no regions or BED file have been provided. Aborting.\n"); ++ } else { ++- hts_idx_t *idx = sam_index_load(in, fn_in); // load index +++ hts_idx_t *idx = NULL; +++ // If index filename has not been specfied, look in BAM folder +++ if (fn_idx_in != 0) { +++ idx = sam_index_load2(in, fn_in, fn_idx_in); // load index +++ } else { +++ idx = sam_index_load(in, fn_in); +++ } ++ if (idx != NULL) { ++ ++ int regcount = 0; ++@@ -557,7 +681,7 @@ ++ } ++ bam_destroy1(b); ++ } else { ++- if (optind + 1 >= argc) { // convert/print the entire file +++ if ((has_index_file && optind >= argc - 2) || (!has_index_file && optind >= argc - 1)) { // convert/print the entire file ++ bam1_t *b = bam_init1(); ++ int r; ++ while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' ++@@ -576,22 +700,25 @@ ++ } else { // retrieve alignments in specified regions ++ int i; ++ bam1_t *b; ++- hts_idx_t *idx = sam_index_load(in, fn_in); // load index +++ hts_idx_t *idx = NULL; +++ // If index filename has not been specfied, look in BAM folder +++ if (fn_idx_in != NULL) { +++ idx = sam_index_load2(in, fn_in, fn_idx_in); // load index +++ } else { +++ idx = sam_index_load(in, fn_in); +++ } ++ if (idx == 0) { // index is unavailable ++ fprintf(samtools_stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); ++ ret = 1; ++ goto view_end; ++ } ++ b = bam_init1(); ++- for (i = optind + 1; i < argc; ++i) { +++ +++ for (i = (has_index_file)? optind+2 : optind+1; i < argc; ++i) { ++ int result; ++ hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' ++ if (iter == NULL) { // region invalid or reference name not found ++- int beg, end; ++- if (hts_parse_reg(argv[i], &beg, &end)) ++- fprintf(samtools_stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); ++- else ++- fprintf(samtools_stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); +++ fprintf(samtools_stderr, "[main_samview] region \"%s\" specifies an invalid region or unknown reference. Continue anyway.\n", argv[i]); ++ continue; ++ } ++ // fetch alignments ++@@ -615,6 +742,17 @@ ++ } ++ } ++ +++ if (ga.write_index) { +++ if (sam_idx_save(out) < 0) { +++ print_error_errno("view", "writing index failed"); +++ ret = 1; +++ } +++ if (un_out && sam_idx_save(un_out) < 0) { +++ print_error_errno("view", "writing index failed"); +++ ret = 1; +++ } +++ } +++ ++ view_end: ++ if (is_count && ret == 0) { ++ if (fprintf(fn_out? fp_out : samtools_stdout, "%" PRId64 "\n", count) < 0) { ++@@ -632,7 +770,7 @@ ++ ++ free(fn_list); free(fn_out); free(settings.library); free(fn_un_out); ++ sam_global_args_free(&ga); ++- if ( header ) bam_hdr_destroy(header); +++ if ( header ) sam_hdr_destroy(header); ++ if (settings.bed) bed_destroy(settings.bed); ++ if (settings.rghash) { ++ khint_t k; ++@@ -640,13 +778,28 @@ ++ if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k)); ++ kh_destroy(rg, settings.rghash); ++ } +++ if (settings.tvhash) { +++ khint_t k; +++ for (k = 0; k < kh_end(settings.tvhash); ++k) +++ if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k)); +++ kh_destroy(tv, settings.tvhash); +++ } ++ if (settings.remove_aux_len) { ++ free(settings.remove_aux); ++ } +++ if (settings.tag) { +++ free(settings.tag); +++ } ++ ++ if (p.pool) ++ hts_tpool_destroy(p.pool); ++ +++ if (fn_out_idx) +++ free(fn_out_idx); +++ if (fn_un_out_idx) +++ free(fn_un_out_idx); +++ free(arg_list); +++ ++ return ret; ++ } ++ ++@@ -669,10 +822,16 @@ ++ " -U FILE output reads not selected by filters to FILE [null]\n" ++ // extra input ++ " -t FILE FILE listing reference names and lengths (see long help) [null]\n" +++" -X include customized index file\n" ++ // read filters ++ " -L FILE only include reads overlapping this BED FILE [null]\n" ++ " -r STR only include reads in read group STR [null]\n" ++ " -R FILE only include reads with read group listed in FILE [null]\n" +++" -d STR:STR\n" +++" only include reads with tag STR and associated value STR [null]\n" +++" -D STR:FILE\n" +++" only include reads with tag STR and associated values listed in\n" +++" FILE [null]\n" ++ " -q INT only include reads with mapping quality >= INT [0]\n" ++ " -l STR only include reads in library STR [null]\n" ++ " -m INT only include reads with number of CIGAR operations consuming\n" ++@@ -689,9 +848,10 @@ ++ " -B collapse the backward CIGAR operation\n" ++ // general options ++ " -? print long help, including note about region specification\n" ++-" -S ignored (input format is auto-detected)\n"); +++" -S ignored (input format is auto-detected)\n" +++" --no-PG do not add a PG line\n"); ++ ++- sam_global_opt_help(fp, "-.O.T@"); +++ sam_global_opt_help(fp, "-.O.T@.."); ++ fprintf(fp, "\n"); ++ ++ if (is_long_help) ++@@ -749,903 +909,3 @@ ++ free(argv2); ++ return ret; ++ } ++- ++-int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; ++-static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; ++- ++-static void bam2fq_usage(FILE *to, const char *command) ++-{ ++- int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; ++- fprintf(to, ++-"Usage: samtools %s [options...] \n", command); ++- fprintf(to, ++-"Options:\n" ++-" -0 FILE write reads designated READ_OTHER to FILE\n" ++-" -1 FILE write reads designated READ1 to FILE\n" ++-" -2 FILE write reads designated READ2 to FILE\n" ++-" note: if a singleton file is specified with -s, only\n" ++-" paired reads will be written to the -1 and -2 files.\n" ++-" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x ++-" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0 ++-" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) ++-" -n don't append /1 and /2 to the read name\n" ++-" -N always append /1 and /2 to the read name\n"); ++- if (fq) fprintf(to, ++-" -O output quality in the OQ tag if present\n"); ++- fprintf(to, ++-" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" ++-" -t copy RG, BC and QT tags to the %s header line\n", ++- fq ? "FASTQ" : "FASTA"); ++- fprintf(to, ++-" -T TAGLIST copy arbitrary tags to the %s header line\n", ++- fq ? "FASTQ" : "FASTA"); ++- if (fq) fprintf(to, ++-" -v INT default quality score if not given in file [1]\n" ++-" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" ++-" -c compression level [0..9] to use when creating gz or bgzf fastq files\n" ++-" --i1 FILE write first index reads to FILE\n" ++-" --i2 FILE write second index reads to FILE\n" ++-" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" ++-" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" ++-" --index-format STR How to parse barcode and quality tags\n\n"); ++- sam_global_opt_help(to, "-.--.@"); ++- fprintf(to, ++-"\n" ++-"Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n" ++-"Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n" ++-"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n" ++-"or both unset.\n" ++-"Run 'samtools flags' for more information on flag codes and meanings.\n"); ++- fprintf(to, ++-"\n" ++-"The index-format string describes how to parse the barcode and quality tags, for example:\n" ++-" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" ++-" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" ++-"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" ++-"'read until the separator or end of tag', for example:\n" ++-" n*i* ignore the left part of the tag until the separator, then use the second part\n" ++-" of the tag as index 1\n"); ++- fprintf(to, ++-"\n" ++-"Examples:\n" ++-" To get just the paired reads in separate files, use:\n" ++-" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n -F 0x900 in.bam\n" ++-"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n" ++-" samtools %s -F 0x900 in.bam > all_reads.%s\n", ++- command, fq ? "fq" : "fa", fq ? "fq" : "fa", ++- command, fq ? "fq" : "fa"); ++-} ++- ++-typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart; ++-typedef enum { FASTA, FASTQ } fastfile; ++-typedef struct bam2fq_opts { ++- char *fnse; ++- char *fnr[3]; ++- char *fn_input; // pointer to input filename in argv do not free ++- bool has12, has12always, use_oq, copy_tags, illumina_tag; ++- int flag_on, flag_off, flag_alloff; ++- sam_global_args ga; ++- fastfile filetype; ++- int def_qual; ++- char *barcode_tag; ++- char *quality_tag; ++- char *index_file[2]; ++- char *index_format; ++- char *extra_tags; ++- char compression_level; ++-} bam2fq_opts_t; ++- ++-typedef struct bam2fq_state { ++- samFile *fp; ++- BGZF *fpse; ++- BGZF *fpr[3]; ++- BGZF *fpi[2]; ++- BGZF *hsamtools_stdout; ++- bam_hdr_t *h; ++- bool has12, use_oq, copy_tags, illumina_tag; ++- int flag_on, flag_off, flag_alloff; ++- fastfile filetype; ++- int def_qual; ++- klist_t(ktaglist) *taglist; ++- char *index_sequence; ++- char compression_level; ++-} bam2fq_state_t; ++- ++-/* ++- * Get and decode the read from a BAM record. ++- * ++- * TODO: htslib really needs an interface for this. Consider this or perhaps ++- * bam_get_seq_str (current vs original orientation) and bam_get_qual_str ++- * functions as string formatted equivalents to bam_get_{seq,qual}? ++- */ ++- ++-/* ++- * Reverse a string in place. ++- * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. ++- * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik ++- */ ++-static char *reverse(char *str) ++-{ ++- int i = strlen(str)-1,j=0; ++- char ch; ++- while (i>j) { ++- ch = str[i]; ++- str[i]= str[j]; ++- str[j] = ch; ++- i--; ++- j++; ++- } ++- return str; ++-} ++- ++-/* return the read, reverse complemented if necessary */ ++-static char *get_read(const bam1_t *rec) ++-{ ++- int len = rec->core.l_qseq + 1; ++- char *read = calloc(1, len); ++- char *seq = (char *)bam_get_seq(rec); ++- int n; ++- ++- if (!read) return NULL; ++- ++- for (n=0; n < rec->core.l_qseq; n++) { ++- if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; ++- else read[n] = seq_nt16_str[bam_seqi(seq,n)]; ++- } ++- if (rec->core.flag & BAM_FREVERSE) reverse(read); ++- return read; ++-} ++- ++-/* ++- * get and decode the quality from a BAM record ++- */ ++-static int get_quality(const bam1_t *rec, char **qual_out) ++-{ ++- char *quality = calloc(1, rec->core.l_qseq + 1); ++- char *q = (char *)bam_get_qual(rec); ++- int n; ++- ++- if (!quality) return -1; ++- ++- if (*q == '\xff') { ++- free(quality); ++- *qual_out = NULL; ++- return 0; ++- } ++- ++- for (n=0; n < rec->core.l_qseq; n++) { ++- quality[n] = q[n]+33; ++- } ++- if (rec->core.flag & BAM_FREVERSE) reverse(quality); ++- *qual_out = quality; ++- return 0; ++-} ++- ++-// ++-// End of htslib complaints ++-// ++- ++- ++-static readpart which_readpart(const bam1_t *b) ++-{ ++- if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { ++- return READ_1; ++- } else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) { ++- return READ_2; ++- } else { ++- return READ_UNKNOWN; ++- } ++-} ++- ++-/* ++- * parse the length part from the index-format string ++- */ ++-static int getLength(char **s) ++-{ ++- int n = 0; ++- while (**s) { ++- if (**s == '*') { n=-1; (*s)++; break; } ++- if ( !isdigit(**s)) break; ++- n = n*10 + ((**s)-'0'); ++- (*s)++; ++- } ++- return n; ++-} ++- ++-static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf) ++-{ ++- uint8_t *s = bam_aux_get(rec, tag); ++- if (s) { ++- char aux_type = *s; ++- switch (aux_type) { ++- case 'C': ++- case 'S': aux_type = 'I'; break; ++- case 'c': ++- case 's': aux_type = 'i'; break; ++- case 'd': aux_type = 'f'; break; ++- } ++- ++- // Ensure space. Need 6 chars + length of tag. Max length of ++- // i is 16, A is 21, B currently 26, Z is unknown, so ++- // have to check that one later. ++- if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false; ++- ++- kputc('\t', linebuf); ++- kputsn(tag, 2, linebuf); ++- kputc(':', linebuf); ++- kputc(aux_type=='I'? 'i': aux_type, linebuf); ++- kputc(':', linebuf); ++- switch (aux_type) { ++- case 'H': ++- case 'Z': ++- if (kputs(bam_aux2Z(s), linebuf) < 0) return false; ++- break; ++- case 'i': kputw(bam_aux2i(s), linebuf); break; ++- case 'I': kputuw(bam_aux2i(s), linebuf); break; ++- case 'A': kputc(bam_aux2A(s), linebuf); break; ++- case 'f': kputd(bam_aux2f(s), linebuf); break; ++- case 'B': kputs("*** Unhandled aux type ***", linebuf); return false; ++- default: kputs("*** Unknown aux type ***", linebuf); return false; ++- } ++- } ++- return true; ++-} ++- ++-static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec) ++-{ ++- if (!index_sequence) return 0; ++- ++- kstring_t new = {0,0,NULL}; ++- if (linebuf->s) { ++- char *s = strchr(linebuf->s, '\n'); ++- if (s) { ++- if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0) ++- return -1; ++- *s = 0; ++- kputs(linebuf->s, &new); ++- kputc(' ', &new); ++- readpart readpart = which_readpart(rec); ++- if (readpart == READ_1) kputc('1', &new); ++- else if (readpart == READ_2) kputc('2', &new); ++- else kputc('0', &new); ++- ++- kputc(':', &new); ++- if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new); ++- else kputc('N', &new); ++- ++- kputs(":0:", &new); ++- kputs(index_sequence, &new); ++- kputc('\n', &new); ++- kputs(s+1, &new); ++- free(ks_release(linebuf)); ++- linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m; ++- } ++- } ++- return 0; ++-} ++- ++-static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) ++-{ ++- int i; ++- ++- linebuf->l = 0; ++- // Write read name ++- if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false; ++- if (kputs(bam_get_qname(rec), linebuf) < 0) return false; ++- // Add the /1 /2 if requested ++- if (state->has12) { ++- readpart readpart = which_readpart(rec); ++- if (readpart == READ_1) { ++- if (kputs("/1", linebuf) < 0) return false; ++- } else if (readpart == READ_2) { ++- if (kputs("/2", linebuf) < 0) return false; ++- } ++- } ++- if (state->copy_tags) { ++- for (i = 0; copied_tags[i]; ++i) { ++- if (!copy_tag(copied_tags[i], rec, linebuf)) { ++- fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); ++- return false; ++- } ++- } ++- } ++- ++- if (state->taglist->size) { ++- kliter_t(ktaglist) *p; ++- for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) { ++- if (!copy_tag(kl_val(p), rec, linebuf)) { ++- fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); ++- return false; ++- } ++- } ++- } ++- ++- if (kputc('\n', linebuf) < 0) return false; ++- if (kputs(seq, linebuf) < 0) return false; ++- if (kputc('\n', linebuf) < 0) return false; ++- ++- if (state->filetype == FASTQ) { ++- // Write quality ++- if (kputs("+\n", linebuf) < 0) return false; ++- if (qual && *qual) { ++- if (kputs(qual, linebuf) < 0) return false; ++- } else { ++- int len = strlen(seq); ++- if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false; ++- for (i = 0; i < len; ++i) { ++- kputc(33 + state->def_qual, linebuf); ++- } ++- } ++- if (kputc('\n', linebuf) < 0) return false; ++- } ++- return true; ++-} ++- ++-/* ++- * Create FASTQ lines from the barcode tag using the index-format ++- */ ++-static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) ++-{ ++- uint8_t *p; ++- char *ifmt = opts->index_format; ++- char *tag = NULL; ++- char *qual = NULL; ++- char *sub_tag = NULL; ++- char *sub_qual = NULL; ++- size_t tag_len; ++- int file_number = 0; ++- kstring_t linebuf = { 0, 0, NULL }; // Buffer ++- ++- ++- // read barcode tag ++- p = bam_aux_get(rec,opts->barcode_tag); ++- if (p) tag = bam_aux2Z(p); ++- ++- if (!tag) return true; // there is no tag ++- ++- tag_len = strlen(tag); ++- sub_tag = calloc(1, tag_len + 1); ++- if (!sub_tag) goto fail; ++- sub_qual = calloc(1, tag_len + 1); ++- if (!sub_qual) goto fail; ++- ++- // read quality tag ++- p = bam_aux_get(rec, opts->quality_tag); ++- if (p) qual = bam_aux2Z(p); ++- ++- // Parse the index-format string ++- while (*ifmt) { ++- if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly ++- char action = *ifmt; // should be 'i' or 'n' ++- ifmt++; // skip over action ++- int index_len = getLength(&ifmt); ++- int n = 0; ++- ++- if (index_len < 0) { ++- // read until separator ++- while (isalpha(*tag)) { ++- sub_tag[n] = *tag++; ++- if (qual) sub_qual[n] = *qual++; ++- n++; ++- } ++- if (*tag) { // skip separator ++- tag++; ++- if (qual) qual++; ++- } ++- } else { ++- // read index_len characters ++- while (index_len-- && *tag) { ++- sub_tag[n] = *tag++; ++- if (qual) sub_qual[n] = *qual++; ++- n++; ++- } ++- } ++- sub_tag[n] = '\0'; ++- sub_qual[n] = '\0'; ++- ++- if (action=='i' && *sub_tag && state->fpi[file_number]) { ++- //if (file_number==0) state->index_sequence = strdup(sub_tag); // we're going to need this later... ++- state->index_sequence = strdup(sub_tag); // we're going to need this later... ++- if (!state->index_sequence) goto fail; ++- if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail; ++- if (state->illumina_tag) { ++- if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf, rec) < 0) { ++- goto fail; ++- } ++- } ++- if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0) ++- goto fail; ++- } ++- ++- } ++- ++- free(sub_qual); free(sub_tag); ++- free(linebuf.s); ++- return true; ++- ++- fail: ++- perror(__func__); ++- free(sub_qual); free(sub_tag); ++- free(linebuf.s); ++- return true; ++-} ++- ++-// Transform a bam1_t record into a string with the FASTQ representation of it ++-// @returns false for error, true for success ++-static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) ++-{ ++- int32_t qlen = b->core.l_qseq; ++- assert(qlen >= 0); ++- const uint8_t *oq = NULL; ++- char *qual = NULL; ++- ++- char *seq = get_read(b); ++- if (!seq) return false; ++- ++- if (state->use_oq) oq = bam_aux_get(b, "OQ"); ++- if (oq && *oq=='Z') { ++- qual = strdup(bam_aux2Z(oq)); ++- if (!qual) goto fail; ++- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented ++- reverse(qual); ++- } ++- } else { ++- if (get_quality(b, &qual) < 0) goto fail; ++- } ++- ++- if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail; ++- ++- free(qual); ++- free(seq); ++- return true; ++- ++- fail: ++- free(seq); ++- free(qual); ++- return false; ++-} ++- ++-static void free_opts(bam2fq_opts_t *opts) ++-{ ++- free(opts->barcode_tag); ++- free(opts->quality_tag); ++- free(opts->index_format); ++- free(opts->extra_tags); ++- free(opts); ++-} ++- ++-// return true if valid ++-static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) ++-{ ++- // Parse args ++- bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t)); ++- opts->has12 = true; ++- opts->has12always = false; ++- opts->filetype = FASTQ; ++- opts->def_qual = 1; ++- opts->barcode_tag = NULL; ++- opts->quality_tag = NULL; ++- opts->index_format = NULL; ++- opts->index_file[0] = NULL; ++- opts->index_file[1] = NULL; ++- opts->extra_tags = NULL; ++- opts->compression_level = 1; ++- ++- int c; ++- sam_global_args_init(&opts->ga); ++- static const struct option lopts[] = { ++- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), ++- {"i1", required_argument, NULL, 1}, ++- {"I1", required_argument, NULL, 1}, ++- {"i2", required_argument, NULL, 2}, ++- {"I2", required_argument, NULL, 2}, ++- {"if", required_argument, NULL, 3}, ++- {"IF", required_argument, NULL, 3}, ++- {"index-format", required_argument, NULL, 3}, ++- {"barcode-tag", required_argument, NULL, 'b'}, ++- {"quality-tag", required_argument, NULL, 'q'}, ++- { NULL, 0, NULL, 0 } ++- }; ++- while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) { ++- switch (c) { ++- case 'b': opts->barcode_tag = strdup(optarg); break; ++- case 'q': opts->quality_tag = strdup(optarg); break; ++- case 1 : opts->index_file[0] = optarg; break; ++- case 2 : opts->index_file[1] = optarg; break; ++- case 3 : opts->index_format = strdup(optarg); break; ++- case '0': opts->fnr[0] = optarg; break; ++- case '1': opts->fnr[1] = optarg; break; ++- case '2': opts->fnr[2] = optarg; break; ++- case 'f': opts->flag_on |= strtol(optarg, 0, 0); break; ++- case 'F': opts->flag_off |= strtol(optarg, 0, 0); break; ++- case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; ++- case 'n': opts->has12 = false; break; ++- case 'N': opts->has12always = true; break; ++- case 'O': opts->use_oq = true; break; ++- case 's': opts->fnse = optarg; break; ++- case 't': opts->copy_tags = true; break; ++- case 'i': opts->illumina_tag = true; break; ++- case 'c': opts->compression_level = atoi(optarg); break; ++- case 'T': opts->extra_tags = strdup(optarg); break; ++- case 'v': opts->def_qual = atoi(optarg); break; ++- case '?': bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; ++- default: ++- if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { ++- bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; ++- } ++- break; ++- } ++- } ++- ++- if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; ++- if (opts->has12always) opts->has12 = true; ++- ++- if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); ++- if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); ++- ++- int nIndex = 0; ++- if (opts->index_format) { ++- char *s; ++- for (s = opts->index_format; *s; s++) { ++- if (*s == 'i') nIndex++; ++- } ++- } ++- if (nIndex>2) { ++- fprintf(samtools_stderr,"Invalid index format: more than 2 indexes\n"); ++- bam2fq_usage(samtools_stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if (opts->index_file[1] && !opts->index_file[0]) { ++- fprintf(samtools_stderr, "Index one specified, but index two not given\n"); ++- bam2fq_usage(samtools_stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if (nIndex==2 && !opts->index_file[1]) { ++- fprintf(samtools_stderr, "index_format specifies two indexes, but only one index file given\n"); ++- bam2fq_usage(samtools_stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if (nIndex==1 && !opts->index_file[0]) { ++- fprintf(samtools_stderr, "index_format specifies an index, but no index file given\n"); ++- bam2fq_usage(samtools_stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if (nIndex==0 && opts->index_file[0]) { ++- fprintf(samtools_stderr, "index_format not specified, but index file given\n"); ++- bam2fq_usage(samtools_stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if (opts->def_qual < 0 || 93 < opts->def_qual) { ++- fprintf(samtools_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); ++- bam2fq_usage(samtools_stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- const char* type_str = argv[0]; ++- if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) { ++- opts->filetype = FASTQ; ++- } else if (strcasecmp("fasta", type_str) == 0) { ++- opts->filetype = FASTA; ++- } else { ++- print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str); ++- bam2fq_usage(samtools_stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if ((argc - (optind)) == 0) { ++- fprintf(samtools_stderr, "No input file specified.\n"); ++- bam2fq_usage(samtools_stdout, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- ++- if ((argc - (optind)) != 1) { ++- fprintf(samtools_stderr, "Too many arguments.\n"); ++- bam2fq_usage(samtools_stderr, argv[0]); ++- free_opts(opts); ++- return false; ++- } ++- opts->fn_input = argv[optind]; ++- *opts_out = opts; ++- return true; ++-} ++- ++-static BGZF *open_fqfile(char *filename, int c) ++-{ ++- char mode[4] = "w"; ++- size_t len = strlen(filename); ++- ++- mode[2] = 0; mode[3] = 0; ++- if (len > 3 && strstr(filename + (len - 3),".gz")) { ++- mode[1] = 'g'; mode[2] = c+'0'; ++- } else if ((len > 4 && strstr(filename + (len - 4),".bgz")) ++- || (len > 5 && strstr(filename + (len - 5),".bgzf"))) { ++- mode[1] = c+'0'; ++- } else { ++- mode[1] = 'u'; ++- } ++- ++- return bgzf_open(filename,mode); ++-} ++- ++-static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) ++-{ ++- bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); ++- state->flag_on = opts->flag_on; ++- state->flag_off = opts->flag_off; ++- state->flag_alloff = opts->flag_alloff; ++- state->has12 = opts->has12; ++- state->use_oq = opts->use_oq; ++- state->illumina_tag = opts->illumina_tag; ++- state->copy_tags = opts->copy_tags; ++- state->filetype = opts->filetype; ++- state->def_qual = opts->def_qual; ++- state->index_sequence = NULL; ++- state->hsamtools_stdout = NULL; ++- state->compression_level = opts->compression_level; ++- ++- state->taglist = kl_init(ktaglist); ++- if (opts->extra_tags) { ++- char *save_p; ++- char *s = strtok_r(opts->extra_tags, ",", &save_p); ++- while (s) { ++- if (strlen(s) != 2) { ++- fprintf(samtools_stderr, "Parsing extra tags - '%s' is not two characters\n", s); ++- free(state); ++- return false; ++- } ++- char **et = kl_pushp(ktaglist, state->taglist); ++- *et = s; ++- s = strtok_r(NULL, ",", &save_p); ++- } ++- } ++- ++- state->fp = sam_open(opts->fn_input, "r"); ++- if (state->fp == NULL) { ++- print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); ++- free(state); ++- return false; ++- } ++- if (opts->ga.nthreads > 0) ++- hts_set_threads(state->fp, opts->ga.nthreads); ++- uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; ++- if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX; ++- if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { ++- fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); ++- free(state); ++- return false; ++- } ++- if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) { ++- fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); ++- free(state); ++- return false; ++- } ++- if (opts->fnse) { ++- state->fpse = open_fqfile(opts->fnse, state->compression_level); ++- if (state->fpse == NULL) { ++- print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse); ++- free(state); ++- return false; ++- } ++- } ++- ++- if (opts->ga.reference) { ++- if (hts_set_fai_filename(state->fp, opts->ga.reference) != 0) { ++- print_error_errno("bam2fq", "cannot load reference \"%s\"", opts->ga.reference); ++- free(state); ++- return false; ++- } ++- } ++- ++- int i; ++- for (i = 0; i < 3; ++i) { ++- if (opts->fnr[i]) { ++- state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level); ++- if (state->fpr[i] == NULL) { ++- print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", i, opts->fnr[i]); ++- free(state); ++- return false; ++- } ++- } else { ++- if (!state->hsamtools_stdout) { ++- state->hsamtools_stdout = bgzf_dopen(fileno(samtools_stdout), "wu"); ++- if (!state->hsamtools_stdout) { ++- print_error_errno("bam2fq", "Cannot open STDOUT"); ++- free(state); ++- return false; ++- } ++- } ++- state->fpr[i] = state->hsamtools_stdout; ++- } ++- } ++- for (i = 0; i < 2; i++) { ++- state->fpi[i] = NULL; ++- if (opts->index_file[i]) { ++- state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level); ++- if (state->fpi[i] == NULL) { ++- print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", i+1, opts->index_file[i]); ++- free(state); ++- return false; ++- } ++- } ++- } ++- ++- state->h = sam_hdr_read(state->fp); ++- if (state->h == NULL) { ++- fprintf(samtools_stderr, "Failed to read header for \"%s\"\n", opts->fn_input); ++- free(state); ++- return false; ++- } ++- ++- *state_out = state; ++- return true; ++-} ++- ++-static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status) ++-{ ++- bool valid = true; ++- bam_hdr_destroy(state->h); ++- check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status); ++- if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } ++- int i; ++- for (i = 0; i < 3; ++i) { ++- if (state->fpr[i] != state->hsamtools_stdout) { ++- if (bgzf_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } ++- } ++- } ++- if (state->hsamtools_stdout) { ++- if (bgzf_close(state->hsamtools_stdout)) { ++- print_error_errno("bam2fq", "Error closing STDOUT"); ++- valid = false; ++- } ++- } ++- for (i = 0; i < 2; i++) { ++- if (state->fpi[i] && bgzf_close(state->fpi[i])) { ++- print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); ++- valid = false; ++- } ++- } ++- kl_destroy(ktaglist,state->taglist); ++- free(state->index_sequence); ++- free(state); ++- return valid; ++-} ++- ++-static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) ++-{ ++- return (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments ++- || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags ++- || (b->core.flag&(state->flag_off)) != 0 ++- || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff)); ++- ++-} ++- ++-static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) ++-{ ++- int n; ++- bam1_t *records[3]; ++- bam1_t* b = bam_init1(); ++- char *current_qname = NULL; ++- int64_t n_reads = 0, n_singletons = 0; // Statistics ++- kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}}; ++- int score[3]; ++- int at_eof; ++- if (b == NULL ) { ++- perror("[bam2fq_mainloop] Malloc error for bam record buffer."); ++- return false; ++- } ++- ++- bool valid = true; ++- while (true) { ++- int res = sam_read1(state->fp, state->h, b); ++- if (res < -1) { ++- fprintf(samtools_stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); ++- return false; ++- } ++- at_eof = res < 0; ++- ++- if (!at_eof && filter_it_out(b, state)) continue; ++- if (!at_eof) ++n_reads; ++- ++- if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) { ++- if (current_qname) { ++- if (state->illumina_tag) { ++- for (n=0; valid && n<3; n++) { ++- if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false; ++- } ++- if (!valid) break; ++- } ++- free(state->index_sequence); state->index_sequence = NULL; ++- if (score[1] > 0 && score[2] > 0) { ++- // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] ++- if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } ++- if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } ++- } else if (score[1] > 0 || score[2] > 0) { ++- if (state->fpse) { ++- // print whichever one exists to fpse ++- if (score[1] > 0) { ++- if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } ++- } else { ++- if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } ++- } ++- ++n_singletons; ++- } else { ++- if (score[1] > 0) { ++- if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } ++- } else { ++- if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } ++- } ++- } ++- } ++- if (score[0]) { // TODO: check this ++- // print linebuf[0] to fpr[0] ++- if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; } ++- } ++- } ++- ++- if (at_eof) break; ++- ++- free(current_qname); ++- current_qname = strdup(bam_get_qname(b)); ++- if (!current_qname) { valid = false; break; } ++- score[0] = score[1] = score[2] = 0; ++- } ++- ++- // Prefer a copy of the read that has base qualities ++- int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; ++- if (b_score > score[which_readpart(b)]) { ++- if (state->fpi[0]) if (!tags2fq(b, state, opts)) return false; ++- records[which_readpart(b)] = b; ++- if(!bam1_to_fq(b, &linebuf[which_readpart(b)], state)) { ++- fprintf(samtools_stderr, "[%s] Error converting read to FASTA/Q\n", __func__); ++- return false; ++- } ++- score[which_readpart(b)] = b_score; ++- } ++- } ++- if (!valid) ++- { ++- perror("[bam2fq_mainloop] Error writing to FASTx files."); ++- } ++- bam_destroy1(b); ++- free(current_qname); ++- free(linebuf[0].s); ++- free(linebuf[1].s); ++- free(linebuf[2].s); ++- fprintf(samtools_stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); ++- fprintf(samtools_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); ++- ++- return valid; ++-} ++- ++-int main_bam2fq(int argc, char *argv[]) ++-{ ++- int status = EXIT_SUCCESS; ++- bam2fq_opts_t* opts = NULL; ++- bam2fq_state_t* state = NULL; ++- ++- bool valid = parse_opts(argc, argv, &opts); ++- if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE; ++- ++- if (!init_state(opts, &state)) return EXIT_FAILURE; ++- ++- if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; ++- ++- if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; ++- sam_global_args_free(&opts->ga); ++- free_opts(opts); ++- ++- return status; ++-} ++--- python-pysam.orig/samtools/samtools.h +++++ python-pysam/samtools/samtools.h ++@@ -1,6 +1,6 @@ ++ /* samtools.h -- utility routines. ++ ++- Copyright (C) 2013-2015 Genome Research Ltd. +++ Copyright (C) 2013-2015, 2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ ++@@ -25,15 +25,28 @@ ++ #ifndef SAMTOOLS_H ++ #define SAMTOOLS_H ++ +++#include "htslib/hts_defs.h" +++#include "htslib/sam.h" +++ ++ const char *samtools_version(void); ++ ++-#if defined __GNUC__ && __GNUC__ >= 2 ++-#define CHECK_PRINTF(fmt,args) __attribute__ ((format (printf, fmt, args))) ++-#else ++-#define CHECK_PRINTF(fmt,args) ++-#endif +++#define CHECK_PRINTF(fmt,args) HTS_FORMAT(HTS_PRINTF_FMT, (fmt), (args)) ++ ++ void print_error(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3); ++ void print_error_errno(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3); ++ +++void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp); +++ +++/* +++ * Utility function to add an index to a file we've opened for write. +++ * NB: Call this after writing the header and before writing sequences. +++ * +++ * The returned index filename should be freed by the caller, but only +++ * after sam_idx_save has been called. +++ * +++ * Returns index filename on success, +++ * NULL on failure. +++ */ +++char *auto_index(htsFile *fp, const char *fn, bam_hdr_t *header); +++ ++ #endif ++--- python-pysam.orig/samtools/stats.c +++++ python-pysam/samtools/stats.c ++@@ -1,6 +1,6 @@ ++ /* stats.c -- This is the former bamcheck integrated into samtools/htslib. ++ ++- Copyright (C) 2012-2015 Genome Research Ltd. +++ Copyright (C) 2012-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ Author: Sam Nicholls ++@@ -46,6 +46,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -53,7 +54,7 @@ ++ #include ++ #include ++ #include ++-#include "sam_header.h" +++#include ++ #include ++ #include "samtools.h" ++ #include ++@@ -65,8 +66,10 @@ ++ #define BWA_MIN_RDLEN 35 ++ #define DEFAULT_CHUNK_NO 8 ++ #define DEFAULT_PAIR_MAX 10000 +++#define ERROR_LIMIT 200 ++ // From the spec ++ // If 0x4 is set, no assumptions can be made about RNAME, POS, CIGAR, MAPQ, bits 0x2, 0x10, 0x100 and 0x800, and the bit 0x20 of the previous read in the template. +++#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED) ++ #define IS_PAIRED_AND_MAPPED(bam) (((bam)->core.flag&BAM_FPAIRED) && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP)) ++ #define IS_PROPERLYPAIRED(bam) (((bam)->core.flag&(BAM_FPAIRED|BAM_FPROPER_PAIR)) == (BAM_FPAIRED|BAM_FPROPER_PAIR) && !((bam)->core.flag&BAM_FUNMAP)) ++ #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP) ++@@ -77,6 +80,14 @@ ++ #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP) ++ #define IS_ORIGINAL(bam) (((bam)->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY)) == 0) ++ +++#define READ_ORDER_NONE 0 +++#define READ_ORDER_FIRST 1 +++#define READ_ORDER_LAST 2 +++#define READ_ORDER_MIDDLE 3 +++ +++#define REG_INC 100 +++#define POS_INC 1000 +++ ++ // The GC-depth graph works as follows: split the reference sequence into ++ // segments and calculate GC content and depth in each bin. Then sort ++ // these segments by their GC and plot the depth distribution by means ++@@ -91,17 +102,16 @@ ++ // For coverage distribution, a simple pileup ++ typedef struct ++ { ++- int64_t pos; +++ hts_pos_t pos; ++ int size, start; ++ int *buffer; ++ } ++ round_buffer_t; ++ ++-typedef struct { uint32_t from, to; } pos_t; ++ typedef struct ++ { ++- int npos,mpos,cpos; ++- pos_t *pos; +++ int npos, mpos, cpos; +++ hts_pair_pos_t *pos; ++ } ++ regions_t; ++ ++@@ -118,6 +128,17 @@ ++ ++ typedef struct ++ { +++ char tag_name[3]; +++ char qual_name[3]; +++ uint32_t nbases; +++ int32_t tag_sep; // Index of the separator (if present) +++ int32_t max_qual; +++ uint32_t offset; // Where the tag stats info is located in the allocated memory +++} +++barcode_info_t; +++ +++typedef struct +++{ ++ // Auxiliary data ++ int flag_require, flag_filter; ++ faidx_t *fai; // Reference sequence for GC-depth graph ++@@ -129,7 +150,7 @@ ++ float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part ++ int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins ++ samFile* sam; ++- bam_hdr_t* sam_header; +++ sam_hdr_t* sam_header; ++ ++ // Filters ++ int filter_readlen; ++@@ -175,6 +196,7 @@ ++ uint64_t total_len_dup; ++ uint64_t nreads_1st; ++ uint64_t nreads_2nd; +++ uint64_t nreads_other; ++ uint64_t nreads_filtered; ++ uint64_t nreads_dup; ++ uint64_t nreads_unmapped; ++@@ -196,8 +218,8 @@ ++ // GC-depth related data ++ uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin ++ gc_depth_t *gcd; // The GC-depth bins holder ++- int32_t tid, gcd_pos; // Position of the current bin ++- int32_t pos; // Position of the last read +++ int32_t tid; // Position of the current bin +++ hts_pos_t gcd_pos, pos; // Position of the last read ++ ++ // Coverage distribution related data ++ int ncov; // The number of coverage bins ++@@ -207,12 +229,13 @@ ++ // Mismatches by read cycle ++ uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against ++ int mrseq_buf; // The size of the buffer ++- int32_t rseq_pos; // The coordinate of the first base in the buffer ++- int32_t nrseq_buf; // The used part of the buffer +++ hts_pos_t rseq_pos; // The coordinate of the first base in the buffer +++ int64_t nrseq_buf; // The used part of the buffer ++ uint64_t *mpc_buf; // Mismatches per cycle ++ ++ // Target regions ++- int nregions, reg_from, reg_to; +++ int nregions; +++ hts_pos_t reg_from, reg_to; ++ regions_t *regions; ++ ++ // Auxiliary data ++@@ -223,13 +246,20 @@ ++ char* split_name; ++ ++ stats_info_t* info; // Pointer to options and settings struct ++- pos_t *chunks; +++ hts_pair_pos_t *chunks; ++ uint32_t nchunks; ++ ++ uint32_t pair_count; // Number of active pairs in the pairing hash table ++ uint32_t target_count; // Number of bases covered by the target file ++ uint32_t last_pair_tid; ++ uint32_t last_read_flush; +++ +++ // Barcode statistics +++ acgtno_count_t *acgtno_barcode; +++ uint64_t *quals_barcode; +++ barcode_info_t *tags_barcode; +++ uint32_t ntags; +++ uint32_t error_number; ++ } ++ stats_t; ++ KHASH_MAP_INIT_STR(c2stats, stats_t*) ++@@ -237,18 +267,18 @@ ++ typedef struct { ++ uint32_t first; // 1 - first read, 2 - second read ++ uint32_t n, m; // number of chunks, allocated chunks ++- pos_t *chunks; // chunk array of size m +++ hts_pair_pos_t *chunks; // chunk array of size m ++ } pair_t; ++ KHASH_MAP_INIT_STR(qn2pair, pair_t*) ++ ++ ++-static void error(const char *format, ...); +++static void HTS_NORETURN error(const char *format, ...); ++ int is_in_regions(bam1_t *bam_line, stats_t *stats); ++ void realloc_buffers(stats_t *stats, int seq_len); ++ ++ static int regions_lt(const void *r1, const void *r2) { ++- int64_t from_diff = (int64_t)((pos_t *)r1)->from - (int64_t)((pos_t *)r2)->from; ++- int64_t to_diff = (int64_t)((pos_t *)r1)->to - (int64_t)((pos_t *)r2)->to; +++ int64_t from_diff = ((hts_pair_pos_t *)r1)->beg - ((hts_pair_pos_t *)r2)->beg; +++ int64_t to_diff = ((hts_pair_pos_t *)r1)->end - ((hts_pair_pos_t *)r2)->end; ++ ++ return from_diff > 0 ? 1 : from_diff < 0 ? -1 : to_diff > 0 ? 1 : to_diff < 0 ? -1 : 0; ++ } ++@@ -265,19 +295,19 @@ ++ return 1 + (depth - min) / step; ++ } ++ ++-static inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos) +++static inline int round_buffer_lidx2ridx(int offset, int size, hts_pos_t refpos, hts_pos_t pos) ++ { ++ return (offset + (pos-refpos) % size) % size; ++ } ++ ++-void round_buffer_flush(stats_t *stats, int64_t pos) +++void round_buffer_flush(stats_t *stats, hts_pos_t pos) ++ { ++ int ibuf,idp; ++ ++ if ( pos==stats->cov_rbuf.pos ) ++ return; ++ ++- int64_t new_pos = pos; +++ hts_pos_t new_pos = pos; ++ if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size ) ++ { ++ // Flush the whole buffer, but in sequential order, ++@@ -285,10 +315,10 @@ ++ } ++ ++ if ( pos < stats->cov_rbuf.pos ) ++- error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos); +++ error("Expected coordinates in ascending order, got %"PRIhts_pos" after %"PRIhts_pos"\n", pos, stats->cov_rbuf.pos); ++ ++ int ifrom = stats->cov_rbuf.start; ++- int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1); +++ int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start, stats->cov_rbuf.size, stats->cov_rbuf.pos, pos-1); ++ if ( ifrom>ito ) ++ { ++ for (ibuf=ifrom; ibufcov_rbuf.size; ibuf++) ++@@ -309,27 +339,30 @@ ++ stats->cov[idp]++; ++ stats->cov_rbuf.buffer[ibuf] = 0; ++ } ++- stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos); +++ stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start, stats->cov_rbuf.size, stats->cov_rbuf.pos, pos); ++ stats->cov_rbuf.pos = new_pos; ++ } ++ ++-void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to) +++/** +++ * [from, to) - 0 based half-open +++ */ +++static void round_buffer_insert_read(round_buffer_t *rbuf, hts_pos_t from, hts_pos_t to) ++ { ++- if ( to-from >= rbuf->size ) ++- error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size); +++ if ( to-from > rbuf->size ) +++ error("The read length too big (%"PRIhts_pos"), please increase the buffer length (currently %d)\n", to-from, rbuf->size); ++ if ( from < rbuf->pos ) ++- error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos); +++ error("The reads are not sorted (%"PRIhts_pos" comes after %"PRIhts_pos").\n", from, rbuf->pos); ++ ++- int ifrom,ito,ibuf; ++- ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from); ++- ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to); +++ int ifrom, ito, ibuf; +++ ifrom = round_buffer_lidx2ridx(rbuf->start, rbuf->size, rbuf->pos, from); +++ ito = round_buffer_lidx2ridx(rbuf->start, rbuf->size, rbuf->pos, to); ++ if ( ifrom>ito ) ++ { ++ for (ibuf=ifrom; ibufsize; ibuf++) ++ rbuf->buffer[ibuf]++; ++ ifrom = 0; ++ } ++- for (ibuf=ifrom; ibuf<=ito; ibuf++) +++ for (ibuf=ifrom; ibufbuffer[ibuf]++; ++ } ++ ++@@ -362,7 +395,7 @@ ++ void count_indels(stats_t *stats,bam1_t *bam_line) ++ { ++ int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; ++- int is_1st = IS_READ1(bam_line) ? 1 : 0; +++ uint32_t order = IS_PAIRED(bam_line) ? (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0) : READ_ORDER_FIRST; ++ int icig; ++ int icycle = 0; ++ int read_len = bam_line->core.l_qseq; ++@@ -377,10 +410,10 @@ ++ int idx = is_fwd ? icycle : read_len-icycle-ncig; ++ if ( idx<0 ) ++ error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle); ++- if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); ++- if ( is_1st ) +++ if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%"PRIhts_pos" %s\n", idx, stats->nbases, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); +++ if ( order == READ_ORDER_FIRST ) ++ stats->ins_cycles_1st[idx]++; ++- else +++ if ( order == READ_ORDER_LAST ) ++ stats->ins_cycles_2nd[idx]++; ++ icycle += ncig; ++ if ( ncig<=stats->nindels ) ++@@ -392,9 +425,9 @@ ++ int idx = is_fwd ? icycle-1 : read_len-icycle-1; ++ if ( idx<0 ) continue; // discard meaningless deletions ++ if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases); ++- if ( is_1st ) +++ if ( order == READ_ORDER_FIRST ) ++ stats->del_cycles_1st[idx]++; ++- else +++ if ( order == READ_ORDER_LAST ) ++ stats->del_cycles_2nd[idx]++; ++ if ( ncig<=stats->nindels ) ++ stats->deletions[ncig-1]++; ++@@ -420,8 +453,8 @@ ++ void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len) ++ { ++ int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; ++- int icig,iread=0,icycle=0; ++- int iref = bam_line->core.pos - stats->rseq_pos; +++ int icig, iread=0, icycle=0; +++ hts_pos_t iref = bam_line->core.pos - stats->rseq_pos; ++ uint8_t *read = bam_get_seq(bam_line); ++ uint8_t *quals = bam_get_qual(bam_line); ++ uint64_t *mpc_buf = stats->mpc_buf; ++@@ -454,13 +487,13 @@ ++ continue; ++ } ++ // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large ++- // chunk of refseq in memory. Not very frequent and not noticable in the stats. +++ // chunk of refseq in memory. Not very frequent and not noticeable in the stats. ++ if ( cig==BAM_CREF_SKIP || cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) continue; ++ if ( cig!=BAM_CMATCH && cig!=BAM_CEQUAL && cig!=BAM_CDIFF ) // not relying on precalculated diffs ++- error("TODO: cigar %d, %s:%d %s\n", cig,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); +++ error("TODO: cigar %d, %s:%"PRIhts_pos" %s\n", cig, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); ++ ++ if ( ncig+iref > stats->nrseq_buf ) ++- error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam_get_qname(bam_line),stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1); +++ error("FIXME: %d+%"PRIhts_pos" > %"PRId64", %s, %s:%"PRIhts_pos"\n", ncig, iref, stats->nrseq_buf, bam_get_qname(bam_line), sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1); ++ ++ int im; ++ for (im=0; im=stats->nquals ) ++- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); +++ error("TODO: quality too high %d>=%d (%s %"PRIhts_pos" %s)\n", qual, stats->nquals, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); ++ ++ int idx = is_fwd ? icycle : read_len-icycle-1; ++ if ( idx>stats->max_len ) ++- error("mpc: %d>%d (%s %d %s)\n",idx,stats->max_len,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); +++ error("mpc: %d>%d (%s %"PRIhts_pos" %s)\n", idx, stats->max_len, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); ++ ++ idx = idx*stats->nquals + qual; ++ if ( idx>=stats->nquals*stats->nbases ) ++@@ -503,11 +536,12 @@ ++ } ++ } ++ ++-void read_ref_seq(stats_t *stats, int32_t tid, int32_t pos) +++void read_ref_seq(stats_t *stats, int32_t tid, hts_pos_t pos) ++ { ++- int i, fai_ref_len; ++- char *fai_ref = faidx_fetch_seq(stats->info->fai, stats->info->sam_header->target_name[tid], pos, pos+stats->mrseq_buf-1, &fai_ref_len); ++- if ( fai_ref_len<0 ) error("Failed to fetch the sequence \"%s\"\n", stats->info->sam_header->target_name[tid]); +++ int i; +++ hts_pos_t fai_ref_len; +++ char *fai_ref = faidx_fetch_seq64(stats->info->fai, sam_hdr_tid2name(stats->info->sam_header, tid), pos, pos+stats->mrseq_buf-1, &fai_ref_len); +++ if ( fai_ref_len < 0 ) error("Failed to fetch the sequence \"%s\"\n", sam_hdr_tid2name(stats->info->sam_header, tid)); ++ ++ uint8_t *ptr = stats->rseq_buf; ++ for (i=0; itid = tid; ++ } ++ ++-float fai_gc_content(stats_t *stats, int pos, int len) +++float fai_gc_content(stats_t *stats, hts_pos_t pos, int len) ++ { ++ uint32_t gc,count,c; ++- int i = pos - stats->rseq_pos, ito = i + len; +++ hts_pos_t i = pos - stats->rseq_pos, ito = i + len; ++ assert( i>=0 ); ++ ++ if ( ito > stats->nrseq_buf ) ito = stats->nrseq_buf; ++@@ -568,6 +602,9 @@ ++ if ( stats->mrseq_bufrseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n); +++ if (!stats->rseq_buf) { +++ error("Could not reallocate reference sequence buffer"); +++ } ++ stats->mrseq_buf = n; ++ } ++ } ++@@ -659,6 +696,9 @@ ++ ++ // Realloc the coverage distribution buffer ++ int *rbuffer = calloc(sizeof(int),seq_len*5); +++ if (!rbuffer) { +++ error("Could not allocate coverage distribution buffer"); +++ } ++ n = stats->cov_rbuf.size-stats->cov_rbuf.start; ++ memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n); ++ if ( stats->cov_rbuf.start>1 ) ++@@ -688,6 +728,119 @@ ++ stats->checksum.quals += crc32(0L, qual, (seq_len+1)/2); ++ } ++ +++// Collect statistics about the barcode tags specified by init_barcode_tags method +++static void collect_barcode_stats(bam1_t* bam_line, stats_t* stats) { +++ uint32_t nbases, tag, i; +++ acgtno_count_t *acgtno; +++ uint64_t *quals; +++ int32_t *separator, *maxqual; +++ +++ for (tag = 0; tag < stats->ntags; tag++) { +++ const char *barcode_tag = stats->tags_barcode[tag].tag_name, *qual_tag = stats->tags_barcode[tag].qual_name; +++ uint8_t* bc = bam_aux_get(bam_line, barcode_tag); +++ if (!bc) +++ continue; +++ +++ char* barcode = bam_aux2Z(bc); +++ if (!barcode) +++ continue; +++ +++ uint32_t barcode_len = strlen(barcode); +++ if (!stats->tags_barcode[tag].nbases) { // tag seen for the first time +++ uint32_t offset = 0; +++ for (i = 0; i < stats->ntags; i++) +++ offset += stats->tags_barcode[i].nbases; +++ +++ stats->tags_barcode[tag].offset = offset; +++ stats->tags_barcode[tag].nbases = barcode_len; +++ stats->acgtno_barcode = realloc(stats->acgtno_barcode, (offset + barcode_len) * sizeof(acgtno_count_t)); +++ stats->quals_barcode = realloc(stats->quals_barcode, (offset + barcode_len) * stats->nquals * sizeof(uint64_t)); +++ +++ if (!stats->acgtno_barcode || !stats->quals_barcode) +++ error("Error allocating memory. Aborting!\n"); +++ +++ memset(stats->acgtno_barcode + offset, 0, barcode_len*sizeof(acgtno_count_t)); +++ memset(stats->quals_barcode + offset*stats->nquals, 0, barcode_len*stats->nquals*sizeof(uint64_t)); +++ } +++ +++ nbases = stats->tags_barcode[tag].nbases; +++ if (barcode_len > nbases) { +++ fprintf(stderr, "Barcodes with tag %s differ in length at sequence '%s'\n", barcode_tag, bam_get_qname(bam_line)); +++ continue; +++ } +++ +++ acgtno = stats->acgtno_barcode + stats->tags_barcode[tag].offset; +++ quals = stats->quals_barcode + stats->tags_barcode[tag].offset*stats->nquals; +++ maxqual = &stats->tags_barcode[tag].max_qual; +++ separator = &stats->tags_barcode[tag].tag_sep; +++ int error_flag = 0; +++ +++ for (i = 0; i < barcode_len; i++) { +++ switch (barcode[i]) { +++ case 'A': +++ acgtno[i].a++; +++ break; +++ case 'C': +++ acgtno[i].c++; +++ break; +++ case 'G': +++ acgtno[i].g++; +++ break; +++ case 'T': +++ acgtno[i].t++; +++ break; +++ case 'N': +++ acgtno[i].n++; +++ break; +++ default: +++ if (*separator >= 0) { +++ if (*separator != i) { +++ if (stats->error_number < ERROR_LIMIT) { +++ fprintf(stderr, "Barcode separator for tag %s is in a different position or wrong barcode content('%s') at sequence '%s'\n", barcode_tag, barcode, bam_get_qname(bam_line)); +++ stats->error_number++; +++ } +++ error_flag = 1; +++ } +++ } else { +++ *separator = i; +++ } +++ } +++ +++ /* don't process the rest of the tag bases */ +++ if (error_flag) +++ break; +++ } +++ +++ /* skip to the next tag */ +++ if (error_flag) +++ continue; +++ +++ uint8_t* qt = bam_aux_get(bam_line, qual_tag); +++ if (!qt) +++ continue; +++ +++ char* barqual = bam_aux2Z(qt); +++ if (!barqual) +++ continue; +++ +++ uint32_t barqual_len = strlen(barqual); +++ if (barqual_len == barcode_len) { +++ for (i = 0; i < barcode_len; i++) { +++ int32_t qual = (int32_t)barqual[i] - '!'; // Phred + 33 +++ if (qual >= 0 && qual < stats->nquals) { +++ quals[i * stats->nquals + qual]++; +++ if (qual > *maxqual) +++ *maxqual = qual; +++ } +++ } +++ } else { +++ if (stats->error_number++ < ERROR_LIMIT) { +++ fprintf(stderr, "%s length and %s length don't match for sequence '%s'\n", barcode_tag, qual_tag, bam_get_qname(bam_line)); +++ } +++ } +++ } +++} +++ ++ // These stats should only be calculated for the original reads ignoring ++ // supplementary artificial reads otherwise we'll accidentally double count ++ void collect_orig_read_stats(bam1_t *bam_line, stats_t *stats, int* gc_count_out) ++@@ -698,42 +851,48 @@ ++ if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++; ++ if ( bam_line->core.flag & BAM_FPAIRED ) stats->nreads_paired_tech++; ++ +++ uint32_t order = IS_PAIRED(bam_line) ? (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0) : READ_ORDER_FIRST; +++ ++ // Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored ++ uint8_t *seq = bam_get_seq(bam_line); ++- int i, read_cycle, gc_count = 0, reverse = IS_REVERSE(bam_line), is_first = IS_READ1(bam_line); ++- for (i=0; iacgtno_cycles_1st[ read_cycle ].a++ : stats->acgtno_cycles_2nd[ read_cycle ].a++; ++- break; ++- case 2: ++- is_first ? stats->acgtno_cycles_1st[ read_cycle ].c++ : stats->acgtno_cycles_2nd[ read_cycle ].c++; ++- gc_count++; ++- break; ++- case 4: ++- is_first ? stats->acgtno_cycles_1st[ read_cycle ].g++ : stats->acgtno_cycles_2nd[ read_cycle ].g++; ++- gc_count++; ++- break; ++- case 8: ++- is_first ? stats->acgtno_cycles_1st[ read_cycle ].t++ : stats->acgtno_cycles_2nd[ read_cycle ].t++; ++- break; ++- case 15: ++- is_first ? stats->acgtno_cycles_1st[ read_cycle ].n++ : stats->acgtno_cycles_2nd[ read_cycle ].n++; ++- break; ++- default: ++- /* ++- * count "=" sequences in "other" along ++- * with MRSVWYHKDB ambiguity codes ++- */ ++- is_first ? stats->acgtno_cycles_1st[ read_cycle ].other++ : stats->acgtno_cycles_2nd[ read_cycle ].other++; ++- break; +++ acgtno_count_t *acgtno_cycles = (order == READ_ORDER_FIRST) ? stats->acgtno_cycles_1st : (order == READ_ORDER_LAST) ? stats->acgtno_cycles_2nd : NULL ; +++ if (acgtno_cycles) { +++ for (i=0; ingc-1)/seq_len; ++@@ -743,38 +902,48 @@ ++ // Determine which array (1st or 2nd read) will these stats go to, ++ // trim low quality bases from end the same way BWA does, ++ // fill GC histogram ++- uint64_t *quals; +++ uint64_t *quals = NULL; ++ uint8_t *bam_quals = bam_get_qual(bam_line); ++- if ( IS_READ2(bam_line) ) ++- { ++- quals = stats->quals_2nd; ++- stats->nreads_2nd++; ++- stats->total_len_2nd += seq_len; ++- for (i=gc_idx_min; igc_2nd[i]++; ++- } ++- else ++- { +++ +++ switch (order) { +++ case READ_ORDER_FIRST: ++ quals = stats->quals_1st; ++ stats->nreads_1st++; ++ stats->total_len_1st += seq_len; ++ for (i=gc_idx_min; igc_1st[i]++; +++ break; +++ case READ_ORDER_LAST: +++ quals = stats->quals_2nd; +++ stats->nreads_2nd++; +++ stats->total_len_2nd += seq_len; +++ for (i=gc_idx_min; igc_2nd[i]++; +++ break; +++ default: +++ stats->nreads_other++; ++ } ++ if ( stats->info->trim_qual>0 ) ++ stats->nbases_trimmed += bwa_trim_read(stats->info->trim_qual, bam_quals, seq_len, reverse); ++ ++ // Quality histogram and average quality. Clipping is neglected. ++- for (i=0; i=stats->nquals ) ++- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); ++- if ( qual>stats->max_qual ) ++- stats->max_qual = qual; +++ if (quals) { +++ for (i=0; i=stats->nquals ) +++ error("TODO: quality too high %d>=%d (%s %"PRIhts_pos" %s)\n", qual, stats->nquals, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); +++ if ( qual>stats->max_qual ) +++ stats->max_qual = qual; +++ +++ quals[ i*stats->nquals+qual ]++; +++ stats->sum_qual += qual; +++ } +++ } ++ ++- quals[ i*stats->nquals+qual ]++; ++- stats->sum_qual += qual; +++ // Barcode statistics +++ if (order == READ_ORDER_FIRST) { +++ collect_barcode_stats(bam_line, stats); ++ } ++ ++ // Look at the flags and increment appropriate counters (mapped, paired, etc) ++@@ -803,7 +972,7 @@ ++ *gc_count_out = gc_count; ++ } ++ ++-static int cleanup_overlaps(khash_t(qn2pair) *read_pairs, int max) { +++static int cleanup_overlaps(khash_t(qn2pair) *read_pairs, hts_pos_t max) { ++ if ( !read_pairs ) ++ return 0; ++ ++@@ -814,7 +983,7 @@ ++ char *key = (char *)kh_key(read_pairs, k); ++ pair_t *val = kh_val(read_pairs, k); ++ if ( val && val->chunks ) { ++- if ( val->chunks[val->n-1].to < max ) { +++ if ( val->chunks[val->n-1].end < max ) { ++ free(val->chunks); ++ free(val); ++ free(key); ++@@ -828,29 +997,32 @@ ++ } ++ } ++ } ++- if ( max == INT_MAX ) +++ if ( max == INT64_MAX ) ++ kh_destroy(qn2pair, read_pairs); ++ ++ return count; ++ } ++ ++-static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stats_t *stats, int pmin, int pmax) { +++/** +++ * [pmin, pmax) - 0 based half-open +++ */ +++static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stats_t *stats, hts_pos_t pmin, hts_pos_t pmax) { ++ if ( !bam_line || !read_pairs || !stats ) ++ return; ++ ++- uint32_t first = (IS_READ1(bam_line) > 0 ? 1 : 0) + (IS_READ2(bam_line) > 0 ? 2 : 0) ; +++ uint32_t order = (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0); ++ if ( !(bam_line->core.flag & BAM_FPAIRED) || ++ (bam_line->core.flag & BAM_FMUNMAP) || ++- (abs(bam_line->core.isize) >= 2*bam_line->core.l_qseq) || ++- (first != 1 && first != 2) ) { +++ (llabs(bam_line->core.isize) >= 2*bam_line->core.l_qseq) || +++ (order != READ_ORDER_FIRST && order != READ_ORDER_LAST) ) { ++ if ( pmin >= 0 ) ++- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1); +++ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax); ++ return; ++ } ++ ++ char *qname = bam_get_qname(bam_line); ++ if ( !qname ) { ++- fprintf(stderr, "Error retrieving qname for line starting at pos %d\n", bam_line->core.pos); +++ fprintf(stderr, "Error retrieving qname for line starting at pos %"PRIhts_pos"\n", bam_line->core.pos); ++ return; ++ } ++ ++@@ -868,8 +1040,7 @@ ++ ++ k = kh_put(qn2pair, read_pairs, s, &ret); ++ if ( -1 == ret ) { ++- fprintf(stderr, "Error inserting read '%s' in pair hash table\n", qname); ++- return; +++ error("Error inserting read '%s' in pair hash table\n", qname); ++ } ++ ++ pair_t *pc = calloc(1, sizeof(pair_t)); ++@@ -879,16 +1050,16 @@ ++ } ++ ++ pc->m = DEFAULT_CHUNK_NO; ++- pc->chunks = calloc(pc->m, sizeof(pos_t)); +++ pc->chunks = calloc(pc->m, sizeof(hts_pair_pos_t)); ++ if ( !pc->chunks ) { ++ fprintf(stderr, "Error allocating memory\n"); ++ return; ++ } ++ ++- pc->chunks[0].from = pmin; ++- pc->chunks[0].to = pmax; +++ pc->chunks[0].beg = pmin; +++ pc->chunks[0].end = pmax; ++ pc->n = 1; ++- pc->first = first; +++ pc->first = order; ++ ++ kh_val(read_pairs, k) = pc; ++ stats->pair_count++; ++@@ -899,12 +1070,12 @@ ++ return; ++ } ++ ++- if ( first == pc->first ) { //chunk from an existing line +++ if ( order == pc->first ) { //chunk from an existing line ++ if ( pmin == -1 ) ++ return; ++ ++ if ( pc->n == pc->m ) { ++- pos_t *tmp = realloc(pc->chunks, (pc->m<<1)*sizeof(pos_t)); +++ hts_pair_pos_t *tmp = realloc(pc->chunks, (pc->m<<1)*sizeof(hts_pair_pos_t)); ++ if ( !tmp ) { ++ fprintf(stderr, "Error allocating memory\n"); ++ return; ++@@ -913,8 +1084,8 @@ ++ pc->m<<=1; ++ } ++ ++- pc->chunks[pc->n].from = pmin; ++- pc->chunks[pc->n].to = pmax; +++ pc->chunks[pc->n].beg = pmin; +++ pc->chunks[pc->n].end = pmax; ++ pc->n++; ++ } else { //the other line, check for overlapping ++ if ( pmin == -1 && kh_exist(read_pairs, k) ) { //job done, delete entry ++@@ -932,28 +1103,28 @@ ++ ++ int i; ++ for (i=0; in; i++) { ++- if ( pmin >= pc->chunks[i].to ) +++ if ( pmin >= pc->chunks[i].end ) ++ continue; ++ ++- if ( pmax <= pc->chunks[i].from ) //no overlap +++ if ( pmax <= pc->chunks[i].beg ) //no overlap ++ break; ++ ++- if ( pmin < pc->chunks[i].from ) { //overlap at the beginning ++- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pc->chunks[i].from-1); ++- pmin = pc->chunks[i].from; +++ if ( pmin < pc->chunks[i].beg ) { //overlap at the beginning +++ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pc->chunks[i].beg); +++ pmin = pc->chunks[i].beg; ++ } ++ ++- if ( pmax <= pc->chunks[i].to ) { //completely contained +++ if ( pmax <= pc->chunks[i].end ) { //completely contained ++ stats->nbases_mapped_cigar -= (pmax - pmin); ++ return; ++ } else { //overlap at the end ++- stats->nbases_mapped_cigar -= (pc->chunks[i].to - pmin); ++- pmin = pc->chunks[i].to; +++ stats->nbases_mapped_cigar -= (pc->chunks[i].end - pmin); +++ pmin = pc->chunks[i].end; ++ } ++ } ++ } ++ } ++- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1); +++ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax); ++ } ++ ++ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pairs) ++@@ -998,15 +1169,17 @@ ++ stats->nreads_dup++; ++ } ++ +++ uint32_t order = IS_PAIRED(bam_line) ? (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0) : READ_ORDER_FIRST; +++ ++ int read_len = unclipped_length(bam_line); ++ if ( read_len >= stats->nbases ) ++ realloc_buffers(stats,read_len); ++ // Update max_len observed ++ if ( stats->max_lenmax_len = read_len; ++- if ( IS_READ1(bam_line) && stats->max_len_1st < read_len ) +++ if ( order == READ_ORDER_FIRST && stats->max_len_1st < read_len ) ++ stats->max_len_1st = read_len; ++- if ( IS_READ2(bam_line) && stats->max_len_2nd < read_len ) +++ if ( order == READ_ORDER_LAST && stats->max_len_2nd < read_len ) ++ stats->max_len_2nd = read_len; ++ ++ int i; ++@@ -1017,8 +1190,8 @@ ++ if ( IS_ORIGINAL(bam_line) ) ++ { ++ stats->read_lengths[read_len]++; ++- if ( IS_READ1(bam_line) ) stats->read_lengths_1st[read_len]++; ++- if ( IS_READ2(bam_line) ) stats->read_lengths_2nd[read_len]++; +++ if ( order == READ_ORDER_FIRST ) stats->read_lengths_1st[read_len]++; +++ if ( order == READ_ORDER_LAST ) stats->read_lengths_2nd[read_len]++; ++ collect_orig_read_stats(bam_line, stats, &gc_count); ++ } ++ ++@@ -1039,7 +1212,7 @@ ++ isize = stats->info->nisize; ++ if ( isize>0 || bam_line->core.tid==bam_line->core.mtid ) ++ { ++- int pos_fst = bam_line->core.mpos - bam_line->core.pos; +++ hts_pos_t pos_fst = bam_line->core.mpos - bam_line->core.pos; ++ int is_fst = IS_READ1(bam_line) ? 1 : -1; ++ int is_fwd = IS_REVERSE(bam_line) ? -1 : 1; ++ int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1; ++@@ -1075,7 +1248,7 @@ ++ if ( stats->regions ) ++ { ++ // Count only on-target bases ++- int iref = bam_line->core.pos + 1; +++ hts_pos_t iref = bam_line->core.pos + 1; ++ for (i=0; icore.n_cigar; i++) ++ { ++ int cig = bam_cigar_op(bam_get_cigar(bam_line)[i]); ++@@ -1129,7 +1302,7 @@ ++ } ++ ++ if ( stats->last_pair_tid != bam_line->core.tid) { ++- stats->pair_count -= cleanup_overlaps(read_pairs, INT_MAX-1); +++ stats->pair_count -= cleanup_overlaps(read_pairs, INT64_MAX-1); ++ stats->last_pair_tid = bam_line->core.tid; ++ stats->last_read_flush = 0; ++ } ++@@ -1181,8 +1354,9 @@ ++ // Coverage distribution graph ++ round_buffer_flush(stats,bam_line->core.pos); ++ if ( stats->regions ) { ++- uint32_t p = bam_line->core.pos, pnew, pmin, pmax, j; ++- pmin = pmax = i = j = 0; +++ hts_pos_t p = bam_line->core.pos, pnew, pmin = 0, pmax = 0; +++ uint32_t j = 0; +++ i = 0; ++ while ( j < bam_line->core.n_cigar && i < stats->nchunks ) { ++ int op = bam_cigar_op(bam_get_cigar(bam_line)[j]); ++ int oplen = bam_cigar_oplen(bam_get_cigar(bam_line)[j]); ++@@ -1190,13 +1364,13 @@ ++ case BAM_CMATCH: ++ case BAM_CEQUAL: ++ case BAM_CDIFF: ++- pmin = MAX(p, stats->chunks[i].from-1); ++- pmax = MIN(p+oplen, stats->chunks[i].to); ++- if ( pmax >= pmin ) { +++ pmin = MAX(p, stats->chunks[i].beg-1); // 0 based +++ pmax = MIN(p+oplen, stats->chunks[i].end); // 1 based +++ if ( pmax > pmin ) { ++ if ( stats->info->remove_overlaps ) ++ remove_overlaps(bam_line, read_pairs, stats, pmin, pmax); ++ else ++- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1); +++ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax); ++ } ++ break; ++ case BAM_CDEL: ++@@ -1204,7 +1378,7 @@ ++ } ++ pnew = p + (bam_cigar_type(op)&2 ? oplen : 0); // consumes reference ++ ++- if ( pnew >= stats->chunks[i].to ) { +++ if ( pnew >= stats->chunks[i].end ) { ++ // go to the next chunk ++ i++; ++ } else { ++@@ -1214,7 +1388,8 @@ ++ } ++ } ++ } else { ++- uint32_t p = bam_line->core.pos, j; +++ hts_pos_t p = bam_line->core.pos; +++ uint32_t j; ++ for (j = 0; j < bam_line->core.n_cigar; j++) { ++ int op = bam_cigar_op(bam_get_cigar(bam_line)[j]); ++ int oplen = bam_cigar_oplen(bam_get_cigar(bam_line)[j]); ++@@ -1225,7 +1400,7 @@ ++ if ( stats->info->remove_overlaps ) ++ remove_overlaps(bam_line, read_pairs, stats, p, p+oplen); ++ else ++- round_buffer_insert_read(&(stats->cov_rbuf), p, p+oplen-1); +++ round_buffer_insert_read(&(stats->cov_rbuf), p, p+oplen); ++ break; ++ case BAM_CDEL: ++ break; ++@@ -1234,7 +1409,7 @@ ++ } ++ } ++ if ( stats->info->remove_overlaps ) ++- remove_overlaps(bam_line, read_pairs, stats, -1, -1); //remove the line from the hash table +++ remove_overlaps(bam_line, read_pairs, stats, -1LL, -1LL); //remove the line from the hash table ++ } ++ } ++ ++@@ -1255,7 +1430,7 @@ ++ float n,d; ++ int k; ++ ++- n = p*(N+1)/100; +++ n = (float)p*(N+1)/100; ++ k = n; ++ if ( k<=0 ) ++ return gcd[0].depth; ++@@ -1320,9 +1495,9 @@ ++ fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n"); ++ fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals); ++ fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); ++- fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); // not counting excluded seqs (and none of the below) +++ fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below) ++ fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); ++- fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd)); +++ fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); ++ fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); ++ fprintf(to, "SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st); ++ fprintf(to, "SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd); ++@@ -1344,7 +1519,7 @@ ++ fprintf(to, "SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup); ++ fprintf(to, "SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches); ++ fprintf(to, "SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0); ++- float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0; +++ float avg_read_length = (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0; ++ fprintf(to, "SN\taverage length:\t%.0f\n", avg_read_length); ++ fprintf(to, "SN\taverage first fragment length:\t%.0f\n", stats->nreads_1st? (float)stats->total_len_1st/stats->nreads_1st:0); ++ fprintf(to, "SN\taverage last fragment length:\t%.0f\n", stats->nreads_2nd? (float)stats->total_len_2nd/stats->nreads_2nd:0); ++@@ -1358,7 +1533,7 @@ ++ fprintf(to, "SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward); ++ fprintf(to, "SN\tpairs with other orientation:\t%ld\n", (long)nisize_other); ++ fprintf(to, "SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2); ++- fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd):0); +++ fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0); ++ if ( stats->target_count ) { ++ fprintf(to, "SN\tbases inside the target:\t%u\n", stats->target_count); ++ for (icov=stats->info->cov_threshold+1; icovncov; icov++) ++@@ -1439,11 +1614,18 @@ ++ 100.*(acgtno_count_1st->other + acgtno_count_2nd->other)/acgt_sum); ++ ++ } +++ +++ uint64_t tA=0, tC=0, tG=0, tT=0, tN=0; ++ fprintf(to, "# ACGT content per cycle for first fragments. Use `grep ^FBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N and O counts as a percentage of all A/C/G/T bases [%%]\n"); ++ for (ibase=0; ibasemax_len; ibase++) ++ { ++ acgtno_count_t *acgtno_count_1st = &(stats->acgtno_cycles_1st[ibase]); ++ uint64_t acgt_sum_1st = acgtno_count_1st->a + acgtno_count_1st->c + acgtno_count_1st->g + acgtno_count_1st->t; +++ tA += acgtno_count_1st->a; +++ tC += acgtno_count_1st->c; +++ tG += acgtno_count_1st->g; +++ tT += acgtno_count_1st->t; +++ tN += acgtno_count_1st->n; ++ ++ if ( acgt_sum_1st ) ++ fprintf(to, "FBC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1, ++@@ -1455,11 +1637,19 @@ ++ 100.*acgtno_count_1st->other/acgt_sum_1st); ++ ++ } +++ fprintf(to, "# ACGT raw counters for first fragments. Use `grep ^FTC | cut -f 2-` to extract this part. The columns are: A,C,G,T,N base counters\n"); +++ fprintf(to, "FTC\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", tA, tC, tG, tT, tN); +++ tA=0, tC=0, tG=0, tT=0, tN=0; ++ fprintf(to, "# ACGT content per cycle for last fragments. Use `grep ^LBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N and O counts as a percentage of all A/C/G/T bases [%%]\n"); ++ for (ibase=0; ibasemax_len; ibase++) ++ { ++ acgtno_count_t *acgtno_count_2nd = &(stats->acgtno_cycles_2nd[ibase]); ++ uint64_t acgt_sum_2nd = acgtno_count_2nd->a + acgtno_count_2nd->c + acgtno_count_2nd->g + acgtno_count_2nd->t; +++ tA += acgtno_count_2nd->a; +++ tC += acgtno_count_2nd->c; +++ tG += acgtno_count_2nd->g; +++ tT += acgtno_count_2nd->t; +++ tN += acgtno_count_2nd->n; ++ ++ if ( acgt_sum_2nd ) ++ fprintf(to, "LBC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1, ++@@ -1471,6 +1661,52 @@ ++ 100.*acgtno_count_2nd->other/acgt_sum_2nd); ++ ++ } +++ fprintf(to, "# ACGT raw counters for last fragments. Use `grep ^LTC | cut -f 2-` to extract this part. The columns are: A,C,G,T,N base counters\n"); +++ fprintf(to, "LTC\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", tA, tC, tG, tT, tN); +++ +++ int tag; +++ for (tag=0; tagntags; tag++) { +++ if (stats->tags_barcode[tag].nbases) { +++ fprintf(to, "# ACGT content per cycle for barcodes. Use `grep ^%sC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N counts as a percentage of all A/C/G/T bases [%%]\n", +++ stats->tags_barcode[tag].tag_name); +++ for (ibase=0; ibasetags_barcode[tag].nbases; ibase++) +++ { +++ if (ibase == stats->tags_barcode[tag].tag_sep) +++ continue; +++ +++ acgtno_count_t *acgtno_count = stats->acgtno_barcode + stats->tags_barcode[tag].offset + ibase; +++ uint64_t acgt_sum = acgtno_count->a + acgtno_count->c + acgtno_count->g + acgtno_count->t; +++ +++ if ( acgt_sum ) +++ fprintf(to, "%sC%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", stats->tags_barcode[tag].tag_name, +++ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? 1 : 2, +++ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? ibase+1 : ibase-stats->tags_barcode[tag].tag_sep, +++ 100.*acgtno_count->a/acgt_sum, +++ 100.*acgtno_count->c/acgt_sum, +++ 100.*acgtno_count->g/acgt_sum, +++ 100.*acgtno_count->t/acgt_sum, +++ 100.*acgtno_count->n/acgt_sum); +++ } +++ +++ fprintf(to, "# Barcode Qualities. Use `grep ^%sQ | cut -f 2-` to extract this part.\n", stats->tags_barcode[tag].qual_name); +++ fprintf(to, "# Columns correspond to qualities and rows to barcode cycles. First column is the cycle number.\n"); +++ for (ibase=0; ibasetags_barcode[tag].nbases; ibase++) +++ { +++ if (ibase == stats->tags_barcode[tag].tag_sep) +++ continue; +++ +++ fprintf(to, "%sQ%d\t%d", stats->tags_barcode[tag].qual_name, +++ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? 1 : 2, +++ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? ibase+1 : ibase-stats->tags_barcode[tag].tag_sep); +++ for (iqual=0; iqual<=stats->tags_barcode[tag].max_qual; iqual++) +++ { +++ fprintf(to, "\t%ld", (long)stats->quals_barcode[(stats->tags_barcode[tag].offset + ibase)*stats->nquals+iqual]); +++ } +++ fprintf(to, "\n"); +++ } +++ } +++ } +++ ++ fprintf(to, "# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); ++ for (isize=0; isizeisize->inward(stats->isize->data, isize)); ++@@ -1564,14 +1800,15 @@ ++ } ++ } ++ ++-void init_regions(stats_t *stats, const char *file) +++static void init_regions(stats_t *stats, const char *file) ++ { ++ FILE *fp = fopen(file,"r"); ++ if ( !fp ) error("%s: %s\n",file,strerror(errno)); ++ ++ kstring_t line = { 0, 0, NULL }; ++ int warned = 0, r, p, new_p; ++- int prev_tid=-1, prev_pos=-1; +++ int prev_tid=-1; +++ hts_pos_t prev_pos=-1LL; ++ while (line.l = 0, kgetline(&line, (kgets_func *)fgets, fp) >= 0) ++ { ++ if ( line.s[0] == '#' ) continue; ++@@ -1592,30 +1829,33 @@ ++ ++ if ( tid >= stats->nregions ) ++ { ++- stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100)); +++ if(!(stats->regions = realloc(stats->regions,sizeof(regions_t)*(tid+REG_INC)))) +++ error("Could not allocate memory for region.\n"); +++ ++ int j; ++- for (j=stats->nregions; jnregions+100; j++) +++ for (j=stats->nregions; jregions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0; ++ stats->regions[j].pos = NULL; ++ } ++- stats->nregions += 100; +++ stats->nregions = tid+REG_INC; ++ } ++ int npos = stats->regions[tid].npos; ++ if ( npos >= stats->regions[tid].mpos ) ++ { ++- stats->regions[tid].mpos += 1000; ++- stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos); +++ stats->regions[tid].mpos = npos+POS_INC; +++ if (!(stats->regions[tid].pos = realloc(stats->regions[tid].pos, sizeof(hts_pair_pos_t)*stats->regions[tid].mpos))) +++ error("Could not allocate memory for interval.\n"); ++ } ++ ++- if ( (sscanf(&line.s[i+1],"%u %u",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]); +++ if ( (sscanf(&line.s[i+1],"%"SCNd64" %"SCNd64, &stats->regions[tid].pos[npos].beg, &stats->regions[tid].pos[npos].end))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]); ++ if ( prev_tid==-1 || prev_tid!=tid ) ++ { ++ prev_tid = tid; ++- prev_pos = stats->regions[tid].pos[npos].from; +++ prev_pos = stats->regions[tid].pos[npos].beg; ++ } ++- if ( prev_pos>stats->regions[tid].pos[npos].from ) ++- error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line.s,stats->regions[tid].pos[npos].from,prev_pos); +++ if ( prev_pos>stats->regions[tid].pos[npos].beg ) +++ error("The positions are not in chromosomal order (%s:%"PRIhts_pos" comes after %"PRIhts_pos")\n", line.s, stats->regions[tid].pos[npos].beg, prev_pos); ++ stats->regions[tid].npos++; ++ if ( stats->regions[tid].npos > stats->nchunks ) ++ stats->nchunks = stats->regions[tid].npos; ++@@ -1628,20 +1868,21 @@ ++ for (r = 0; r < stats->nregions; r++) { ++ regions_t *reg = &stats->regions[r]; ++ if ( reg->npos > 1 ) { ++- qsort(reg->pos, reg->npos, sizeof(pos_t), regions_lt); +++ qsort(reg->pos, reg->npos, sizeof(hts_pair_pos_t), regions_lt); ++ for (new_p = 0, p = 1; p < reg->npos; p++) { ++- if ( reg->pos[new_p].to < reg->pos[p].from ) +++ if ( reg->pos[new_p].end < reg->pos[p].beg ) ++ reg->pos[++new_p] = reg->pos[p]; ++- else if ( reg->pos[new_p].to < reg->pos[p].to ) ++- reg->pos[new_p].to = reg->pos[p].to; +++ else if ( reg->pos[new_p].end < reg->pos[p].end ) +++ reg->pos[new_p].end = reg->pos[p].end; ++ } ++ reg->npos = ++new_p; ++ } ++ for (p = 0; p < reg->npos; p++) ++- stats->target_count += (reg->pos[p].to - reg->pos[p].from + 1); +++ stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1); ++ } ++ ++- stats->chunks = calloc(stats->nchunks, sizeof(pos_t)); +++ if (!(stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t)))) +++ error("Could not allocate memory for chunk.\n"); ++ } ++ ++ void destroy_regions(stats_t *stats) ++@@ -1676,22 +1917,22 @@ ++ // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, ++ // even small overlap is enough to include the read in the stats. ++ int i = reg->cpos; ++- while ( inpos && reg->pos[i].to<=bam_line->core.pos ) i++; +++ while ( inpos && reg->pos[i].end<=bam_line->core.pos ) i++; ++ if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; } ++ int64_t endpos = bam_endpos(bam_line); ++- if ( endpos < reg->pos[i].from ) return 0; +++ if ( endpos < reg->pos[i].beg ) return 0; ++ ++ //found a read overlapping a region ++ reg->cpos = i; ++- stats->reg_from = reg->pos[i].from; ++- stats->reg_to = reg->pos[i].to; +++ stats->reg_from = reg->pos[i].beg; +++ stats->reg_to = reg->pos[i].end; ++ ++ //now find all the overlapping chunks ++ stats->nchunks = 0; ++ while (i < reg->npos) { ++- if (bam_line->core.pos < reg->pos[i].to && endpos >= reg->pos[i].from) { ++- stats->chunks[stats->nchunks].from = MAX(bam_line->core.pos+1, reg->pos[i].from); ++- stats->chunks[stats->nchunks].to = MIN(endpos, reg->pos[i].to); +++ if (bam_line->core.pos < reg->pos[i].end && endpos >= reg->pos[i].beg) { +++ stats->chunks[stats->nchunks].beg = MAX(bam_line->core.pos+1, reg->pos[i].beg); +++ stats->chunks[stats->nchunks].end = MIN(endpos, reg->pos[i].end); ++ stats->nchunks++; ++ } ++ i++; ++@@ -1707,7 +1948,7 @@ ++ int i, j, tid; ++ stats->nregions = iter->n_reg; ++ stats->regions = calloc(stats->nregions, sizeof(regions_t)); ++- stats->chunks = calloc(stats->nchunks, sizeof(pos_t)); +++ stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t)); ++ if ( !stats->regions || !stats->chunks ) ++ return 1; ++ ++@@ -1727,15 +1968,15 @@ ++ } ++ ++ stats->regions[tid].mpos = stats->regions[tid].npos = iter->reg_list[i].count; ++- stats->regions[tid].pos = calloc(stats->regions[tid].mpos, sizeof(pos_t)); +++ stats->regions[tid].pos = calloc(stats->regions[tid].mpos, sizeof(hts_pair_pos_t)); ++ if ( !stats->regions[tid].pos ) ++ return 1; ++ ++ for (j = 0; j < stats->regions[tid].npos; j++) { ++- stats->regions[tid].pos[j].from = iter->reg_list[i].intervals[j].beg+1; ++- stats->regions[tid].pos[j].to = iter->reg_list[i].intervals[j].end; +++ stats->regions[tid].pos[j].beg = iter->reg_list[i].intervals[j].beg+1; +++ stats->regions[tid].pos[j].end = iter->reg_list[i].intervals[j].end; ++ ++- stats->target_count += (stats->regions[tid].pos[j].to - stats->regions[tid].pos[j].from + 1); +++ stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1); ++ } ++ } ++ ++@@ -1773,7 +2014,7 @@ ++ } ++ ++ ++-static void error(const char *format, ...) +++static void HTS_NORETURN error(const char *format, ...) ++ { ++ if ( !format ) ++ { ++@@ -1783,13 +2024,14 @@ ++ printf("Options:\n"); ++ printf(" -c, --coverage ,, Coverage distribution min,max,step [1,1000,1]\n"); ++ printf(" -d, --remove-dups Exclude from statistics reads marked as duplicates\n"); +++ printf(" -X, --customized-index-file Use a customized index file\n"); ++ printf(" -f, --required-flag Required flag, 0 for unset. See also `samtools flags` [0]\n"); ++ printf(" -F, --filtering-flag Filtering flag, 0 for unset. See also `samtools flags` [0]\n"); ++ printf(" --GC-depth the size of GC-depth bins (decreasing bin size increases memory requirement) [2e4]\n"); ++ printf(" -h, --help This help message\n"); ++ printf(" -i, --insert-size Maximum insert size [8000]\n"); ++ printf(" -I, --id Include only listed read group or sample name\n"); ++- printf(" -l, --read-length Include in the statistics only reads with the given read length []\n"); +++ printf(" -l, --read-length Include in the statistics only reads with the given read length [-1]\n"); ++ printf(" -m, --most-inserts Report only the main part of inserts [0.99]\n"); ++ printf(" -P, --split-prefix Path or string prefix for filepaths output by -S (default is input filename)\n"); ++ printf(" -q, --trim-quality The BWA trimming parameter [0]\n"); ++@@ -1799,8 +2041,8 @@ ++ printf(" -t, --target-regions Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); ++ printf(" -x, --sparse Suppress outputting IS rows where there are no insertions.\n"); ++ printf(" -p, --remove-overlaps Remove overlaps of paired-end reads from coverage and base count computations.\n"); ++- printf(" -g, --cov-threshold Only bases with coverage above this value will be included in the target percentage computation.\n"); ++- sam_global_opt_help(stdout, "-.--.@"); +++ printf(" -g, --cov-threshold Only bases with coverage above this value will be included in the target percentage computation [0]\n"); +++ sam_global_opt_help(stdout, "-.--.@-."); ++ printf("\n"); ++ } ++ else ++@@ -1840,6 +2082,9 @@ ++ free(stats->ins_cycles_2nd); ++ free(stats->del_cycles_1st); ++ free(stats->del_cycles_2nd); +++ if (stats->acgtno_barcode) free(stats->acgtno_barcode); +++ if (stats->quals_barcode) free(stats->quals_barcode); +++ free(stats->tags_barcode); ++ destroy_regions(stats); ++ if ( stats->rg_hash ) khash_str2int_destroy(stats->rg_hash); ++ free(stats->split_name); ++@@ -1878,6 +2123,9 @@ ++ ++ void destroy_split_stats(khash_t(c2stats) *split_hash) ++ { +++ if (!split_hash) +++ return; +++ ++ int i = 0; ++ stats_t *curr_stats = NULL; ++ for(i = kh_begin(split_hash); i != kh_end(split_hash); ++i){ ++@@ -1891,6 +2139,10 @@ ++ stats_info_t* stats_info_init(int argc, char *argv[]) ++ { ++ stats_info_t* info = calloc(1, sizeof(stats_info_t)); +++ if (!info) { +++ return NULL; +++ } +++ ++ info->nisize = 8000; ++ info->isize_main_bulk = 0.99; // There are always outliers at the far end ++ info->gcd_bin_size = 20e3; ++@@ -1926,11 +2178,15 @@ ++ stats_t* stats_init() ++ { ++ stats_t *stats = calloc(1,sizeof(stats_t)); +++ if (!stats) +++ return NULL; +++ ++ stats->ngc = 200; ++ stats->nquals = 256; ++ stats->nbases = 300; ++ stats->rseq_pos = -1; ++- stats->tid = stats->gcd_pos = -1; +++ stats->tid = -1; +++ stats->gcd_pos = -1LL; ++ stats->igcd = 0; ++ stats->is_sorted = 1; ++ stats->nindels = stats->nbases; ++@@ -1944,6 +2200,18 @@ ++ return stats; ++ } ++ +++static int init_barcode_tags(stats_t* stats) { +++ stats->ntags = 4; +++ stats->tags_barcode = calloc(stats->ntags, sizeof(barcode_info_t)); +++ if (!stats->tags_barcode) +++ return -1; +++ stats->tags_barcode[0] = (barcode_info_t){"BC", "QT", 0, -1, -1, 0}; +++ stats->tags_barcode[1] = (barcode_info_t){"CR", "CY", 0, -1, -1, 0}; +++ stats->tags_barcode[2] = (barcode_info_t){"OX", "BZ", 0, -1, -1, 0}; +++ stats->tags_barcode[3] = (barcode_info_t){"RX", "QX", 0, -1, -1, 0}; +++ return 0; +++} +++ ++ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* group_id, const char* targets) ++ { ++ // Give stats_t a pointer to the info struct ++@@ -1961,32 +2229,60 @@ ++ stats->ncov = 3 + (info->cov_max-info->cov_min) / info->cov_step; ++ info->cov_max = info->cov_min + ((info->cov_max-info->cov_min)/info->cov_step +1)*info->cov_step - 1; ++ stats->cov = calloc(sizeof(uint64_t),stats->ncov); +++ if (!stats->cov) goto nomem; ++ stats->cov_rbuf.size = stats->nbases*5; ++ stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size); ++- +++ if (!stats->cov_rbuf.buffer) goto nomem; ++ if ( group_id ) init_group_id(stats, group_id); ++ // .. arrays ++ stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); +++ if (!stats->quals_1st) goto nomem; ++ stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); +++ if (!stats->quals_2nd) goto nomem; ++ stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t)); +++ if (!stats->gc_1st) goto nomem; ++ stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t)); +++ if (!stats->gc_2nd) goto nomem; ++ stats->isize = init_isize_t(info->nisize ?info->nisize+1 :0); +++ if (!stats->isize) goto nomem; ++ stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); ++- stats->mpc_buf = info->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; +++ if (!stats->gcd) goto nomem; +++ if (info->fai) { +++ stats->mpc_buf = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); +++ if (!stats->mpc_buf) goto nomem; +++ } else { +++ stats->mpc_buf = NULL; +++ } ++ stats->acgtno_cycles_1st = calloc(stats->nbases,sizeof(acgtno_count_t)); +++ if (!stats->acgtno_cycles_1st) goto nomem; ++ stats->acgtno_cycles_2nd = calloc(stats->nbases,sizeof(acgtno_count_t)); +++ if (!stats->acgtno_cycles_2nd) goto nomem; ++ stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t)); +++ if (!stats->read_lengths) goto nomem; ++ stats->read_lengths_1st = calloc(stats->nbases,sizeof(uint64_t)); +++ if (!stats->read_lengths_1st) goto nomem; ++ stats->read_lengths_2nd = calloc(stats->nbases,sizeof(uint64_t)); +++ if (!stats->read_lengths_2nd) goto nomem; ++ stats->insertions = calloc(stats->nbases,sizeof(uint64_t)); +++ if (!stats->insertions) goto nomem; ++ stats->deletions = calloc(stats->nbases,sizeof(uint64_t)); +++ if (!stats->deletions) goto nomem; ++ stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); +++ if (!stats->ins_cycles_1st) goto nomem; ++ stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); +++ if (!stats->ins_cycles_2nd) goto nomem; ++ stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); +++ if (!stats->del_cycles_1st) goto nomem; ++ stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); +++ if (!stats->del_cycles_2nd) goto nomem; +++ if (init_barcode_tags(stats) < 0) +++ goto nomem; ++ realloc_rseq_buffer(stats); ++ if ( targets ) ++ init_regions(stats, targets); +++ return; +++ nomem: +++ error("Out of memory"); ++ } ++ ++ static stats_t* get_curr_split_stats(bam1_t* bam_line, khash_t(c2stats)* split_hash, stats_info_t* info, char* targets) ++@@ -2002,6 +2298,9 @@ ++ khiter_t k = kh_get(c2stats, split_hash, split_name); ++ if(k == kh_end(split_hash)){ ++ curr_stats = stats_init(); // mallocs new instance +++ if (!curr_stats) { +++ error("Couldn't allocate split stats"); +++ } ++ init_stat_structs(curr_stats, info, NULL, targets); ++ curr_stats->split_name = split_name; ++ ++@@ -2024,11 +2323,16 @@ ++ { ++ char *targets = NULL; ++ char *bam_fname = NULL; +++ char *bam_idx_fname = NULL; ++ char *group_id = NULL; ++- int sparse = 0; +++ int sparse = 0, has_index_file = 0, ret = 1; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ ++ stats_info_t *info = stats_info_init(argc, argv); +++ if (!info) { +++ fprintf(stderr, "Could not allocate memory for info.\n"); +++ return 1; +++ } ++ ++ static const struct option loptions[] = ++ { ++@@ -2036,6 +2340,7 @@ ++ {"help", no_argument, NULL, 'h'}, ++ {"remove-dups", no_argument, NULL, 'd'}, ++ {"sam", no_argument, NULL, 's'}, +++ {"customized-index-file", required_argument, NULL, 'X'}, ++ {"ref-seq", required_argument, NULL, 'r'}, ++ {"coverage", required_argument, NULL, 'c'}, ++ {"read-length", required_argument, NULL, 'l'}, ++@@ -2056,13 +2361,14 @@ ++ }; ++ int opt; ++ ++- while ( (opt=getopt_long(argc,argv,"?hdsxpr:c:l:i:t:m:q:f:F:g:I:1:S:P:@:",loptions,NULL))>0 ) +++ while ( (opt=getopt_long(argc,argv,"?hdsXxpr:c:l:i:t:m:q:f:F:g:I:S:P:@:",loptions,NULL))>0 ) ++ { ++ switch (opt) ++ { ++ case 'f': info->flag_require = bam_str2flag(optarg); break; ++ case 'F': info->flag_filter |= bam_str2flag(optarg); break; ++ case 'd': info->flag_filter |= BAM_FDUP; break; +++ case 'X': has_index_file = 1; break; ++ case 's': break; ++ case 'r': info->fai = fai_load(optarg); ++ if (info->fai==NULL) ++@@ -2088,15 +2394,15 @@ ++ break; ++ case '?': ++ case 'h': error(NULL); +++ /* no break */ ++ default: ++ if (parse_sam_global_opt(opt, optarg, loptions, &ga) != 0) ++ error("Unknown argument: %s\n", optarg); ++ break; ++ } ++ } ++- if ( optind 0) ++ hts_set_threads(info->sam, ga.nthreads); ++ ++ stats_t *all_stats = stats_init(); +++ if (!all_stats) { +++ fprintf(stderr, "Could not allocate memory for stats.\n"); +++ cleanup_stats_info(info); +++ return 1; +++ } ++ stats_t *curr_stats = NULL; ++ init_stat_structs(all_stats, info, group_id, targets); ++ // Init ++ // .. hash ++ khash_t(c2stats)* split_hash = kh_init(c2stats); +++ if (!split_hash) goto cleanup_all_stats; ++ ++ khash_t(qn2pair)* read_pairs = kh_init(qn2pair); +++ if (!read_pairs) goto cleanup_split_hash; ++ ++ // Collect statistics ++ bam1_t *bam_line = bam_init1(); ++- if ( optindsam,bam_fname); ++- if (bam_idx) { ++- ++- int regcount = 0; ++- hts_reglist_t *reglist = bed_reglist(region_hash, ALL, ®count); ++- if (reglist) { ++- ++- hts_itr_multi_t *iter = sam_itr_regions(bam_idx, info->sam_header, reglist, regcount); ++- if (iter) { ++- ++- if (!targets) { ++- all_stats->nchunks = argc-optind; ++- if ( replicate_regions(all_stats, iter) ) ++- fprintf(stderr, "Replications of the regions failed."); ++- } +++ if (!bam_line) goto cleanup_read_pairs; +++ +++ if (optind < argc) { +++ // Region:interval arguments in the command line +++ hts_idx_t *bam_idx = NULL; +++ if (has_index_file) { +++ bam_idx = sam_index_load2(info->sam, bam_fname, bam_idx_fname); +++ } else { +++ // If an index filename has not been specified, look alongside the alignment file +++ bam_idx = sam_index_load(info->sam, bam_fname); +++ } +++ +++ if (bam_idx) { +++ hts_itr_multi_t *iter = sam_itr_regarray(bam_idx, info->sam_header, &argv[optind], argc - optind); +++ if (iter) { +++ if (!targets) { +++ all_stats->nchunks = argc-optind; +++ if (replicate_regions(all_stats, iter)) +++ fprintf(stderr, "Replications of the regions failed\n"); +++ } ++ ++- if ( all_stats->nregions && all_stats->regions ) { ++- while (sam_itr_multi_next(info->sam, iter, bam_line) >= 0) { ++- if (info->split_tag) { ++- curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets); ++- collect_stats(bam_line, curr_stats, read_pairs); ++- } ++- collect_stats(bam_line, all_stats, read_pairs); ++- } +++ if ( all_stats->nregions && all_stats->regions ) { +++ while ((ret = sam_itr_next(info->sam, iter, bam_line)) >= 0) { +++ if (info->split_tag) { +++ curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets); +++ collect_stats(bam_line, curr_stats, read_pairs); ++ } +++ collect_stats(bam_line, all_stats, read_pairs); +++ } ++ +++ if (ret < -1) { +++ fprintf(stderr, "Failure while running the iterator\n"); ++ hts_itr_multi_destroy(iter); ++- } else { ++- fprintf(stderr, "Creation of the region iterator failed."); ++- hts_reglist_free(reglist, regcount); +++ hts_idx_destroy(bam_idx); +++ goto cleanup; ++ } ++- } else { ++- fprintf(stderr, "Creation of the region list failed."); ++ } ++- ++- hts_idx_destroy(bam_idx); +++ hts_itr_multi_destroy(iter); ++ } else { ++- fprintf(stderr, "Random alignment retrieval only works for indexed BAM files.\n"); +++ fprintf(stderr, "Multi-region iterator could not be created\n"); +++ hts_idx_destroy(bam_idx); +++ goto cleanup; ++ } ++- ++- bed_destroy(region_hash); +++ hts_idx_destroy(bam_idx); ++ } else { ++- fprintf(stderr, "Creation of the region hash table failed.\n"); +++ if (has_index_file) +++ fprintf(stderr, "Invalid index file '%s'\n", bam_idx_fname); +++ fprintf(stderr, "Random alignment retrieval only works for indexed files\n"); +++ goto cleanup; ++ } ++- } ++- else ++- { +++ } else { ++ if ( info->cov_threshold > 0 && !targets ) { ++- fprintf(stderr, "Coverage percentage calcuation requires a list of target regions\n"); +++ fprintf(stderr, "Coverage percentage calculation requires a list of target regions\n"); ++ goto cleanup; ++ } ++ ++ // Stream through the entire BAM ignoring off-target regions if -t is given ++- int ret; ++ while ((ret = sam_read1(info->sam, info->sam_header, bam_line)) >= 0) { ++ if (info->split_tag) { ++ curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets); ++@@ -2194,7 +2509,7 @@ ++ ++ if (ret < -1) { ++ fprintf(stderr, "Failure while decoding file\n"); ++- return 1; +++ goto cleanup; ++ } ++ } ++ ++@@ -2203,15 +2518,19 @@ ++ if (info->split_tag) ++ output_split_stats(split_hash, bam_fname, sparse); ++ +++ ret = 0; ++ cleanup: ++ bam_destroy1(bam_line); ++- bam_hdr_destroy(info->sam_header); +++ sam_hdr_destroy(info->sam_header); ++ sam_global_args_free(&ga); ++ +++cleanup_read_pairs: +++ cleanup_overlaps(read_pairs, INT64_MAX); +++cleanup_split_hash: +++ destroy_split_stats(split_hash); +++cleanup_all_stats: ++ cleanup_stats(all_stats); ++ cleanup_stats_info(info); ++- destroy_split_stats(split_hash); ++- cleanup_overlaps(read_pairs, INT_MAX); ++ ++- return 0; +++ return ret; ++ } ++--- python-pysam.orig/samtools/stats.c.pysam.c +++++ python-pysam/samtools/stats.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* stats.c -- This is the former bamcheck integrated into samtools/htslib. ++ ++- Copyright (C) 2012-2015 Genome Research Ltd. +++ Copyright (C) 2012-2019 Genome Research Ltd. ++ ++ Author: Petr Danecek ++ Author: Sam Nicholls ++@@ -48,6 +48,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -55,7 +56,7 @@ ++ #include ++ #include ++ #include ++-#include "sam_header.h" +++#include ++ #include ++ #include "samtools.h" ++ #include ++@@ -67,8 +68,10 @@ ++ #define BWA_MIN_RDLEN 35 ++ #define DEFAULT_CHUNK_NO 8 ++ #define DEFAULT_PAIR_MAX 10000 +++#define ERROR_LIMIT 200 ++ // From the spec ++ // If 0x4 is set, no assumptions can be made about RNAME, POS, CIGAR, MAPQ, bits 0x2, 0x10, 0x100 and 0x800, and the bit 0x20 of the previous read in the template. +++#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED) ++ #define IS_PAIRED_AND_MAPPED(bam) (((bam)->core.flag&BAM_FPAIRED) && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP)) ++ #define IS_PROPERLYPAIRED(bam) (((bam)->core.flag&(BAM_FPAIRED|BAM_FPROPER_PAIR)) == (BAM_FPAIRED|BAM_FPROPER_PAIR) && !((bam)->core.flag&BAM_FUNMAP)) ++ #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP) ++@@ -79,6 +82,14 @@ ++ #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP) ++ #define IS_ORIGINAL(bam) (((bam)->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY)) == 0) ++ +++#define READ_ORDER_NONE 0 +++#define READ_ORDER_FIRST 1 +++#define READ_ORDER_LAST 2 +++#define READ_ORDER_MIDDLE 3 +++ +++#define REG_INC 100 +++#define POS_INC 1000 +++ ++ // The GC-depth graph works as follows: split the reference sequence into ++ // segments and calculate GC content and depth in each bin. Then sort ++ // these segments by their GC and plot the depth distribution by means ++@@ -93,17 +104,16 @@ ++ // For coverage distribution, a simple pileup ++ typedef struct ++ { ++- int64_t pos; +++ hts_pos_t pos; ++ int size, start; ++ int *buffer; ++ } ++ round_buffer_t; ++ ++-typedef struct { uint32_t from, to; } pos_t; ++ typedef struct ++ { ++- int npos,mpos,cpos; ++- pos_t *pos; +++ int npos, mpos, cpos; +++ hts_pair_pos_t *pos; ++ } ++ regions_t; ++ ++@@ -120,6 +130,17 @@ ++ ++ typedef struct ++ { +++ char tag_name[3]; +++ char qual_name[3]; +++ uint32_t nbases; +++ int32_t tag_sep; // Index of the separator (if present) +++ int32_t max_qual; +++ uint32_t offset; // Where the tag stats info is located in the allocated memory +++} +++barcode_info_t; +++ +++typedef struct +++{ ++ // Auxiliary data ++ int flag_require, flag_filter; ++ faidx_t *fai; // Reference sequence for GC-depth graph ++@@ -131,7 +152,7 @@ ++ float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part ++ int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins ++ samFile* sam; ++- bam_hdr_t* sam_header; +++ sam_hdr_t* sam_header; ++ ++ // Filters ++ int filter_readlen; ++@@ -177,6 +198,7 @@ ++ uint64_t total_len_dup; ++ uint64_t nreads_1st; ++ uint64_t nreads_2nd; +++ uint64_t nreads_other; ++ uint64_t nreads_filtered; ++ uint64_t nreads_dup; ++ uint64_t nreads_unmapped; ++@@ -198,8 +220,8 @@ ++ // GC-depth related data ++ uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin ++ gc_depth_t *gcd; // The GC-depth bins holder ++- int32_t tid, gcd_pos; // Position of the current bin ++- int32_t pos; // Position of the last read +++ int32_t tid; // Position of the current bin +++ hts_pos_t gcd_pos, pos; // Position of the last read ++ ++ // Coverage distribution related data ++ int ncov; // The number of coverage bins ++@@ -209,12 +231,13 @@ ++ // Mismatches by read cycle ++ uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against ++ int mrseq_buf; // The size of the buffer ++- int32_t rseq_pos; // The coordinate of the first base in the buffer ++- int32_t nrseq_buf; // The used part of the buffer +++ hts_pos_t rseq_pos; // The coordinate of the first base in the buffer +++ int64_t nrseq_buf; // The used part of the buffer ++ uint64_t *mpc_buf; // Mismatches per cycle ++ ++ // Target regions ++- int nregions, reg_from, reg_to; +++ int nregions; +++ hts_pos_t reg_from, reg_to; ++ regions_t *regions; ++ ++ // Auxiliary data ++@@ -225,13 +248,20 @@ ++ char* split_name; ++ ++ stats_info_t* info; // Pointer to options and settings struct ++- pos_t *chunks; +++ hts_pair_pos_t *chunks; ++ uint32_t nchunks; ++ ++ uint32_t pair_count; // Number of active pairs in the pairing hash table ++ uint32_t target_count; // Number of bases covered by the target file ++ uint32_t last_pair_tid; ++ uint32_t last_read_flush; +++ +++ // Barcode statistics +++ acgtno_count_t *acgtno_barcode; +++ uint64_t *quals_barcode; +++ barcode_info_t *tags_barcode; +++ uint32_t ntags; +++ uint32_t error_number; ++ } ++ stats_t; ++ KHASH_MAP_INIT_STR(c2stats, stats_t*) ++@@ -239,18 +269,18 @@ ++ typedef struct { ++ uint32_t first; // 1 - first read, 2 - second read ++ uint32_t n, m; // number of chunks, allocated chunks ++- pos_t *chunks; // chunk array of size m +++ hts_pair_pos_t *chunks; // chunk array of size m ++ } pair_t; ++ KHASH_MAP_INIT_STR(qn2pair, pair_t*) ++ ++ ++-static void error(const char *format, ...); +++static void HTS_NORETURN error(const char *format, ...); ++ int is_in_regions(bam1_t *bam_line, stats_t *stats); ++ void realloc_buffers(stats_t *stats, int seq_len); ++ ++ static int regions_lt(const void *r1, const void *r2) { ++- int64_t from_diff = (int64_t)((pos_t *)r1)->from - (int64_t)((pos_t *)r2)->from; ++- int64_t to_diff = (int64_t)((pos_t *)r1)->to - (int64_t)((pos_t *)r2)->to; +++ int64_t from_diff = ((hts_pair_pos_t *)r1)->beg - ((hts_pair_pos_t *)r2)->beg; +++ int64_t to_diff = ((hts_pair_pos_t *)r1)->end - ((hts_pair_pos_t *)r2)->end; ++ ++ return from_diff > 0 ? 1 : from_diff < 0 ? -1 : to_diff > 0 ? 1 : to_diff < 0 ? -1 : 0; ++ } ++@@ -267,19 +297,19 @@ ++ return 1 + (depth - min) / step; ++ } ++ ++-static inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos) +++static inline int round_buffer_lidx2ridx(int offset, int size, hts_pos_t refpos, hts_pos_t pos) ++ { ++ return (offset + (pos-refpos) % size) % size; ++ } ++ ++-void round_buffer_flush(stats_t *stats, int64_t pos) +++void round_buffer_flush(stats_t *stats, hts_pos_t pos) ++ { ++ int ibuf,idp; ++ ++ if ( pos==stats->cov_rbuf.pos ) ++ return; ++ ++- int64_t new_pos = pos; +++ hts_pos_t new_pos = pos; ++ if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size ) ++ { ++ // Flush the whole buffer, but in sequential order, ++@@ -287,10 +317,10 @@ ++ } ++ ++ if ( pos < stats->cov_rbuf.pos ) ++- error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos); +++ error("Expected coordinates in ascending order, got %"PRIhts_pos" after %"PRIhts_pos"\n", pos, stats->cov_rbuf.pos); ++ ++ int ifrom = stats->cov_rbuf.start; ++- int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1); +++ int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start, stats->cov_rbuf.size, stats->cov_rbuf.pos, pos-1); ++ if ( ifrom>ito ) ++ { ++ for (ibuf=ifrom; ibufcov_rbuf.size; ibuf++) ++@@ -311,27 +341,30 @@ ++ stats->cov[idp]++; ++ stats->cov_rbuf.buffer[ibuf] = 0; ++ } ++- stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos); +++ stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start, stats->cov_rbuf.size, stats->cov_rbuf.pos, pos); ++ stats->cov_rbuf.pos = new_pos; ++ } ++ ++-void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to) +++/** +++ * [from, to) - 0 based half-open +++ */ +++static void round_buffer_insert_read(round_buffer_t *rbuf, hts_pos_t from, hts_pos_t to) ++ { ++- if ( to-from >= rbuf->size ) ++- error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size); +++ if ( to-from > rbuf->size ) +++ error("The read length too big (%"PRIhts_pos"), please increase the buffer length (currently %d)\n", to-from, rbuf->size); ++ if ( from < rbuf->pos ) ++- error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos); +++ error("The reads are not sorted (%"PRIhts_pos" comes after %"PRIhts_pos").\n", from, rbuf->pos); ++ ++- int ifrom,ito,ibuf; ++- ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from); ++- ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to); +++ int ifrom, ito, ibuf; +++ ifrom = round_buffer_lidx2ridx(rbuf->start, rbuf->size, rbuf->pos, from); +++ ito = round_buffer_lidx2ridx(rbuf->start, rbuf->size, rbuf->pos, to); ++ if ( ifrom>ito ) ++ { ++ for (ibuf=ifrom; ibufsize; ibuf++) ++ rbuf->buffer[ibuf]++; ++ ifrom = 0; ++ } ++- for (ibuf=ifrom; ibuf<=ito; ibuf++) +++ for (ibuf=ifrom; ibufbuffer[ibuf]++; ++ } ++ ++@@ -364,7 +397,7 @@ ++ void count_indels(stats_t *stats,bam1_t *bam_line) ++ { ++ int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; ++- int is_1st = IS_READ1(bam_line) ? 1 : 0; +++ uint32_t order = IS_PAIRED(bam_line) ? (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0) : READ_ORDER_FIRST; ++ int icig; ++ int icycle = 0; ++ int read_len = bam_line->core.l_qseq; ++@@ -379,10 +412,10 @@ ++ int idx = is_fwd ? icycle : read_len-icycle-ncig; ++ if ( idx<0 ) ++ error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle); ++- if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); ++- if ( is_1st ) +++ if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%"PRIhts_pos" %s\n", idx, stats->nbases, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); +++ if ( order == READ_ORDER_FIRST ) ++ stats->ins_cycles_1st[idx]++; ++- else +++ if ( order == READ_ORDER_LAST ) ++ stats->ins_cycles_2nd[idx]++; ++ icycle += ncig; ++ if ( ncig<=stats->nindels ) ++@@ -394,9 +427,9 @@ ++ int idx = is_fwd ? icycle-1 : read_len-icycle-1; ++ if ( idx<0 ) continue; // discard meaningless deletions ++ if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases); ++- if ( is_1st ) +++ if ( order == READ_ORDER_FIRST ) ++ stats->del_cycles_1st[idx]++; ++- else +++ if ( order == READ_ORDER_LAST ) ++ stats->del_cycles_2nd[idx]++; ++ if ( ncig<=stats->nindels ) ++ stats->deletions[ncig-1]++; ++@@ -422,8 +455,8 @@ ++ void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len) ++ { ++ int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; ++- int icig,iread=0,icycle=0; ++- int iref = bam_line->core.pos - stats->rseq_pos; +++ int icig, iread=0, icycle=0; +++ hts_pos_t iref = bam_line->core.pos - stats->rseq_pos; ++ uint8_t *read = bam_get_seq(bam_line); ++ uint8_t *quals = bam_get_qual(bam_line); ++ uint64_t *mpc_buf = stats->mpc_buf; ++@@ -456,13 +489,13 @@ ++ continue; ++ } ++ // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large ++- // chunk of refseq in memory. Not very frequent and not noticable in the stats. +++ // chunk of refseq in memory. Not very frequent and not noticeable in the stats. ++ if ( cig==BAM_CREF_SKIP || cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) continue; ++ if ( cig!=BAM_CMATCH && cig!=BAM_CEQUAL && cig!=BAM_CDIFF ) // not relying on precalculated diffs ++- error("TODO: cigar %d, %s:%d %s\n", cig,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); +++ error("TODO: cigar %d, %s:%"PRIhts_pos" %s\n", cig, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); ++ ++ if ( ncig+iref > stats->nrseq_buf ) ++- error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam_get_qname(bam_line),stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1); +++ error("FIXME: %d+%"PRIhts_pos" > %"PRId64", %s, %s:%"PRIhts_pos"\n", ncig, iref, stats->nrseq_buf, bam_get_qname(bam_line), sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1); ++ ++ int im; ++ for (im=0; im=stats->nquals ) ++- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); +++ error("TODO: quality too high %d>=%d (%s %"PRIhts_pos" %s)\n", qual, stats->nquals, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); ++ ++ int idx = is_fwd ? icycle : read_len-icycle-1; ++ if ( idx>stats->max_len ) ++- error("mpc: %d>%d (%s %d %s)\n",idx,stats->max_len,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); +++ error("mpc: %d>%d (%s %"PRIhts_pos" %s)\n", idx, stats->max_len, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); ++ ++ idx = idx*stats->nquals + qual; ++ if ( idx>=stats->nquals*stats->nbases ) ++@@ -505,11 +538,12 @@ ++ } ++ } ++ ++-void read_ref_seq(stats_t *stats, int32_t tid, int32_t pos) +++void read_ref_seq(stats_t *stats, int32_t tid, hts_pos_t pos) ++ { ++- int i, fai_ref_len; ++- char *fai_ref = faidx_fetch_seq(stats->info->fai, stats->info->sam_header->target_name[tid], pos, pos+stats->mrseq_buf-1, &fai_ref_len); ++- if ( fai_ref_len<0 ) error("Failed to fetch the sequence \"%s\"\n", stats->info->sam_header->target_name[tid]); +++ int i; +++ hts_pos_t fai_ref_len; +++ char *fai_ref = faidx_fetch_seq64(stats->info->fai, sam_hdr_tid2name(stats->info->sam_header, tid), pos, pos+stats->mrseq_buf-1, &fai_ref_len); +++ if ( fai_ref_len < 0 ) error("Failed to fetch the sequence \"%s\"\n", sam_hdr_tid2name(stats->info->sam_header, tid)); ++ ++ uint8_t *ptr = stats->rseq_buf; ++ for (i=0; itid = tid; ++ } ++ ++-float fai_gc_content(stats_t *stats, int pos, int len) +++float fai_gc_content(stats_t *stats, hts_pos_t pos, int len) ++ { ++ uint32_t gc,count,c; ++- int i = pos - stats->rseq_pos, ito = i + len; +++ hts_pos_t i = pos - stats->rseq_pos, ito = i + len; ++ assert( i>=0 ); ++ ++ if ( ito > stats->nrseq_buf ) ito = stats->nrseq_buf; ++@@ -570,6 +604,9 @@ ++ if ( stats->mrseq_bufrseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n); +++ if (!stats->rseq_buf) { +++ error("Could not reallocate reference sequence buffer"); +++ } ++ stats->mrseq_buf = n; ++ } ++ } ++@@ -661,6 +698,9 @@ ++ ++ // Realloc the coverage distribution buffer ++ int *rbuffer = calloc(sizeof(int),seq_len*5); +++ if (!rbuffer) { +++ error("Could not allocate coverage distribution buffer"); +++ } ++ n = stats->cov_rbuf.size-stats->cov_rbuf.start; ++ memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n); ++ if ( stats->cov_rbuf.start>1 ) ++@@ -690,6 +730,119 @@ ++ stats->checksum.quals += crc32(0L, qual, (seq_len+1)/2); ++ } ++ +++// Collect statistics about the barcode tags specified by init_barcode_tags method +++static void collect_barcode_stats(bam1_t* bam_line, stats_t* stats) { +++ uint32_t nbases, tag, i; +++ acgtno_count_t *acgtno; +++ uint64_t *quals; +++ int32_t *separator, *maxqual; +++ +++ for (tag = 0; tag < stats->ntags; tag++) { +++ const char *barcode_tag = stats->tags_barcode[tag].tag_name, *qual_tag = stats->tags_barcode[tag].qual_name; +++ uint8_t* bc = bam_aux_get(bam_line, barcode_tag); +++ if (!bc) +++ continue; +++ +++ char* barcode = bam_aux2Z(bc); +++ if (!barcode) +++ continue; +++ +++ uint32_t barcode_len = strlen(barcode); +++ if (!stats->tags_barcode[tag].nbases) { // tag seen for the first time +++ uint32_t offset = 0; +++ for (i = 0; i < stats->ntags; i++) +++ offset += stats->tags_barcode[i].nbases; +++ +++ stats->tags_barcode[tag].offset = offset; +++ stats->tags_barcode[tag].nbases = barcode_len; +++ stats->acgtno_barcode = realloc(stats->acgtno_barcode, (offset + barcode_len) * sizeof(acgtno_count_t)); +++ stats->quals_barcode = realloc(stats->quals_barcode, (offset + barcode_len) * stats->nquals * sizeof(uint64_t)); +++ +++ if (!stats->acgtno_barcode || !stats->quals_barcode) +++ error("Error allocating memory. Aborting!\n"); +++ +++ memset(stats->acgtno_barcode + offset, 0, barcode_len*sizeof(acgtno_count_t)); +++ memset(stats->quals_barcode + offset*stats->nquals, 0, barcode_len*stats->nquals*sizeof(uint64_t)); +++ } +++ +++ nbases = stats->tags_barcode[tag].nbases; +++ if (barcode_len > nbases) { +++ fprintf(samtools_stderr, "Barcodes with tag %s differ in length at sequence '%s'\n", barcode_tag, bam_get_qname(bam_line)); +++ continue; +++ } +++ +++ acgtno = stats->acgtno_barcode + stats->tags_barcode[tag].offset; +++ quals = stats->quals_barcode + stats->tags_barcode[tag].offset*stats->nquals; +++ maxqual = &stats->tags_barcode[tag].max_qual; +++ separator = &stats->tags_barcode[tag].tag_sep; +++ int error_flag = 0; +++ +++ for (i = 0; i < barcode_len; i++) { +++ switch (barcode[i]) { +++ case 'A': +++ acgtno[i].a++; +++ break; +++ case 'C': +++ acgtno[i].c++; +++ break; +++ case 'G': +++ acgtno[i].g++; +++ break; +++ case 'T': +++ acgtno[i].t++; +++ break; +++ case 'N': +++ acgtno[i].n++; +++ break; +++ default: +++ if (*separator >= 0) { +++ if (*separator != i) { +++ if (stats->error_number < ERROR_LIMIT) { +++ fprintf(samtools_stderr, "Barcode separator for tag %s is in a different position or wrong barcode content('%s') at sequence '%s'\n", barcode_tag, barcode, bam_get_qname(bam_line)); +++ stats->error_number++; +++ } +++ error_flag = 1; +++ } +++ } else { +++ *separator = i; +++ } +++ } +++ +++ /* don't process the rest of the tag bases */ +++ if (error_flag) +++ break; +++ } +++ +++ /* skip to the next tag */ +++ if (error_flag) +++ continue; +++ +++ uint8_t* qt = bam_aux_get(bam_line, qual_tag); +++ if (!qt) +++ continue; +++ +++ char* barqual = bam_aux2Z(qt); +++ if (!barqual) +++ continue; +++ +++ uint32_t barqual_len = strlen(barqual); +++ if (barqual_len == barcode_len) { +++ for (i = 0; i < barcode_len; i++) { +++ int32_t qual = (int32_t)barqual[i] - '!'; // Phred + 33 +++ if (qual >= 0 && qual < stats->nquals) { +++ quals[i * stats->nquals + qual]++; +++ if (qual > *maxqual) +++ *maxqual = qual; +++ } +++ } +++ } else { +++ if (stats->error_number++ < ERROR_LIMIT) { +++ fprintf(samtools_stderr, "%s length and %s length don't match for sequence '%s'\n", barcode_tag, qual_tag, bam_get_qname(bam_line)); +++ } +++ } +++ } +++} +++ ++ // These stats should only be calculated for the original reads ignoring ++ // supplementary artificial reads otherwise we'll accidentally double count ++ void collect_orig_read_stats(bam1_t *bam_line, stats_t *stats, int* gc_count_out) ++@@ -700,42 +853,48 @@ ++ if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++; ++ if ( bam_line->core.flag & BAM_FPAIRED ) stats->nreads_paired_tech++; ++ +++ uint32_t order = IS_PAIRED(bam_line) ? (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0) : READ_ORDER_FIRST; +++ ++ // Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored ++ uint8_t *seq = bam_get_seq(bam_line); ++- int i, read_cycle, gc_count = 0, reverse = IS_REVERSE(bam_line), is_first = IS_READ1(bam_line); ++- for (i=0; iacgtno_cycles_1st[ read_cycle ].a++ : stats->acgtno_cycles_2nd[ read_cycle ].a++; ++- break; ++- case 2: ++- is_first ? stats->acgtno_cycles_1st[ read_cycle ].c++ : stats->acgtno_cycles_2nd[ read_cycle ].c++; ++- gc_count++; ++- break; ++- case 4: ++- is_first ? stats->acgtno_cycles_1st[ read_cycle ].g++ : stats->acgtno_cycles_2nd[ read_cycle ].g++; ++- gc_count++; ++- break; ++- case 8: ++- is_first ? stats->acgtno_cycles_1st[ read_cycle ].t++ : stats->acgtno_cycles_2nd[ read_cycle ].t++; ++- break; ++- case 15: ++- is_first ? stats->acgtno_cycles_1st[ read_cycle ].n++ : stats->acgtno_cycles_2nd[ read_cycle ].n++; ++- break; ++- default: ++- /* ++- * count "=" sequences in "other" along ++- * with MRSVWYHKDB ambiguity codes ++- */ ++- is_first ? stats->acgtno_cycles_1st[ read_cycle ].other++ : stats->acgtno_cycles_2nd[ read_cycle ].other++; ++- break; +++ acgtno_count_t *acgtno_cycles = (order == READ_ORDER_FIRST) ? stats->acgtno_cycles_1st : (order == READ_ORDER_LAST) ? stats->acgtno_cycles_2nd : NULL ; +++ if (acgtno_cycles) { +++ for (i=0; ingc-1)/seq_len; ++@@ -745,38 +904,48 @@ ++ // Determine which array (1st or 2nd read) will these stats go to, ++ // trim low quality bases from end the same way BWA does, ++ // fill GC histogram ++- uint64_t *quals; +++ uint64_t *quals = NULL; ++ uint8_t *bam_quals = bam_get_qual(bam_line); ++- if ( IS_READ2(bam_line) ) ++- { ++- quals = stats->quals_2nd; ++- stats->nreads_2nd++; ++- stats->total_len_2nd += seq_len; ++- for (i=gc_idx_min; igc_2nd[i]++; ++- } ++- else ++- { +++ +++ switch (order) { +++ case READ_ORDER_FIRST: ++ quals = stats->quals_1st; ++ stats->nreads_1st++; ++ stats->total_len_1st += seq_len; ++ for (i=gc_idx_min; igc_1st[i]++; +++ break; +++ case READ_ORDER_LAST: +++ quals = stats->quals_2nd; +++ stats->nreads_2nd++; +++ stats->total_len_2nd += seq_len; +++ for (i=gc_idx_min; igc_2nd[i]++; +++ break; +++ default: +++ stats->nreads_other++; ++ } ++ if ( stats->info->trim_qual>0 ) ++ stats->nbases_trimmed += bwa_trim_read(stats->info->trim_qual, bam_quals, seq_len, reverse); ++ ++ // Quality histogram and average quality. Clipping is neglected. ++- for (i=0; i=stats->nquals ) ++- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); ++- if ( qual>stats->max_qual ) ++- stats->max_qual = qual; +++ if (quals) { +++ for (i=0; i=stats->nquals ) +++ error("TODO: quality too high %d>=%d (%s %"PRIhts_pos" %s)\n", qual, stats->nquals, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); +++ if ( qual>stats->max_qual ) +++ stats->max_qual = qual; +++ +++ quals[ i*stats->nquals+qual ]++; +++ stats->sum_qual += qual; +++ } +++ } ++ ++- quals[ i*stats->nquals+qual ]++; ++- stats->sum_qual += qual; +++ // Barcode statistics +++ if (order == READ_ORDER_FIRST) { +++ collect_barcode_stats(bam_line, stats); ++ } ++ ++ // Look at the flags and increment appropriate counters (mapped, paired, etc) ++@@ -805,7 +974,7 @@ ++ *gc_count_out = gc_count; ++ } ++ ++-static int cleanup_overlaps(khash_t(qn2pair) *read_pairs, int max) { +++static int cleanup_overlaps(khash_t(qn2pair) *read_pairs, hts_pos_t max) { ++ if ( !read_pairs ) ++ return 0; ++ ++@@ -816,7 +985,7 @@ ++ char *key = (char *)kh_key(read_pairs, k); ++ pair_t *val = kh_val(read_pairs, k); ++ if ( val && val->chunks ) { ++- if ( val->chunks[val->n-1].to < max ) { +++ if ( val->chunks[val->n-1].end < max ) { ++ free(val->chunks); ++ free(val); ++ free(key); ++@@ -830,29 +999,32 @@ ++ } ++ } ++ } ++- if ( max == INT_MAX ) +++ if ( max == INT64_MAX ) ++ kh_destroy(qn2pair, read_pairs); ++ ++ return count; ++ } ++ ++-static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stats_t *stats, int pmin, int pmax) { +++/** +++ * [pmin, pmax) - 0 based half-open +++ */ +++static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stats_t *stats, hts_pos_t pmin, hts_pos_t pmax) { ++ if ( !bam_line || !read_pairs || !stats ) ++ return; ++ ++- uint32_t first = (IS_READ1(bam_line) > 0 ? 1 : 0) + (IS_READ2(bam_line) > 0 ? 2 : 0) ; +++ uint32_t order = (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0); ++ if ( !(bam_line->core.flag & BAM_FPAIRED) || ++ (bam_line->core.flag & BAM_FMUNMAP) || ++- (abs(bam_line->core.isize) >= 2*bam_line->core.l_qseq) || ++- (first != 1 && first != 2) ) { +++ (llabs(bam_line->core.isize) >= 2*bam_line->core.l_qseq) || +++ (order != READ_ORDER_FIRST && order != READ_ORDER_LAST) ) { ++ if ( pmin >= 0 ) ++- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1); +++ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax); ++ return; ++ } ++ ++ char *qname = bam_get_qname(bam_line); ++ if ( !qname ) { ++- fprintf(samtools_stderr, "Error retrieving qname for line starting at pos %d\n", bam_line->core.pos); +++ fprintf(samtools_stderr, "Error retrieving qname for line starting at pos %"PRIhts_pos"\n", bam_line->core.pos); ++ return; ++ } ++ ++@@ -870,8 +1042,7 @@ ++ ++ k = kh_put(qn2pair, read_pairs, s, &ret); ++ if ( -1 == ret ) { ++- fprintf(samtools_stderr, "Error inserting read '%s' in pair hash table\n", qname); ++- return; +++ error("Error inserting read '%s' in pair hash table\n", qname); ++ } ++ ++ pair_t *pc = calloc(1, sizeof(pair_t)); ++@@ -881,16 +1052,16 @@ ++ } ++ ++ pc->m = DEFAULT_CHUNK_NO; ++- pc->chunks = calloc(pc->m, sizeof(pos_t)); +++ pc->chunks = calloc(pc->m, sizeof(hts_pair_pos_t)); ++ if ( !pc->chunks ) { ++ fprintf(samtools_stderr, "Error allocating memory\n"); ++ return; ++ } ++ ++- pc->chunks[0].from = pmin; ++- pc->chunks[0].to = pmax; +++ pc->chunks[0].beg = pmin; +++ pc->chunks[0].end = pmax; ++ pc->n = 1; ++- pc->first = first; +++ pc->first = order; ++ ++ kh_val(read_pairs, k) = pc; ++ stats->pair_count++; ++@@ -901,12 +1072,12 @@ ++ return; ++ } ++ ++- if ( first == pc->first ) { //chunk from an existing line +++ if ( order == pc->first ) { //chunk from an existing line ++ if ( pmin == -1 ) ++ return; ++ ++ if ( pc->n == pc->m ) { ++- pos_t *tmp = realloc(pc->chunks, (pc->m<<1)*sizeof(pos_t)); +++ hts_pair_pos_t *tmp = realloc(pc->chunks, (pc->m<<1)*sizeof(hts_pair_pos_t)); ++ if ( !tmp ) { ++ fprintf(samtools_stderr, "Error allocating memory\n"); ++ return; ++@@ -915,8 +1086,8 @@ ++ pc->m<<=1; ++ } ++ ++- pc->chunks[pc->n].from = pmin; ++- pc->chunks[pc->n].to = pmax; +++ pc->chunks[pc->n].beg = pmin; +++ pc->chunks[pc->n].end = pmax; ++ pc->n++; ++ } else { //the other line, check for overlapping ++ if ( pmin == -1 && kh_exist(read_pairs, k) ) { //job done, delete entry ++@@ -934,28 +1105,28 @@ ++ ++ int i; ++ for (i=0; in; i++) { ++- if ( pmin >= pc->chunks[i].to ) +++ if ( pmin >= pc->chunks[i].end ) ++ continue; ++ ++- if ( pmax <= pc->chunks[i].from ) //no overlap +++ if ( pmax <= pc->chunks[i].beg ) //no overlap ++ break; ++ ++- if ( pmin < pc->chunks[i].from ) { //overlap at the beginning ++- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pc->chunks[i].from-1); ++- pmin = pc->chunks[i].from; +++ if ( pmin < pc->chunks[i].beg ) { //overlap at the beginning +++ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pc->chunks[i].beg); +++ pmin = pc->chunks[i].beg; ++ } ++ ++- if ( pmax <= pc->chunks[i].to ) { //completely contained +++ if ( pmax <= pc->chunks[i].end ) { //completely contained ++ stats->nbases_mapped_cigar -= (pmax - pmin); ++ return; ++ } else { //overlap at the end ++- stats->nbases_mapped_cigar -= (pc->chunks[i].to - pmin); ++- pmin = pc->chunks[i].to; +++ stats->nbases_mapped_cigar -= (pc->chunks[i].end - pmin); +++ pmin = pc->chunks[i].end; ++ } ++ } ++ } ++ } ++- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1); +++ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax); ++ } ++ ++ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pairs) ++@@ -1000,15 +1171,17 @@ ++ stats->nreads_dup++; ++ } ++ +++ uint32_t order = IS_PAIRED(bam_line) ? (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0) : READ_ORDER_FIRST; +++ ++ int read_len = unclipped_length(bam_line); ++ if ( read_len >= stats->nbases ) ++ realloc_buffers(stats,read_len); ++ // Update max_len observed ++ if ( stats->max_lenmax_len = read_len; ++- if ( IS_READ1(bam_line) && stats->max_len_1st < read_len ) +++ if ( order == READ_ORDER_FIRST && stats->max_len_1st < read_len ) ++ stats->max_len_1st = read_len; ++- if ( IS_READ2(bam_line) && stats->max_len_2nd < read_len ) +++ if ( order == READ_ORDER_LAST && stats->max_len_2nd < read_len ) ++ stats->max_len_2nd = read_len; ++ ++ int i; ++@@ -1019,8 +1192,8 @@ ++ if ( IS_ORIGINAL(bam_line) ) ++ { ++ stats->read_lengths[read_len]++; ++- if ( IS_READ1(bam_line) ) stats->read_lengths_1st[read_len]++; ++- if ( IS_READ2(bam_line) ) stats->read_lengths_2nd[read_len]++; +++ if ( order == READ_ORDER_FIRST ) stats->read_lengths_1st[read_len]++; +++ if ( order == READ_ORDER_LAST ) stats->read_lengths_2nd[read_len]++; ++ collect_orig_read_stats(bam_line, stats, &gc_count); ++ } ++ ++@@ -1041,7 +1214,7 @@ ++ isize = stats->info->nisize; ++ if ( isize>0 || bam_line->core.tid==bam_line->core.mtid ) ++ { ++- int pos_fst = bam_line->core.mpos - bam_line->core.pos; +++ hts_pos_t pos_fst = bam_line->core.mpos - bam_line->core.pos; ++ int is_fst = IS_READ1(bam_line) ? 1 : -1; ++ int is_fwd = IS_REVERSE(bam_line) ? -1 : 1; ++ int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1; ++@@ -1077,7 +1250,7 @@ ++ if ( stats->regions ) ++ { ++ // Count only on-target bases ++- int iref = bam_line->core.pos + 1; +++ hts_pos_t iref = bam_line->core.pos + 1; ++ for (i=0; icore.n_cigar; i++) ++ { ++ int cig = bam_cigar_op(bam_get_cigar(bam_line)[i]); ++@@ -1131,7 +1304,7 @@ ++ } ++ ++ if ( stats->last_pair_tid != bam_line->core.tid) { ++- stats->pair_count -= cleanup_overlaps(read_pairs, INT_MAX-1); +++ stats->pair_count -= cleanup_overlaps(read_pairs, INT64_MAX-1); ++ stats->last_pair_tid = bam_line->core.tid; ++ stats->last_read_flush = 0; ++ } ++@@ -1183,8 +1356,9 @@ ++ // Coverage distribution graph ++ round_buffer_flush(stats,bam_line->core.pos); ++ if ( stats->regions ) { ++- uint32_t p = bam_line->core.pos, pnew, pmin, pmax, j; ++- pmin = pmax = i = j = 0; +++ hts_pos_t p = bam_line->core.pos, pnew, pmin = 0, pmax = 0; +++ uint32_t j = 0; +++ i = 0; ++ while ( j < bam_line->core.n_cigar && i < stats->nchunks ) { ++ int op = bam_cigar_op(bam_get_cigar(bam_line)[j]); ++ int oplen = bam_cigar_oplen(bam_get_cigar(bam_line)[j]); ++@@ -1192,13 +1366,13 @@ ++ case BAM_CMATCH: ++ case BAM_CEQUAL: ++ case BAM_CDIFF: ++- pmin = MAX(p, stats->chunks[i].from-1); ++- pmax = MIN(p+oplen, stats->chunks[i].to); ++- if ( pmax >= pmin ) { +++ pmin = MAX(p, stats->chunks[i].beg-1); // 0 based +++ pmax = MIN(p+oplen, stats->chunks[i].end); // 1 based +++ if ( pmax > pmin ) { ++ if ( stats->info->remove_overlaps ) ++ remove_overlaps(bam_line, read_pairs, stats, pmin, pmax); ++ else ++- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1); +++ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax); ++ } ++ break; ++ case BAM_CDEL: ++@@ -1206,7 +1380,7 @@ ++ } ++ pnew = p + (bam_cigar_type(op)&2 ? oplen : 0); // consumes reference ++ ++- if ( pnew >= stats->chunks[i].to ) { +++ if ( pnew >= stats->chunks[i].end ) { ++ // go to the next chunk ++ i++; ++ } else { ++@@ -1216,7 +1390,8 @@ ++ } ++ } ++ } else { ++- uint32_t p = bam_line->core.pos, j; +++ hts_pos_t p = bam_line->core.pos; +++ uint32_t j; ++ for (j = 0; j < bam_line->core.n_cigar; j++) { ++ int op = bam_cigar_op(bam_get_cigar(bam_line)[j]); ++ int oplen = bam_cigar_oplen(bam_get_cigar(bam_line)[j]); ++@@ -1227,7 +1402,7 @@ ++ if ( stats->info->remove_overlaps ) ++ remove_overlaps(bam_line, read_pairs, stats, p, p+oplen); ++ else ++- round_buffer_insert_read(&(stats->cov_rbuf), p, p+oplen-1); +++ round_buffer_insert_read(&(stats->cov_rbuf), p, p+oplen); ++ break; ++ case BAM_CDEL: ++ break; ++@@ -1236,7 +1411,7 @@ ++ } ++ } ++ if ( stats->info->remove_overlaps ) ++- remove_overlaps(bam_line, read_pairs, stats, -1, -1); //remove the line from the hash table +++ remove_overlaps(bam_line, read_pairs, stats, -1LL, -1LL); //remove the line from the hash table ++ } ++ } ++ ++@@ -1257,7 +1432,7 @@ ++ float n,d; ++ int k; ++ ++- n = p*(N+1)/100; +++ n = (float)p*(N+1)/100; ++ k = n; ++ if ( k<=0 ) ++ return gcd[0].depth; ++@@ -1322,9 +1497,9 @@ ++ fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n"); ++ fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals); ++ fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); ++- fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); // not counting excluded seqs (and none of the below) +++ fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below) ++ fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); ++- fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd)); +++ fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); ++ fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); ++ fprintf(to, "SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st); ++ fprintf(to, "SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd); ++@@ -1346,7 +1521,7 @@ ++ fprintf(to, "SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup); ++ fprintf(to, "SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches); ++ fprintf(to, "SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0); ++- float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0; +++ float avg_read_length = (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0; ++ fprintf(to, "SN\taverage length:\t%.0f\n", avg_read_length); ++ fprintf(to, "SN\taverage first fragment length:\t%.0f\n", stats->nreads_1st? (float)stats->total_len_1st/stats->nreads_1st:0); ++ fprintf(to, "SN\taverage last fragment length:\t%.0f\n", stats->nreads_2nd? (float)stats->total_len_2nd/stats->nreads_2nd:0); ++@@ -1360,7 +1535,7 @@ ++ fprintf(to, "SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward); ++ fprintf(to, "SN\tpairs with other orientation:\t%ld\n", (long)nisize_other); ++ fprintf(to, "SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2); ++- fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd):0); +++ fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0); ++ if ( stats->target_count ) { ++ fprintf(to, "SN\tbases inside the target:\t%u\n", stats->target_count); ++ for (icov=stats->info->cov_threshold+1; icovncov; icov++) ++@@ -1441,11 +1616,18 @@ ++ 100.*(acgtno_count_1st->other + acgtno_count_2nd->other)/acgt_sum); ++ ++ } +++ +++ uint64_t tA=0, tC=0, tG=0, tT=0, tN=0; ++ fprintf(to, "# ACGT content per cycle for first fragments. Use `grep ^FBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N and O counts as a percentage of all A/C/G/T bases [%%]\n"); ++ for (ibase=0; ibasemax_len; ibase++) ++ { ++ acgtno_count_t *acgtno_count_1st = &(stats->acgtno_cycles_1st[ibase]); ++ uint64_t acgt_sum_1st = acgtno_count_1st->a + acgtno_count_1st->c + acgtno_count_1st->g + acgtno_count_1st->t; +++ tA += acgtno_count_1st->a; +++ tC += acgtno_count_1st->c; +++ tG += acgtno_count_1st->g; +++ tT += acgtno_count_1st->t; +++ tN += acgtno_count_1st->n; ++ ++ if ( acgt_sum_1st ) ++ fprintf(to, "FBC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1, ++@@ -1457,11 +1639,19 @@ ++ 100.*acgtno_count_1st->other/acgt_sum_1st); ++ ++ } +++ fprintf(to, "# ACGT raw counters for first fragments. Use `grep ^FTC | cut -f 2-` to extract this part. The columns are: A,C,G,T,N base counters\n"); +++ fprintf(to, "FTC\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", tA, tC, tG, tT, tN); +++ tA=0, tC=0, tG=0, tT=0, tN=0; ++ fprintf(to, "# ACGT content per cycle for last fragments. Use `grep ^LBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N and O counts as a percentage of all A/C/G/T bases [%%]\n"); ++ for (ibase=0; ibasemax_len; ibase++) ++ { ++ acgtno_count_t *acgtno_count_2nd = &(stats->acgtno_cycles_2nd[ibase]); ++ uint64_t acgt_sum_2nd = acgtno_count_2nd->a + acgtno_count_2nd->c + acgtno_count_2nd->g + acgtno_count_2nd->t; +++ tA += acgtno_count_2nd->a; +++ tC += acgtno_count_2nd->c; +++ tG += acgtno_count_2nd->g; +++ tT += acgtno_count_2nd->t; +++ tN += acgtno_count_2nd->n; ++ ++ if ( acgt_sum_2nd ) ++ fprintf(to, "LBC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1, ++@@ -1473,6 +1663,52 @@ ++ 100.*acgtno_count_2nd->other/acgt_sum_2nd); ++ ++ } +++ fprintf(to, "# ACGT raw counters for last fragments. Use `grep ^LTC | cut -f 2-` to extract this part. The columns are: A,C,G,T,N base counters\n"); +++ fprintf(to, "LTC\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", tA, tC, tG, tT, tN); +++ +++ int tag; +++ for (tag=0; tagntags; tag++) { +++ if (stats->tags_barcode[tag].nbases) { +++ fprintf(to, "# ACGT content per cycle for barcodes. Use `grep ^%sC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N counts as a percentage of all A/C/G/T bases [%%]\n", +++ stats->tags_barcode[tag].tag_name); +++ for (ibase=0; ibasetags_barcode[tag].nbases; ibase++) +++ { +++ if (ibase == stats->tags_barcode[tag].tag_sep) +++ continue; +++ +++ acgtno_count_t *acgtno_count = stats->acgtno_barcode + stats->tags_barcode[tag].offset + ibase; +++ uint64_t acgt_sum = acgtno_count->a + acgtno_count->c + acgtno_count->g + acgtno_count->t; +++ +++ if ( acgt_sum ) +++ fprintf(to, "%sC%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", stats->tags_barcode[tag].tag_name, +++ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? 1 : 2, +++ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? ibase+1 : ibase-stats->tags_barcode[tag].tag_sep, +++ 100.*acgtno_count->a/acgt_sum, +++ 100.*acgtno_count->c/acgt_sum, +++ 100.*acgtno_count->g/acgt_sum, +++ 100.*acgtno_count->t/acgt_sum, +++ 100.*acgtno_count->n/acgt_sum); +++ } +++ +++ fprintf(to, "# Barcode Qualities. Use `grep ^%sQ | cut -f 2-` to extract this part.\n", stats->tags_barcode[tag].qual_name); +++ fprintf(to, "# Columns correspond to qualities and rows to barcode cycles. First column is the cycle number.\n"); +++ for (ibase=0; ibasetags_barcode[tag].nbases; ibase++) +++ { +++ if (ibase == stats->tags_barcode[tag].tag_sep) +++ continue; +++ +++ fprintf(to, "%sQ%d\t%d", stats->tags_barcode[tag].qual_name, +++ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? 1 : 2, +++ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? ibase+1 : ibase-stats->tags_barcode[tag].tag_sep); +++ for (iqual=0; iqual<=stats->tags_barcode[tag].max_qual; iqual++) +++ { +++ fprintf(to, "\t%ld", (long)stats->quals_barcode[(stats->tags_barcode[tag].offset + ibase)*stats->nquals+iqual]); +++ } +++ fprintf(to, "\n"); +++ } +++ } +++ } +++ ++ fprintf(to, "# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); ++ for (isize=0; isizeisize->inward(stats->isize->data, isize)); ++@@ -1566,14 +1802,15 @@ ++ } ++ } ++ ++-void init_regions(stats_t *stats, const char *file) +++static void init_regions(stats_t *stats, const char *file) ++ { ++ FILE *fp = fopen(file,"r"); ++ if ( !fp ) error("%s: %s\n",file,strerror(errno)); ++ ++ kstring_t line = { 0, 0, NULL }; ++ int warned = 0, r, p, new_p; ++- int prev_tid=-1, prev_pos=-1; +++ int prev_tid=-1; +++ hts_pos_t prev_pos=-1LL; ++ while (line.l = 0, kgetline(&line, (kgets_func *)fgets, fp) >= 0) ++ { ++ if ( line.s[0] == '#' ) continue; ++@@ -1594,30 +1831,33 @@ ++ ++ if ( tid >= stats->nregions ) ++ { ++- stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100)); +++ if(!(stats->regions = realloc(stats->regions,sizeof(regions_t)*(tid+REG_INC)))) +++ error("Could not allocate memory for region.\n"); +++ ++ int j; ++- for (j=stats->nregions; jnregions+100; j++) +++ for (j=stats->nregions; jregions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0; ++ stats->regions[j].pos = NULL; ++ } ++- stats->nregions += 100; +++ stats->nregions = tid+REG_INC; ++ } ++ int npos = stats->regions[tid].npos; ++ if ( npos >= stats->regions[tid].mpos ) ++ { ++- stats->regions[tid].mpos += 1000; ++- stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos); +++ stats->regions[tid].mpos = npos+POS_INC; +++ if (!(stats->regions[tid].pos = realloc(stats->regions[tid].pos, sizeof(hts_pair_pos_t)*stats->regions[tid].mpos))) +++ error("Could not allocate memory for interval.\n"); ++ } ++ ++- if ( (sscanf(&line.s[i+1],"%u %u",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]); +++ if ( (sscanf(&line.s[i+1],"%"SCNd64" %"SCNd64, &stats->regions[tid].pos[npos].beg, &stats->regions[tid].pos[npos].end))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]); ++ if ( prev_tid==-1 || prev_tid!=tid ) ++ { ++ prev_tid = tid; ++- prev_pos = stats->regions[tid].pos[npos].from; +++ prev_pos = stats->regions[tid].pos[npos].beg; ++ } ++- if ( prev_pos>stats->regions[tid].pos[npos].from ) ++- error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line.s,stats->regions[tid].pos[npos].from,prev_pos); +++ if ( prev_pos>stats->regions[tid].pos[npos].beg ) +++ error("The positions are not in chromosomal order (%s:%"PRIhts_pos" comes after %"PRIhts_pos")\n", line.s, stats->regions[tid].pos[npos].beg, prev_pos); ++ stats->regions[tid].npos++; ++ if ( stats->regions[tid].npos > stats->nchunks ) ++ stats->nchunks = stats->regions[tid].npos; ++@@ -1630,20 +1870,21 @@ ++ for (r = 0; r < stats->nregions; r++) { ++ regions_t *reg = &stats->regions[r]; ++ if ( reg->npos > 1 ) { ++- qsort(reg->pos, reg->npos, sizeof(pos_t), regions_lt); +++ qsort(reg->pos, reg->npos, sizeof(hts_pair_pos_t), regions_lt); ++ for (new_p = 0, p = 1; p < reg->npos; p++) { ++- if ( reg->pos[new_p].to < reg->pos[p].from ) +++ if ( reg->pos[new_p].end < reg->pos[p].beg ) ++ reg->pos[++new_p] = reg->pos[p]; ++- else if ( reg->pos[new_p].to < reg->pos[p].to ) ++- reg->pos[new_p].to = reg->pos[p].to; +++ else if ( reg->pos[new_p].end < reg->pos[p].end ) +++ reg->pos[new_p].end = reg->pos[p].end; ++ } ++ reg->npos = ++new_p; ++ } ++ for (p = 0; p < reg->npos; p++) ++- stats->target_count += (reg->pos[p].to - reg->pos[p].from + 1); +++ stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1); ++ } ++ ++- stats->chunks = calloc(stats->nchunks, sizeof(pos_t)); +++ if (!(stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t)))) +++ error("Could not allocate memory for chunk.\n"); ++ } ++ ++ void destroy_regions(stats_t *stats) ++@@ -1678,22 +1919,22 @@ ++ // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, ++ // even small overlap is enough to include the read in the stats. ++ int i = reg->cpos; ++- while ( inpos && reg->pos[i].to<=bam_line->core.pos ) i++; +++ while ( inpos && reg->pos[i].end<=bam_line->core.pos ) i++; ++ if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; } ++ int64_t endpos = bam_endpos(bam_line); ++- if ( endpos < reg->pos[i].from ) return 0; +++ if ( endpos < reg->pos[i].beg ) return 0; ++ ++ //found a read overlapping a region ++ reg->cpos = i; ++- stats->reg_from = reg->pos[i].from; ++- stats->reg_to = reg->pos[i].to; +++ stats->reg_from = reg->pos[i].beg; +++ stats->reg_to = reg->pos[i].end; ++ ++ //now find all the overlapping chunks ++ stats->nchunks = 0; ++ while (i < reg->npos) { ++- if (bam_line->core.pos < reg->pos[i].to && endpos >= reg->pos[i].from) { ++- stats->chunks[stats->nchunks].from = MAX(bam_line->core.pos+1, reg->pos[i].from); ++- stats->chunks[stats->nchunks].to = MIN(endpos, reg->pos[i].to); +++ if (bam_line->core.pos < reg->pos[i].end && endpos >= reg->pos[i].beg) { +++ stats->chunks[stats->nchunks].beg = MAX(bam_line->core.pos+1, reg->pos[i].beg); +++ stats->chunks[stats->nchunks].end = MIN(endpos, reg->pos[i].end); ++ stats->nchunks++; ++ } ++ i++; ++@@ -1709,7 +1950,7 @@ ++ int i, j, tid; ++ stats->nregions = iter->n_reg; ++ stats->regions = calloc(stats->nregions, sizeof(regions_t)); ++- stats->chunks = calloc(stats->nchunks, sizeof(pos_t)); +++ stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t)); ++ if ( !stats->regions || !stats->chunks ) ++ return 1; ++ ++@@ -1729,15 +1970,15 @@ ++ } ++ ++ stats->regions[tid].mpos = stats->regions[tid].npos = iter->reg_list[i].count; ++- stats->regions[tid].pos = calloc(stats->regions[tid].mpos, sizeof(pos_t)); +++ stats->regions[tid].pos = calloc(stats->regions[tid].mpos, sizeof(hts_pair_pos_t)); ++ if ( !stats->regions[tid].pos ) ++ return 1; ++ ++ for (j = 0; j < stats->regions[tid].npos; j++) { ++- stats->regions[tid].pos[j].from = iter->reg_list[i].intervals[j].beg+1; ++- stats->regions[tid].pos[j].to = iter->reg_list[i].intervals[j].end; +++ stats->regions[tid].pos[j].beg = iter->reg_list[i].intervals[j].beg+1; +++ stats->regions[tid].pos[j].end = iter->reg_list[i].intervals[j].end; ++ ++- stats->target_count += (stats->regions[tid].pos[j].to - stats->regions[tid].pos[j].from + 1); +++ stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1); ++ } ++ } ++ ++@@ -1775,7 +2016,7 @@ ++ } ++ ++ ++-static void error(const char *format, ...) +++static void HTS_NORETURN error(const char *format, ...) ++ { ++ if ( !format ) ++ { ++@@ -1785,13 +2026,14 @@ ++ fprintf(samtools_stdout, "Options:\n"); ++ fprintf(samtools_stdout, " -c, --coverage ,, Coverage distribution min,max,step [1,1000,1]\n"); ++ fprintf(samtools_stdout, " -d, --remove-dups Exclude from statistics reads marked as duplicates\n"); +++ fprintf(samtools_stdout, " -X, --customized-index-file Use a customized index file\n"); ++ fprintf(samtools_stdout, " -f, --required-flag Required flag, 0 for unset. See also `samtools flags` [0]\n"); ++ fprintf(samtools_stdout, " -F, --filtering-flag Filtering flag, 0 for unset. See also `samtools flags` [0]\n"); ++ fprintf(samtools_stdout, " --GC-depth the size of GC-depth bins (decreasing bin size increases memory requirement) [2e4]\n"); ++ fprintf(samtools_stdout, " -h, --help This help message\n"); ++ fprintf(samtools_stdout, " -i, --insert-size Maximum insert size [8000]\n"); ++ fprintf(samtools_stdout, " -I, --id Include only listed read group or sample name\n"); ++- fprintf(samtools_stdout, " -l, --read-length Include in the statistics only reads with the given read length []\n"); +++ fprintf(samtools_stdout, " -l, --read-length Include in the statistics only reads with the given read length [-1]\n"); ++ fprintf(samtools_stdout, " -m, --most-inserts Report only the main part of inserts [0.99]\n"); ++ fprintf(samtools_stdout, " -P, --split-prefix Path or string prefix for filepaths output by -S (default is input filename)\n"); ++ fprintf(samtools_stdout, " -q, --trim-quality The BWA trimming parameter [0]\n"); ++@@ -1801,8 +2043,8 @@ ++ fprintf(samtools_stdout, " -t, --target-regions Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); ++ fprintf(samtools_stdout, " -x, --sparse Suppress outputting IS rows where there are no insertions.\n"); ++ fprintf(samtools_stdout, " -p, --remove-overlaps Remove overlaps of paired-end reads from coverage and base count computations.\n"); ++- fprintf(samtools_stdout, " -g, --cov-threshold Only bases with coverage above this value will be included in the target percentage computation.\n"); ++- sam_global_opt_help(samtools_stdout, "-.--.@"); +++ fprintf(samtools_stdout, " -g, --cov-threshold Only bases with coverage above this value will be included in the target percentage computation [0]\n"); +++ sam_global_opt_help(samtools_stdout, "-.--.@-."); ++ fprintf(samtools_stdout, "\n"); ++ } ++ else ++@@ -1842,6 +2084,9 @@ ++ free(stats->ins_cycles_2nd); ++ free(stats->del_cycles_1st); ++ free(stats->del_cycles_2nd); +++ if (stats->acgtno_barcode) free(stats->acgtno_barcode); +++ if (stats->quals_barcode) free(stats->quals_barcode); +++ free(stats->tags_barcode); ++ destroy_regions(stats); ++ if ( stats->rg_hash ) khash_str2int_destroy(stats->rg_hash); ++ free(stats->split_name); ++@@ -1880,6 +2125,9 @@ ++ ++ void destroy_split_stats(khash_t(c2stats) *split_hash) ++ { +++ if (!split_hash) +++ return; +++ ++ int i = 0; ++ stats_t *curr_stats = NULL; ++ for(i = kh_begin(split_hash); i != kh_end(split_hash); ++i){ ++@@ -1893,6 +2141,10 @@ ++ stats_info_t* stats_info_init(int argc, char *argv[]) ++ { ++ stats_info_t* info = calloc(1, sizeof(stats_info_t)); +++ if (!info) { +++ return NULL; +++ } +++ ++ info->nisize = 8000; ++ info->isize_main_bulk = 0.99; // There are always outliers at the far end ++ info->gcd_bin_size = 20e3; ++@@ -1928,11 +2180,15 @@ ++ stats_t* stats_init() ++ { ++ stats_t *stats = calloc(1,sizeof(stats_t)); +++ if (!stats) +++ return NULL; +++ ++ stats->ngc = 200; ++ stats->nquals = 256; ++ stats->nbases = 300; ++ stats->rseq_pos = -1; ++- stats->tid = stats->gcd_pos = -1; +++ stats->tid = -1; +++ stats->gcd_pos = -1LL; ++ stats->igcd = 0; ++ stats->is_sorted = 1; ++ stats->nindels = stats->nbases; ++@@ -1946,6 +2202,18 @@ ++ return stats; ++ } ++ +++static int init_barcode_tags(stats_t* stats) { +++ stats->ntags = 4; +++ stats->tags_barcode = calloc(stats->ntags, sizeof(barcode_info_t)); +++ if (!stats->tags_barcode) +++ return -1; +++ stats->tags_barcode[0] = (barcode_info_t){"BC", "QT", 0, -1, -1, 0}; +++ stats->tags_barcode[1] = (barcode_info_t){"CR", "CY", 0, -1, -1, 0}; +++ stats->tags_barcode[2] = (barcode_info_t){"OX", "BZ", 0, -1, -1, 0}; +++ stats->tags_barcode[3] = (barcode_info_t){"RX", "QX", 0, -1, -1, 0}; +++ return 0; +++} +++ ++ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* group_id, const char* targets) ++ { ++ // Give stats_t a pointer to the info struct ++@@ -1963,32 +2231,60 @@ ++ stats->ncov = 3 + (info->cov_max-info->cov_min) / info->cov_step; ++ info->cov_max = info->cov_min + ((info->cov_max-info->cov_min)/info->cov_step +1)*info->cov_step - 1; ++ stats->cov = calloc(sizeof(uint64_t),stats->ncov); +++ if (!stats->cov) goto nomem; ++ stats->cov_rbuf.size = stats->nbases*5; ++ stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size); ++- +++ if (!stats->cov_rbuf.buffer) goto nomem; ++ if ( group_id ) init_group_id(stats, group_id); ++ // .. arrays ++ stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); +++ if (!stats->quals_1st) goto nomem; ++ stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); +++ if (!stats->quals_2nd) goto nomem; ++ stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t)); +++ if (!stats->gc_1st) goto nomem; ++ stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t)); +++ if (!stats->gc_2nd) goto nomem; ++ stats->isize = init_isize_t(info->nisize ?info->nisize+1 :0); +++ if (!stats->isize) goto nomem; ++ stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); ++- stats->mpc_buf = info->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; +++ if (!stats->gcd) goto nomem; +++ if (info->fai) { +++ stats->mpc_buf = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); +++ if (!stats->mpc_buf) goto nomem; +++ } else { +++ stats->mpc_buf = NULL; +++ } ++ stats->acgtno_cycles_1st = calloc(stats->nbases,sizeof(acgtno_count_t)); +++ if (!stats->acgtno_cycles_1st) goto nomem; ++ stats->acgtno_cycles_2nd = calloc(stats->nbases,sizeof(acgtno_count_t)); +++ if (!stats->acgtno_cycles_2nd) goto nomem; ++ stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t)); +++ if (!stats->read_lengths) goto nomem; ++ stats->read_lengths_1st = calloc(stats->nbases,sizeof(uint64_t)); +++ if (!stats->read_lengths_1st) goto nomem; ++ stats->read_lengths_2nd = calloc(stats->nbases,sizeof(uint64_t)); +++ if (!stats->read_lengths_2nd) goto nomem; ++ stats->insertions = calloc(stats->nbases,sizeof(uint64_t)); +++ if (!stats->insertions) goto nomem; ++ stats->deletions = calloc(stats->nbases,sizeof(uint64_t)); +++ if (!stats->deletions) goto nomem; ++ stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); +++ if (!stats->ins_cycles_1st) goto nomem; ++ stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); +++ if (!stats->ins_cycles_2nd) goto nomem; ++ stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); +++ if (!stats->del_cycles_1st) goto nomem; ++ stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); +++ if (!stats->del_cycles_2nd) goto nomem; +++ if (init_barcode_tags(stats) < 0) +++ goto nomem; ++ realloc_rseq_buffer(stats); ++ if ( targets ) ++ init_regions(stats, targets); +++ return; +++ nomem: +++ error("Out of memory"); ++ } ++ ++ static stats_t* get_curr_split_stats(bam1_t* bam_line, khash_t(c2stats)* split_hash, stats_info_t* info, char* targets) ++@@ -2004,6 +2300,9 @@ ++ khiter_t k = kh_get(c2stats, split_hash, split_name); ++ if(k == kh_end(split_hash)){ ++ curr_stats = stats_init(); // mallocs new instance +++ if (!curr_stats) { +++ error("Couldn't allocate split stats"); +++ } ++ init_stat_structs(curr_stats, info, NULL, targets); ++ curr_stats->split_name = split_name; ++ ++@@ -2026,11 +2325,16 @@ ++ { ++ char *targets = NULL; ++ char *bam_fname = NULL; +++ char *bam_idx_fname = NULL; ++ char *group_id = NULL; ++- int sparse = 0; +++ int sparse = 0, has_index_file = 0, ret = 1; ++ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; ++ ++ stats_info_t *info = stats_info_init(argc, argv); +++ if (!info) { +++ fprintf(samtools_stderr, "Could not allocate memory for info.\n"); +++ return 1; +++ } ++ ++ static const struct option loptions[] = ++ { ++@@ -2038,6 +2342,7 @@ ++ {"help", no_argument, NULL, 'h'}, ++ {"remove-dups", no_argument, NULL, 'd'}, ++ {"sam", no_argument, NULL, 's'}, +++ {"customized-index-file", required_argument, NULL, 'X'}, ++ {"ref-seq", required_argument, NULL, 'r'}, ++ {"coverage", required_argument, NULL, 'c'}, ++ {"read-length", required_argument, NULL, 'l'}, ++@@ -2058,13 +2363,14 @@ ++ }; ++ int opt; ++ ++- while ( (opt=getopt_long(argc,argv,"?hdsxpr:c:l:i:t:m:q:f:F:g:I:1:S:P:@:",loptions,NULL))>0 ) +++ while ( (opt=getopt_long(argc,argv,"?hdsXxpr:c:l:i:t:m:q:f:F:g:I:S:P:@:",loptions,NULL))>0 ) ++ { ++ switch (opt) ++ { ++ case 'f': info->flag_require = bam_str2flag(optarg); break; ++ case 'F': info->flag_filter |= bam_str2flag(optarg); break; ++ case 'd': info->flag_filter |= BAM_FDUP; break; +++ case 'X': has_index_file = 1; break; ++ case 's': break; ++ case 'r': info->fai = fai_load(optarg); ++ if (info->fai==NULL) ++@@ -2090,15 +2396,15 @@ ++ break; ++ case '?': ++ case 'h': error(NULL); +++ /* no break */ ++ default: ++ if (parse_sam_global_opt(opt, optarg, loptions, &ga) != 0) ++ error("Unknown argument: %s\n", optarg); ++ break; ++ } ++ } ++- if ( optind 0) ++ hts_set_threads(info->sam, ga.nthreads); ++ ++ stats_t *all_stats = stats_init(); +++ if (!all_stats) { +++ fprintf(samtools_stderr, "Could not allocate memory for stats.\n"); +++ cleanup_stats_info(info); +++ return 1; +++ } ++ stats_t *curr_stats = NULL; ++ init_stat_structs(all_stats, info, group_id, targets); ++ // Init ++ // .. hash ++ khash_t(c2stats)* split_hash = kh_init(c2stats); +++ if (!split_hash) goto cleanup_all_stats; ++ ++ khash_t(qn2pair)* read_pairs = kh_init(qn2pair); +++ if (!read_pairs) goto cleanup_split_hash; ++ ++ // Collect statistics ++ bam1_t *bam_line = bam_init1(); ++- if ( optindsam,bam_fname); ++- if (bam_idx) { ++- ++- int regcount = 0; ++- hts_reglist_t *reglist = bed_reglist(region_hash, ALL, ®count); ++- if (reglist) { ++- ++- hts_itr_multi_t *iter = sam_itr_regions(bam_idx, info->sam_header, reglist, regcount); ++- if (iter) { ++- ++- if (!targets) { ++- all_stats->nchunks = argc-optind; ++- if ( replicate_regions(all_stats, iter) ) ++- fprintf(samtools_stderr, "Replications of the regions failed."); ++- } +++ if (!bam_line) goto cleanup_read_pairs; +++ +++ if (optind < argc) { +++ // Region:interval arguments in the command line +++ hts_idx_t *bam_idx = NULL; +++ if (has_index_file) { +++ bam_idx = sam_index_load2(info->sam, bam_fname, bam_idx_fname); +++ } else { +++ // If an index filename has not been specified, look alongside the alignment file +++ bam_idx = sam_index_load(info->sam, bam_fname); +++ } +++ +++ if (bam_idx) { +++ hts_itr_multi_t *iter = sam_itr_regarray(bam_idx, info->sam_header, &argv[optind], argc - optind); +++ if (iter) { +++ if (!targets) { +++ all_stats->nchunks = argc-optind; +++ if (replicate_regions(all_stats, iter)) +++ fprintf(samtools_stderr, "Replications of the regions failed\n"); +++ } ++ ++- if ( all_stats->nregions && all_stats->regions ) { ++- while (sam_itr_multi_next(info->sam, iter, bam_line) >= 0) { ++- if (info->split_tag) { ++- curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets); ++- collect_stats(bam_line, curr_stats, read_pairs); ++- } ++- collect_stats(bam_line, all_stats, read_pairs); ++- } +++ if ( all_stats->nregions && all_stats->regions ) { +++ while ((ret = sam_itr_next(info->sam, iter, bam_line)) >= 0) { +++ if (info->split_tag) { +++ curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets); +++ collect_stats(bam_line, curr_stats, read_pairs); ++ } +++ collect_stats(bam_line, all_stats, read_pairs); +++ } ++ +++ if (ret < -1) { +++ fprintf(samtools_stderr, "Failure while running the iterator\n"); ++ hts_itr_multi_destroy(iter); ++- } else { ++- fprintf(samtools_stderr, "Creation of the region iterator failed."); ++- hts_reglist_free(reglist, regcount); +++ hts_idx_destroy(bam_idx); +++ goto cleanup; ++ } ++- } else { ++- fprintf(samtools_stderr, "Creation of the region list failed."); ++ } ++- ++- hts_idx_destroy(bam_idx); +++ hts_itr_multi_destroy(iter); ++ } else { ++- fprintf(samtools_stderr, "Random alignment retrieval only works for indexed BAM files.\n"); +++ fprintf(samtools_stderr, "Multi-region iterator could not be created\n"); +++ hts_idx_destroy(bam_idx); +++ goto cleanup; ++ } ++- ++- bed_destroy(region_hash); +++ hts_idx_destroy(bam_idx); ++ } else { ++- fprintf(samtools_stderr, "Creation of the region hash table failed.\n"); +++ if (has_index_file) +++ fprintf(samtools_stderr, "Invalid index file '%s'\n", bam_idx_fname); +++ fprintf(samtools_stderr, "Random alignment retrieval only works for indexed files\n"); +++ goto cleanup; ++ } ++- } ++- else ++- { +++ } else { ++ if ( info->cov_threshold > 0 && !targets ) { ++- fprintf(samtools_stderr, "Coverage percentage calcuation requires a list of target regions\n"); +++ fprintf(samtools_stderr, "Coverage percentage calculation requires a list of target regions\n"); ++ goto cleanup; ++ } ++ ++ // Stream through the entire BAM ignoring off-target regions if -t is given ++- int ret; ++ while ((ret = sam_read1(info->sam, info->sam_header, bam_line)) >= 0) { ++ if (info->split_tag) { ++ curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets); ++@@ -2196,7 +2511,7 @@ ++ ++ if (ret < -1) { ++ fprintf(samtools_stderr, "Failure while decoding file\n"); ++- return 1; +++ goto cleanup; ++ } ++ } ++ ++@@ -2205,15 +2520,19 @@ ++ if (info->split_tag) ++ output_split_stats(split_hash, bam_fname, sparse); ++ +++ ret = 0; ++ cleanup: ++ bam_destroy1(bam_line); ++- bam_hdr_destroy(info->sam_header); +++ sam_hdr_destroy(info->sam_header); ++ sam_global_args_free(&ga); ++ +++cleanup_read_pairs: +++ cleanup_overlaps(read_pairs, INT64_MAX); +++cleanup_split_hash: +++ destroy_split_stats(split_hash); +++cleanup_all_stats: ++ cleanup_stats(all_stats); ++ cleanup_stats_info(info); ++- destroy_split_stats(split_hash); ++- cleanup_overlaps(read_pairs, INT_MAX); ++ ++- return 0; +++ return ret; ++ } ++--- python-pysam.orig/samtools/stats_isize.c +++++ python-pysam/samtools/stats_isize.c ++@@ -1,6 +1,6 @@ ++ /* stats_isize.c -- generalised insert size calculation for samtools stats. ++ ++- Copyright (C) 2014 Genome Research Ltd. +++ Copyright (C) 2014, 2018 Genome Research Ltd. ++ ++ Author: Nicholas Clarke ++ ++@@ -162,12 +162,23 @@ ++ if (bound <= 0) { ++ // Use sparse data structure. ++ isize_sparse_data_t *data = (isize_sparse_data_t *) malloc(sizeof(isize_sparse_data_t)); +++ if (!data) +++ return NULL; ++ ++ // Initialise ++ data->max = 0; ++ data->array = kh_init(m32); +++ if (!data->array) { +++ free(data); +++ return NULL; +++ } ++ ++ isize_t *isize = (isize_t *)malloc(sizeof(isize_t)); +++ if (!isize) { +++ kh_destroy(m32, data->array); +++ free(data); +++ return NULL; +++ } ++ ++ isize->data.sparse = data; ++ isize->nitems = & sparse_nitems; ++@@ -192,13 +203,20 @@ ++ uint64_t* out = calloc(bound,sizeof(uint64_t)); ++ uint64_t* other = calloc(bound,sizeof(uint64_t)); ++ isize_dense_data_t *rec = (isize_dense_data_t *)malloc(sizeof(isize_dense_data_t)); +++ isize_t *isize = (isize_t *)malloc(sizeof(isize_t)); +++ if (!in || !out || !other || !rec || !isize) { +++ free(in); +++ free(out); +++ free(other); +++ free(rec); +++ free(isize); +++ return NULL; +++ } ++ rec->isize_inward = in; ++ rec->isize_outward = out; ++ rec->isize_other = other; ++ rec->total=bound; ++ ++- isize_t *isize = (isize_t *)malloc(sizeof(isize_t)); ++- ++ isize->data.dense = rec; ++ isize->nitems = & dense_nitems; ++ ++--- python-pysam.orig/samtools/stats_isize.c.pysam.c +++++ python-pysam/samtools/stats_isize.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* stats_isize.c -- generalised insert size calculation for samtools stats. ++ ++- Copyright (C) 2014 Genome Research Ltd. +++ Copyright (C) 2014, 2018 Genome Research Ltd. ++ ++ Author: Nicholas Clarke ++ ++@@ -164,12 +164,23 @@ ++ if (bound <= 0) { ++ // Use sparse data structure. ++ isize_sparse_data_t *data = (isize_sparse_data_t *) malloc(sizeof(isize_sparse_data_t)); +++ if (!data) +++ return NULL; ++ ++ // Initialise ++ data->max = 0; ++ data->array = kh_init(m32); +++ if (!data->array) { +++ free(data); +++ return NULL; +++ } ++ ++ isize_t *isize = (isize_t *)malloc(sizeof(isize_t)); +++ if (!isize) { +++ kh_destroy(m32, data->array); +++ free(data); +++ return NULL; +++ } ++ ++ isize->data.sparse = data; ++ isize->nitems = & sparse_nitems; ++@@ -194,13 +205,20 @@ ++ uint64_t* out = calloc(bound,sizeof(uint64_t)); ++ uint64_t* other = calloc(bound,sizeof(uint64_t)); ++ isize_dense_data_t *rec = (isize_dense_data_t *)malloc(sizeof(isize_dense_data_t)); +++ isize_t *isize = (isize_t *)malloc(sizeof(isize_t)); +++ if (!in || !out || !other || !rec || !isize) { +++ free(in); +++ free(out); +++ free(other); +++ free(rec); +++ free(isize); +++ return NULL; +++ } ++ rec->isize_inward = in; ++ rec->isize_outward = out; ++ rec->isize_other = other; ++ rec->total=bound; ++ ++- isize_t *isize = (isize_t *)malloc(sizeof(isize_t)); ++- ++ isize->data.dense = rec; ++ isize->nitems = & dense_nitems; ++ ++--- python-pysam.orig/samtools/test/merge/test_bam_translate.c +++++ python-pysam/samtools/test/merge/test_bam_translate.c ++@@ -31,10 +31,11 @@ ++ #include ++ #include ++ #include +++#include ++ ++ void dump_read(bam1_t* b) { ++ printf("->core.tid:(%d)\n", b->core.tid); ++- printf("->core.pos:(%d)\n", b->core.pos); +++ printf("->core.pos:(%"PRId64")\n", (int64_t) b->core.pos); ++ printf("->core.bin:(%d)\n", b->core.bin); ++ printf("->core.qual:(%d)\n", b->core.qual); ++ printf("->core.l_qname:(%d)\n", b->core.l_qname); ++@@ -42,8 +43,8 @@ ++ printf("->core.n_cigar:(%d)\n", b->core.n_cigar); ++ printf("->core.l_qseq:(%d)\n", b->core.l_qseq); ++ printf("->core.mtid:(%d)\n", b->core.mtid); ++- printf("->core.mpos:(%d)\n", b->core.mpos); ++- printf("->core.isize:(%d)\n", b->core.isize); +++ printf("->core.mpos:(%"PRId64")\n", (int64_t) b->core.mpos); +++ printf("->core.isize:(%"PRId64")\n", (int64_t) b->core.isize); ++ if (b->data) { ++ printf("->data:"); ++ int i; ++@@ -146,7 +147,7 @@ ++ tbl->tid_trans[3] = 8; ++ int in_there = 0; ++ khiter_t iter = kh_put(c2c, tbl->rg_trans, strdup("hello"), &in_there); ++- kh_value(tbl->rg_trans, iter) = strdup("goodbye"); +++ kh_value(tbl->rg_trans, iter) = "goodbye"; ++ ++ b->core.tid = 0; ++ b->core.pos = 1334; ++@@ -186,7 +187,7 @@ ++ tbl->tid_trans[3] = 8; ++ int in_there = 0; ++ khiter_t iter = kh_put(c2c, tbl->pg_trans, strdup("hello"), &in_there); ++- kh_value(tbl->pg_trans,iter) = strdup("goodbye"); +++ kh_value(tbl->pg_trans,iter) = "goodbye"; ++ ++ ++ b->core.tid = 0; ++@@ -302,9 +303,9 @@ ++ tbl->tid_trans[3] = 8; ++ int in_there = 0; ++ khiter_t iter_rg = kh_put(c2c, tbl->rg_trans, strdup("hello"), &in_there); ++- kh_value(tbl->rg_trans, iter_rg) = strdup("goodbye"); +++ kh_value(tbl->rg_trans, iter_rg) = "goodbye"; ++ khiter_t iter_pg = kh_put(c2c, tbl->pg_trans, strdup("quail"), &in_there); ++- kh_value(tbl->pg_trans, iter_pg) = strdup("bird"); +++ kh_value(tbl->pg_trans, iter_pg) = "bird"; ++ ++ ++ b->core.tid = 0; ++--- python-pysam.orig/samtools/test/merge/test_bam_translate.c.pysam.c +++++ python-pysam/samtools/test/merge/test_bam_translate.c.pysam.c ++@@ -33,10 +33,11 @@ ++ #include ++ #include ++ #include +++#include ++ ++ void dump_read(bam1_t* b) { ++ fprintf(samtools_stdout, "->core.tid:(%d)\n", b->core.tid); ++- fprintf(samtools_stdout, "->core.pos:(%d)\n", b->core.pos); +++ fprintf(samtools_stdout, "->core.pos:(%"PRId64")\n", (int64_t) b->core.pos); ++ fprintf(samtools_stdout, "->core.bin:(%d)\n", b->core.bin); ++ fprintf(samtools_stdout, "->core.qual:(%d)\n", b->core.qual); ++ fprintf(samtools_stdout, "->core.l_qname:(%d)\n", b->core.l_qname); ++@@ -44,8 +45,8 @@ ++ fprintf(samtools_stdout, "->core.n_cigar:(%d)\n", b->core.n_cigar); ++ fprintf(samtools_stdout, "->core.l_qseq:(%d)\n", b->core.l_qseq); ++ fprintf(samtools_stdout, "->core.mtid:(%d)\n", b->core.mtid); ++- fprintf(samtools_stdout, "->core.mpos:(%d)\n", b->core.mpos); ++- fprintf(samtools_stdout, "->core.isize:(%d)\n", b->core.isize); +++ fprintf(samtools_stdout, "->core.mpos:(%"PRId64")\n", (int64_t) b->core.mpos); +++ fprintf(samtools_stdout, "->core.isize:(%"PRId64")\n", (int64_t) b->core.isize); ++ if (b->data) { ++ fprintf(samtools_stdout, "->data:"); ++ int i; ++@@ -148,7 +149,7 @@ ++ tbl->tid_trans[3] = 8; ++ int in_there = 0; ++ khiter_t iter = kh_put(c2c, tbl->rg_trans, strdup("hello"), &in_there); ++- kh_value(tbl->rg_trans, iter) = strdup("goodbye"); +++ kh_value(tbl->rg_trans, iter) = "goodbye"; ++ ++ b->core.tid = 0; ++ b->core.pos = 1334; ++@@ -188,7 +189,7 @@ ++ tbl->tid_trans[3] = 8; ++ int in_there = 0; ++ khiter_t iter = kh_put(c2c, tbl->pg_trans, strdup("hello"), &in_there); ++- kh_value(tbl->pg_trans,iter) = strdup("goodbye"); +++ kh_value(tbl->pg_trans,iter) = "goodbye"; ++ ++ ++ b->core.tid = 0; ++@@ -304,9 +305,9 @@ ++ tbl->tid_trans[3] = 8; ++ int in_there = 0; ++ khiter_t iter_rg = kh_put(c2c, tbl->rg_trans, strdup("hello"), &in_there); ++- kh_value(tbl->rg_trans, iter_rg) = strdup("goodbye"); +++ kh_value(tbl->rg_trans, iter_rg) = "goodbye"; ++ khiter_t iter_pg = kh_put(c2c, tbl->pg_trans, strdup("quail"), &in_there); ++- kh_value(tbl->pg_trans, iter_pg) = strdup("bird"); +++ kh_value(tbl->pg_trans, iter_pg) = "bird"; ++ ++ ++ b->core.tid = 0; ++--- python-pysam.orig/samtools/test/merge/test_trans_tbl_init.c +++++ python-pysam/samtools/test/merge/test_trans_tbl_init.c ++@@ -1,6 +1,6 @@ ++ /* test/merge/test_trans_tbl_init.c -- merge test harness. ++ ++- Copyright (C) 2013, 2014 Genome Research Ltd. +++ Copyright (C) 2013-2016, 2019 Genome Research Ltd. ++ ++ Author: Martin O. Pollard ++ ++@@ -27,18 +27,19 @@ ++ #include "../../bam_sort.c" ++ #include ++ #include +++#include ++ ++ typedef struct refseq_info { ++ const char *name; ++ uint32_t len; ++ } refseq_info_t; ++ ++-void dump_header(bam_hdr_t* hdr) { ++- printf("->n_targets:(%d)\n", hdr->n_targets); +++void dump_header(sam_hdr_t* hdr) { +++ printf("->n_targets:(%d)\n", sam_hdr_nref(hdr)); ++ int i; ++- for (i = 0; i < hdr->n_targets; ++i) { ++- printf("->target_name[%d]:(%s)\n",i,hdr->target_name[i]); ++- printf("->target_len[%d]:(%d)\n",i,hdr->target_len[i]); +++ for (i = 0; i < sam_hdr_nref(hdr); ++i) { +++ printf("->target_name[%d]:(%s)\n", i, sam_hdr_tid2name(hdr, i)); +++ printf("->target_len[%d]:(%"PRId64")\n", i, (int64_t) sam_hdr_tid2len(hdr, i)); ++ } ++ ++ printf("->text:("); ++@@ -46,7 +47,7 @@ ++ printf(")\n"); ++ } ++ ++-static int populate_merged_header(bam_hdr_t *hdr, merged_header_t *merged_hdr) { +++static int populate_merged_header(sam_hdr_t *hdr, merged_header_t *merged_hdr) { ++ trans_tbl_t dummy; ++ int res; ++ res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, 1, NULL); ++@@ -56,55 +57,35 @@ ++ ++ /* ++ * Populate merged_hdr with data from bam0_header_text and bam0_refseqs. ++- * Return bam_hdr_t based on the content in bam1_header_text and bam1_refseqs. +++ * Return sam_hdr_t based on the content in bam1_header_text and bam1_refseqs. ++ */ ++ ++-bam_hdr_t * setup_test(const char *bam0_header_text, +++sam_hdr_t * setup_test(const char *bam0_header_text, ++ const refseq_info_t *bam0_refseqs, ++ int32_t bam0_n_refseqs, ++ const char *bam1_header_text, ++ const refseq_info_t *bam1_refseqs, ++ int32_t bam1_n_refseqs, ++ merged_header_t *merged_hdr) { ++- bam_hdr_t* bam0 = NULL; ++- bam_hdr_t* bam1 = NULL; ++- int32_t i; ++- ++- bam0 = bam_hdr_init(); ++- bam0->text = strdup(bam0_header_text); ++- if (!bam0->text) goto fail; ++- bam0->l_text = strlen(bam0_header_text); ++- bam0->n_targets = 1; ++- bam0->target_name = (char**)calloc(bam0_n_refseqs, sizeof(char*)); ++- bam0->target_len = (uint32_t*)calloc(bam0_n_refseqs, sizeof(uint32_t)); ++- for (i = 0; i < bam0_n_refseqs; i++) { ++- bam0->target_name[i] = strdup(bam0_refseqs[i].name); ++- if (!bam0->target_name[i]) goto fail; ++- bam0->target_len[i] = bam0_refseqs[i].len; ++- } +++ sam_hdr_t* bam0 = NULL; +++ sam_hdr_t* bam1 = NULL; +++ +++ bam0 = sam_hdr_init(); +++ if (!bam0 || -1 == sam_hdr_add_lines(bam0, bam0_header_text, strlen(bam0_header_text))) +++ goto fail; ++ ++ if (populate_merged_header(bam0, merged_hdr)) goto fail; ++ ++- bam1 = bam_hdr_init(); ++- if (!bam1) goto fail; ++- bam1->text = strdup(bam1_header_text); ++- if (!bam1->text) goto fail; ++- bam1->l_text = strlen(bam1_header_text); ++- bam1->n_targets = bam1_n_refseqs; ++- bam1->target_name = (char**)calloc(bam1_n_refseqs, sizeof(char*)); ++- bam1->target_len = (uint32_t*)calloc(bam1_n_refseqs, sizeof(uint32_t)); ++- for (i = 0; i < bam1_n_refseqs; i++) { ++- bam1->target_name[i] = strdup(bam1_refseqs[i].name); ++- if (!bam1->target_name[i]) goto fail; ++- bam1->target_len[i] = bam1_refseqs[i].len; ++- } +++ bam1 = sam_hdr_init(); +++ if (!bam1 || -1 == sam_hdr_add_lines(bam1, bam1_header_text, strlen(bam1_header_text))) +++ goto fail; ++ ++- bam_hdr_destroy(bam0); +++ sam_hdr_destroy(bam0); ++ return bam1; ++ ++ fail: ++- bam_hdr_destroy(bam1); ++- bam_hdr_destroy(bam0); +++ sam_hdr_destroy(bam1); +++ sam_hdr_destroy(bam0); ++ return NULL; ++ } ++ ++@@ -126,18 +107,18 @@ ++ { "fish", 133 } ++ }; ++ ++-bam_hdr_t * setup_test_1(merged_header_t *merged_hdr) { +++sam_hdr_t * setup_test_1(merged_header_t *merged_hdr) { ++ return setup_test(init_text, init_refs, NELE(init_refs), ++ test_1_trans_text, test_1_refs, NELE(test_1_refs), ++ merged_hdr); ++ } ++ ++-bool check_test_1(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { +++bool check_test_1(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { ++ // Check input is unchanged ++ if ( ++- strncmp(test_1_trans_text, translate->text, translate->l_text) ++- || translate->l_text != strlen( test_1_trans_text) ++- || translate->n_targets != 1 +++ strncmp(test_1_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) +++ || sam_hdr_length(translate) != strlen( test_1_trans_text) +++ || sam_hdr_nref(translate) != 1 ++ ) return false; ++ ++ // Check output header ++@@ -148,7 +129,7 @@ ++ regex_t check_regex; ++ regcomp(&check_regex, out_regex, REG_EXTENDED|REG_NOSUB); ++ ++- if ( regexec(&check_regex, out->text, 0, NULL, 0) != 0 || out->n_targets != 1 ) return false; +++ if ( regexec(&check_regex, sam_hdr_str(out), 0, NULL, 0) != 0 || sam_hdr_nref(out) != 1 ) return false; ++ ++ regfree(&check_regex); ++ ++@@ -161,25 +142,24 @@ ++ static const char test_2_trans_text[] = ++ "@HD\tVN:1.4\tSO:unknown\n" ++ "@SQ\tSN:donkey\tLN:133\n" ++-"@SQ\tSN:fish\tLN:133"; +++"@SQ\tSN:fish\tLN:133\n"; ++ ++ static const refseq_info_t test_2_refs[2] = { ++ { "donkey", 133 }, ++ { "fish", 133 } ++ }; ++ ++-bam_hdr_t * setup_test_2(merged_header_t *merged_hdr) { +++sam_hdr_t * setup_test_2(merged_header_t *merged_hdr) { ++ return setup_test(init_text, init_refs, NELE(init_refs), ++ test_2_trans_text, test_2_refs, NELE(test_2_refs), ++ merged_hdr); ++ } ++ ++-bool check_test_2(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { +++bool check_test_2(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { ++ // Check input is unchanged ++- if ( ++- strncmp(test_2_trans_text, translate->text, translate->l_text) ++- || translate->l_text != strlen(test_2_trans_text) ++- || translate->n_targets != 2 +++ if (sam_hdr_length(translate) != strlen(test_2_trans_text) +++ || strncmp(test_2_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) +++ || sam_hdr_nref(translate) != 2 ++ ) return false; ++ ++ // Check output header ++@@ -191,7 +171,7 @@ ++ regex_t check_regex; ++ regcomp(&check_regex, out_regex, REG_EXTENDED|REG_NOSUB); ++ ++- if ( regexec(&check_regex, out->text, 0, NULL, 0) != 0 || out->n_targets != 2 ) return false; +++ if ( regexec(&check_regex, sam_hdr_str(out), 0, NULL, 0) != 0 || sam_hdr_nref(out) != 2 ) return false; ++ ++ regfree(&check_regex); ++ ++@@ -212,18 +192,18 @@ ++ { "fish", 133 } ++ }; ++ ++-bam_hdr_t * setup_test_3(merged_header_t *merged_hdr) { +++sam_hdr_t * setup_test_3(merged_header_t *merged_hdr) { ++ return setup_test(init_text, init_refs, NELE(init_refs), ++ test_3_trans_text, test_3_refs, NELE(test_3_refs), ++ merged_hdr); ++ } ++ ++-bool check_test_3(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { +++bool check_test_3(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { ++ // Check input is unchanged ++ if ( ++- strncmp(test_3_trans_text, translate->text, translate->l_text) ++- || translate->l_text != strlen(test_3_trans_text) ++- || translate->n_targets != 2 +++ strncmp(test_3_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) +++ || sam_hdr_length(translate) != strlen(test_3_trans_text) +++ || sam_hdr_nref(translate) != 2 ++ ) return false; ++ return true; ++ } ++@@ -239,7 +219,7 @@ ++ { "fish", 133 } ++ }; ++ ++-bam_hdr_t * setup_test_4(merged_header_t *merged_hdr) { +++sam_hdr_t * setup_test_4(merged_header_t *merged_hdr) { ++ const char* t4_init_text = ++ "@HD\tVN:1.4\tSO:unknown\n" ++ "@SQ\tSN:fish\tLN:133\tSP:frog\n" ++@@ -250,12 +230,12 @@ ++ merged_hdr); ++ } ++ ++-bool check_test_4(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { +++bool check_test_4(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { ++ // Check input is unchanged ++ if ( ++- strncmp(test_4_trans_text, translate->text, translate->l_text) ++- || translate->l_text != strlen(test_4_trans_text) ++- || translate->n_targets != 2 +++ strncmp(test_4_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) +++ || sam_hdr_length(translate) != strlen(test_4_trans_text) +++ || sam_hdr_nref(translate) != 2 ++ ) return false; ++ return true; ++ } ++@@ -273,7 +253,7 @@ ++ { "fish", 133 } ++ }; ++ ++-bam_hdr_t * setup_test_5(merged_header_t *merged_hdr) { +++sam_hdr_t * setup_test_5(merged_header_t *merged_hdr) { ++ const char* t5_init_text = ++ "@HD\tVN:1.4\tSO:unknown\n" ++ "@SQ\tSN:fish\tLN:133\tSP:frog\n" ++@@ -286,12 +266,12 @@ ++ merged_hdr); ++ } ++ ++-bool check_test_5(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { +++bool check_test_5(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { ++ // Check input is unchanged ++ if ( ++- strncmp(test_5_trans_text, translate->text, translate->l_text) ++- || translate->l_text != strlen(test_5_trans_text) ++- || translate->n_targets != 2 +++ strncmp(test_5_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) +++ || sam_hdr_length(translate) != strlen(test_5_trans_text) +++ || sam_hdr_nref(translate) != 2 ++ ) return false; ++ return true; ++ } ++@@ -309,18 +289,18 @@ ++ { "fish", 133 } ++ }; ++ ++-bam_hdr_t * setup_test_6(merged_header_t *merged_hdr) { +++sam_hdr_t * setup_test_6(merged_header_t *merged_hdr) { ++ return setup_test(init_text, init_refs, NELE(init_refs), ++ test_6_trans_text, test_6_refs, NELE(test_6_refs), ++ merged_hdr); ++ } ++ ++-bool check_test_6(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { +++bool check_test_6(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { ++ // Check input is unchanged ++ if ( ++- strncmp(test_6_trans_text, translate->text, translate->l_text) ++- || translate->l_text != strlen(test_5_trans_text) ++- || translate->n_targets != 2 +++ strncmp(test_6_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) +++ || sam_hdr_length(translate) != strlen(test_5_trans_text) +++ || sam_hdr_nref(translate) != 2 ++ ) return false; ++ return true; ++ } ++@@ -346,8 +326,8 @@ ++ const long GIMMICK_SEED = 0x1234330e; ++ srand48(GIMMICK_SEED); ++ ++- bam_hdr_t* out; ++- bam_hdr_t* translate; +++ sam_hdr_t* out; +++ sam_hdr_t* translate; ++ ++ if (verbose) printf("BEGIN test 1\n"); ++ // setup ++@@ -362,7 +342,8 @@ ++ } ++ if (verbose) printf("RUN test 1\n"); ++ trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, true, NULL); ++- out = finish_merged_header(merged_hdr); +++ finish_merged_header(merged_hdr); +++ out = merged_hdr->hdr; ++ free_merged_header(merged_hdr); ++ if (verbose) printf("END RUN test 1\n"); ++ if (verbose > 1) { ++@@ -380,8 +361,8 @@ ++ ++failure; ++ } ++ // teardown ++- bam_hdr_destroy(translate); ++- bam_hdr_destroy(out); +++ sam_hdr_destroy(translate); +++ sam_hdr_destroy(out); ++ trans_tbl_destroy(&tbl_1); ++ if (verbose) printf("END test 1\n"); ++ ++@@ -399,7 +380,8 @@ ++ } ++ if (verbose) printf("RUN test 2\n"); ++ trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, true, NULL); ++- out = finish_merged_header(merged_hdr); +++ finish_merged_header(merged_hdr); +++ out = merged_hdr->hdr; ++ free_merged_header(merged_hdr); ++ if (verbose) printf("END RUN test 2\n"); ++ if (verbose > 1) { ++@@ -417,8 +399,8 @@ ++ ++failure; ++ } ++ // teardown ++- bam_hdr_destroy(translate); ++- bam_hdr_destroy(out); +++ sam_hdr_destroy(translate); +++ sam_hdr_destroy(out); ++ trans_tbl_destroy(&tbl_2); ++ if (verbose) printf("END test 2\n"); ++ ++@@ -435,7 +417,8 @@ ++ } ++ if (verbose) printf("RUN test 3\n"); ++ trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, true, NULL); ++- out = finish_merged_header(merged_hdr); +++ finish_merged_header(merged_hdr); +++ out = merged_hdr->hdr; ++ free_merged_header(merged_hdr); ++ if (verbose) printf("END RUN test 3\n"); ++ if (verbose > 1) { ++@@ -453,8 +436,8 @@ ++ ++failure; ++ } ++ // teardown ++- bam_hdr_destroy(translate); ++- bam_hdr_destroy(out); +++ sam_hdr_destroy(translate); +++ sam_hdr_destroy(out); ++ trans_tbl_destroy(&tbl_3); ++ if (verbose) printf("END test 3\n"); ++ ++@@ -471,7 +454,8 @@ ++ } ++ if (verbose) printf("RUN test 4\n"); ++ trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, true, NULL); ++- out = finish_merged_header(merged_hdr); +++ finish_merged_header(merged_hdr); +++ out = merged_hdr->hdr; ++ free_merged_header(merged_hdr); ++ if (verbose) printf("END RUN test 4\n"); ++ if (verbose > 1) { ++@@ -489,8 +473,8 @@ ++ ++failure; ++ } ++ // teardown ++- bam_hdr_destroy(translate); ++- bam_hdr_destroy(out); +++ sam_hdr_destroy(translate); +++ sam_hdr_destroy(out); ++ trans_tbl_destroy(&tbl_4); ++ if (verbose) printf("END test 4\n"); ++ ++@@ -508,7 +492,8 @@ ++ } ++ if (verbose) printf("RUN test 5\n"); ++ trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, true, NULL); ++- out = finish_merged_header(merged_hdr); +++ finish_merged_header(merged_hdr); +++ out = merged_hdr->hdr; ++ free_merged_header(merged_hdr); ++ if (verbose) printf("END RUN test 5\n"); ++ if (verbose > 1) { ++@@ -526,8 +511,8 @@ ++ ++failure; ++ } ++ // teardown ++- bam_hdr_destroy(translate); ++- bam_hdr_destroy(out); +++ sam_hdr_destroy(translate); +++ sam_hdr_destroy(out); ++ trans_tbl_destroy(&tbl_5); ++ if (verbose) printf("END test 5\n"); ++ ++@@ -544,7 +529,8 @@ ++ } ++ if (verbose) printf("RUN test 6\n"); ++ trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, true, "filename"); ++- out = finish_merged_header(merged_hdr); +++ finish_merged_header(merged_hdr); +++ out = merged_hdr->hdr; ++ free_merged_header(merged_hdr); ++ if (verbose) printf("END RUN test 6\n"); ++ if (verbose > 1) { ++@@ -562,8 +548,8 @@ ++ ++failure; ++ } ++ // teardown ++- bam_hdr_destroy(translate); ++- bam_hdr_destroy(out); +++ sam_hdr_destroy(translate); +++ sam_hdr_destroy(out); ++ trans_tbl_destroy(&tbl_6); ++ if (verbose) printf("END test 6\n"); ++ ++--- python-pysam.orig/samtools/test/merge/test_trans_tbl_init.c.pysam.c +++++ python-pysam/samtools/test/merge/test_trans_tbl_init.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* test/merge/test_trans_tbl_init.c -- merge test harness. ++ ++- Copyright (C) 2013, 2014 Genome Research Ltd. +++ Copyright (C) 2013-2016, 2019 Genome Research Ltd. ++ ++ Author: Martin O. Pollard ++ ++@@ -29,18 +29,19 @@ ++ #include "../../bam_sort.c" ++ #include ++ #include +++#include ++ ++ typedef struct refseq_info { ++ const char *name; ++ uint32_t len; ++ } refseq_info_t; ++ ++-void dump_header(bam_hdr_t* hdr) { ++- fprintf(samtools_stdout, "->n_targets:(%d)\n", hdr->n_targets); +++void dump_header(sam_hdr_t* hdr) { +++ fprintf(samtools_stdout, "->n_targets:(%d)\n", sam_hdr_nref(hdr)); ++ int i; ++- for (i = 0; i < hdr->n_targets; ++i) { ++- fprintf(samtools_stdout, "->target_name[%d]:(%s)\n",i,hdr->target_name[i]); ++- fprintf(samtools_stdout, "->target_len[%d]:(%d)\n",i,hdr->target_len[i]); +++ for (i = 0; i < sam_hdr_nref(hdr); ++i) { +++ fprintf(samtools_stdout, "->target_name[%d]:(%s)\n", i, sam_hdr_tid2name(hdr, i)); +++ fprintf(samtools_stdout, "->target_len[%d]:(%"PRId64")\n", i, (int64_t) sam_hdr_tid2len(hdr, i)); ++ } ++ ++ fprintf(samtools_stdout, "->text:("); ++@@ -48,7 +49,7 @@ ++ fprintf(samtools_stdout, ")\n"); ++ } ++ ++-static int populate_merged_header(bam_hdr_t *hdr, merged_header_t *merged_hdr) { +++static int populate_merged_header(sam_hdr_t *hdr, merged_header_t *merged_hdr) { ++ trans_tbl_t dummy; ++ int res; ++ res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, 1, NULL); ++@@ -58,55 +59,35 @@ ++ ++ /* ++ * Populate merged_hdr with data from bam0_header_text and bam0_refseqs. ++- * Return bam_hdr_t based on the content in bam1_header_text and bam1_refseqs. +++ * Return sam_hdr_t based on the content in bam1_header_text and bam1_refseqs. ++ */ ++ ++-bam_hdr_t * setup_test(const char *bam0_header_text, +++sam_hdr_t * setup_test(const char *bam0_header_text, ++ const refseq_info_t *bam0_refseqs, ++ int32_t bam0_n_refseqs, ++ const char *bam1_header_text, ++ const refseq_info_t *bam1_refseqs, ++ int32_t bam1_n_refseqs, ++ merged_header_t *merged_hdr) { ++- bam_hdr_t* bam0 = NULL; ++- bam_hdr_t* bam1 = NULL; ++- int32_t i; ++- ++- bam0 = bam_hdr_init(); ++- bam0->text = strdup(bam0_header_text); ++- if (!bam0->text) goto fail; ++- bam0->l_text = strlen(bam0_header_text); ++- bam0->n_targets = 1; ++- bam0->target_name = (char**)calloc(bam0_n_refseqs, sizeof(char*)); ++- bam0->target_len = (uint32_t*)calloc(bam0_n_refseqs, sizeof(uint32_t)); ++- for (i = 0; i < bam0_n_refseqs; i++) { ++- bam0->target_name[i] = strdup(bam0_refseqs[i].name); ++- if (!bam0->target_name[i]) goto fail; ++- bam0->target_len[i] = bam0_refseqs[i].len; ++- } +++ sam_hdr_t* bam0 = NULL; +++ sam_hdr_t* bam1 = NULL; +++ +++ bam0 = sam_hdr_init(); +++ if (!bam0 || -1 == sam_hdr_add_lines(bam0, bam0_header_text, strlen(bam0_header_text))) +++ goto fail; ++ ++ if (populate_merged_header(bam0, merged_hdr)) goto fail; ++ ++- bam1 = bam_hdr_init(); ++- if (!bam1) goto fail; ++- bam1->text = strdup(bam1_header_text); ++- if (!bam1->text) goto fail; ++- bam1->l_text = strlen(bam1_header_text); ++- bam1->n_targets = bam1_n_refseqs; ++- bam1->target_name = (char**)calloc(bam1_n_refseqs, sizeof(char*)); ++- bam1->target_len = (uint32_t*)calloc(bam1_n_refseqs, sizeof(uint32_t)); ++- for (i = 0; i < bam1_n_refseqs; i++) { ++- bam1->target_name[i] = strdup(bam1_refseqs[i].name); ++- if (!bam1->target_name[i]) goto fail; ++- bam1->target_len[i] = bam1_refseqs[i].len; ++- } +++ bam1 = sam_hdr_init(); +++ if (!bam1 || -1 == sam_hdr_add_lines(bam1, bam1_header_text, strlen(bam1_header_text))) +++ goto fail; ++ ++- bam_hdr_destroy(bam0); +++ sam_hdr_destroy(bam0); ++ return bam1; ++ ++ fail: ++- bam_hdr_destroy(bam1); ++- bam_hdr_destroy(bam0); +++ sam_hdr_destroy(bam1); +++ sam_hdr_destroy(bam0); ++ return NULL; ++ } ++ ++@@ -128,18 +109,18 @@ ++ { "fish", 133 } ++ }; ++ ++-bam_hdr_t * setup_test_1(merged_header_t *merged_hdr) { +++sam_hdr_t * setup_test_1(merged_header_t *merged_hdr) { ++ return setup_test(init_text, init_refs, NELE(init_refs), ++ test_1_trans_text, test_1_refs, NELE(test_1_refs), ++ merged_hdr); ++ } ++ ++-bool check_test_1(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { +++bool check_test_1(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { ++ // Check input is unchanged ++ if ( ++- strncmp(test_1_trans_text, translate->text, translate->l_text) ++- || translate->l_text != strlen( test_1_trans_text) ++- || translate->n_targets != 1 +++ strncmp(test_1_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) +++ || sam_hdr_length(translate) != strlen( test_1_trans_text) +++ || sam_hdr_nref(translate) != 1 ++ ) return false; ++ ++ // Check output header ++@@ -150,7 +131,7 @@ ++ regex_t check_regex; ++ regcomp(&check_regex, out_regex, REG_EXTENDED|REG_NOSUB); ++ ++- if ( regexec(&check_regex, out->text, 0, NULL, 0) != 0 || out->n_targets != 1 ) return false; +++ if ( regexec(&check_regex, sam_hdr_str(out), 0, NULL, 0) != 0 || sam_hdr_nref(out) != 1 ) return false; ++ ++ regfree(&check_regex); ++ ++@@ -163,25 +144,24 @@ ++ static const char test_2_trans_text[] = ++ "@HD\tVN:1.4\tSO:unknown\n" ++ "@SQ\tSN:donkey\tLN:133\n" ++-"@SQ\tSN:fish\tLN:133"; +++"@SQ\tSN:fish\tLN:133\n"; ++ ++ static const refseq_info_t test_2_refs[2] = { ++ { "donkey", 133 }, ++ { "fish", 133 } ++ }; ++ ++-bam_hdr_t * setup_test_2(merged_header_t *merged_hdr) { +++sam_hdr_t * setup_test_2(merged_header_t *merged_hdr) { ++ return setup_test(init_text, init_refs, NELE(init_refs), ++ test_2_trans_text, test_2_refs, NELE(test_2_refs), ++ merged_hdr); ++ } ++ ++-bool check_test_2(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { +++bool check_test_2(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { ++ // Check input is unchanged ++- if ( ++- strncmp(test_2_trans_text, translate->text, translate->l_text) ++- || translate->l_text != strlen(test_2_trans_text) ++- || translate->n_targets != 2 +++ if (sam_hdr_length(translate) != strlen(test_2_trans_text) +++ || strncmp(test_2_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) +++ || sam_hdr_nref(translate) != 2 ++ ) return false; ++ ++ // Check output header ++@@ -193,7 +173,7 @@ ++ regex_t check_regex; ++ regcomp(&check_regex, out_regex, REG_EXTENDED|REG_NOSUB); ++ ++- if ( regexec(&check_regex, out->text, 0, NULL, 0) != 0 || out->n_targets != 2 ) return false; +++ if ( regexec(&check_regex, sam_hdr_str(out), 0, NULL, 0) != 0 || sam_hdr_nref(out) != 2 ) return false; ++ ++ regfree(&check_regex); ++ ++@@ -214,18 +194,18 @@ ++ { "fish", 133 } ++ }; ++ ++-bam_hdr_t * setup_test_3(merged_header_t *merged_hdr) { +++sam_hdr_t * setup_test_3(merged_header_t *merged_hdr) { ++ return setup_test(init_text, init_refs, NELE(init_refs), ++ test_3_trans_text, test_3_refs, NELE(test_3_refs), ++ merged_hdr); ++ } ++ ++-bool check_test_3(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { +++bool check_test_3(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { ++ // Check input is unchanged ++ if ( ++- strncmp(test_3_trans_text, translate->text, translate->l_text) ++- || translate->l_text != strlen(test_3_trans_text) ++- || translate->n_targets != 2 +++ strncmp(test_3_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) +++ || sam_hdr_length(translate) != strlen(test_3_trans_text) +++ || sam_hdr_nref(translate) != 2 ++ ) return false; ++ return true; ++ } ++@@ -241,7 +221,7 @@ ++ { "fish", 133 } ++ }; ++ ++-bam_hdr_t * setup_test_4(merged_header_t *merged_hdr) { +++sam_hdr_t * setup_test_4(merged_header_t *merged_hdr) { ++ const char* t4_init_text = ++ "@HD\tVN:1.4\tSO:unknown\n" ++ "@SQ\tSN:fish\tLN:133\tSP:frog\n" ++@@ -252,12 +232,12 @@ ++ merged_hdr); ++ } ++ ++-bool check_test_4(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { +++bool check_test_4(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { ++ // Check input is unchanged ++ if ( ++- strncmp(test_4_trans_text, translate->text, translate->l_text) ++- || translate->l_text != strlen(test_4_trans_text) ++- || translate->n_targets != 2 +++ strncmp(test_4_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) +++ || sam_hdr_length(translate) != strlen(test_4_trans_text) +++ || sam_hdr_nref(translate) != 2 ++ ) return false; ++ return true; ++ } ++@@ -275,7 +255,7 @@ ++ { "fish", 133 } ++ }; ++ ++-bam_hdr_t * setup_test_5(merged_header_t *merged_hdr) { +++sam_hdr_t * setup_test_5(merged_header_t *merged_hdr) { ++ const char* t5_init_text = ++ "@HD\tVN:1.4\tSO:unknown\n" ++ "@SQ\tSN:fish\tLN:133\tSP:frog\n" ++@@ -288,12 +268,12 @@ ++ merged_hdr); ++ } ++ ++-bool check_test_5(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { +++bool check_test_5(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { ++ // Check input is unchanged ++ if ( ++- strncmp(test_5_trans_text, translate->text, translate->l_text) ++- || translate->l_text != strlen(test_5_trans_text) ++- || translate->n_targets != 2 +++ strncmp(test_5_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) +++ || sam_hdr_length(translate) != strlen(test_5_trans_text) +++ || sam_hdr_nref(translate) != 2 ++ ) return false; ++ return true; ++ } ++@@ -311,18 +291,18 @@ ++ { "fish", 133 } ++ }; ++ ++-bam_hdr_t * setup_test_6(merged_header_t *merged_hdr) { +++sam_hdr_t * setup_test_6(merged_header_t *merged_hdr) { ++ return setup_test(init_text, init_refs, NELE(init_refs), ++ test_6_trans_text, test_6_refs, NELE(test_6_refs), ++ merged_hdr); ++ } ++ ++-bool check_test_6(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { +++bool check_test_6(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { ++ // Check input is unchanged ++ if ( ++- strncmp(test_6_trans_text, translate->text, translate->l_text) ++- || translate->l_text != strlen(test_5_trans_text) ++- || translate->n_targets != 2 +++ strncmp(test_6_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) +++ || sam_hdr_length(translate) != strlen(test_5_trans_text) +++ || sam_hdr_nref(translate) != 2 ++ ) return false; ++ return true; ++ } ++@@ -348,8 +328,8 @@ ++ const long GIMMICK_SEED = 0x1234330e; ++ srand48(GIMMICK_SEED); ++ ++- bam_hdr_t* out; ++- bam_hdr_t* translate; +++ sam_hdr_t* out; +++ sam_hdr_t* translate; ++ ++ if (verbose) fprintf(samtools_stdout, "BEGIN test 1\n"); ++ // setup ++@@ -364,7 +344,8 @@ ++ } ++ if (verbose) fprintf(samtools_stdout, "RUN test 1\n"); ++ trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, true, NULL); ++- out = finish_merged_header(merged_hdr); +++ finish_merged_header(merged_hdr); +++ out = merged_hdr->hdr; ++ free_merged_header(merged_hdr); ++ if (verbose) fprintf(samtools_stdout, "END RUN test 1\n"); ++ if (verbose > 1) { ++@@ -382,8 +363,8 @@ ++ ++failure; ++ } ++ // teardown ++- bam_hdr_destroy(translate); ++- bam_hdr_destroy(out); +++ sam_hdr_destroy(translate); +++ sam_hdr_destroy(out); ++ trans_tbl_destroy(&tbl_1); ++ if (verbose) fprintf(samtools_stdout, "END test 1\n"); ++ ++@@ -401,7 +382,8 @@ ++ } ++ if (verbose) fprintf(samtools_stdout, "RUN test 2\n"); ++ trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, true, NULL); ++- out = finish_merged_header(merged_hdr); +++ finish_merged_header(merged_hdr); +++ out = merged_hdr->hdr; ++ free_merged_header(merged_hdr); ++ if (verbose) fprintf(samtools_stdout, "END RUN test 2\n"); ++ if (verbose > 1) { ++@@ -419,8 +401,8 @@ ++ ++failure; ++ } ++ // teardown ++- bam_hdr_destroy(translate); ++- bam_hdr_destroy(out); +++ sam_hdr_destroy(translate); +++ sam_hdr_destroy(out); ++ trans_tbl_destroy(&tbl_2); ++ if (verbose) fprintf(samtools_stdout, "END test 2\n"); ++ ++@@ -437,7 +419,8 @@ ++ } ++ if (verbose) fprintf(samtools_stdout, "RUN test 3\n"); ++ trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, true, NULL); ++- out = finish_merged_header(merged_hdr); +++ finish_merged_header(merged_hdr); +++ out = merged_hdr->hdr; ++ free_merged_header(merged_hdr); ++ if (verbose) fprintf(samtools_stdout, "END RUN test 3\n"); ++ if (verbose > 1) { ++@@ -455,8 +438,8 @@ ++ ++failure; ++ } ++ // teardown ++- bam_hdr_destroy(translate); ++- bam_hdr_destroy(out); +++ sam_hdr_destroy(translate); +++ sam_hdr_destroy(out); ++ trans_tbl_destroy(&tbl_3); ++ if (verbose) fprintf(samtools_stdout, "END test 3\n"); ++ ++@@ -473,7 +456,8 @@ ++ } ++ if (verbose) fprintf(samtools_stdout, "RUN test 4\n"); ++ trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, true, NULL); ++- out = finish_merged_header(merged_hdr); +++ finish_merged_header(merged_hdr); +++ out = merged_hdr->hdr; ++ free_merged_header(merged_hdr); ++ if (verbose) fprintf(samtools_stdout, "END RUN test 4\n"); ++ if (verbose > 1) { ++@@ -491,8 +475,8 @@ ++ ++failure; ++ } ++ // teardown ++- bam_hdr_destroy(translate); ++- bam_hdr_destroy(out); +++ sam_hdr_destroy(translate); +++ sam_hdr_destroy(out); ++ trans_tbl_destroy(&tbl_4); ++ if (verbose) fprintf(samtools_stdout, "END test 4\n"); ++ ++@@ -510,7 +494,8 @@ ++ } ++ if (verbose) fprintf(samtools_stdout, "RUN test 5\n"); ++ trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, true, NULL); ++- out = finish_merged_header(merged_hdr); +++ finish_merged_header(merged_hdr); +++ out = merged_hdr->hdr; ++ free_merged_header(merged_hdr); ++ if (verbose) fprintf(samtools_stdout, "END RUN test 5\n"); ++ if (verbose > 1) { ++@@ -528,8 +513,8 @@ ++ ++failure; ++ } ++ // teardown ++- bam_hdr_destroy(translate); ++- bam_hdr_destroy(out); +++ sam_hdr_destroy(translate); +++ sam_hdr_destroy(out); ++ trans_tbl_destroy(&tbl_5); ++ if (verbose) fprintf(samtools_stdout, "END test 5\n"); ++ ++@@ -546,7 +531,8 @@ ++ } ++ if (verbose) fprintf(samtools_stdout, "RUN test 6\n"); ++ trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, true, "filename"); ++- out = finish_merged_header(merged_hdr); +++ finish_merged_header(merged_hdr); +++ out = merged_hdr->hdr; ++ free_merged_header(merged_hdr); ++ if (verbose) fprintf(samtools_stdout, "END RUN test 6\n"); ++ if (verbose > 1) { ++@@ -564,8 +550,8 @@ ++ ++failure; ++ } ++ // teardown ++- bam_hdr_destroy(translate); ++- bam_hdr_destroy(out); +++ sam_hdr_destroy(translate); +++ sam_hdr_destroy(out); ++ trans_tbl_destroy(&tbl_6); ++ if (verbose) fprintf(samtools_stdout, "END test 6\n"); ++ ++--- python-pysam.orig/samtools/test/split/test_count_rg.c +++++ python-pysam/samtools/test/split/test_count_rg.c ++@@ -1,6 +1,6 @@ ++ /* test/split/test_count_rg.c -- split test cases. ++ ++- Copyright (C) 2014 Genome Research Ltd. +++ Copyright (C) 2014, 2019 Genome Research Ltd. ++ ++ Author: Martin O. Pollard ++ ++@@ -29,15 +29,14 @@ ++ #include ++ #include ++ ++-void setup_test_1(bam_hdr_t** hdr_in) +++void setup_test_1(sam_hdr_t** hdr_in) ++ { ++- *hdr_in = bam_hdr_init(); +++ *hdr_in = sam_hdr_init(); ++ const char *test1 = ++ "@HD\tVN:1.4\n" ++- "@SQ\tSN:blah\n" +++ "@SQ\tSN:blah\tLN:150\n" ++ "@RG\tID:fish\n"; ++- (*hdr_in)->text = strdup(test1); ++- (*hdr_in)->l_text = strlen(test1); +++ sam_hdr_add_lines(*hdr_in, test1, 0); ++ } ++ ++ int main(int argc, char**argv) ++@@ -66,13 +65,14 @@ ++ ++ // Setup stderr redirect ++ kstring_t res = { 0, 0, NULL }; ++- FILE* orig_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save stderr +++ int orig_stderr = dup(STDERR_FILENO); // Save stderr +++ int redirected_stderr; ++ char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp"; ++ FILE* check = NULL; ++ ++ // setup ++ if (verbose) printf("BEGIN test 1\n"); // TID test ++- bam_hdr_t* hdr1; +++ sam_hdr_t* hdr1; ++ size_t count; ++ char** output; ++ setup_test_1(&hdr1); ++@@ -83,9 +83,9 @@ ++ if (verbose) printf("RUN test 1\n"); ++ ++ // test ++- xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe +++ redirected_stderr = redirect_stderr(tempfname); ++ bool result_1 = count_RG(hdr1, &count, &output); ++- fclose(stderr); +++ flush_and_restore_stderr(orig_stderr, redirected_stderr); ++ ++ if (verbose) printf("END RUN test 1\n"); ++ if (verbose > 1) { ++@@ -111,15 +111,15 @@ ++ free(output[i]); ++ } ++ free(output); ++- bam_hdr_destroy(hdr1); +++ sam_hdr_destroy(hdr1); ++ if (verbose) printf("END test 1\n"); ++ ++ // Cleanup ++ free(res.s); ++ remove(tempfname); ++ if (failure > 0) ++- fprintf(orig_stderr, "%d failures %d successes\n", failure, success); ++- fclose(orig_stderr); +++ fprintf(stderr, "%d failures %d successes\n", failure, success); +++ close(orig_stderr); ++ ++ return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; ++ } ++--- python-pysam.orig/samtools/test/split/test_count_rg.c.pysam.c +++++ python-pysam/samtools/test/split/test_count_rg.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* test/split/test_count_rg.c -- split test cases. ++ ++- Copyright (C) 2014 Genome Research Ltd. +++ Copyright (C) 2014, 2019 Genome Research Ltd. ++ ++ Author: Martin O. Pollard ++ ++@@ -31,15 +31,14 @@ ++ #include ++ #include ++ ++-void setup_test_1(bam_hdr_t** hdr_in) +++void setup_test_1(sam_hdr_t** hdr_in) ++ { ++- *hdr_in = bam_hdr_init(); +++ *hdr_in = sam_hdr_init(); ++ const char *test1 = ++ "@HD\tVN:1.4\n" ++- "@SQ\tSN:blah\n" +++ "@SQ\tSN:blah\tLN:150\n" ++ "@RG\tID:fish\n"; ++- (*hdr_in)->text = strdup(test1); ++- (*hdr_in)->l_text = strlen(test1); +++ sam_hdr_add_lines(*hdr_in, test1, 0); ++ } ++ ++ int samtools_test_count_rg_main(int argc, char**argv) ++@@ -68,13 +67,14 @@ ++ ++ // Setup samtools_stderr redirect ++ kstring_t res = { 0, 0, NULL }; ++- FILE* orig_samtools_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save samtools_stderr +++ int orig_samtools_stderr = dup(STDERR_FILENO); // Save samtools_stderr +++ int redirected_samtools_stderr; ++ char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp"; ++ FILE* check = NULL; ++ ++ // setup ++ if (verbose) fprintf(samtools_stdout, "BEGIN test 1\n"); // TID test ++- bam_hdr_t* hdr1; +++ sam_hdr_t* hdr1; ++ size_t count; ++ char** output; ++ setup_test_1(&hdr1); ++@@ -85,9 +85,9 @@ ++ if (verbose) fprintf(samtools_stdout, "RUN test 1\n"); ++ ++ // test ++- xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe +++ redirected_samtools_stderr = redirect_samtools_stderr(tempfname); ++ bool result_1 = count_RG(hdr1, &count, &output); ++- fclose(samtools_stderr); +++ flush_and_restore_samtools_stderr(orig_samtools_stderr, redirected_samtools_stderr); ++ ++ if (verbose) fprintf(samtools_stdout, "END RUN test 1\n"); ++ if (verbose > 1) { ++@@ -113,15 +113,15 @@ ++ free(output[i]); ++ } ++ free(output); ++- bam_hdr_destroy(hdr1); +++ sam_hdr_destroy(hdr1); ++ if (verbose) fprintf(samtools_stdout, "END test 1\n"); ++ ++ // Cleanup ++ free(res.s); ++ remove(tempfname); ++ if (failure > 0) ++- fprintf(orig_samtools_stderr, "%d failures %d successes\n", failure, success); ++- fclose(orig_samtools_stderr); +++ fprintf(samtools_stderr, "%d failures %d successes\n", failure, success); +++ close(orig_samtools_stderr); ++ ++ return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; ++ } ++--- python-pysam.orig/samtools/test/split/test_expand_format_string.c +++++ python-pysam/samtools/test/split/test_expand_format_string.c ++@@ -29,15 +29,14 @@ ++ #include ++ #include ++ ++-void setup_test_1(bam_hdr_t** hdr_in) +++void setup_test_1(sam_hdr_t** hdr_in) ++ { ++- *hdr_in = bam_hdr_init(); +++ *hdr_in = sam_hdr_init(); ++ const char *test1 = ++ "@HD\tVN:1.4\n" ++ "@SQ\tSN:blah\n" ++ "@RG\tID:fish\n"; ++- (*hdr_in)->text = strdup(test1); ++- (*hdr_in)->l_text = strlen(test1); +++ sam_hdr_add_lines(*hdr_in, test1, 0); ++ } ++ ++ int main(int argc, char**argv) ++--- python-pysam.orig/samtools/test/split/test_expand_format_string.c.pysam.c +++++ python-pysam/samtools/test/split/test_expand_format_string.c.pysam.c ++@@ -31,15 +31,14 @@ ++ #include ++ #include ++ ++-void setup_test_1(bam_hdr_t** hdr_in) +++void setup_test_1(sam_hdr_t** hdr_in) ++ { ++- *hdr_in = bam_hdr_init(); +++ *hdr_in = sam_hdr_init(); ++ const char *test1 = ++ "@HD\tVN:1.4\n" ++ "@SQ\tSN:blah\n" ++ "@RG\tID:fish\n"; ++- (*hdr_in)->text = strdup(test1); ++- (*hdr_in)->l_text = strlen(test1); +++ sam_hdr_add_lines(*hdr_in, test1, 0); ++ } ++ ++ int samtools_test_expand_format_string_main(int argc, char**argv) ++--- python-pysam.orig/samtools/test/split/test_filter_header_rg.c +++++ python-pysam/samtools/test/split/test_filter_header_rg.c ++@@ -1,6 +1,6 @@ ++ /* test/split/test_filter_header_rg.c -- split test cases. ++ ++- Copyright (C) 2014 Genome Research Ltd. +++ Copyright (C) 2014-2016, 2018, 2019 Genome Research Ltd. ++ ++ Author: Martin O. Pollard ++ ++@@ -24,61 +24,133 @@ ++ ++ #include ++ ++-#include "../../bam_split.c" ++ #include "../test.h" ++ #include +++#include +++#include "samtools.h" +++#include +++#include +++#include "htslib/kstring.h" +++ +++int line_cmp(const void *av, const void *bv) { +++ const char *a = *(const char **) av; +++ const char *b = *(const char **) bv; +++ size_t al = strcspn(a, "\n"); +++ size_t bl = strcspn(b, "\n"); +++ size_t min = al < bl ? al : bl; +++ int m = memcmp(a, b, min); +++ if (m != 0) return m; +++ if (al < bl) return -1; +++ return al == bl ? 0 : 1; +++} +++ +++bool hdrcmp(const char *hdr1, const char *hdr2) { +++ size_t nl1, nl2, count1 = 0, count2 = 0, i; +++ const char *l; +++ const char **lines1, **lines2; +++ int res = 0; +++ +++ // First line should be @HD +++ if (strncmp(hdr1, "@HD\t", 4) != 0) return false; +++ if (strncmp(hdr2, "@HD\t", 4) != 0) return false; +++ nl1 = strcspn(hdr1, "\n"); +++ nl2 = strcspn(hdr2, "\n"); +++ if (nl1 != nl2 || memcmp(hdr1, hdr2, nl1) != 0) return false; +++ +++ // Count lines. +++ for (l = hdr1 + nl1; *l != '\0'; l += strcspn(l, "\n")) ++l, ++count1; +++ for (l = hdr2 + nl2; *l != '\0'; l += strcspn(l, "\n")) ++l, ++count2; +++ if (count1 != count2) return false; +++ +++ lines1 = malloc(count1 * sizeof(*lines1)); +++ if (!lines1) return false; +++ lines2 = malloc(count2 * sizeof(*lines2)); +++ if (!lines2) { free(lines1); return false; } +++ +++ for (i = 0, l = hdr1 + nl1; *l != '\0'; l += strcspn(l, "\n")) +++ lines1[i++] = ++l; +++ for (i = 0, l = hdr2 + nl2; *l != '\0'; l += strcspn(l, "\n")) +++ lines2[i++] = ++l; +++ +++ qsort(lines1, count1, sizeof(*lines1), line_cmp); +++ qsort(lines2, count2, sizeof(*lines2), line_cmp); +++ +++ for (i = 0; i < count1; i++) { +++ res = line_cmp(&lines1[i], &lines2[i]); +++ if (res != 0) break; +++ } +++ +++ free(lines1); +++ free(lines2); +++ +++ return res?false:true; +++} ++ ++-void setup_test_1(bam_hdr_t** hdr_in) +++void setup_test_1(sam_hdr_t** hdr_in) ++ { ++- *hdr_in = bam_hdr_init(); +++ *hdr_in = sam_hdr_init(); ++ const char *test1 = ++ "@HD\tVN:1.4\n" ++- "@SQ\tSN:blah\n" +++ "@SQ\tSN:blah\tLN:1\n" ++ "@RG\tID:fish\n"; ++- (*hdr_in)->text = strdup(test1); ++- (*hdr_in)->l_text = strlen(test1); +++ sam_hdr_add_lines(*hdr_in, test1, 0); ++ } ++ ++-bool check_test_1(const bam_hdr_t* hdr) { +++bool check_test_1(sam_hdr_t* hdr) { ++ const char *test1_res = ++ "@HD\tVN:1.4\n" ++- "@SQ\tSN:blah\n" +++ "@SQ\tSN:blah\tLN:1\n" ++ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; ++ ++- if (strcmp(hdr->text, test1_res)) { ++- return false; ++- } ++- return true; +++ return hdrcmp(sam_hdr_str(hdr), test1_res); ++ } ++ ++-void setup_test_2(bam_hdr_t** hdr_in) +++void setup_test_2(sam_hdr_t** hdr_in) ++ { ++- *hdr_in = bam_hdr_init(); +++ *hdr_in = sam_hdr_init(); ++ const char *test2 = ++ "@HD\tVN:1.4\n" ++- "@SQ\tSN:blah\n" +++ "@SQ\tSN:blah\tLN:1\n" ++ "@RG\tID:fish\n"; ++- (*hdr_in)->text = strdup(test2); ++- (*hdr_in)->l_text = strlen(test2); +++ sam_hdr_add_lines(*hdr_in, test2, 0); ++ } ++ ++-bool check_test_2(const bam_hdr_t* hdr) { +++bool check_test_2(sam_hdr_t* hdr) { ++ const char *test2_res = ++ "@HD\tVN:1.4\n" ++- "@SQ\tSN:blah\n" +++ "@SQ\tSN:blah\tLN:1\n" ++ "@RG\tID:fish\n" ++ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; ++ ++- if (strcmp(hdr->text, test2_res)) { ++- return false; ++- } ++- return true; +++ return hdrcmp(sam_hdr_str(hdr), test2_res); +++} +++ +++void setup_test_3(sam_hdr_t** hdr_in) +++{ +++ *hdr_in = sam_hdr_init(); +++ const char *test3 = +++ "@HD\tVN:1.4\n" +++ "@SQ\tSN:blah\tLN:1\n" +++ "@RG\tID:fish1\n" +++ "@RG\tID:fish2\n" +++ "@RG\tID:fish3\n" +++ "@RG\tID:fish4\n"; +++ sam_hdr_add_lines(*hdr_in, test3, 0); +++} +++ +++bool check_test_3(sam_hdr_t* hdr) { +++ const char *test3_res = +++ "@HD\tVN:1.4\n" +++ "@SQ\tSN:blah\tLN:1\n" +++ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; +++ +++ return hdrcmp(sam_hdr_str(hdr), test3_res); ++ } ++ ++ int main(int argc, char *argv[]) ++ { ++ // test state ++- const int NUM_TESTS = 2; +++ const int NUM_TESTS = 3; ++ int verbose = 0; ++ int success = 0; ++ int failure = 0; ++@@ -103,13 +175,14 @@ ++ ++ // Setup stderr redirect ++ kstring_t res = { 0, 0, NULL }; ++- FILE* orig_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save stderr +++ int orig_stderr = dup(STDERR_FILENO); // Save stderr +++ int redirected_stderr; ++ char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp"; ++ FILE* check = NULL; ++ ++ // setup ++ if (verbose) printf("BEGIN test 1\n"); // test eliminating a tag that isn't there ++- bam_hdr_t* hdr1; +++ sam_hdr_t* hdr1; ++ const char* id_to_keep_1 = "1#2.3"; ++ setup_test_1(&hdr1); ++ if (verbose > 1) { ++@@ -119,9 +192,13 @@ ++ if (verbose) printf("RUN test 1\n"); ++ ++ // test ++- xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe ++- bool result_1 = filter_header_rg(hdr1, id_to_keep_1, arg_list); ++- fclose(stderr); +++ redirected_stderr = redirect_stderr(tempfname); +++ bool result_1 = (!sam_hdr_remove_except(hdr1, "RG", "ID", id_to_keep_1) && +++ !sam_hdr_add_pg(hdr1, "samtools", "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)); +++ flush_and_restore_stderr(orig_stderr, redirected_stderr); ++ ++ if (verbose) printf("END RUN test 1\n"); ++ if (verbose > 1) { ++@@ -144,11 +221,11 @@ ++ fclose(check); ++ ++ // teardown ++- bam_hdr_destroy(hdr1); +++ sam_hdr_destroy(hdr1); ++ if (verbose) printf("END test 1\n"); ++ ++ if (verbose) printf("BEGIN test 2\n"); // test eliminating a tag that is there ++- bam_hdr_t* hdr2; +++ sam_hdr_t* hdr2; ++ const char* id_to_keep_2 = "fish"; ++ setup_test_2(&hdr2); ++ if (verbose > 1) { ++@@ -158,9 +235,13 @@ ++ if (verbose) printf("RUN test 2\n"); ++ ++ // test ++- xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe ++- bool result_2 = filter_header_rg(hdr2, id_to_keep_2, arg_list); ++- fclose(stderr); +++ redirected_stderr = redirect_stderr(tempfname); +++ bool result_2 = (!sam_hdr_remove_except(hdr2, "RG", "ID", id_to_keep_2) && +++ !sam_hdr_add_pg(hdr2, "samtools", "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)); +++ flush_and_restore_stderr(orig_stderr, redirected_stderr); ++ ++ if (verbose) printf("END RUN test 2\n"); ++ if (verbose > 1) { ++@@ -183,17 +264,58 @@ ++ fclose(check); ++ ++ // teardown ++- bam_hdr_destroy(hdr2); +++ sam_hdr_destroy(hdr2); ++ if (verbose) printf("END test 2\n"); ++ +++ if (verbose) printf("BEGIN test 3\n"); // test eliminating a tag that is there +++ sam_hdr_t* hdr3; +++ setup_test_3(&hdr3); +++ if (verbose > 1) { +++ printf("hdr3\n"); +++ dump_hdr(hdr3); +++ } +++ if (verbose) printf("RUN test 3\n"); +++ +++ // test +++ redirected_stderr = redirect_stderr(tempfname); +++ bool result_3 = (!sam_hdr_remove_except(hdr3, "RG", NULL, NULL) && +++ !sam_hdr_add_pg(hdr3, "samtools", "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)); +++ flush_and_restore_stderr(orig_stderr, redirected_stderr); +++ +++ if (verbose) printf("END RUN test 3\n"); +++ if (verbose > 1) { +++ printf("hdr3\n"); +++ dump_hdr(hdr3); +++ } +++ +++ // check result +++ res.l = 0; +++ check = fopen(tempfname, "r"); +++ if ( result_3 +++ && check_test_3(hdr3) +++ && kgetline(&res, (kgets_func *)fgets, check) < 0 +++ && (feof(check) || res.l == 0)) { +++ ++success; +++ } else { +++ ++failure; +++ if (verbose) printf("FAIL test 3\n"); +++ } +++ fclose(check); +++ +++ // teardown +++ sam_hdr_destroy(hdr3); +++ if (verbose) printf("END test 3\n"); ++ ++ // Cleanup ++ free(res.s); ++ free(arg_list); ++ remove(tempfname); ++ if (failure > 0) ++- fprintf(orig_stderr, "%d failures %d successes\n", failure, success); ++- fclose(orig_stderr); +++ fprintf(stderr, "%d failures %d successes\n", failure, success); +++ close(orig_stderr); ++ ++ return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; ++ } ++--- python-pysam.orig/samtools/test/split/test_filter_header_rg.c.pysam.c +++++ python-pysam/samtools/test/split/test_filter_header_rg.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* test/split/test_filter_header_rg.c -- split test cases. ++ ++- Copyright (C) 2014 Genome Research Ltd. +++ Copyright (C) 2014-2016, 2018, 2019 Genome Research Ltd. ++ ++ Author: Martin O. Pollard ++ ++@@ -26,61 +26,133 @@ ++ ++ #include ++ ++-#include "../../bam_split.c" ++ #include "../test.h" ++ #include +++#include +++#include "samtools.h" +++#include +++#include +++#include "htslib/kstring.h" +++ +++int line_cmp(const void *av, const void *bv) { +++ const char *a = *(const char **) av; +++ const char *b = *(const char **) bv; +++ size_t al = strcspn(a, "\n"); +++ size_t bl = strcspn(b, "\n"); +++ size_t min = al < bl ? al : bl; +++ int m = memcmp(a, b, min); +++ if (m != 0) return m; +++ if (al < bl) return -1; +++ return al == bl ? 0 : 1; +++} +++ +++bool hdrcmp(const char *hdr1, const char *hdr2) { +++ size_t nl1, nl2, count1 = 0, count2 = 0, i; +++ const char *l; +++ const char **lines1, **lines2; +++ int res = 0; +++ +++ // First line should be @HD +++ if (strncmp(hdr1, "@HD\t", 4) != 0) return false; +++ if (strncmp(hdr2, "@HD\t", 4) != 0) return false; +++ nl1 = strcspn(hdr1, "\n"); +++ nl2 = strcspn(hdr2, "\n"); +++ if (nl1 != nl2 || memcmp(hdr1, hdr2, nl1) != 0) return false; +++ +++ // Count lines. +++ for (l = hdr1 + nl1; *l != '\0'; l += strcspn(l, "\n")) ++l, ++count1; +++ for (l = hdr2 + nl2; *l != '\0'; l += strcspn(l, "\n")) ++l, ++count2; +++ if (count1 != count2) return false; +++ +++ lines1 = malloc(count1 * sizeof(*lines1)); +++ if (!lines1) return false; +++ lines2 = malloc(count2 * sizeof(*lines2)); +++ if (!lines2) { free(lines1); return false; } +++ +++ for (i = 0, l = hdr1 + nl1; *l != '\0'; l += strcspn(l, "\n")) +++ lines1[i++] = ++l; +++ for (i = 0, l = hdr2 + nl2; *l != '\0'; l += strcspn(l, "\n")) +++ lines2[i++] = ++l; +++ +++ qsort(lines1, count1, sizeof(*lines1), line_cmp); +++ qsort(lines2, count2, sizeof(*lines2), line_cmp); +++ +++ for (i = 0; i < count1; i++) { +++ res = line_cmp(&lines1[i], &lines2[i]); +++ if (res != 0) break; +++ } +++ +++ free(lines1); +++ free(lines2); +++ +++ return res?false:true; +++} ++ ++-void setup_test_1(bam_hdr_t** hdr_in) +++void setup_test_1(sam_hdr_t** hdr_in) ++ { ++- *hdr_in = bam_hdr_init(); +++ *hdr_in = sam_hdr_init(); ++ const char *test1 = ++ "@HD\tVN:1.4\n" ++- "@SQ\tSN:blah\n" +++ "@SQ\tSN:blah\tLN:1\n" ++ "@RG\tID:fish\n"; ++- (*hdr_in)->text = strdup(test1); ++- (*hdr_in)->l_text = strlen(test1); +++ sam_hdr_add_lines(*hdr_in, test1, 0); ++ } ++ ++-bool check_test_1(const bam_hdr_t* hdr) { +++bool check_test_1(sam_hdr_t* hdr) { ++ const char *test1_res = ++ "@HD\tVN:1.4\n" ++- "@SQ\tSN:blah\n" +++ "@SQ\tSN:blah\tLN:1\n" ++ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; ++ ++- if (strcmp(hdr->text, test1_res)) { ++- return false; ++- } ++- return true; +++ return hdrcmp(sam_hdr_str(hdr), test1_res); ++ } ++ ++-void setup_test_2(bam_hdr_t** hdr_in) +++void setup_test_2(sam_hdr_t** hdr_in) ++ { ++- *hdr_in = bam_hdr_init(); +++ *hdr_in = sam_hdr_init(); ++ const char *test2 = ++ "@HD\tVN:1.4\n" ++- "@SQ\tSN:blah\n" +++ "@SQ\tSN:blah\tLN:1\n" ++ "@RG\tID:fish\n"; ++- (*hdr_in)->text = strdup(test2); ++- (*hdr_in)->l_text = strlen(test2); +++ sam_hdr_add_lines(*hdr_in, test2, 0); ++ } ++ ++-bool check_test_2(const bam_hdr_t* hdr) { +++bool check_test_2(sam_hdr_t* hdr) { ++ const char *test2_res = ++ "@HD\tVN:1.4\n" ++- "@SQ\tSN:blah\n" +++ "@SQ\tSN:blah\tLN:1\n" ++ "@RG\tID:fish\n" ++ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; ++ ++- if (strcmp(hdr->text, test2_res)) { ++- return false; ++- } ++- return true; +++ return hdrcmp(sam_hdr_str(hdr), test2_res); +++} +++ +++void setup_test_3(sam_hdr_t** hdr_in) +++{ +++ *hdr_in = sam_hdr_init(); +++ const char *test3 = +++ "@HD\tVN:1.4\n" +++ "@SQ\tSN:blah\tLN:1\n" +++ "@RG\tID:fish1\n" +++ "@RG\tID:fish2\n" +++ "@RG\tID:fish3\n" +++ "@RG\tID:fish4\n"; +++ sam_hdr_add_lines(*hdr_in, test3, 0); +++} +++ +++bool check_test_3(sam_hdr_t* hdr) { +++ const char *test3_res = +++ "@HD\tVN:1.4\n" +++ "@SQ\tSN:blah\tLN:1\n" +++ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; +++ +++ return hdrcmp(sam_hdr_str(hdr), test3_res); ++ } ++ ++ int samtools_test_filter_header_rg_main(int argc, char *argv[]) ++ { ++ // test state ++- const int NUM_TESTS = 2; +++ const int NUM_TESTS = 3; ++ int verbose = 0; ++ int success = 0; ++ int failure = 0; ++@@ -105,13 +177,14 @@ ++ ++ // Setup samtools_stderr redirect ++ kstring_t res = { 0, 0, NULL }; ++- FILE* orig_samtools_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save samtools_stderr +++ int orig_samtools_stderr = dup(STDERR_FILENO); // Save samtools_stderr +++ int redirected_samtools_stderr; ++ char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp"; ++ FILE* check = NULL; ++ ++ // setup ++ if (verbose) fprintf(samtools_stdout, "BEGIN test 1\n"); // test eliminating a tag that isn't there ++- bam_hdr_t* hdr1; +++ sam_hdr_t* hdr1; ++ const char* id_to_keep_1 = "1#2.3"; ++ setup_test_1(&hdr1); ++ if (verbose > 1) { ++@@ -121,9 +194,13 @@ ++ if (verbose) fprintf(samtools_stdout, "RUN test 1\n"); ++ ++ // test ++- xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe ++- bool result_1 = filter_header_rg(hdr1, id_to_keep_1, arg_list); ++- fclose(samtools_stderr); +++ redirected_samtools_stderr = redirect_samtools_stderr(tempfname); +++ bool result_1 = (!sam_hdr_remove_except(hdr1, "RG", "ID", id_to_keep_1) && +++ !sam_hdr_add_pg(hdr1, "samtools", "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)); +++ flush_and_restore_samtools_stderr(orig_samtools_stderr, redirected_samtools_stderr); ++ ++ if (verbose) fprintf(samtools_stdout, "END RUN test 1\n"); ++ if (verbose > 1) { ++@@ -146,11 +223,11 @@ ++ fclose(check); ++ ++ // teardown ++- bam_hdr_destroy(hdr1); +++ sam_hdr_destroy(hdr1); ++ if (verbose) fprintf(samtools_stdout, "END test 1\n"); ++ ++ if (verbose) fprintf(samtools_stdout, "BEGIN test 2\n"); // test eliminating a tag that is there ++- bam_hdr_t* hdr2; +++ sam_hdr_t* hdr2; ++ const char* id_to_keep_2 = "fish"; ++ setup_test_2(&hdr2); ++ if (verbose > 1) { ++@@ -160,9 +237,13 @@ ++ if (verbose) fprintf(samtools_stdout, "RUN test 2\n"); ++ ++ // test ++- xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe ++- bool result_2 = filter_header_rg(hdr2, id_to_keep_2, arg_list); ++- fclose(samtools_stderr); +++ redirected_samtools_stderr = redirect_samtools_stderr(tempfname); +++ bool result_2 = (!sam_hdr_remove_except(hdr2, "RG", "ID", id_to_keep_2) && +++ !sam_hdr_add_pg(hdr2, "samtools", "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)); +++ flush_and_restore_samtools_stderr(orig_samtools_stderr, redirected_samtools_stderr); ++ ++ if (verbose) fprintf(samtools_stdout, "END RUN test 2\n"); ++ if (verbose > 1) { ++@@ -185,17 +266,58 @@ ++ fclose(check); ++ ++ // teardown ++- bam_hdr_destroy(hdr2); +++ sam_hdr_destroy(hdr2); ++ if (verbose) fprintf(samtools_stdout, "END test 2\n"); ++ +++ if (verbose) fprintf(samtools_stdout, "BEGIN test 3\n"); // test eliminating a tag that is there +++ sam_hdr_t* hdr3; +++ setup_test_3(&hdr3); +++ if (verbose > 1) { +++ fprintf(samtools_stdout, "hdr3\n"); +++ dump_hdr(hdr3); +++ } +++ if (verbose) fprintf(samtools_stdout, "RUN test 3\n"); +++ +++ // test +++ redirected_samtools_stderr = redirect_samtools_stderr(tempfname); +++ bool result_3 = (!sam_hdr_remove_except(hdr3, "RG", NULL, NULL) && +++ !sam_hdr_add_pg(hdr3, "samtools", "VN", samtools_version(), +++ arg_list ? "CL": NULL, +++ arg_list ? arg_list : NULL, +++ NULL)); +++ flush_and_restore_samtools_stderr(orig_samtools_stderr, redirected_samtools_stderr); +++ +++ if (verbose) fprintf(samtools_stdout, "END RUN test 3\n"); +++ if (verbose > 1) { +++ fprintf(samtools_stdout, "hdr3\n"); +++ dump_hdr(hdr3); +++ } +++ +++ // check result +++ res.l = 0; +++ check = fopen(tempfname, "r"); +++ if ( result_3 +++ && check_test_3(hdr3) +++ && kgetline(&res, (kgets_func *)fgets, check) < 0 +++ && (feof(check) || res.l == 0)) { +++ ++success; +++ } else { +++ ++failure; +++ if (verbose) fprintf(samtools_stdout, "FAIL test 3\n"); +++ } +++ fclose(check); +++ +++ // teardown +++ sam_hdr_destroy(hdr3); +++ if (verbose) fprintf(samtools_stdout, "END test 3\n"); ++ ++ // Cleanup ++ free(res.s); ++ free(arg_list); ++ remove(tempfname); ++ if (failure > 0) ++- fprintf(orig_samtools_stderr, "%d failures %d successes\n", failure, success); ++- fclose(orig_samtools_stderr); +++ fprintf(samtools_stderr, "%d failures %d successes\n", failure, success); +++ close(orig_samtools_stderr); ++ ++ return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; ++ } ++--- python-pysam.orig/samtools/test/test.c +++++ python-pysam/samtools/test/test.c ++@@ -1,6 +1,6 @@ ++ /* test/test.c -- test harness utility routines. ++ ++- Copyright (C) 2014, 2016 Genome Research Ltd. +++ Copyright (C) 2014, 2016, 2019 Genome Research Ltd. ++ ++ Author: Martin O. Pollard ++ ++@@ -28,6 +28,12 @@ ++ #include ++ #include ++ #include +++#include +++#include +++#include +++#include +++#include +++#include ++ #include ++ ++ #include "test.h" ++@@ -41,17 +47,34 @@ ++ } ++ } ++ ++-void dump_hdr(const bam_hdr_t* hdr) +++int redirect_stderr(const char *path) { +++ int fd = open(path, O_WRONLY|O_TRUNC|O_CREAT, 0666); +++ if (!fd) { +++ fprintf(stderr, "Couldn't open \"%s\" : %s\n", path, strerror(errno)); +++ exit(2); +++ } +++ fflush(stderr); +++ dup2(fd, STDERR_FILENO); +++ return fd; +++} +++ +++void flush_and_restore_stderr(int orig_stderr, int redirect_fd) { +++ fflush(stderr); +++ dup2(orig_stderr, STDERR_FILENO); +++ close(redirect_fd); +++} +++ +++void dump_hdr(const sam_hdr_t* hdr) ++ { ++- printf("n_targets: %d\n", hdr->n_targets); +++ printf("n_targets: %d\n", sam_hdr_nref(hdr)); ++ printf("ignore_sam_err: %d\n", hdr->ignore_sam_err); ++- printf("l_text: %u\n", hdr->l_text); +++ printf("l_text: %zu\n", (size_t) sam_hdr_length((sam_hdr_t*)hdr)); ++ printf("idx\ttarget_len\ttarget_name:\n"); ++ int32_t target; ++- for (target = 0; target < hdr->n_targets; ++target) { ++- printf("%d\t%u\t\"%s\"\n", target, hdr->target_len[target], hdr->target_name[target]); +++ for (target = 0; target < sam_hdr_nref(hdr); ++target) { +++ printf("%d\t%"PRId64"\t\"%s\"\n", target, (int64_t) sam_hdr_tid2len(hdr, target), sam_hdr_tid2name(hdr, target)); ++ } ++- printf("text: \"%s\"\n", hdr->text); +++ printf("text: \"%s\"\n", sam_hdr_str((sam_hdr_t*)hdr)); ++ } ++ ++ // For tests, just return a constant that can be embedded in expected output. ++--- python-pysam.orig/samtools/test/test.c.pysam.c +++++ python-pysam/samtools/test/test.c.pysam.c ++@@ -2,7 +2,7 @@ ++ ++ /* test/test.c -- test harness utility routines. ++ ++- Copyright (C) 2014, 2016 Genome Research Ltd. +++ Copyright (C) 2014, 2016, 2019 Genome Research Ltd. ++ ++ Author: Martin O. Pollard ++ ++@@ -30,6 +30,12 @@ ++ #include ++ #include ++ #include +++#include +++#include +++#include +++#include +++#include +++#include ++ #include ++ ++ #include "test.h" ++@@ -43,17 +49,34 @@ ++ } ++ } ++ ++-void dump_hdr(const bam_hdr_t* hdr) +++int redirect_samtools_stderr(const char *path) { +++ int fd = open(path, O_WRONLY|O_TRUNC|O_CREAT, 0666); +++ if (!fd) { +++ fprintf(samtools_stderr, "Couldn't open \"%s\" : %s\n", path, strerror(errno)); +++ exit(2); +++ } +++ fflush(samtools_stderr); +++ dup2(fd, STDERR_FILENO); +++ return fd; +++} +++ +++void flush_and_restore_samtools_stderr(int orig_samtools_stderr, int redirect_fd) { +++ fflush(samtools_stderr); +++ dup2(orig_samtools_stderr, STDERR_FILENO); +++ close(redirect_fd); +++} +++ +++void dump_hdr(const sam_hdr_t* hdr) ++ { ++- fprintf(samtools_stdout, "n_targets: %d\n", hdr->n_targets); +++ fprintf(samtools_stdout, "n_targets: %d\n", sam_hdr_nref(hdr)); ++ fprintf(samtools_stdout, "ignore_sam_err: %d\n", hdr->ignore_sam_err); ++- fprintf(samtools_stdout, "l_text: %u\n", hdr->l_text); +++ fprintf(samtools_stdout, "l_text: %zu\n", (size_t) sam_hdr_length((sam_hdr_t*)hdr)); ++ fprintf(samtools_stdout, "idx\ttarget_len\ttarget_name:\n"); ++ int32_t target; ++- for (target = 0; target < hdr->n_targets; ++target) { ++- fprintf(samtools_stdout, "%d\t%u\t\"%s\"\n", target, hdr->target_len[target], hdr->target_name[target]); +++ for (target = 0; target < sam_hdr_nref(hdr); ++target) { +++ fprintf(samtools_stdout, "%d\t%"PRId64"\t\"%s\"\n", target, (int64_t) sam_hdr_tid2len(hdr, target), sam_hdr_tid2name(hdr, target)); ++ } ++- fprintf(samtools_stdout, "text: \"%s\"\n", hdr->text); +++ fprintf(samtools_stdout, "text: \"%s\"\n", sam_hdr_str((sam_hdr_t*)hdr)); ++ } ++ ++ // For tests, just return a constant that can be embedded in expected output. ++--- python-pysam.orig/samtools/test/test.h +++++ python-pysam/samtools/test/test.h ++@@ -30,6 +30,9 @@ ++ ++ void xfreopen(const char *path, const char *mode, FILE *stream); ++ ++-void dump_hdr(const bam_hdr_t* hdr); +++int redirect_stderr(const char *path); +++void flush_and_restore_stderr(int orig_stderr, int redirect_fd); +++ +++void dump_hdr(const sam_hdr_t* hdr); ++ ++ #endif ++--- python-pysam.orig/samtools/tmp_file.c +++++ python-pysam/samtools/tmp_file.c ++@@ -2,7 +2,7 @@ ++ tmp_file.c - write to and read from a temporary binary file ++ for fast storage plus added compression. ++ ++- Copyright (C) 2017 Genome Research Ltd. +++ Copyright (C) 2017, 2018 Genome Research Ltd. ++ ++ Author: Andrew Whitwham ++ ++@@ -66,7 +66,6 @@ ++ tmp->max_data_size = TMP_SAM_MAX_DATA + sizeof(bam1_t); // arbitrary but growable ++ tmp->ring_buffer_size = TMP_SAM_RING_SIZE; // arbitrary (min 64K) but growable ++ tmp->comp_buffer_size = LZ4_COMPRESSBOUND(tmp->max_data_size * tmp->group_size); ++- tmp->data = NULL; ++ tmp->ring_buffer = malloc(sizeof(uint8_t) * tmp->ring_buffer_size); ++ tmp->ring_index = tmp->ring_buffer; ++ tmp->comp_buffer = malloc(tmp->comp_buffer_size); ++@@ -184,7 +183,7 @@ ++ ++ ++ /* ++- * This does the actual compression and writing to disk. On disk format consists of a +++ * This does the actual compression and writing to a file. The file format consists of a ++ * single size_t for the size of the compressed data followed by the data itself. ++ * Returns 0 on success, a negative number on failure. ++ */ ++@@ -244,16 +243,16 @@ ++ ++ /* ++ * Stores an in memory bam structure for writing and if enough are gathered together writes ++- * it to disk. Mulitiple alignments compress better that single ones though after a certain number +++ * it to a file. Multiple alignments compress better that single ones though after a certain number ++ * there is a law of diminishing returns. ++ * Returns 0 on success, a negative number on failure. ++ */ ++ int tmp_file_write(tmp_file_t *tmp, bam1_t *inbam) { ++ ++- if ((tmp->input_size + sizeof(bam1_t) + inbam->l_data) >= tmp->ring_buffer_size) { +++ if ((tmp->offset + tmp->input_size + sizeof(bam1_t) + inbam->l_data) >= tmp->ring_buffer_size) { ++ int ret; ++ ++- if ((ret = tmp_file_grow_ring_buffer(tmp, (tmp->input_size + sizeof(bam1_t) + inbam->l_data) * 5))) { +++ if ((ret = tmp_file_grow_ring_buffer(tmp, (tmp->offset + tmp->input_size + sizeof(bam1_t) + inbam->l_data) * 2))) { ++ tmp_print_error(tmp, "[tmp_file] Error: input line too big. (%ld).\n", ++ (tmp->input_size + inbam->l_data)); ++ ++@@ -283,70 +282,8 @@ ++ ++ ++ /* ++- * Closes the file after writing out any remaining alignments. Adds a size_t 0 to ++- * mark the end of the file. Companion function to tmp_file_open_read below. ++- * Returns 0 on success, a negative number on failure. ++- */ ++-int tmp_file_close_write(tmp_file_t *tmp) { ++- size_t terminator = 0; ++- ++- if (tmp->entry_number) { ++- int ret; ++- ++- if ((ret = tmp_file_write_to_file(tmp))) { ++- return ret; ++- } ++- } ++- ++- if (fwrite(&terminator, sizeof(size_t), 1, tmp->fp) < 1) { ++- tmp_print_error(tmp, "[tmp_file] Error: tmp file write terminator failed.\n"); ++- return TMP_SAM_FILE_ERROR; ++- } ++- ++- if (fclose(tmp->fp)) { ++- tmp_print_error(tmp, "[tmp_file] Error: closing tmp file %s failed.\n", tmp->name); ++- return TMP_SAM_FILE_ERROR; ++- } ++- ++- LZ4_freeStream(tmp->stream); ++- ++- return TMP_SAM_OK; ++-} ++- ++- ++-/* ++- * Opens the file for reading. Optionally, if given a pointer to an existing ++- * bam1_t structure, it will free the data entry to prevent memory leaks. ++- * Companion function to tmp_file_close_write above. ++- * Returns 0 on success, a negative number on failure. ++- */ ++-int tmp_file_open_read(tmp_file_t *tmp, bam1_t *inbam) { ++- ++- if ((tmp->fp = fopen(tmp->name, "rb")) == NULL) { ++- tmp_print_error(tmp, "[tmp_file] Error: unable to open read file %s.\n", tmp->name); ++- return TMP_SAM_FILE_ERROR; ++- } ++- ++- tmp->dstream = LZ4_createStreamDecode(); ++- tmp->offset = 0; ++- ++- if (inbam) { ++- free(inbam->data); ++- } ++- ++- if (!tmp->dstream) { ++- tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression stream.\n"); ++- return TMP_SAM_MEM_ERROR; ++- } ++- ++- ++- return TMP_SAM_OK; ++-} ++- ++- ++-/* ++- * An alternative to tmp_file_close_write that does the same job without actually ++- * closing the file. Companion function to tmp_file_begin_read below. +++ * Marks the end of file writing. Adds a size_t 0 to mark the end of +++ * the file. Companion function to tmp_file_begin_read below. ++ * Returns 0 on success, a negative number on failure. ++ */ ++ int tmp_file_end_write(tmp_file_t *tmp) { ++@@ -374,11 +311,11 @@ ++ ++ ++ /* ++- * An alternative to tmp_file_open_read but works on an open file. +++ * Prepares the file for reading. ++ * Companion function to tmp_file_end_write above. ++ * Returns 0 on success, a negative number on failure. ++ */ ++-int tmp_file_begin_read(tmp_file_t *tmp, bam1_t *inbam) { +++int tmp_file_begin_read(tmp_file_t *tmp) { ++ ++ rewind(tmp->fp); ++ ++@@ -386,10 +323,6 @@ ++ tmp->offset = 0; ++ tmp->entry_number = tmp->group_size; ++ ++- if (inbam) { ++- free(inbam->data); ++- } ++- ++ if (!tmp->dstream) { ++ tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression stream.\n"); ++ return TMP_SAM_MEM_ERROR; ++@@ -400,11 +333,19 @@ ++ ++ ++ /* ++- * Read the next alignment, either from memory or from disk. +++ * Read the next alignment, either from memory or from a file. ++ * Returns size of entry on success, 0 on end of file or a negative on error. ++ */ ++ int tmp_file_read(tmp_file_t *tmp, bam1_t *inbam) { ++ int entry_size; +++ uint8_t *data = inbam->data; +++ +++ /* while tmp_file_read assumes that the same bam1_t variable +++ is being used in each call, this may not be the case. So +++ default to the lowest memory size for safety. */ +++ if (tmp->data_size > inbam->m_data) { +++ tmp->data_size = inbam->m_data; +++ } ++ ++ if (tmp->entry_number == tmp->group_size) { ++ // read more data ++@@ -438,17 +379,22 @@ ++ ++ tmp->ring_index = tmp->ring_buffer + tmp->offset; ++ memcpy(inbam, tmp->ring_index, sizeof(bam1_t)); +++ inbam->data = data; // put the pointer to real bam data back ++ ++ if ((unsigned int)inbam->l_data > tmp->data_size) { ++- if ((tmp->data = realloc(tmp->data, sizeof(uint8_t) * inbam->l_data)) == NULL) { ++- tmp_print_error(tmp, "[tmp_file] Error: unable to allocate tmp data memory.\n"); +++ uint8_t *tmp_data; +++ tmp->data_size = inbam->l_data; kroundup32(tmp->data_size); +++ +++ if ((tmp_data = realloc(inbam->data, sizeof(uint8_t) * tmp->data_size)) == NULL) { +++ tmp_print_error(tmp, "[tmp_file] Error: unable to allocate tmp bam data memory.\n"); ++ return TMP_SAM_MEM_ERROR; ++ } ++ ++- tmp->data_size = inbam->l_data; +++ inbam->data = tmp_data; ++ } ++ ++- inbam->data = tmp->data; +++ inbam->m_data = tmp->data_size; // set to the actual data size +++ ++ entry_size = sizeof(bam1_t); ++ ++ memcpy(inbam->data, tmp->ring_index + entry_size, inbam->l_data); ++@@ -474,34 +420,19 @@ ++ ++ ++ /* ++- * Frees up memory, closes the file and optionally deletes it. Giving this function ++- * pointer to the bam1_t structure used for reading will set its data value to null, ++- * preventing bam_destroy1() from trying to free already freed memory. ++- * Returns 0 on success, a negative number or EOF on failure. +++ * Frees up memory, closes the file and deletes it. +++ * Returns 0 on success or EOF on failure. ++ */ ++-int tmp_file_destroy(tmp_file_t *tmp, bam1_t *inbam, int delete) { +++int tmp_file_destroy(tmp_file_t *tmp) { ++ int ret = 0; ++ ++ ret = fclose(tmp->fp); ++ ++- if (delete && ret == 0) { ++- if (unlink(tmp->name)) { ++- tmp_print_error(tmp, "[tmp_file] Error: unable to delete file %s.\n", tmp->name); ++- ret = TMP_SAM_FILE_ERROR; ++- } ++- } ++- ++ LZ4_freeStreamDecode(tmp->dstream); ++ free(tmp->ring_buffer); ++ free(tmp->comp_buffer); ++ free(tmp->name); ++- free(tmp->data); ++ free(tmp->dict); ++ ++- ++- if (inbam) { ++- inbam->data = NULL; ++- } ++- ++ return ret; ++ } ++--- python-pysam.orig/samtools/tmp_file.c.pysam.c +++++ python-pysam/samtools/tmp_file.c.pysam.c ++@@ -4,7 +4,7 @@ ++ tmp_file.c - write to and read from a temporary binary file ++ for fast storage plus added compression. ++ ++- Copyright (C) 2017 Genome Research Ltd. +++ Copyright (C) 2017, 2018 Genome Research Ltd. ++ ++ Author: Andrew Whitwham ++ ++@@ -68,7 +68,6 @@ ++ tmp->max_data_size = TMP_SAM_MAX_DATA + sizeof(bam1_t); // arbitrary but growable ++ tmp->ring_buffer_size = TMP_SAM_RING_SIZE; // arbitrary (min 64K) but growable ++ tmp->comp_buffer_size = LZ4_COMPRESSBOUND(tmp->max_data_size * tmp->group_size); ++- tmp->data = NULL; ++ tmp->ring_buffer = malloc(sizeof(uint8_t) * tmp->ring_buffer_size); ++ tmp->ring_index = tmp->ring_buffer; ++ tmp->comp_buffer = malloc(tmp->comp_buffer_size); ++@@ -186,7 +185,7 @@ ++ ++ ++ /* ++- * This does the actual compression and writing to disk. On disk format consists of a +++ * This does the actual compression and writing to a file. The file format consists of a ++ * single size_t for the size of the compressed data followed by the data itself. ++ * Returns 0 on success, a negative number on failure. ++ */ ++@@ -246,16 +245,16 @@ ++ ++ /* ++ * Stores an in memory bam structure for writing and if enough are gathered together writes ++- * it to disk. Mulitiple alignments compress better that single ones though after a certain number +++ * it to a file. Multiple alignments compress better that single ones though after a certain number ++ * there is a law of diminishing returns. ++ * Returns 0 on success, a negative number on failure. ++ */ ++ int tmp_file_write(tmp_file_t *tmp, bam1_t *inbam) { ++ ++- if ((tmp->input_size + sizeof(bam1_t) + inbam->l_data) >= tmp->ring_buffer_size) { +++ if ((tmp->offset + tmp->input_size + sizeof(bam1_t) + inbam->l_data) >= tmp->ring_buffer_size) { ++ int ret; ++ ++- if ((ret = tmp_file_grow_ring_buffer(tmp, (tmp->input_size + sizeof(bam1_t) + inbam->l_data) * 5))) { +++ if ((ret = tmp_file_grow_ring_buffer(tmp, (tmp->offset + tmp->input_size + sizeof(bam1_t) + inbam->l_data) * 2))) { ++ tmp_print_error(tmp, "[tmp_file] Error: input line too big. (%ld).\n", ++ (tmp->input_size + inbam->l_data)); ++ ++@@ -285,70 +284,8 @@ ++ ++ ++ /* ++- * Closes the file after writing out any remaining alignments. Adds a size_t 0 to ++- * mark the end of the file. Companion function to tmp_file_open_read below. ++- * Returns 0 on success, a negative number on failure. ++- */ ++-int tmp_file_close_write(tmp_file_t *tmp) { ++- size_t terminator = 0; ++- ++- if (tmp->entry_number) { ++- int ret; ++- ++- if ((ret = tmp_file_write_to_file(tmp))) { ++- return ret; ++- } ++- } ++- ++- if (fwrite(&terminator, sizeof(size_t), 1, tmp->fp) < 1) { ++- tmp_print_error(tmp, "[tmp_file] Error: tmp file write terminator failed.\n"); ++- return TMP_SAM_FILE_ERROR; ++- } ++- ++- if (fclose(tmp->fp)) { ++- tmp_print_error(tmp, "[tmp_file] Error: closing tmp file %s failed.\n", tmp->name); ++- return TMP_SAM_FILE_ERROR; ++- } ++- ++- LZ4_freeStream(tmp->stream); ++- ++- return TMP_SAM_OK; ++-} ++- ++- ++-/* ++- * Opens the file for reading. Optionally, if given a pointer to an existing ++- * bam1_t structure, it will free the data entry to prevent memory leaks. ++- * Companion function to tmp_file_close_write above. ++- * Returns 0 on success, a negative number on failure. ++- */ ++-int tmp_file_open_read(tmp_file_t *tmp, bam1_t *inbam) { ++- ++- if ((tmp->fp = fopen(tmp->name, "rb")) == NULL) { ++- tmp_print_error(tmp, "[tmp_file] Error: unable to open read file %s.\n", tmp->name); ++- return TMP_SAM_FILE_ERROR; ++- } ++- ++- tmp->dstream = LZ4_createStreamDecode(); ++- tmp->offset = 0; ++- ++- if (inbam) { ++- free(inbam->data); ++- } ++- ++- if (!tmp->dstream) { ++- tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression stream.\n"); ++- return TMP_SAM_MEM_ERROR; ++- } ++- ++- ++- return TMP_SAM_OK; ++-} ++- ++- ++-/* ++- * An alternative to tmp_file_close_write that does the same job without actually ++- * closing the file. Companion function to tmp_file_begin_read below. +++ * Marks the end of file writing. Adds a size_t 0 to mark the end of +++ * the file. Companion function to tmp_file_begin_read below. ++ * Returns 0 on success, a negative number on failure. ++ */ ++ int tmp_file_end_write(tmp_file_t *tmp) { ++@@ -376,11 +313,11 @@ ++ ++ ++ /* ++- * An alternative to tmp_file_open_read but works on an open file. +++ * Prepares the file for reading. ++ * Companion function to tmp_file_end_write above. ++ * Returns 0 on success, a negative number on failure. ++ */ ++-int tmp_file_begin_read(tmp_file_t *tmp, bam1_t *inbam) { +++int tmp_file_begin_read(tmp_file_t *tmp) { ++ ++ rewind(tmp->fp); ++ ++@@ -388,10 +325,6 @@ ++ tmp->offset = 0; ++ tmp->entry_number = tmp->group_size; ++ ++- if (inbam) { ++- free(inbam->data); ++- } ++- ++ if (!tmp->dstream) { ++ tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression stream.\n"); ++ return TMP_SAM_MEM_ERROR; ++@@ -402,11 +335,19 @@ ++ ++ ++ /* ++- * Read the next alignment, either from memory or from disk. +++ * Read the next alignment, either from memory or from a file. ++ * Returns size of entry on success, 0 on end of file or a negative on error. ++ */ ++ int tmp_file_read(tmp_file_t *tmp, bam1_t *inbam) { ++ int entry_size; +++ uint8_t *data = inbam->data; +++ +++ /* while tmp_file_read assumes that the same bam1_t variable +++ is being used in each call, this may not be the case. So +++ default to the lowest memory size for safety. */ +++ if (tmp->data_size > inbam->m_data) { +++ tmp->data_size = inbam->m_data; +++ } ++ ++ if (tmp->entry_number == tmp->group_size) { ++ // read more data ++@@ -440,17 +381,22 @@ ++ ++ tmp->ring_index = tmp->ring_buffer + tmp->offset; ++ memcpy(inbam, tmp->ring_index, sizeof(bam1_t)); +++ inbam->data = data; // put the pointer to real bam data back ++ ++ if ((unsigned int)inbam->l_data > tmp->data_size) { ++- if ((tmp->data = realloc(tmp->data, sizeof(uint8_t) * inbam->l_data)) == NULL) { ++- tmp_print_error(tmp, "[tmp_file] Error: unable to allocate tmp data memory.\n"); +++ uint8_t *tmp_data; +++ tmp->data_size = inbam->l_data; kroundup32(tmp->data_size); +++ +++ if ((tmp_data = realloc(inbam->data, sizeof(uint8_t) * tmp->data_size)) == NULL) { +++ tmp_print_error(tmp, "[tmp_file] Error: unable to allocate tmp bam data memory.\n"); ++ return TMP_SAM_MEM_ERROR; ++ } ++ ++- tmp->data_size = inbam->l_data; +++ inbam->data = tmp_data; ++ } ++ ++- inbam->data = tmp->data; +++ inbam->m_data = tmp->data_size; // set to the actual data size +++ ++ entry_size = sizeof(bam1_t); ++ ++ memcpy(inbam->data, tmp->ring_index + entry_size, inbam->l_data); ++@@ -476,34 +422,19 @@ ++ ++ ++ /* ++- * Frees up memory, closes the file and optionally deletes it. Giving this function ++- * pointer to the bam1_t structure used for reading will set its data value to null, ++- * preventing bam_destroy1() from trying to free already freed memory. ++- * Returns 0 on success, a negative number or EOF on failure. +++ * Frees up memory, closes the file and deletes it. +++ * Returns 0 on success or EOF on failure. ++ */ ++-int tmp_file_destroy(tmp_file_t *tmp, bam1_t *inbam, int delete) { +++int tmp_file_destroy(tmp_file_t *tmp) { ++ int ret = 0; ++ ++ ret = fclose(tmp->fp); ++ ++- if (delete && ret == 0) { ++- if (unlink(tmp->name)) { ++- tmp_print_error(tmp, "[tmp_file] Error: unable to delete file %s.\n", tmp->name); ++- ret = TMP_SAM_FILE_ERROR; ++- } ++- } ++- ++ LZ4_freeStreamDecode(tmp->dstream); ++ free(tmp->ring_buffer); ++ free(tmp->comp_buffer); ++ free(tmp->name); ++- free(tmp->data); ++ free(tmp->dict); ++ ++- ++- if (inbam) { ++- inbam->data = NULL; ++- } ++- ++ return ret; ++ } ++--- python-pysam.orig/samtools/tmp_file.h +++++ python-pysam/samtools/tmp_file.h ++@@ -2,7 +2,7 @@ ++ tmp_file.h - write to and read from a temporary binary file ++ for fast storage plus added compression. ++ ++- Copyright (C) 2017 Genome Research Ltd. +++ Copyright (C) 2017, 2018 Genome Research Ltd. ++ ++ Author: Andrew Whitwham ++ ++@@ -58,7 +58,6 @@ ++ size_t ring_buffer_size; ++ size_t comp_buffer_size; ++ size_t offset; ++- uint8_t *data; ++ uint8_t *ring_buffer; ++ uint8_t *ring_index; ++ char *comp_buffer; ++@@ -84,7 +83,7 @@ ++ ++ /* ++ * Stores an in memory bam structure for writing and if enough are gathered together writes ++- * it to disk. Mulitiple alignments compress better that single ones though after a certain number +++ * it to a file. Multiple alignments compress better that single ones though after a certain number ++ * there is a law of diminishing returns. ++ * Returns 0 on success, a negative number on failure. ++ */ ++@@ -92,50 +91,31 @@ ++ ++ ++ /* ++- * Closes the file after writing out any remaining alignments. Adds a size_t 0 to ++- * mark the end of the file. Companion function to tmp_file_open_read below. ++- * Returns 0 on success, a negative number on failure. ++- */ ++-int tmp_file_close_write(tmp_file_t *tmp); ++- ++- ++-/* ++- * Opens the file for reading. Optionally, if given a pointer to an existing ++- * bam1_t structure, it will free the data entry to prevent memory leaks. ++- * Companion function to tmp_file_close_write above. ++- * Returns 0 on success, a negative number on failure. ++- */ ++-int tmp_file_open_read(tmp_file_t *tmp, bam1_t *inbam); ++- ++- ++-/* ++- * An alternative to tmp_file_close_write that does the same job without actually ++- * closing the file. Companion function to tmp_file_begin_read below. +++ * Marks the end of file writing. Adds a size_t 0 to mark the end of +++ * the file. Companion function to tmp_file_begin_read below. ++ * Returns 0 on success, a negative number on failure. ++ */ ++ int tmp_file_end_write(tmp_file_t *tmp); ++ ++ /* ++- * An alternative to tmp_file_open_read but works on an open file. +++ * Prepares the file for reading. ++ * Companion function to tmp_file_end_write above. ++ * Returns 0 on success, a negative number on failure. ++ */ ++-int tmp_file_begin_read(tmp_file_t *tmp, bam1_t *inbam); +++int tmp_file_begin_read(tmp_file_t *tmp); ++ ++ /* ++- * Read the next alignment, either from memory or from disk. +++ * Read the next alignment, either from memory or from a file. ++ * Returns size of entry on success, 0 on end of file or a negative on error. ++ */ ++ int tmp_file_read(tmp_file_t *tmp, bam1_t *inbam); ++ ++ ++ /* ++- * Frees up memory, closes the file and optionally deletes it. Giving this function ++- * pointer to the bam1_t structure used for reading will set its data value to null, ++- * preventing bam_destroy1() from trying to free already freed memory. ++- * Returns 0 on success, a negative number or EOF on failure. +++ * Frees up memory, closes the file and deletes it. +++ * Returns 0 on success or EOF on failure. ++ */ ++-int tmp_file_destroy(tmp_file_t *tmp, bam1_t *inbam, int delete); +++int tmp_file_destroy(tmp_file_t *tmp); ++ ++ #ifdef __cplusplus ++ } ++--- python-pysam.orig/samtools/version.h +++++ python-pysam/samtools/version.h ++@@ -1 +1 @@ ++-#define SAMTOOLS_VERSION "1.9" +++#define SAMTOOLS_VERSION "1.10" ++--- python-pysam.orig/samtools/win32/xcurses.h +++++ /dev/null ++@@ -1,1377 +0,0 @@ ++-/* Public Domain Curses */ ++- ++-/* $Id: curses.h,v 1.295 2008/07/15 17:13:25 wmcbrine Exp $ */ ++- ++-/*----------------------------------------------------------------------* ++- * PDCurses * ++- *----------------------------------------------------------------------*/ ++- ++-#ifndef __PDCURSES__ ++-#define __PDCURSES__ 1 ++- ++-/*man-start************************************************************** ++- ++-PDCurses definitions list: (Only define those needed) ++- ++- XCURSES True if compiling for X11. ++- PDC_RGB True if you want to use RGB color definitions ++- (Red = 1, Green = 2, Blue = 4) instead of BGR. ++- PDC_WIDE True if building wide-character support. ++- PDC_DLL_BUILD True if building a Win32 DLL. ++- NCURSES_MOUSE_VERSION Use the ncurses mouse API instead ++- of PDCurses' traditional mouse API. ++- ++-PDCurses portable platform definitions list: ++- ++- PDC_BUILD Defines API build version. ++- PDCURSES Enables access to PDCurses-only routines. ++- XOPEN Always true. ++- SYSVcurses True if you are compiling for SYSV portability. ++- BSDcurses True if you are compiling for BSD portability. ++- ++-**man-end****************************************************************/ ++- ++-#define PDC_BUILD 3401 ++-#define PDCURSES 1 /* PDCurses-only routines */ ++-#define XOPEN 1 /* X/Open Curses routines */ ++-#define SYSVcurses 1 /* System V Curses routines */ ++-#define BSDcurses 1 /* BSD Curses routines */ ++-#define CHTYPE_LONG 1 /* size of chtype; long */ ++- ++-/*----------------------------------------------------------------------*/ ++- ++-#include ++-#include ++-#include /* Required by X/Open usage below */ ++- ++-#ifdef PDC_WIDE ++-# include ++-#endif ++- ++-#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS) ++-extern "C" ++-{ ++-# define bool _bool ++-#endif ++- ++-/*---------------------------------------------------------------------- ++- * ++- * PDCurses Manifest Constants ++- * ++- */ ++- ++-#ifndef FALSE ++-# define FALSE 0 ++-#endif ++-#ifndef TRUE ++-# define TRUE 1 ++-#endif ++-#ifndef NULL ++-# define NULL (void *)0 ++-#endif ++-#ifndef ERR ++-# define ERR (-1) ++-#endif ++-#ifndef OK ++-# define OK 0 ++-#endif ++- ++-/*---------------------------------------------------------------------- ++- * ++- * PDCurses Type Declarations ++- * ++- */ ++- ++-typedef unsigned char bool; /* PDCurses Boolean type */ ++- ++-#ifdef CHTYPE_LONG ++-# if _LP64 ++-typedef unsigned int chtype; ++-# else ++-typedef unsigned long chtype; /* 16-bit attr + 16-bit char */ ++-# endif ++-#else ++-typedef unsigned short chtype; /* 8-bit attr + 8-bit char */ ++-#endif ++- ++-#ifdef PDC_WIDE ++-typedef chtype cchar_t; ++-#endif ++- ++-typedef chtype attr_t; ++- ++-/*---------------------------------------------------------------------- ++- * ++- * PDCurses Mouse Interface -- SYSVR4, with extensions ++- * ++- */ ++- ++-typedef struct ++-{ ++- int x; /* absolute column, 0 based, measured in characters */ ++- int y; /* absolute row, 0 based, measured in characters */ ++- short button[3]; /* state of each button */ ++- int changes; /* flags indicating what has changed with the mouse */ ++-} MOUSE_STATUS; ++- ++-#define BUTTON_RELEASED 0x0000 ++-#define BUTTON_PRESSED 0x0001 ++-#define BUTTON_CLICKED 0x0002 ++-#define BUTTON_DOUBLE_CLICKED 0x0003 ++-#define BUTTON_TRIPLE_CLICKED 0x0004 ++-#define BUTTON_MOVED 0x0005 /* PDCurses */ ++-#define WHEEL_SCROLLED 0x0006 /* PDCurses */ ++-#define BUTTON_ACTION_MASK 0x0007 /* PDCurses */ ++- ++-#define PDC_BUTTON_SHIFT 0x0008 /* PDCurses */ ++-#define PDC_BUTTON_CONTROL 0x0010 /* PDCurses */ ++-#define PDC_BUTTON_ALT 0x0020 /* PDCurses */ ++-#define BUTTON_MODIFIER_MASK 0x0038 /* PDCurses */ ++- ++-#define MOUSE_X_POS (Mouse_status.x) ++-#define MOUSE_Y_POS (Mouse_status.y) ++- ++-/* ++- * Bits associated with the .changes field: ++- * 3 2 1 0 ++- * 210987654321098765432109876543210 ++- * 1 <- button 1 has changed ++- * 10 <- button 2 has changed ++- * 100 <- button 3 has changed ++- * 1000 <- mouse has moved ++- * 10000 <- mouse position report ++- * 100000 <- mouse wheel up ++- * 1000000 <- mouse wheel down ++- */ ++- ++-#define PDC_MOUSE_MOVED 0x0008 ++-#define PDC_MOUSE_POSITION 0x0010 ++-#define PDC_MOUSE_WHEEL_UP 0x0020 ++-#define PDC_MOUSE_WHEEL_DOWN 0x0040 ++- ++-#define A_BUTTON_CHANGED (Mouse_status.changes & 7) ++-#define MOUSE_MOVED (Mouse_status.changes & PDC_MOUSE_MOVED) ++-#define MOUSE_POS_REPORT (Mouse_status.changes & PDC_MOUSE_POSITION) ++-#define BUTTON_CHANGED(x) (Mouse_status.changes & (1 << ((x) - 1))) ++-#define BUTTON_STATUS(x) (Mouse_status.button[(x) - 1]) ++-#define MOUSE_WHEEL_UP (Mouse_status.changes & PDC_MOUSE_WHEEL_UP) ++-#define MOUSE_WHEEL_DOWN (Mouse_status.changes & PDC_MOUSE_WHEEL_DOWN) ++- ++-/* mouse bit-masks */ ++- ++-#define BUTTON1_RELEASED 0x00000001L ++-#define BUTTON1_PRESSED 0x00000002L ++-#define BUTTON1_CLICKED 0x00000004L ++-#define BUTTON1_DOUBLE_CLICKED 0x00000008L ++-#define BUTTON1_TRIPLE_CLICKED 0x00000010L ++-#define BUTTON1_MOVED 0x00000010L /* PDCurses */ ++- ++-#define BUTTON2_RELEASED 0x00000020L ++-#define BUTTON2_PRESSED 0x00000040L ++-#define BUTTON2_CLICKED 0x00000080L ++-#define BUTTON2_DOUBLE_CLICKED 0x00000100L ++-#define BUTTON2_TRIPLE_CLICKED 0x00000200L ++-#define BUTTON2_MOVED 0x00000200L /* PDCurses */ ++- ++-#define BUTTON3_RELEASED 0x00000400L ++-#define BUTTON3_PRESSED 0x00000800L ++-#define BUTTON3_CLICKED 0x00001000L ++-#define BUTTON3_DOUBLE_CLICKED 0x00002000L ++-#define BUTTON3_TRIPLE_CLICKED 0x00004000L ++-#define BUTTON3_MOVED 0x00004000L /* PDCurses */ ++- ++-/* For the ncurses-compatible functions only, BUTTON4_PRESSED and ++- BUTTON5_PRESSED are returned for mouse scroll wheel up and down; ++- otherwise PDCurses doesn't support buttons 4 and 5 */ ++- ++-#define BUTTON4_RELEASED 0x00008000L ++-#define BUTTON4_PRESSED 0x00010000L ++-#define BUTTON4_CLICKED 0x00020000L ++-#define BUTTON4_DOUBLE_CLICKED 0x00040000L ++-#define BUTTON4_TRIPLE_CLICKED 0x00080000L ++- ++-#define BUTTON5_RELEASED 0x00100000L ++-#define BUTTON5_PRESSED 0x00200000L ++-#define BUTTON5_CLICKED 0x00400000L ++-#define BUTTON5_DOUBLE_CLICKED 0x00800000L ++-#define BUTTON5_TRIPLE_CLICKED 0x01000000L ++- ++-#define MOUSE_WHEEL_SCROLL 0x02000000L /* PDCurses */ ++-#define BUTTON_MODIFIER_SHIFT 0x04000000L /* PDCurses */ ++-#define BUTTON_MODIFIER_CONTROL 0x08000000L /* PDCurses */ ++-#define BUTTON_MODIFIER_ALT 0x10000000L /* PDCurses */ ++- ++-#define ALL_MOUSE_EVENTS 0x1fffffffL ++-#define REPORT_MOUSE_POSITION 0x20000000L ++- ++-/* ncurses mouse interface */ ++- ++-typedef unsigned long mmask_t; ++- ++-typedef struct ++-{ ++- short id; /* unused, always 0 */ ++- int x, y, z; /* x, y same as MOUSE_STATUS; z unused */ ++- mmask_t bstate; /* equivalent to changes + button[], but ++- in the same format as used for mousemask() */ ++-} MEVENT; ++- ++-#ifdef NCURSES_MOUSE_VERSION ++-# define BUTTON_SHIFT BUTTON_MODIFIER_SHIFT ++-# define BUTTON_CONTROL BUTTON_MODIFIER_CONTROL ++-# define BUTTON_CTRL BUTTON_MODIFIER_CONTROL ++-# define BUTTON_ALT BUTTON_MODIFIER_ALT ++-#else ++-# define BUTTON_SHIFT PDC_BUTTON_SHIFT ++-# define BUTTON_CONTROL PDC_BUTTON_CONTROL ++-# define BUTTON_ALT PDC_BUTTON_ALT ++-#endif ++- ++-/*---------------------------------------------------------------------- ++- * ++- * PDCurses Structure Definitions ++- * ++- */ ++- ++-typedef struct _win /* definition of a window */ ++-{ ++- int _cury; /* current pseudo-cursor */ ++- int _curx; ++- int _maxy; /* max window coordinates */ ++- int _maxx; ++- int _begy; /* origin on screen */ ++- int _begx; ++- int _flags; /* window properties */ ++- chtype _attrs; /* standard attributes and colors */ ++- chtype _bkgd; /* background, normally blank */ ++- bool _clear; /* causes clear at next refresh */ ++- bool _leaveit; /* leaves cursor where it is */ ++- bool _scroll; /* allows window scrolling */ ++- bool _nodelay; /* input character wait flag */ ++- bool _immed; /* immediate update flag */ ++- bool _sync; /* synchronise window ancestors */ ++- bool _use_keypad; /* flags keypad key mode active */ ++- chtype **_y; /* pointer to line pointer array */ ++- int *_firstch; /* first changed character in line */ ++- int *_lastch; /* last changed character in line */ ++- int _tmarg; /* top of scrolling region */ ++- int _bmarg; /* bottom of scrolling region */ ++- int _delayms; /* milliseconds of delay for getch() */ ++- int _parx, _pary; /* coords relative to parent (0,0) */ ++- struct _win *_parent; /* subwin's pointer to parent win */ ++-} WINDOW; ++- ++-/* Avoid using the SCREEN struct directly -- use the corresponding ++- functions if possible. This struct may eventually be made private. */ ++- ++-typedef struct ++-{ ++- bool alive; /* if initscr() called, and not endwin() */ ++- bool autocr; /* if cr -> lf */ ++- bool cbreak; /* if terminal unbuffered */ ++- bool echo; /* if terminal echo */ ++- bool raw_inp; /* raw input mode (v. cooked input) */ ++- bool raw_out; /* raw output mode (7 v. 8 bits) */ ++- bool audible; /* FALSE if the bell is visual */ ++- bool mono; /* TRUE if current screen is mono */ ++- bool resized; /* TRUE if TERM has been resized */ ++- bool orig_attr; /* TRUE if we have the original colors */ ++- short orig_fore; /* original screen foreground color */ ++- short orig_back; /* original screen foreground color */ ++- int cursrow; /* position of physical cursor */ ++- int curscol; /* position of physical cursor */ ++- int visibility; /* visibility of cursor */ ++- int orig_cursor; /* original cursor size */ ++- int lines; /* new value for LINES */ ++- int cols; /* new value for COLS */ ++- unsigned long _trap_mbe; /* trap these mouse button events */ ++- unsigned long _map_mbe_to_key; /* map mouse buttons to slk */ ++- int mouse_wait; /* time to wait (in ms) for a ++- button release after a press, in ++- order to count it as a click */ ++- int slklines; /* lines in use by slk_init() */ ++- WINDOW *slk_winptr; /* window for slk */ ++- int linesrippedoff; /* lines ripped off via ripoffline() */ ++- int linesrippedoffontop; /* lines ripped off on ++- top via ripoffline() */ ++- int delaytenths; /* 1/10ths second to wait block ++- getch() for */ ++- bool _preserve; /* TRUE if screen background ++- to be preserved */ ++- int _restore; /* specifies if screen background ++- to be restored, and how */ ++- bool save_key_modifiers; /* TRUE if each key modifiers saved ++- with each key press */ ++- bool return_key_modifiers; /* TRUE if modifier keys are ++- returned as "real" keys */ ++- bool key_code; /* TRUE if last key is a special key; ++- used internally by get_wch() */ ++-#ifdef XCURSES ++- int XcurscrSize; /* size of Xcurscr shared memory block */ ++- bool sb_on; ++- int sb_viewport_y; ++- int sb_viewport_x; ++- int sb_total_y; ++- int sb_total_x; ++- int sb_cur_y; ++- int sb_cur_x; ++-#endif ++- short line_color; /* color of line attributes - default -1 */ ++-} SCREEN; ++- ++-/*---------------------------------------------------------------------- ++- * ++- * PDCurses External Variables ++- * ++- */ ++- ++-#ifdef PDC_DLL_BUILD ++-# ifdef CURSES_LIBRARY ++-# define PDCEX __declspec(dllexport) extern ++-# else ++-# define PDCEX __declspec(dllimport) ++-# endif ++-#else ++-# define PDCEX extern ++-#endif ++- ++-PDCEX int LINES; /* terminal height */ ++-PDCEX int COLS; /* terminal width */ ++-PDCEX WINDOW *stdscr; /* the default screen window */ ++-PDCEX WINDOW *curscr; /* the current screen image */ ++-PDCEX SCREEN *SP; /* curses variables */ ++-PDCEX MOUSE_STATUS Mouse_status; ++-PDCEX int COLORS; ++-PDCEX int COLOR_PAIRS; ++-PDCEX int TABSIZE; ++-PDCEX chtype acs_map[]; /* alternate character set map */ ++-PDCEX char ttytype[]; /* terminal name/description */ ++- ++-/*man-start************************************************************** ++- ++-PDCurses Text Attributes ++-======================== ++- ++-Originally, PDCurses used a short (16 bits) for its chtype. To include ++-color, a number of things had to be sacrificed from the strict Unix and ++-System V support. The main problem was fitting all character attributes ++-and color into an unsigned char (all 8 bits!). ++- ++-Today, PDCurses by default uses a long (32 bits) for its chtype, as in ++-System V. The short chtype is still available, by undefining CHTYPE_LONG ++-and rebuilding the library. ++- ++-The following is the structure of a win->_attrs chtype: ++- ++-short form: ++- ++-------------------------------------------------- ++-|15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0| ++-------------------------------------------------- ++- color number | attrs | character eg 'a' ++- ++-The available non-color attributes are bold, reverse and blink. Others ++-have no effect. The high order char is an index into an array of ++-physical colors (defined in color.c) -- 32 foreground/background color ++-pairs (5 bits) plus 3 bits for other attributes. ++- ++-long form: ++- ++----------------------------------------------------------------------------- ++-|31|30|29|28|27|26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|..| 3| 2| 1| 0| ++----------------------------------------------------------------------------- ++- color number | modifiers | character eg 'a' ++- ++-The available non-color attributes are bold, underline, invisible, ++-right-line, left-line, protect, reverse and blink. 256 color pairs (8 ++-bits), 8 bits for other attributes, and 16 bits for character data. ++- ++-**man-end****************************************************************/ ++- ++-/*** Video attribute macros ***/ ++- ++-#define A_NORMAL (chtype)0 ++- ++-#ifdef CHTYPE_LONG ++-# define A_ALTCHARSET (chtype)0x00010000 ++-# define A_RIGHTLINE (chtype)0x00020000 ++-# define A_LEFTLINE (chtype)0x00040000 ++-# define A_INVIS (chtype)0x00080000 ++-# define A_UNDERLINE (chtype)0x00100000 ++-# define A_REVERSE (chtype)0x00200000 ++-# define A_BLINK (chtype)0x00400000 ++-# define A_BOLD (chtype)0x00800000 ++- ++-# define A_ATTRIBUTES (chtype)0xffff0000 ++-# define A_CHARTEXT (chtype)0x0000ffff ++-# define A_COLOR (chtype)0xff000000 ++- ++-# define A_ITALIC A_INVIS ++-# define A_PROTECT (A_UNDERLINE | A_LEFTLINE | A_RIGHTLINE) ++- ++-# define PDC_ATTR_SHIFT 19 ++-# define PDC_COLOR_SHIFT 24 ++-#else ++-# define A_BOLD (chtype)0x0100 /* X/Open */ ++-# define A_REVERSE (chtype)0x0200 /* X/Open */ ++-# define A_BLINK (chtype)0x0400 /* X/Open */ ++- ++-# define A_ATTRIBUTES (chtype)0xff00 /* X/Open */ ++-# define A_CHARTEXT (chtype)0x00ff /* X/Open */ ++-# define A_COLOR (chtype)0xf800 /* System V */ ++- ++-# define A_ALTCHARSET A_NORMAL /* X/Open */ ++-# define A_PROTECT A_NORMAL /* X/Open */ ++-# define A_UNDERLINE A_NORMAL /* X/Open */ ++- ++-# define A_LEFTLINE A_NORMAL ++-# define A_RIGHTLINE A_NORMAL ++-# define A_ITALIC A_NORMAL ++-# define A_INVIS A_NORMAL ++- ++-# define PDC_ATTR_SHIFT 8 ++-# define PDC_COLOR_SHIFT 11 ++-#endif ++- ++-#define A_STANDOUT (A_REVERSE | A_BOLD) /* X/Open */ ++-#define A_DIM A_NORMAL ++- ++-#define CHR_MSK A_CHARTEXT /* Obsolete */ ++-#define ATR_MSK A_ATTRIBUTES /* Obsolete */ ++-#define ATR_NRM A_NORMAL /* Obsolete */ ++- ++-/* For use with attr_t -- X/Open says, "these shall be distinct", so ++- this is a non-conforming implementation. */ ++- ++-#define WA_ALTCHARSET A_ALTCHARSET ++-#define WA_BLINK A_BLINK ++-#define WA_BOLD A_BOLD ++-#define WA_DIM A_DIM ++-#define WA_INVIS A_INVIS ++-#define WA_LEFT A_LEFTLINE ++-#define WA_PROTECT A_PROTECT ++-#define WA_REVERSE A_REVERSE ++-#define WA_RIGHT A_RIGHTLINE ++-#define WA_STANDOUT A_STANDOUT ++-#define WA_UNDERLINE A_UNDERLINE ++- ++-#define WA_HORIZONTAL A_NORMAL ++-#define WA_LOW A_NORMAL ++-#define WA_TOP A_NORMAL ++-#define WA_VERTICAL A_NORMAL ++- ++-/*** Alternate character set macros ***/ ++- ++-/* 'w' = 32-bit chtype; acs_map[] index | A_ALTCHARSET ++- 'n' = 16-bit chtype; it gets the fallback set because no bit is ++- available for A_ALTCHARSET */ ++- ++-#ifdef CHTYPE_LONG ++-# define ACS_PICK(w, n) ((chtype)w | A_ALTCHARSET) ++-#else ++-# define ACS_PICK(w, n) ((chtype)n) ++-#endif ++- ++-/* VT100-compatible symbols -- box chars */ ++- ++-#define ACS_ULCORNER ACS_PICK('l', '+') ++-#define ACS_LLCORNER ACS_PICK('m', '+') ++-#define ACS_URCORNER ACS_PICK('k', '+') ++-#define ACS_LRCORNER ACS_PICK('j', '+') ++-#define ACS_RTEE ACS_PICK('u', '+') ++-#define ACS_LTEE ACS_PICK('t', '+') ++-#define ACS_BTEE ACS_PICK('v', '+') ++-#define ACS_TTEE ACS_PICK('w', '+') ++-#define ACS_HLINE ACS_PICK('q', '-') ++-#define ACS_VLINE ACS_PICK('x', '|') ++-#define ACS_PLUS ACS_PICK('n', '+') ++- ++-/* VT100-compatible symbols -- other */ ++- ++-#define ACS_S1 ACS_PICK('o', '-') ++-#define ACS_S9 ACS_PICK('s', '_') ++-#define ACS_DIAMOND ACS_PICK('`', '+') ++-#define ACS_CKBOARD ACS_PICK('a', ':') ++-#define ACS_DEGREE ACS_PICK('f', '\'') ++-#define ACS_PLMINUS ACS_PICK('g', '#') ++-#define ACS_BULLET ACS_PICK('~', 'o') ++- ++-/* Teletype 5410v1 symbols -- these are defined in SysV curses, but ++- are not well-supported by most terminals. Stick to VT100 characters ++- for optimum portability. */ ++- ++-#define ACS_LARROW ACS_PICK(',', '<') ++-#define ACS_RARROW ACS_PICK('+', '>') ++-#define ACS_DARROW ACS_PICK('.', 'v') ++-#define ACS_UARROW ACS_PICK('-', '^') ++-#define ACS_BOARD ACS_PICK('h', '#') ++-#define ACS_LANTERN ACS_PICK('i', '*') ++-#define ACS_BLOCK ACS_PICK('0', '#') ++- ++-/* That goes double for these -- undocumented SysV symbols. Don't use ++- them. */ ++- ++-#define ACS_S3 ACS_PICK('p', '-') ++-#define ACS_S7 ACS_PICK('r', '-') ++-#define ACS_LEQUAL ACS_PICK('y', '<') ++-#define ACS_GEQUAL ACS_PICK('z', '>') ++-#define ACS_PI ACS_PICK('{', 'n') ++-#define ACS_NEQUAL ACS_PICK('|', '+') ++-#define ACS_STERLING ACS_PICK('}', 'L') ++- ++-/* Box char aliases */ ++- ++-#define ACS_BSSB ACS_ULCORNER ++-#define ACS_SSBB ACS_LLCORNER ++-#define ACS_BBSS ACS_URCORNER ++-#define ACS_SBBS ACS_LRCORNER ++-#define ACS_SBSS ACS_RTEE ++-#define ACS_SSSB ACS_LTEE ++-#define ACS_SSBS ACS_BTEE ++-#define ACS_BSSS ACS_TTEE ++-#define ACS_BSBS ACS_HLINE ++-#define ACS_SBSB ACS_VLINE ++-#define ACS_SSSS ACS_PLUS ++- ++-/* cchar_t aliases */ ++- ++-#ifdef PDC_WIDE ++-# define WACS_ULCORNER (&(acs_map['l'])) ++-# define WACS_LLCORNER (&(acs_map['m'])) ++-# define WACS_URCORNER (&(acs_map['k'])) ++-# define WACS_LRCORNER (&(acs_map['j'])) ++-# define WACS_RTEE (&(acs_map['u'])) ++-# define WACS_LTEE (&(acs_map['t'])) ++-# define WACS_BTEE (&(acs_map['v'])) ++-# define WACS_TTEE (&(acs_map['w'])) ++-# define WACS_HLINE (&(acs_map['q'])) ++-# define WACS_VLINE (&(acs_map['x'])) ++-# define WACS_PLUS (&(acs_map['n'])) ++- ++-# define WACS_S1 (&(acs_map['o'])) ++-# define WACS_S9 (&(acs_map['s'])) ++-# define WACS_DIAMOND (&(acs_map['`'])) ++-# define WACS_CKBOARD (&(acs_map['a'])) ++-# define WACS_DEGREE (&(acs_map['f'])) ++-# define WACS_PLMINUS (&(acs_map['g'])) ++-# define WACS_BULLET (&(acs_map['~'])) ++- ++-# define WACS_LARROW (&(acs_map[','])) ++-# define WACS_RARROW (&(acs_map['+'])) ++-# define WACS_DARROW (&(acs_map['.'])) ++-# define WACS_UARROW (&(acs_map['-'])) ++-# define WACS_BOARD (&(acs_map['h'])) ++-# define WACS_LANTERN (&(acs_map['i'])) ++-# define WACS_BLOCK (&(acs_map['0'])) ++- ++-# define WACS_S3 (&(acs_map['p'])) ++-# define WACS_S7 (&(acs_map['r'])) ++-# define WACS_LEQUAL (&(acs_map['y'])) ++-# define WACS_GEQUAL (&(acs_map['z'])) ++-# define WACS_PI (&(acs_map['{'])) ++-# define WACS_NEQUAL (&(acs_map['|'])) ++-# define WACS_STERLING (&(acs_map['}'])) ++- ++-# define WACS_BSSB WACS_ULCORNER ++-# define WACS_SSBB WACS_LLCORNER ++-# define WACS_BBSS WACS_URCORNER ++-# define WACS_SBBS WACS_LRCORNER ++-# define WACS_SBSS WACS_RTEE ++-# define WACS_SSSB WACS_LTEE ++-# define WACS_SSBS WACS_BTEE ++-# define WACS_BSSS WACS_TTEE ++-# define WACS_BSBS WACS_HLINE ++-# define WACS_SBSB WACS_VLINE ++-# define WACS_SSSS WACS_PLUS ++-#endif ++- ++-/*** Color macros ***/ ++- ++-#define COLOR_BLACK 0 ++- ++-#ifdef PDC_RGB /* RGB */ ++-# define COLOR_RED 1 ++-# define COLOR_GREEN 2 ++-# define COLOR_BLUE 4 ++-#else /* BGR */ ++-# define COLOR_BLUE 1 ++-# define COLOR_GREEN 2 ++-# define COLOR_RED 4 ++-#endif ++- ++-#define COLOR_CYAN (COLOR_BLUE | COLOR_GREEN) ++-#define COLOR_MAGENTA (COLOR_RED | COLOR_BLUE) ++-#define COLOR_YELLOW (COLOR_RED | COLOR_GREEN) ++- ++-#define COLOR_WHITE 7 ++- ++-/*---------------------------------------------------------------------- ++- * ++- * Function and Keypad Key Definitions. ++- * Many are just for compatibility. ++- * ++- */ ++- ++-#define KEY_CODE_YES 0x100 /* If get_wch() gives a key code */ ++- ++-#define KEY_BREAK 0x101 /* Not on PC KBD */ ++-#define KEY_DOWN 0x102 /* Down arrow key */ ++-#define KEY_UP 0x103 /* Up arrow key */ ++-#define KEY_LEFT 0x104 /* Left arrow key */ ++-#define KEY_RIGHT 0x105 /* Right arrow key */ ++-#define KEY_HOME 0x106 /* home key */ ++-#define KEY_BACKSPACE 0x107 /* not on pc */ ++-#define KEY_F0 0x108 /* function keys; 64 reserved */ ++- ++-#define KEY_DL 0x148 /* delete line */ ++-#define KEY_IL 0x149 /* insert line */ ++-#define KEY_DC 0x14a /* delete character */ ++-#define KEY_IC 0x14b /* insert char or enter ins mode */ ++-#define KEY_EIC 0x14c /* exit insert char mode */ ++-#define KEY_CLEAR 0x14d /* clear screen */ ++-#define KEY_EOS 0x14e /* clear to end of screen */ ++-#define KEY_EOL 0x14f /* clear to end of line */ ++-#define KEY_SF 0x150 /* scroll 1 line forward */ ++-#define KEY_SR 0x151 /* scroll 1 line back (reverse) */ ++-#define KEY_NPAGE 0x152 /* next page */ ++-#define KEY_PPAGE 0x153 /* previous page */ ++-#define KEY_STAB 0x154 /* set tab */ ++-#define KEY_CTAB 0x155 /* clear tab */ ++-#define KEY_CATAB 0x156 /* clear all tabs */ ++-#define KEY_ENTER 0x157 /* enter or send (unreliable) */ ++-#define KEY_SRESET 0x158 /* soft/reset (partial/unreliable) */ ++-#define KEY_RESET 0x159 /* reset/hard reset (unreliable) */ ++-#define KEY_PRINT 0x15a /* print/copy */ ++-#define KEY_LL 0x15b /* home down/bottom (lower left) */ ++-#define KEY_ABORT 0x15c /* abort/terminate key (any) */ ++-#define KEY_SHELP 0x15d /* short help */ ++-#define KEY_LHELP 0x15e /* long help */ ++-#define KEY_BTAB 0x15f /* Back tab key */ ++-#define KEY_BEG 0x160 /* beg(inning) key */ ++-#define KEY_CANCEL 0x161 /* cancel key */ ++-#define KEY_CLOSE 0x162 /* close key */ ++-#define KEY_COMMAND 0x163 /* cmd (command) key */ ++-#define KEY_COPY 0x164 /* copy key */ ++-#define KEY_CREATE 0x165 /* create key */ ++-#define KEY_END 0x166 /* end key */ ++-#define KEY_EXIT 0x167 /* exit key */ ++-#define KEY_FIND 0x168 /* find key */ ++-#define KEY_HELP 0x169 /* help key */ ++-#define KEY_MARK 0x16a /* mark key */ ++-#define KEY_MESSAGE 0x16b /* message key */ ++-#define KEY_MOVE 0x16c /* move key */ ++-#define KEY_NEXT 0x16d /* next object key */ ++-#define KEY_OPEN 0x16e /* open key */ ++-#define KEY_OPTIONS 0x16f /* options key */ ++-#define KEY_PREVIOUS 0x170 /* previous object key */ ++-#define KEY_REDO 0x171 /* redo key */ ++-#define KEY_REFERENCE 0x172 /* ref(erence) key */ ++-#define KEY_REFRESH 0x173 /* refresh key */ ++-#define KEY_REPLACE 0x174 /* replace key */ ++-#define KEY_RESTART 0x175 /* restart key */ ++-#define KEY_RESUME 0x176 /* resume key */ ++-#define KEY_SAVE 0x177 /* save key */ ++-#define KEY_SBEG 0x178 /* shifted beginning key */ ++-#define KEY_SCANCEL 0x179 /* shifted cancel key */ ++-#define KEY_SCOMMAND 0x17a /* shifted command key */ ++-#define KEY_SCOPY 0x17b /* shifted copy key */ ++-#define KEY_SCREATE 0x17c /* shifted create key */ ++-#define KEY_SDC 0x17d /* shifted delete char key */ ++-#define KEY_SDL 0x17e /* shifted delete line key */ ++-#define KEY_SELECT 0x17f /* select key */ ++-#define KEY_SEND 0x180 /* shifted end key */ ++-#define KEY_SEOL 0x181 /* shifted clear line key */ ++-#define KEY_SEXIT 0x182 /* shifted exit key */ ++-#define KEY_SFIND 0x183 /* shifted find key */ ++-#define KEY_SHOME 0x184 /* shifted home key */ ++-#define KEY_SIC 0x185 /* shifted input key */ ++- ++-#define KEY_SLEFT 0x187 /* shifted left arrow key */ ++-#define KEY_SMESSAGE 0x188 /* shifted message key */ ++-#define KEY_SMOVE 0x189 /* shifted move key */ ++-#define KEY_SNEXT 0x18a /* shifted next key */ ++-#define KEY_SOPTIONS 0x18b /* shifted options key */ ++-#define KEY_SPREVIOUS 0x18c /* shifted prev key */ ++-#define KEY_SPRINT 0x18d /* shifted print key */ ++-#define KEY_SREDO 0x18e /* shifted redo key */ ++-#define KEY_SREPLACE 0x18f /* shifted replace key */ ++-#define KEY_SRIGHT 0x190 /* shifted right arrow */ ++-#define KEY_SRSUME 0x191 /* shifted resume key */ ++-#define KEY_SSAVE 0x192 /* shifted save key */ ++-#define KEY_SSUSPEND 0x193 /* shifted suspend key */ ++-#define KEY_SUNDO 0x194 /* shifted undo key */ ++-#define KEY_SUSPEND 0x195 /* suspend key */ ++-#define KEY_UNDO 0x196 /* undo key */ ++- ++-/* PDCurses-specific key definitions -- PC only */ ++- ++-#define ALT_0 0x197 ++-#define ALT_1 0x198 ++-#define ALT_2 0x199 ++-#define ALT_3 0x19a ++-#define ALT_4 0x19b ++-#define ALT_5 0x19c ++-#define ALT_6 0x19d ++-#define ALT_7 0x19e ++-#define ALT_8 0x19f ++-#define ALT_9 0x1a0 ++-#define ALT_A 0x1a1 ++-#define ALT_B 0x1a2 ++-#define ALT_C 0x1a3 ++-#define ALT_D 0x1a4 ++-#define ALT_E 0x1a5 ++-#define ALT_F 0x1a6 ++-#define ALT_G 0x1a7 ++-#define ALT_H 0x1a8 ++-#define ALT_I 0x1a9 ++-#define ALT_J 0x1aa ++-#define ALT_K 0x1ab ++-#define ALT_L 0x1ac ++-#define ALT_M 0x1ad ++-#define ALT_N 0x1ae ++-#define ALT_O 0x1af ++-#define ALT_P 0x1b0 ++-#define ALT_Q 0x1b1 ++-#define ALT_R 0x1b2 ++-#define ALT_S 0x1b3 ++-#define ALT_T 0x1b4 ++-#define ALT_U 0x1b5 ++-#define ALT_V 0x1b6 ++-#define ALT_W 0x1b7 ++-#define ALT_X 0x1b8 ++-#define ALT_Y 0x1b9 ++-#define ALT_Z 0x1ba ++- ++-#define CTL_LEFT 0x1bb /* Control-Left-Arrow */ ++-#define CTL_RIGHT 0x1bc ++-#define CTL_PGUP 0x1bd ++-#define CTL_PGDN 0x1be ++-#define CTL_HOME 0x1bf ++-#define CTL_END 0x1c0 ++- ++-#define KEY_A1 0x1c1 /* upper left on Virtual keypad */ ++-#define KEY_A2 0x1c2 /* upper middle on Virt. keypad */ ++-#define KEY_A3 0x1c3 /* upper right on Vir. keypad */ ++-#define KEY_B1 0x1c4 /* middle left on Virt. keypad */ ++-#define KEY_B2 0x1c5 /* center on Virt. keypad */ ++-#define KEY_B3 0x1c6 /* middle right on Vir. keypad */ ++-#define KEY_C1 0x1c7 /* lower left on Virt. keypad */ ++-#define KEY_C2 0x1c8 /* lower middle on Virt. keypad */ ++-#define KEY_C3 0x1c9 /* lower right on Vir. keypad */ ++- ++-#define PADSLASH 0x1ca /* slash on keypad */ ++-#define PADENTER 0x1cb /* enter on keypad */ ++-#define CTL_PADENTER 0x1cc /* ctl-enter on keypad */ ++-#define ALT_PADENTER 0x1cd /* alt-enter on keypad */ ++-#define PADSTOP 0x1ce /* stop on keypad */ ++-#define PADSTAR 0x1cf /* star on keypad */ ++-#define PADMINUS 0x1d0 /* minus on keypad */ ++-#define PADPLUS 0x1d1 /* plus on keypad */ ++-#define CTL_PADSTOP 0x1d2 /* ctl-stop on keypad */ ++-#define CTL_PADCENTER 0x1d3 /* ctl-enter on keypad */ ++-#define CTL_PADPLUS 0x1d4 /* ctl-plus on keypad */ ++-#define CTL_PADMINUS 0x1d5 /* ctl-minus on keypad */ ++-#define CTL_PADSLASH 0x1d6 /* ctl-slash on keypad */ ++-#define CTL_PADSTAR 0x1d7 /* ctl-star on keypad */ ++-#define ALT_PADPLUS 0x1d8 /* alt-plus on keypad */ ++-#define ALT_PADMINUS 0x1d9 /* alt-minus on keypad */ ++-#define ALT_PADSLASH 0x1da /* alt-slash on keypad */ ++-#define ALT_PADSTAR 0x1db /* alt-star on keypad */ ++-#define ALT_PADSTOP 0x1dc /* alt-stop on keypad */ ++-#define CTL_INS 0x1dd /* ctl-insert */ ++-#define ALT_DEL 0x1de /* alt-delete */ ++-#define ALT_INS 0x1df /* alt-insert */ ++-#define CTL_UP 0x1e0 /* ctl-up arrow */ ++-#define CTL_DOWN 0x1e1 /* ctl-down arrow */ ++-#define CTL_TAB 0x1e2 /* ctl-tab */ ++-#define ALT_TAB 0x1e3 ++-#define ALT_MINUS 0x1e4 ++-#define ALT_EQUAL 0x1e5 ++-#define ALT_HOME 0x1e6 ++-#define ALT_PGUP 0x1e7 ++-#define ALT_PGDN 0x1e8 ++-#define ALT_END 0x1e9 ++-#define ALT_UP 0x1ea /* alt-up arrow */ ++-#define ALT_DOWN 0x1eb /* alt-down arrow */ ++-#define ALT_RIGHT 0x1ec /* alt-right arrow */ ++-#define ALT_LEFT 0x1ed /* alt-left arrow */ ++-#define ALT_ENTER 0x1ee /* alt-enter */ ++-#define ALT_ESC 0x1ef /* alt-escape */ ++-#define ALT_BQUOTE 0x1f0 /* alt-back quote */ ++-#define ALT_LBRACKET 0x1f1 /* alt-left bracket */ ++-#define ALT_RBRACKET 0x1f2 /* alt-right bracket */ ++-#define ALT_SEMICOLON 0x1f3 /* alt-semi-colon */ ++-#define ALT_FQUOTE 0x1f4 /* alt-forward quote */ ++-#define ALT_COMMA 0x1f5 /* alt-comma */ ++-#define ALT_STOP 0x1f6 /* alt-stop */ ++-#define ALT_FSLASH 0x1f7 /* alt-forward slash */ ++-#define ALT_BKSP 0x1f8 /* alt-backspace */ ++-#define CTL_BKSP 0x1f9 /* ctl-backspace */ ++-#define PAD0 0x1fa /* keypad 0 */ ++- ++-#define CTL_PAD0 0x1fb /* ctl-keypad 0 */ ++-#define CTL_PAD1 0x1fc ++-#define CTL_PAD2 0x1fd ++-#define CTL_PAD3 0x1fe ++-#define CTL_PAD4 0x1ff ++-#define CTL_PAD5 0x200 ++-#define CTL_PAD6 0x201 ++-#define CTL_PAD7 0x202 ++-#define CTL_PAD8 0x203 ++-#define CTL_PAD9 0x204 ++- ++-#define ALT_PAD0 0x205 /* alt-keypad 0 */ ++-#define ALT_PAD1 0x206 ++-#define ALT_PAD2 0x207 ++-#define ALT_PAD3 0x208 ++-#define ALT_PAD4 0x209 ++-#define ALT_PAD5 0x20a ++-#define ALT_PAD6 0x20b ++-#define ALT_PAD7 0x20c ++-#define ALT_PAD8 0x20d ++-#define ALT_PAD9 0x20e ++- ++-#define CTL_DEL 0x20f /* clt-delete */ ++-#define ALT_BSLASH 0x210 /* alt-back slash */ ++-#define CTL_ENTER 0x211 /* ctl-enter */ ++- ++-#define SHF_PADENTER 0x212 /* shift-enter on keypad */ ++-#define SHF_PADSLASH 0x213 /* shift-slash on keypad */ ++-#define SHF_PADSTAR 0x214 /* shift-star on keypad */ ++-#define SHF_PADPLUS 0x215 /* shift-plus on keypad */ ++-#define SHF_PADMINUS 0x216 /* shift-minus on keypad */ ++-#define SHF_UP 0x217 /* shift-up on keypad */ ++-#define SHF_DOWN 0x218 /* shift-down on keypad */ ++-#define SHF_IC 0x219 /* shift-insert on keypad */ ++-#define SHF_DC 0x21a /* shift-delete on keypad */ ++- ++-#define KEY_MOUSE 0x21b /* "mouse" key */ ++-#define KEY_SHIFT_L 0x21c /* Left-shift */ ++-#define KEY_SHIFT_R 0x21d /* Right-shift */ ++-#define KEY_CONTROL_L 0x21e /* Left-control */ ++-#define KEY_CONTROL_R 0x21f /* Right-control */ ++-#define KEY_ALT_L 0x220 /* Left-alt */ ++-#define KEY_ALT_R 0x221 /* Right-alt */ ++-#define KEY_RESIZE 0x222 /* Window resize */ ++-#define KEY_SUP 0x223 /* Shifted up arrow */ ++-#define KEY_SDOWN 0x224 /* Shifted down arrow */ ++- ++-#define KEY_MIN KEY_BREAK /* Minimum curses key value */ ++-#define KEY_MAX KEY_SDOWN /* Maximum curses key */ ++- ++-#define KEY_F(n) (KEY_F0 + (n)) ++- ++-/*---------------------------------------------------------------------- ++- * ++- * PDCurses Function Declarations ++- * ++- */ ++- ++-/* Standard */ ++- ++-int addch(const chtype); ++-int addchnstr(const chtype *, int); ++-int addchstr(const chtype *); ++-int addnstr(const char *, int); ++-int addstr(const char *); ++-int attroff(chtype); ++-int attron(chtype); ++-int attrset(chtype); ++-int attr_get(attr_t *, short *, void *); ++-int attr_off(attr_t, void *); ++-int attr_on(attr_t, void *); ++-int attr_set(attr_t, short, void *); ++-int baudrate(void); ++-int beep(void); ++-int bkgd(chtype); ++-void bkgdset(chtype); ++-int border(chtype, chtype, chtype, chtype, chtype, chtype, chtype, chtype); ++-int box(WINDOW *, chtype, chtype); ++-bool can_change_color(void); ++-int cbreak(void); ++-int chgat(int, attr_t, short, const void *); ++-int clearok(WINDOW *, bool); ++-int clear(void); ++-int clrtobot(void); ++-int clrtoeol(void); ++-int color_content(short, short *, short *, short *); ++-int color_set(short, void *); ++-int copywin(const WINDOW *, WINDOW *, int, int, int, int, int, int, int); ++-int curs_set(int); ++-int def_prog_mode(void); ++-int def_shell_mode(void); ++-int delay_output(int); ++-int delch(void); ++-int deleteln(void); ++-void delscreen(SCREEN *); ++-int delwin(WINDOW *); ++-WINDOW *derwin(WINDOW *, int, int, int, int); ++-int doupdate(void); ++-WINDOW *dupwin(WINDOW *); ++-int echochar(const chtype); ++-int echo(void); ++-int endwin(void); ++-char erasechar(void); ++-int erase(void); ++-void filter(void); ++-int flash(void); ++-int flushinp(void); ++-chtype getbkgd(WINDOW *); ++-int getnstr(char *, int); ++-int getstr(char *); ++-WINDOW *getwin(FILE *); ++-int halfdelay(int); ++-bool has_colors(void); ++-bool has_ic(void); ++-bool has_il(void); ++-int hline(chtype, int); ++-void idcok(WINDOW *, bool); ++-int idlok(WINDOW *, bool); ++-void immedok(WINDOW *, bool); ++-int inchnstr(chtype *, int); ++-int inchstr(chtype *); ++-chtype inch(void); ++-int init_color(short, short, short, short); ++-int init_pair(short, short, short); ++-WINDOW *initscr(void); ++-int innstr(char *, int); ++-int insch(chtype); ++-int insdelln(int); ++-int insertln(void); ++-int insnstr(const char *, int); ++-int insstr(const char *); ++-int instr(char *); ++-int intrflush(WINDOW *, bool); ++-bool isendwin(void); ++-bool is_linetouched(WINDOW *, int); ++-bool is_wintouched(WINDOW *); ++-char *keyname(int); ++-int keypad(WINDOW *, bool); ++-char killchar(void); ++-int leaveok(WINDOW *, bool); ++-char *longname(void); ++-int meta(WINDOW *, bool); ++-int move(int, int); ++-int mvaddch(int, int, const chtype); ++-int mvaddchnstr(int, int, const chtype *, int); ++-int mvaddchstr(int, int, const chtype *); ++-int mvaddnstr(int, int, const char *, int); ++-int mvaddstr(int, int, const char *); ++-int mvchgat(int, int, int, attr_t, short, const void *); ++-int mvcur(int, int, int, int); ++-int mvdelch(int, int); ++-int mvderwin(WINDOW *, int, int); ++-int mvgetch(int, int); ++-int mvgetnstr(int, int, char *, int); ++-int mvgetstr(int, int, char *); ++-int mvhline(int, int, chtype, int); ++-chtype mvinch(int, int); ++-int mvinchnstr(int, int, chtype *, int); ++-int mvinchstr(int, int, chtype *); ++-int mvinnstr(int, int, char *, int); ++-int mvinsch(int, int, chtype); ++-int mvinsnstr(int, int, const char *, int); ++-int mvinsstr(int, int, const char *); ++-int mvinstr(int, int, char *); ++-int mvprintw(int, int, const char *, ...); ++-int mvscanw(int, int, const char *, ...); ++-int mvvline(int, int, chtype, int); ++-int mvwaddchnstr(WINDOW *, int, int, const chtype *, int); ++-int mvwaddchstr(WINDOW *, int, int, const chtype *); ++-int mvwaddch(WINDOW *, int, int, const chtype); ++-int mvwaddnstr(WINDOW *, int, int, const char *, int); ++-int mvwaddstr(WINDOW *, int, int, const char *); ++-int mvwchgat(WINDOW *, int, int, int, attr_t, short, const void *); ++-int mvwdelch(WINDOW *, int, int); ++-int mvwgetch(WINDOW *, int, int); ++-int mvwgetnstr(WINDOW *, int, int, char *, int); ++-int mvwgetstr(WINDOW *, int, int, char *); ++-int mvwhline(WINDOW *, int, int, chtype, int); ++-int mvwinchnstr(WINDOW *, int, int, chtype *, int); ++-int mvwinchstr(WINDOW *, int, int, chtype *); ++-chtype mvwinch(WINDOW *, int, int); ++-int mvwinnstr(WINDOW *, int, int, char *, int); ++-int mvwinsch(WINDOW *, int, int, chtype); ++-int mvwinsnstr(WINDOW *, int, int, const char *, int); ++-int mvwinsstr(WINDOW *, int, int, const char *); ++-int mvwinstr(WINDOW *, int, int, char *); ++-int mvwin(WINDOW *, int, int); ++-int mvwprintw(WINDOW *, int, int, const char *, ...); ++-int mvwscanw(WINDOW *, int, int, const char *, ...); ++-int mvwvline(WINDOW *, int, int, chtype, int); ++-int napms(int); ++-WINDOW *newpad(int, int); ++-SCREEN *newterm(const char *, FILE *, FILE *); ++-WINDOW *newwin(int, int, int, int); ++-int nl(void); ++-int nocbreak(void); ++-int nodelay(WINDOW *, bool); ++-int noecho(void); ++-int nonl(void); ++-void noqiflush(void); ++-int noraw(void); ++-int notimeout(WINDOW *, bool); ++-int overlay(const WINDOW *, WINDOW *); ++-int overwrite(const WINDOW *, WINDOW *); ++-int pair_content(short, short *, short *); ++-int pechochar(WINDOW *, chtype); ++-int pnoutrefresh(WINDOW *, int, int, int, int, int, int); ++-int prefresh(WINDOW *, int, int, int, int, int, int); ++-int printw(const char *, ...); ++-int putwin(WINDOW *, FILE *); ++-void qiflush(void); ++-int raw(void); ++-int redrawwin(WINDOW *); ++-int refresh(void); ++-int reset_prog_mode(void); ++-int reset_shell_mode(void); ++-int resetty(void); ++-int ripoffline(int, int (*)(WINDOW *, int)); ++-int savetty(void); ++-int scanw(const char *, ...); ++-int scr_dump(const char *); ++-int scr_init(const char *); ++-int scr_restore(const char *); ++-int scr_set(const char *); ++-int scrl(int); ++-int scroll(WINDOW *); ++-int scrollok(WINDOW *, bool); ++-SCREEN *set_term(SCREEN *); ++-int setscrreg(int, int); ++-int slk_attroff(const chtype); ++-int slk_attr_off(const attr_t, void *); ++-int slk_attron(const chtype); ++-int slk_attr_on(const attr_t, void *); ++-int slk_attrset(const chtype); ++-int slk_attr_set(const attr_t, short, void *); ++-int slk_clear(void); ++-int slk_color(short); ++-int slk_init(int); ++-char *slk_label(int); ++-int slk_noutrefresh(void); ++-int slk_refresh(void); ++-int slk_restore(void); ++-int slk_set(int, const char *, int); ++-int slk_touch(void); ++-int standend(void); ++-int standout(void); ++-int start_color(void); ++-WINDOW *subpad(WINDOW *, int, int, int, int); ++-WINDOW *subwin(WINDOW *, int, int, int, int); ++-int syncok(WINDOW *, bool); ++-chtype termattrs(void); ++-attr_t term_attrs(void); ++-char *termname(void); ++-void timeout(int); ++-int touchline(WINDOW *, int, int); ++-int touchwin(WINDOW *); ++-int typeahead(int); ++-int untouchwin(WINDOW *); ++-void use_env(bool); ++-int vidattr(chtype); ++-int vid_attr(attr_t, short, void *); ++-int vidputs(chtype, int (*)(int)); ++-int vid_puts(attr_t, short, void *, int (*)(int)); ++-int vline(chtype, int); ++-int vw_printw(WINDOW *, const char *, va_list); ++-int vwprintw(WINDOW *, const char *, va_list); ++-int vw_scanw(WINDOW *, const char *, va_list); ++-int vwscanw(WINDOW *, const char *, va_list); ++-int waddchnstr(WINDOW *, const chtype *, int); ++-int waddchstr(WINDOW *, const chtype *); ++-int waddch(WINDOW *, const chtype); ++-int waddnstr(WINDOW *, const char *, int); ++-int waddstr(WINDOW *, const char *); ++-int wattroff(WINDOW *, chtype); ++-int wattron(WINDOW *, chtype); ++-int wattrset(WINDOW *, chtype); ++-int wattr_get(WINDOW *, attr_t *, short *, void *); ++-int wattr_off(WINDOW *, attr_t, void *); ++-int wattr_on(WINDOW *, attr_t, void *); ++-int wattr_set(WINDOW *, attr_t, short, void *); ++-void wbkgdset(WINDOW *, chtype); ++-int wbkgd(WINDOW *, chtype); ++-int wborder(WINDOW *, chtype, chtype, chtype, chtype, ++- chtype, chtype, chtype, chtype); ++-int wchgat(WINDOW *, int, attr_t, short, const void *); ++-int wclear(WINDOW *); ++-int wclrtobot(WINDOW *); ++-int wclrtoeol(WINDOW *); ++-int wcolor_set(WINDOW *, short, void *); ++-void wcursyncup(WINDOW *); ++-int wdelch(WINDOW *); ++-int wdeleteln(WINDOW *); ++-int wechochar(WINDOW *, const chtype); ++-int werase(WINDOW *); ++-int wgetch(WINDOW *); ++-int wgetnstr(WINDOW *, char *, int); ++-int wgetstr(WINDOW *, char *); ++-int whline(WINDOW *, chtype, int); ++-int winchnstr(WINDOW *, chtype *, int); ++-int winchstr(WINDOW *, chtype *); ++-chtype winch(WINDOW *); ++-int winnstr(WINDOW *, char *, int); ++-int winsch(WINDOW *, chtype); ++-int winsdelln(WINDOW *, int); ++-int winsertln(WINDOW *); ++-int winsnstr(WINDOW *, const char *, int); ++-int winsstr(WINDOW *, const char *); ++-int winstr(WINDOW *, char *); ++-int wmove(WINDOW *, int, int); ++-int wnoutrefresh(WINDOW *); ++-int wprintw(WINDOW *, const char *, ...); ++-int wredrawln(WINDOW *, int, int); ++-int wrefresh(WINDOW *); ++-int wscanw(WINDOW *, const char *, ...); ++-int wscrl(WINDOW *, int); ++-int wsetscrreg(WINDOW *, int, int); ++-int wstandend(WINDOW *); ++-int wstandout(WINDOW *); ++-void wsyncdown(WINDOW *); ++-void wsyncup(WINDOW *); ++-void wtimeout(WINDOW *, int); ++-int wtouchln(WINDOW *, int, int, int); ++-int wvline(WINDOW *, chtype, int); ++- ++-/* Wide-character functions */ ++- ++-#ifdef PDC_WIDE ++-int addnwstr(const wchar_t *, int); ++-int addwstr(const wchar_t *); ++-int add_wch(const cchar_t *); ++-int add_wchnstr(const cchar_t *, int); ++-int add_wchstr(const cchar_t *); ++-int border_set(const cchar_t *, const cchar_t *, const cchar_t *, ++- const cchar_t *, const cchar_t *, const cchar_t *, ++- const cchar_t *, const cchar_t *); ++-int box_set(WINDOW *, const cchar_t *, const cchar_t *); ++-int echo_wchar(const cchar_t *); ++-int erasewchar(wchar_t *); ++-int getbkgrnd(cchar_t *); ++-int getcchar(const cchar_t *, wchar_t *, attr_t *, short *, void *); ++-int getn_wstr(wint_t *, int); ++-int get_wch(wint_t *); ++-int get_wstr(wint_t *); ++-int hline_set(const cchar_t *, int); ++-int innwstr(wchar_t *, int); ++-int ins_nwstr(const wchar_t *, int); ++-int ins_wch(const cchar_t *); ++-int ins_wstr(const wchar_t *); ++-int inwstr(wchar_t *); ++-int in_wch(cchar_t *); ++-int in_wchnstr(cchar_t *, int); ++-int in_wchstr(cchar_t *); ++-char *key_name(wchar_t); ++-int killwchar(wchar_t *); ++-int mvaddnwstr(int, int, const wchar_t *, int); ++-int mvaddwstr(int, int, const wchar_t *); ++-int mvadd_wch(int, int, const cchar_t *); ++-int mvadd_wchnstr(int, int, const cchar_t *, int); ++-int mvadd_wchstr(int, int, const cchar_t *); ++-int mvgetn_wstr(int, int, wint_t *, int); ++-int mvget_wch(int, int, wint_t *); ++-int mvget_wstr(int, int, wint_t *); ++-int mvhline_set(int, int, const cchar_t *, int); ++-int mvinnwstr(int, int, wchar_t *, int); ++-int mvins_nwstr(int, int, const wchar_t *, int); ++-int mvins_wch(int, int, const cchar_t *); ++-int mvins_wstr(int, int, const wchar_t *); ++-int mvinwstr(int, int, wchar_t *); ++-int mvin_wch(int, int, cchar_t *); ++-int mvin_wchnstr(int, int, cchar_t *, int); ++-int mvin_wchstr(int, int, cchar_t *); ++-int mvvline_set(int, int, const cchar_t *, int); ++-int mvwaddnwstr(WINDOW *, int, int, const wchar_t *, int); ++-int mvwaddwstr(WINDOW *, int, int, const wchar_t *); ++-int mvwadd_wch(WINDOW *, int, int, const cchar_t *); ++-int mvwadd_wchnstr(WINDOW *, int, int, const cchar_t *, int); ++-int mvwadd_wchstr(WINDOW *, int, int, const cchar_t *); ++-int mvwgetn_wstr(WINDOW *, int, int, wint_t *, int); ++-int mvwget_wch(WINDOW *, int, int, wint_t *); ++-int mvwget_wstr(WINDOW *, int, int, wint_t *); ++-int mvwhline_set(WINDOW *, int, int, const cchar_t *, int); ++-int mvwinnwstr(WINDOW *, int, int, wchar_t *, int); ++-int mvwins_nwstr(WINDOW *, int, int, const wchar_t *, int); ++-int mvwins_wch(WINDOW *, int, int, const cchar_t *); ++-int mvwins_wstr(WINDOW *, int, int, const wchar_t *); ++-int mvwin_wch(WINDOW *, int, int, cchar_t *); ++-int mvwin_wchnstr(WINDOW *, int, int, cchar_t *, int); ++-int mvwin_wchstr(WINDOW *, int, int, cchar_t *); ++-int mvwinwstr(WINDOW *, int, int, wchar_t *); ++-int mvwvline_set(WINDOW *, int, int, const cchar_t *, int); ++-int pecho_wchar(WINDOW *, const cchar_t*); ++-int setcchar(cchar_t*, const wchar_t*, const attr_t, short, const void*); ++-int slk_wset(int, const wchar_t *, int); ++-int unget_wch(const wchar_t); ++-int vline_set(const cchar_t *, int); ++-int waddnwstr(WINDOW *, const wchar_t *, int); ++-int waddwstr(WINDOW *, const wchar_t *); ++-int wadd_wch(WINDOW *, const cchar_t *); ++-int wadd_wchnstr(WINDOW *, const cchar_t *, int); ++-int wadd_wchstr(WINDOW *, const cchar_t *); ++-int wbkgrnd(WINDOW *, const cchar_t *); ++-void wbkgrndset(WINDOW *, const cchar_t *); ++-int wborder_set(WINDOW *, const cchar_t *, const cchar_t *, ++- const cchar_t *, const cchar_t *, const cchar_t *, ++- const cchar_t *, const cchar_t *, const cchar_t *); ++-int wecho_wchar(WINDOW *, const cchar_t *); ++-int wgetbkgrnd(WINDOW *, cchar_t *); ++-int wgetn_wstr(WINDOW *, wint_t *, int); ++-int wget_wch(WINDOW *, wint_t *); ++-int wget_wstr(WINDOW *, wint_t *); ++-int whline_set(WINDOW *, const cchar_t *, int); ++-int winnwstr(WINDOW *, wchar_t *, int); ++-int wins_nwstr(WINDOW *, const wchar_t *, int); ++-int wins_wch(WINDOW *, const cchar_t *); ++-int wins_wstr(WINDOW *, const wchar_t *); ++-int winwstr(WINDOW *, wchar_t *); ++-int win_wch(WINDOW *, cchar_t *); ++-int win_wchnstr(WINDOW *, cchar_t *, int); ++-int win_wchstr(WINDOW *, cchar_t *); ++-wchar_t *wunctrl(cchar_t *); ++-int wvline_set(WINDOW *, const cchar_t *, int); ++-#endif ++- ++-/* Quasi-standard */ ++- ++-chtype getattrs(WINDOW *); ++-int getbegx(WINDOW *); ++-int getbegy(WINDOW *); ++-int getmaxx(WINDOW *); ++-int getmaxy(WINDOW *); ++-int getparx(WINDOW *); ++-int getpary(WINDOW *); ++-int getcurx(WINDOW *); ++-int getcury(WINDOW *); ++-void traceoff(void); ++-void traceon(void); ++-char *unctrl(chtype); ++- ++-int crmode(void); ++-int nocrmode(void); ++-int draino(int); ++-int resetterm(void); ++-int fixterm(void); ++-int saveterm(void); ++-int setsyx(int, int); ++- ++-int mouse_set(unsigned long); ++-int mouse_on(unsigned long); ++-int mouse_off(unsigned long); ++-int request_mouse_pos(void); ++-int map_button(unsigned long); ++-void wmouse_position(WINDOW *, int *, int *); ++-unsigned long getmouse(void); ++-unsigned long getbmap(void); ++- ++-/* ncurses */ ++- ++-int assume_default_colors(int, int); ++-const char *curses_version(void); ++-bool has_key(int); ++-int use_default_colors(void); ++-int wresize(WINDOW *, int, int); ++- ++-int mouseinterval(int); ++-mmask_t mousemask(mmask_t, mmask_t *); ++-bool mouse_trafo(int *, int *, bool); ++-int nc_getmouse(MEVENT *); ++-int ungetmouse(MEVENT *); ++-bool wenclose(const WINDOW *, int, int); ++-bool wmouse_trafo(const WINDOW *, int *, int *, bool); ++- ++-/* PDCurses */ ++- ++-int addrawch(chtype); ++-int insrawch(chtype); ++-bool is_termresized(void); ++-int mvaddrawch(int, int, chtype); ++-int mvdeleteln(int, int); ++-int mvinsertln(int, int); ++-int mvinsrawch(int, int, chtype); ++-int mvwaddrawch(WINDOW *, int, int, chtype); ++-int mvwdeleteln(WINDOW *, int, int); ++-int mvwinsertln(WINDOW *, int, int); ++-int mvwinsrawch(WINDOW *, int, int, chtype); ++-int raw_output(bool); ++-int resize_term(int, int); ++-WINDOW *resize_window(WINDOW *, int, int); ++-int waddrawch(WINDOW *, chtype); ++-int winsrawch(WINDOW *, chtype); ++-char wordchar(void); ++- ++-#ifdef PDC_WIDE ++-wchar_t *slk_wlabel(int); ++-#endif ++- ++-void PDC_debug(const char *, ...); ++-int PDC_ungetch(int); ++-int PDC_set_blink(bool); ++-int PDC_set_line_color(short); ++-void PDC_set_title(const char *); ++- ++-int PDC_clearclipboard(void); ++-int PDC_freeclipboard(char *); ++-int PDC_getclipboard(char **, long *); ++-int PDC_setclipboard(const char *, long); ++- ++-unsigned long PDC_get_input_fd(void); ++-unsigned long PDC_get_key_modifiers(void); ++-int PDC_return_key_modifiers(bool); ++-int PDC_save_key_modifiers(bool); ++- ++-#ifdef XCURSES ++-WINDOW *Xinitscr(int, char **); ++-void XCursesExit(void); ++-int sb_init(void); ++-int sb_set_horz(int, int, int); ++-int sb_set_vert(int, int, int); ++-int sb_get_horz(int *, int *, int *); ++-int sb_get_vert(int *, int *, int *); ++-int sb_refresh(void); ++-#endif ++- ++-/*** Functions defined as macros ***/ ++- ++-/* getch() and ungetch() conflict with some DOS libraries */ ++- ++-#define getch() wgetch(stdscr) ++-#define ungetch(ch) PDC_ungetch(ch) ++- ++-#define COLOR_PAIR(n) (((chtype)(n) << PDC_COLOR_SHIFT) & A_COLOR) ++-#define PAIR_NUMBER(n) (((n) & A_COLOR) >> PDC_COLOR_SHIFT) ++- ++-/* These will _only_ work as macros */ ++- ++-#define getbegyx(w, y, x) (y = getbegy(w), x = getbegx(w)) ++-#define getmaxyx(w, y, x) (y = getmaxy(w), x = getmaxx(w)) ++-#define getparyx(w, y, x) (y = getpary(w), x = getparx(w)) ++-#define getyx(w, y, x) (y = getcury(w), x = getcurx(w)) ++- ++-#define getsyx(y, x) { if (curscr->_leaveit) (y)=(x)=-1; \ ++- else getyx(curscr,(y),(x)); } ++- ++-#ifdef NCURSES_MOUSE_VERSION ++-# define getmouse(x) nc_getmouse(x) ++-#endif ++- ++-/* return codes from PDC_getclipboard() and PDC_setclipboard() calls */ ++- ++-#define PDC_CLIP_SUCCESS 0 ++-#define PDC_CLIP_ACCESS_ERROR 1 ++-#define PDC_CLIP_EMPTY 2 ++-#define PDC_CLIP_MEMORY_ERROR 3 ++- ++-/* PDCurses key modifier masks */ ++- ++-#define PDC_KEY_MODIFIER_SHIFT 1 ++-#define PDC_KEY_MODIFIER_CONTROL 2 ++-#define PDC_KEY_MODIFIER_ALT 4 ++-#define PDC_KEY_MODIFIER_NUMLOCK 8 ++- ++-#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS) ++-# undef bool ++-} ++-#endif ++- ++-#endif /* __PDCURSES__ */ ++--- python-pysam.orig/samtools/win32/zconf.h +++++ /dev/null ++@@ -1,332 +0,0 @@ ++-/* zconf.h -- configuration of the zlib compression library ++- * Copyright (C) 1995-2005 Jean-loup Gailly. ++- * For conditions of distribution and use, see copyright notice in zlib.h ++- */ ++- ++-/* @(#) $Id$ */ ++- ++-#ifndef ZCONF_H ++-#define ZCONF_H ++- ++-/* ++- * If you *really* need a unique prefix for all types and library functions, ++- * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it. ++- */ ++-#ifdef Z_PREFIX ++-# define deflateInit_ z_deflateInit_ ++-# define deflate z_deflate ++-# define deflateEnd z_deflateEnd ++-# define inflateInit_ z_inflateInit_ ++-# define inflate z_inflate ++-# define inflateEnd z_inflateEnd ++-# define deflateInit2_ z_deflateInit2_ ++-# define deflateSetDictionary z_deflateSetDictionary ++-# define deflateCopy z_deflateCopy ++-# define deflateReset z_deflateReset ++-# define deflateParams z_deflateParams ++-# define deflateBound z_deflateBound ++-# define deflatePrime z_deflatePrime ++-# define inflateInit2_ z_inflateInit2_ ++-# define inflateSetDictionary z_inflateSetDictionary ++-# define inflateSync z_inflateSync ++-# define inflateSyncPoint z_inflateSyncPoint ++-# define inflateCopy z_inflateCopy ++-# define inflateReset z_inflateReset ++-# define inflateBack z_inflateBack ++-# define inflateBackEnd z_inflateBackEnd ++-# define compress z_compress ++-# define compress2 z_compress2 ++-# define compressBound z_compressBound ++-# define uncompress z_uncompress ++-# define adler32 z_adler32 ++-# define crc32 z_crc32 ++-# define get_crc_table z_get_crc_table ++-# define zError z_zError ++- ++-# define alloc_func z_alloc_func ++-# define free_func z_free_func ++-# define in_func z_in_func ++-# define out_func z_out_func ++-# define Byte z_Byte ++-# define uInt z_uInt ++-# define uLong z_uLong ++-# define Bytef z_Bytef ++-# define charf z_charf ++-# define intf z_intf ++-# define uIntf z_uIntf ++-# define uLongf z_uLongf ++-# define voidpf z_voidpf ++-# define voidp z_voidp ++-#endif ++- ++-#if defined(__MSDOS__) && !defined(MSDOS) ++-# define MSDOS ++-#endif ++-#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2) ++-# define OS2 ++-#endif ++-#if defined(_WINDOWS) && !defined(WINDOWS) ++-# define WINDOWS ++-#endif ++-#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__) ++-# ifndef WIN32 ++-# define WIN32 ++-# endif ++-#endif ++-#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32) ++-# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__) ++-# ifndef SYS16BIT ++-# define SYS16BIT ++-# endif ++-# endif ++-#endif ++- ++-/* ++- * Compile with -DMAXSEG_64K if the alloc function cannot allocate more ++- * than 64k bytes at a time (needed on systems with 16-bit int). ++- */ ++-#ifdef SYS16BIT ++-# define MAXSEG_64K ++-#endif ++-#ifdef MSDOS ++-# define UNALIGNED_OK ++-#endif ++- ++-#ifdef __STDC_VERSION__ ++-# ifndef STDC ++-# define STDC ++-# endif ++-# if __STDC_VERSION__ >= 199901L ++-# ifndef STDC99 ++-# define STDC99 ++-# endif ++-# endif ++-#endif ++-#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus)) ++-# define STDC ++-#endif ++-#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__)) ++-# define STDC ++-#endif ++-#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32)) ++-# define STDC ++-#endif ++-#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__)) ++-# define STDC ++-#endif ++- ++-#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */ ++-# define STDC ++-#endif ++- ++-#ifndef STDC ++-# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */ ++-# define const /* note: need a more gentle solution here */ ++-# endif ++-#endif ++- ++-/* Some Mac compilers merge all .h files incorrectly: */ ++-#if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__) ++-# define NO_DUMMY_DECL ++-#endif ++- ++-/* Maximum value for memLevel in deflateInit2 */ ++-#ifndef MAX_MEM_LEVEL ++-# ifdef MAXSEG_64K ++-# define MAX_MEM_LEVEL 8 ++-# else ++-# define MAX_MEM_LEVEL 9 ++-# endif ++-#endif ++- ++-/* Maximum value for windowBits in deflateInit2 and inflateInit2. ++- * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files ++- * created by gzip. (Files created by minigzip can still be extracted by ++- * gzip.) ++- */ ++-#ifndef MAX_WBITS ++-# define MAX_WBITS 15 /* 32K LZ77 window */ ++-#endif ++- ++-/* The memory requirements for deflate are (in bytes): ++- (1 << (windowBits+2)) + (1 << (memLevel+9)) ++- that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) ++- plus a few kilobytes for small objects. For example, if you want to reduce ++- the default memory requirements from 256K to 128K, compile with ++- make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" ++- Of course this will generally degrade compression (there's no free lunch). ++- ++- The memory requirements for inflate are (in bytes) 1 << windowBits ++- that is, 32K for windowBits=15 (default value) plus a few kilobytes ++- for small objects. ++-*/ ++- ++- /* Type declarations */ ++- ++-#ifndef OF /* function prototypes */ ++-# ifdef STDC ++-# define OF(args) args ++-# else ++-# define OF(args) () ++-# endif ++-#endif ++- ++-/* The following definitions for FAR are needed only for MSDOS mixed ++- * model programming (small or medium model with some far allocations). ++- * This was tested only with MSC; for other MSDOS compilers you may have ++- * to define NO_MEMCPY in zutil.h. If you don't need the mixed model, ++- * just define FAR to be empty. ++- */ ++-#ifdef SYS16BIT ++-# if defined(M_I86SM) || defined(M_I86MM) ++- /* MSC small or medium model */ ++-# define SMALL_MEDIUM ++-# ifdef _MSC_VER ++-# define FAR _far ++-# else ++-# define FAR far ++-# endif ++-# endif ++-# if (defined(__SMALL__) || defined(__MEDIUM__)) ++- /* Turbo C small or medium model */ ++-# define SMALL_MEDIUM ++-# ifdef __BORLANDC__ ++-# define FAR _far ++-# else ++-# define FAR far ++-# endif ++-# endif ++-#endif ++- ++-#if defined(WINDOWS) || defined(WIN32) ++- /* If building or using zlib as a DLL, define ZLIB_DLL. ++- * This is not mandatory, but it offers a little performance increase. ++- */ ++-# ifdef ZLIB_DLL ++-# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500)) ++-# ifdef ZLIB_INTERNAL ++-# define ZEXTERN extern __declspec(dllexport) ++-# else ++-# define ZEXTERN extern __declspec(dllimport) ++-# endif ++-# endif ++-# endif /* ZLIB_DLL */ ++- /* If building or using zlib with the WINAPI/WINAPIV calling convention, ++- * define ZLIB_WINAPI. ++- * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI. ++- */ ++-# ifdef ZLIB_WINAPI ++-# ifdef FAR ++-# undef FAR ++-# endif ++-# include ++- /* No need for _export, use ZLIB.DEF instead. */ ++- /* For complete Windows compatibility, use WINAPI, not __stdcall. */ ++-# define ZEXPORT WINAPI ++-# ifdef WIN32 ++-# define ZEXPORTVA WINAPIV ++-# else ++-# define ZEXPORTVA FAR CDECL ++-# endif ++-# endif ++-#endif ++- ++-#if defined (__BEOS__) ++-# ifdef ZLIB_DLL ++-# ifdef ZLIB_INTERNAL ++-# define ZEXPORT __declspec(dllexport) ++-# define ZEXPORTVA __declspec(dllexport) ++-# else ++-# define ZEXPORT __declspec(dllimport) ++-# define ZEXPORTVA __declspec(dllimport) ++-# endif ++-# endif ++-#endif ++- ++-#ifndef ZEXTERN ++-# define ZEXTERN extern ++-#endif ++-#ifndef ZEXPORT ++-# define ZEXPORT ++-#endif ++-#ifndef ZEXPORTVA ++-# define ZEXPORTVA ++-#endif ++- ++-#ifndef FAR ++-# define FAR ++-#endif ++- ++-#if !defined(__MACTYPES__) ++-typedef unsigned char Byte; /* 8 bits */ ++-#endif ++-typedef unsigned int uInt; /* 16 bits or more */ ++-typedef unsigned long uLong; /* 32 bits or more */ ++- ++-#ifdef SMALL_MEDIUM ++- /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */ ++-# define Bytef Byte FAR ++-#else ++- typedef Byte FAR Bytef; ++-#endif ++-typedef char FAR charf; ++-typedef int FAR intf; ++-typedef uInt FAR uIntf; ++-typedef uLong FAR uLongf; ++- ++-#ifdef STDC ++- typedef void const *voidpc; ++- typedef void FAR *voidpf; ++- typedef void *voidp; ++-#else ++- typedef Byte const *voidpc; ++- typedef Byte FAR *voidpf; ++- typedef Byte *voidp; ++-#endif ++- ++-#if 0 /* HAVE_UNISTD_H -- this line is updated by ./configure */ ++-# include /* for off_t */ ++-# include /* for SEEK_* and off_t */ ++-# ifdef VMS ++-# include /* for off_t */ ++-# endif ++-# define z_off_t off_t ++-#endif ++-#ifndef SEEK_SET ++-# define SEEK_SET 0 /* Seek from beginning of file. */ ++-# define SEEK_CUR 1 /* Seek from current position. */ ++-# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */ ++-#endif ++-#ifndef z_off_t ++-# define z_off_t long ++-#endif ++- ++-#if defined(__OS400__) ++-# define NO_vsnprintf ++-#endif ++- ++-#if defined(__MVS__) ++-# define NO_vsnprintf ++-# ifdef FAR ++-# undef FAR ++-# endif ++-#endif ++- ++-/* MVS linker does not support external names larger than 8 bytes */ ++-#if defined(__MVS__) ++-# pragma map(deflateInit_,"DEIN") ++-# pragma map(deflateInit2_,"DEIN2") ++-# pragma map(deflateEnd,"DEEND") ++-# pragma map(deflateBound,"DEBND") ++-# pragma map(inflateInit_,"ININ") ++-# pragma map(inflateInit2_,"ININ2") ++-# pragma map(inflateEnd,"INEND") ++-# pragma map(inflateSync,"INSY") ++-# pragma map(inflateSetDictionary,"INSEDI") ++-# pragma map(compressBound,"CMBND") ++-# pragma map(inflate_table,"INTABL") ++-# pragma map(inflate_fast,"INFA") ++-# pragma map(inflate_copyright,"INCOPY") ++-#endif ++- ++-#endif /* ZCONF_H */ ++--- python-pysam.orig/samtools/win32/zlib.h +++++ /dev/null ++@@ -1,1357 +0,0 @@ ++-/* zlib.h -- interface of the 'zlib' general purpose compression library ++- version 1.2.3, July 18th, 2005 ++- ++- Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler ++- ++- This software is provided 'as-is', without any express or implied ++- warranty. In no event will the authors be held liable for any damages ++- arising from the use of this software. ++- ++- Permission is granted to anyone to use this software for any purpose, ++- including commercial applications, and to alter it and redistribute it ++- freely, subject to the following restrictions: ++- ++- 1. The origin of this software must not be misrepresented; you must not ++- claim that you wrote the original software. If you use this software ++- in a product, an acknowledgment in the product documentation would be ++- appreciated but is not required. ++- 2. Altered source versions must be plainly marked as such, and must not be ++- misrepresented as being the original software. ++- 3. This notice may not be removed or altered from any source distribution. ++- ++- Jean-loup Gailly Mark Adler ++- jloup@gzip.org madler@alumni.caltech.edu ++- ++- ++- The data format used by the zlib library is described by RFCs (Request for ++- Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt ++- (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). ++-*/ ++- ++-#ifndef ZLIB_H ++-#define ZLIB_H ++- ++-#include "zconf.h" ++- ++-#ifdef __cplusplus ++-extern "C" { ++-#endif ++- ++-#define ZLIB_VERSION "1.2.3" ++-#define ZLIB_VERNUM 0x1230 ++- ++-/* ++- The 'zlib' compression library provides in-memory compression and ++- decompression functions, including integrity checks of the uncompressed ++- data. This version of the library supports only one compression method ++- (deflation) but other algorithms will be added later and will have the same ++- stream interface. ++- ++- Compression can be done in a single step if the buffers are large ++- enough (for example if an input file is mmap'ed), or can be done by ++- repeated calls of the compression function. In the latter case, the ++- application must provide more input and/or consume the output ++- (providing more output space) before each call. ++- ++- The compressed data format used by default by the in-memory functions is ++- the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped ++- around a deflate stream, which is itself documented in RFC 1951. ++- ++- The library also supports reading and writing files in gzip (.gz) format ++- with an interface similar to that of stdio using the functions that start ++- with "gz". The gzip format is different from the zlib format. gzip is a ++- gzip wrapper, documented in RFC 1952, wrapped around a deflate stream. ++- ++- This library can optionally read and write gzip streams in memory as well. ++- ++- The zlib format was designed to be compact and fast for use in memory ++- and on communications channels. The gzip format was designed for single- ++- file compression on file systems, has a larger header than zlib to maintain ++- directory information, and uses a different, slower check method than zlib. ++- ++- The library does not install any signal handler. The decoder checks ++- the consistency of the compressed data, so the library should never ++- crash even in case of corrupted input. ++-*/ ++- ++-typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size)); ++-typedef void (*free_func) OF((voidpf opaque, voidpf address)); ++- ++-struct internal_state; ++- ++-typedef struct z_stream_s { ++- Bytef *next_in; /* next input byte */ ++- uInt avail_in; /* number of bytes available at next_in */ ++- uLong total_in; /* total nb of input bytes read so far */ ++- ++- Bytef *next_out; /* next output byte should be put there */ ++- uInt avail_out; /* remaining free space at next_out */ ++- uLong total_out; /* total nb of bytes output so far */ ++- ++- char *msg; /* last error message, NULL if no error */ ++- struct internal_state FAR *state; /* not visible by applications */ ++- ++- alloc_func zalloc; /* used to allocate the internal state */ ++- free_func zfree; /* used to free the internal state */ ++- voidpf opaque; /* private data object passed to zalloc and zfree */ ++- ++- int data_type; /* best guess about the data type: binary or text */ ++- uLong adler; /* adler32 value of the uncompressed data */ ++- uLong reserved; /* reserved for future use */ ++-} z_stream; ++- ++-typedef z_stream FAR *z_streamp; ++- ++-/* ++- gzip header information passed to and from zlib routines. See RFC 1952 ++- for more details on the meanings of these fields. ++-*/ ++-typedef struct gz_header_s { ++- int text; /* true if compressed data believed to be text */ ++- uLong time; /* modification time */ ++- int xflags; /* extra flags (not used when writing a gzip file) */ ++- int os; /* operating system */ ++- Bytef *extra; /* pointer to extra field or Z_NULL if none */ ++- uInt extra_len; /* extra field length (valid if extra != Z_NULL) */ ++- uInt extra_max; /* space at extra (only when reading header) */ ++- Bytef *name; /* pointer to zero-terminated file name or Z_NULL */ ++- uInt name_max; /* space at name (only when reading header) */ ++- Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */ ++- uInt comm_max; /* space at comment (only when reading header) */ ++- int hcrc; /* true if there was or will be a header crc */ ++- int done; /* true when done reading gzip header (not used ++- when writing a gzip file) */ ++-} gz_header; ++- ++-typedef gz_header FAR *gz_headerp; ++- ++-/* ++- The application must update next_in and avail_in when avail_in has ++- dropped to zero. It must update next_out and avail_out when avail_out ++- has dropped to zero. The application must initialize zalloc, zfree and ++- opaque before calling the init function. All other fields are set by the ++- compression library and must not be updated by the application. ++- ++- The opaque value provided by the application will be passed as the first ++- parameter for calls of zalloc and zfree. This can be useful for custom ++- memory management. The compression library attaches no meaning to the ++- opaque value. ++- ++- zalloc must return Z_NULL if there is not enough memory for the object. ++- If zlib is used in a multi-threaded application, zalloc and zfree must be ++- thread safe. ++- ++- On 16-bit systems, the functions zalloc and zfree must be able to allocate ++- exactly 65536 bytes, but will not be required to allocate more than this ++- if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, ++- pointers returned by zalloc for objects of exactly 65536 bytes *must* ++- have their offset normalized to zero. The default allocation function ++- provided by this library ensures this (see zutil.c). To reduce memory ++- requirements and avoid any allocation of 64K objects, at the expense of ++- compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h). ++- ++- The fields total_in and total_out can be used for statistics or ++- progress reports. After compression, total_in holds the total size of ++- the uncompressed data and may be saved for use in the decompressor ++- (particularly if the decompressor wants to decompress everything in ++- a single step). ++-*/ ++- ++- /* constants */ ++- ++-#define Z_NO_FLUSH 0 ++-#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */ ++-#define Z_SYNC_FLUSH 2 ++-#define Z_FULL_FLUSH 3 ++-#define Z_FINISH 4 ++-#define Z_BLOCK 5 ++-/* Allowed flush values; see deflate() and inflate() below for details */ ++- ++-#define Z_OK 0 ++-#define Z_STREAM_END 1 ++-#define Z_NEED_DICT 2 ++-#define Z_ERRNO (-1) ++-#define Z_STREAM_ERROR (-2) ++-#define Z_DATA_ERROR (-3) ++-#define Z_MEM_ERROR (-4) ++-#define Z_BUF_ERROR (-5) ++-#define Z_VERSION_ERROR (-6) ++-/* Return codes for the compression/decompression functions. Negative ++- * values are errors, positive values are used for special but normal events. ++- */ ++- ++-#define Z_NO_COMPRESSION 0 ++-#define Z_BEST_SPEED 1 ++-#define Z_BEST_COMPRESSION 9 ++-#define Z_DEFAULT_COMPRESSION (-1) ++-/* compression levels */ ++- ++-#define Z_FILTERED 1 ++-#define Z_HUFFMAN_ONLY 2 ++-#define Z_RLE 3 ++-#define Z_FIXED 4 ++-#define Z_DEFAULT_STRATEGY 0 ++-/* compression strategy; see deflateInit2() below for details */ ++- ++-#define Z_BINARY 0 ++-#define Z_TEXT 1 ++-#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */ ++-#define Z_UNKNOWN 2 ++-/* Possible values of the data_type field (though see inflate()) */ ++- ++-#define Z_DEFLATED 8 ++-/* The deflate compression method (the only one supported in this version) */ ++- ++-#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ ++- ++-#define zlib_version zlibVersion() ++-/* for compatibility with versions < 1.0.2 */ ++- ++- /* basic functions */ ++- ++-ZEXTERN const char * ZEXPORT zlibVersion OF((void)); ++-/* The application can compare zlibVersion and ZLIB_VERSION for consistency. ++- If the first character differs, the library code actually used is ++- not compatible with the zlib.h header file used by the application. ++- This check is automatically made by deflateInit and inflateInit. ++- */ ++- ++-/* ++-ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); ++- ++- Initializes the internal stream state for compression. The fields ++- zalloc, zfree and opaque must be initialized before by the caller. ++- If zalloc and zfree are set to Z_NULL, deflateInit updates them to ++- use default allocation functions. ++- ++- The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9: ++- 1 gives best speed, 9 gives best compression, 0 gives no compression at ++- all (the input data is simply copied a block at a time). ++- Z_DEFAULT_COMPRESSION requests a default compromise between speed and ++- compression (currently equivalent to level 6). ++- ++- deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not ++- enough memory, Z_STREAM_ERROR if level is not a valid compression level, ++- Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible ++- with the version assumed by the caller (ZLIB_VERSION). ++- msg is set to null if there is no error message. deflateInit does not ++- perform any compression: this will be done by deflate(). ++-*/ ++- ++- ++-ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush)); ++-/* ++- deflate compresses as much data as possible, and stops when the input ++- buffer becomes empty or the output buffer becomes full. It may introduce some ++- output latency (reading input without producing any output) except when ++- forced to flush. ++- ++- The detailed semantics are as follows. deflate performs one or both of the ++- following actions: ++- ++- - Compress more input starting at next_in and update next_in and avail_in ++- accordingly. If not all input can be processed (because there is not ++- enough room in the output buffer), next_in and avail_in are updated and ++- processing will resume at this point for the next call of deflate(). ++- ++- - Provide more output starting at next_out and update next_out and avail_out ++- accordingly. This action is forced if the parameter flush is non zero. ++- Forcing flush frequently degrades the compression ratio, so this parameter ++- should be set only when necessary (in interactive applications). ++- Some output may be provided even if flush is not set. ++- ++- Before the call of deflate(), the application should ensure that at least ++- one of the actions is possible, by providing more input and/or consuming ++- more output, and updating avail_in or avail_out accordingly; avail_out ++- should never be zero before the call. The application can consume the ++- compressed output when it wants, for example when the output buffer is full ++- (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK ++- and with zero avail_out, it must be called again after making room in the ++- output buffer because there might be more output pending. ++- ++- Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to ++- decide how much data to accumualte before producing output, in order to ++- maximize compression. ++- ++- If the parameter flush is set to Z_SYNC_FLUSH, all pending output is ++- flushed to the output buffer and the output is aligned on a byte boundary, so ++- that the decompressor can get all input data available so far. (In particular ++- avail_in is zero after the call if enough output space has been provided ++- before the call.) Flushing may degrade compression for some compression ++- algorithms and so it should be used only when necessary. ++- ++- If flush is set to Z_FULL_FLUSH, all output is flushed as with ++- Z_SYNC_FLUSH, and the compression state is reset so that decompression can ++- restart from this point if previous compressed data has been damaged or if ++- random access is desired. Using Z_FULL_FLUSH too often can seriously degrade ++- compression. ++- ++- If deflate returns with avail_out == 0, this function must be called again ++- with the same value of the flush parameter and more output space (updated ++- avail_out), until the flush is complete (deflate returns with non-zero ++- avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that ++- avail_out is greater than six to avoid repeated flush markers due to ++- avail_out == 0 on return. ++- ++- If the parameter flush is set to Z_FINISH, pending input is processed, ++- pending output is flushed and deflate returns with Z_STREAM_END if there ++- was enough output space; if deflate returns with Z_OK, this function must be ++- called again with Z_FINISH and more output space (updated avail_out) but no ++- more input data, until it returns with Z_STREAM_END or an error. After ++- deflate has returned Z_STREAM_END, the only possible operations on the ++- stream are deflateReset or deflateEnd. ++- ++- Z_FINISH can be used immediately after deflateInit if all the compression ++- is to be done in a single step. In this case, avail_out must be at least ++- the value returned by deflateBound (see below). If deflate does not return ++- Z_STREAM_END, then it must be called again as described above. ++- ++- deflate() sets strm->adler to the adler32 checksum of all input read ++- so far (that is, total_in bytes). ++- ++- deflate() may update strm->data_type if it can make a good guess about ++- the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered ++- binary. This field is only for information purposes and does not affect ++- the compression algorithm in any manner. ++- ++- deflate() returns Z_OK if some progress has been made (more input ++- processed or more output produced), Z_STREAM_END if all input has been ++- consumed and all output has been produced (only when flush is set to ++- Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example ++- if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible ++- (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not ++- fatal, and deflate() can be called again with more input and more output ++- space to continue compressing. ++-*/ ++- ++- ++-ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm)); ++-/* ++- All dynamically allocated data structures for this stream are freed. ++- This function discards any unprocessed input and does not flush any ++- pending output. ++- ++- deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the ++- stream state was inconsistent, Z_DATA_ERROR if the stream was freed ++- prematurely (some input or output was discarded). In the error case, ++- msg may be set but then points to a static string (which must not be ++- deallocated). ++-*/ ++- ++- ++-/* ++-ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); ++- ++- Initializes the internal stream state for decompression. The fields ++- next_in, avail_in, zalloc, zfree and opaque must be initialized before by ++- the caller. If next_in is not Z_NULL and avail_in is large enough (the exact ++- value depends on the compression method), inflateInit determines the ++- compression method from the zlib header and allocates all data structures ++- accordingly; otherwise the allocation will be deferred to the first call of ++- inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to ++- use default allocation functions. ++- ++- inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough ++- memory, Z_VERSION_ERROR if the zlib library version is incompatible with the ++- version assumed by the caller. msg is set to null if there is no error ++- message. inflateInit does not perform any decompression apart from reading ++- the zlib header if present: this will be done by inflate(). (So next_in and ++- avail_in may be modified, but next_out and avail_out are unchanged.) ++-*/ ++- ++- ++-ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush)); ++-/* ++- inflate decompresses as much data as possible, and stops when the input ++- buffer becomes empty or the output buffer becomes full. It may introduce ++- some output latency (reading input without producing any output) except when ++- forced to flush. ++- ++- The detailed semantics are as follows. inflate performs one or both of the ++- following actions: ++- ++- - Decompress more input starting at next_in and update next_in and avail_in ++- accordingly. If not all input can be processed (because there is not ++- enough room in the output buffer), next_in is updated and processing ++- will resume at this point for the next call of inflate(). ++- ++- - Provide more output starting at next_out and update next_out and avail_out ++- accordingly. inflate() provides as much output as possible, until there ++- is no more input data or no more space in the output buffer (see below ++- about the flush parameter). ++- ++- Before the call of inflate(), the application should ensure that at least ++- one of the actions is possible, by providing more input and/or consuming ++- more output, and updating the next_* and avail_* values accordingly. ++- The application can consume the uncompressed output when it wants, for ++- example when the output buffer is full (avail_out == 0), or after each ++- call of inflate(). If inflate returns Z_OK and with zero avail_out, it ++- must be called again after making room in the output buffer because there ++- might be more output pending. ++- ++- The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, ++- Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much ++- output as possible to the output buffer. Z_BLOCK requests that inflate() stop ++- if and when it gets to the next deflate block boundary. When decoding the ++- zlib or gzip format, this will cause inflate() to return immediately after ++- the header and before the first block. When doing a raw inflate, inflate() ++- will go ahead and process the first block, and will return when it gets to ++- the end of that block, or when it runs out of data. ++- ++- The Z_BLOCK option assists in appending to or combining deflate streams. ++- Also to assist in this, on return inflate() will set strm->data_type to the ++- number of unused bits in the last byte taken from strm->next_in, plus 64 ++- if inflate() is currently decoding the last block in the deflate stream, ++- plus 128 if inflate() returned immediately after decoding an end-of-block ++- code or decoding the complete header up to just before the first byte of the ++- deflate stream. The end-of-block will not be indicated until all of the ++- uncompressed data from that block has been written to strm->next_out. The ++- number of unused bits may in general be greater than seven, except when ++- bit 7 of data_type is set, in which case the number of unused bits will be ++- less than eight. ++- ++- inflate() should normally be called until it returns Z_STREAM_END or an ++- error. However if all decompression is to be performed in a single step ++- (a single call of inflate), the parameter flush should be set to ++- Z_FINISH. In this case all pending input is processed and all pending ++- output is flushed; avail_out must be large enough to hold all the ++- uncompressed data. (The size of the uncompressed data may have been saved ++- by the compressor for this purpose.) The next operation on this stream must ++- be inflateEnd to deallocate the decompression state. The use of Z_FINISH ++- is never required, but can be used to inform inflate that a faster approach ++- may be used for the single inflate() call. ++- ++- In this implementation, inflate() always flushes as much output as ++- possible to the output buffer, and always uses the faster approach on the ++- first call. So the only effect of the flush parameter in this implementation ++- is on the return value of inflate(), as noted below, or when it returns early ++- because Z_BLOCK is used. ++- ++- If a preset dictionary is needed after this call (see inflateSetDictionary ++- below), inflate sets strm->adler to the adler32 checksum of the dictionary ++- chosen by the compressor and returns Z_NEED_DICT; otherwise it sets ++- strm->adler to the adler32 checksum of all output produced so far (that is, ++- total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described ++- below. At the end of the stream, inflate() checks that its computed adler32 ++- checksum is equal to that saved by the compressor and returns Z_STREAM_END ++- only if the checksum is correct. ++- ++- inflate() will decompress and check either zlib-wrapped or gzip-wrapped ++- deflate data. The header type is detected automatically. Any information ++- contained in the gzip header is not retained, so applications that need that ++- information should instead use raw inflate, see inflateInit2() below, or ++- inflateBack() and perform their own processing of the gzip header and ++- trailer. ++- ++- inflate() returns Z_OK if some progress has been made (more input processed ++- or more output produced), Z_STREAM_END if the end of the compressed data has ++- been reached and all uncompressed output has been produced, Z_NEED_DICT if a ++- preset dictionary is needed at this point, Z_DATA_ERROR if the input data was ++- corrupted (input stream not conforming to the zlib format or incorrect check ++- value), Z_STREAM_ERROR if the stream structure was inconsistent (for example ++- if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory, ++- Z_BUF_ERROR if no progress is possible or if there was not enough room in the ++- output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and ++- inflate() can be called again with more input and more output space to ++- continue decompressing. If Z_DATA_ERROR is returned, the application may then ++- call inflateSync() to look for a good compression block if a partial recovery ++- of the data is desired. ++-*/ ++- ++- ++-ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm)); ++-/* ++- All dynamically allocated data structures for this stream are freed. ++- This function discards any unprocessed input and does not flush any ++- pending output. ++- ++- inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state ++- was inconsistent. In the error case, msg may be set but then points to a ++- static string (which must not be deallocated). ++-*/ ++- ++- /* Advanced functions */ ++- ++-/* ++- The following functions are needed only in some special applications. ++-*/ ++- ++-/* ++-ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm, ++- int level, ++- int method, ++- int windowBits, ++- int memLevel, ++- int strategy)); ++- ++- This is another version of deflateInit with more compression options. The ++- fields next_in, zalloc, zfree and opaque must be initialized before by ++- the caller. ++- ++- The method parameter is the compression method. It must be Z_DEFLATED in ++- this version of the library. ++- ++- The windowBits parameter is the base two logarithm of the window size ++- (the size of the history buffer). It should be in the range 8..15 for this ++- version of the library. Larger values of this parameter result in better ++- compression at the expense of memory usage. The default value is 15 if ++- deflateInit is used instead. ++- ++- windowBits can also be -8..-15 for raw deflate. In this case, -windowBits ++- determines the window size. deflate() will then generate raw deflate data ++- with no zlib header or trailer, and will not compute an adler32 check value. ++- ++- windowBits can also be greater than 15 for optional gzip encoding. Add ++- 16 to windowBits to write a simple gzip header and trailer around the ++- compressed data instead of a zlib wrapper. The gzip header will have no ++- file name, no extra data, no comment, no modification time (set to zero), ++- no header crc, and the operating system will be set to 255 (unknown). If a ++- gzip stream is being written, strm->adler is a crc32 instead of an adler32. ++- ++- The memLevel parameter specifies how much memory should be allocated ++- for the internal compression state. memLevel=1 uses minimum memory but ++- is slow and reduces compression ratio; memLevel=9 uses maximum memory ++- for optimal speed. The default value is 8. See zconf.h for total memory ++- usage as a function of windowBits and memLevel. ++- ++- The strategy parameter is used to tune the compression algorithm. Use the ++- value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a ++- filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no ++- string match), or Z_RLE to limit match distances to one (run-length ++- encoding). Filtered data consists mostly of small values with a somewhat ++- random distribution. In this case, the compression algorithm is tuned to ++- compress them better. The effect of Z_FILTERED is to force more Huffman ++- coding and less string matching; it is somewhat intermediate between ++- Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as ++- Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy ++- parameter only affects the compression ratio but not the correctness of the ++- compressed output even if it is not set appropriately. Z_FIXED prevents the ++- use of dynamic Huffman codes, allowing for a simpler decoder for special ++- applications. ++- ++- deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough ++- memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid ++- method). msg is set to null if there is no error message. deflateInit2 does ++- not perform any compression: this will be done by deflate(). ++-*/ ++- ++-ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm, ++- const Bytef *dictionary, ++- uInt dictLength)); ++-/* ++- Initializes the compression dictionary from the given byte sequence ++- without producing any compressed output. This function must be called ++- immediately after deflateInit, deflateInit2 or deflateReset, before any ++- call of deflate. The compressor and decompressor must use exactly the same ++- dictionary (see inflateSetDictionary). ++- ++- The dictionary should consist of strings (byte sequences) that are likely ++- to be encountered later in the data to be compressed, with the most commonly ++- used strings preferably put towards the end of the dictionary. Using a ++- dictionary is most useful when the data to be compressed is short and can be ++- predicted with good accuracy; the data can then be compressed better than ++- with the default empty dictionary. ++- ++- Depending on the size of the compression data structures selected by ++- deflateInit or deflateInit2, a part of the dictionary may in effect be ++- discarded, for example if the dictionary is larger than the window size in ++- deflate or deflate2. Thus the strings most likely to be useful should be ++- put at the end of the dictionary, not at the front. In addition, the ++- current implementation of deflate will use at most the window size minus ++- 262 bytes of the provided dictionary. ++- ++- Upon return of this function, strm->adler is set to the adler32 value ++- of the dictionary; the decompressor may later use this value to determine ++- which dictionary has been used by the compressor. (The adler32 value ++- applies to the whole dictionary even if only a subset of the dictionary is ++- actually used by the compressor.) If a raw deflate was requested, then the ++- adler32 value is not computed and strm->adler is not set. ++- ++- deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a ++- parameter is invalid (such as NULL dictionary) or the stream state is ++- inconsistent (for example if deflate has already been called for this stream ++- or if the compression method is bsort). deflateSetDictionary does not ++- perform any compression: this will be done by deflate(). ++-*/ ++- ++-ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest, ++- z_streamp source)); ++-/* ++- Sets the destination stream as a complete copy of the source stream. ++- ++- This function can be useful when several compression strategies will be ++- tried, for example when there are several ways of pre-processing the input ++- data with a filter. The streams that will be discarded should then be freed ++- by calling deflateEnd. Note that deflateCopy duplicates the internal ++- compression state which can be quite large, so this strategy is slow and ++- can consume lots of memory. ++- ++- deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not ++- enough memory, Z_STREAM_ERROR if the source stream state was inconsistent ++- (such as zalloc being NULL). msg is left unchanged in both source and ++- destination. ++-*/ ++- ++-ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm)); ++-/* ++- This function is equivalent to deflateEnd followed by deflateInit, ++- but does not free and reallocate all the internal compression state. ++- The stream will keep the same compression level and any other attributes ++- that may have been set by deflateInit2. ++- ++- deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source ++- stream state was inconsistent (such as zalloc or state being NULL). ++-*/ ++- ++-ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm, ++- int level, ++- int strategy)); ++-/* ++- Dynamically update the compression level and compression strategy. The ++- interpretation of level and strategy is as in deflateInit2. This can be ++- used to switch between compression and straight copy of the input data, or ++- to switch to a different kind of input data requiring a different ++- strategy. If the compression level is changed, the input available so far ++- is compressed with the old level (and may be flushed); the new level will ++- take effect only at the next call of deflate(). ++- ++- Before the call of deflateParams, the stream state must be set as for ++- a call of deflate(), since the currently available input may have to ++- be compressed and flushed. In particular, strm->avail_out must be non-zero. ++- ++- deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source ++- stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR ++- if strm->avail_out was zero. ++-*/ ++- ++-ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm, ++- int good_length, ++- int max_lazy, ++- int nice_length, ++- int max_chain)); ++-/* ++- Fine tune deflate's internal compression parameters. This should only be ++- used by someone who understands the algorithm used by zlib's deflate for ++- searching for the best matching string, and even then only by the most ++- fanatic optimizer trying to squeeze out the last compressed bit for their ++- specific input data. Read the deflate.c source code for the meaning of the ++- max_lazy, good_length, nice_length, and max_chain parameters. ++- ++- deflateTune() can be called after deflateInit() or deflateInit2(), and ++- returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream. ++- */ ++- ++-ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm, ++- uLong sourceLen)); ++-/* ++- deflateBound() returns an upper bound on the compressed size after ++- deflation of sourceLen bytes. It must be called after deflateInit() ++- or deflateInit2(). This would be used to allocate an output buffer ++- for deflation in a single pass, and so would be called before deflate(). ++-*/ ++- ++-ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm, ++- int bits, ++- int value)); ++-/* ++- deflatePrime() inserts bits in the deflate output stream. The intent ++- is that this function is used to start off the deflate output with the ++- bits leftover from a previous deflate stream when appending to it. As such, ++- this function can only be used for raw deflate, and must be used before the ++- first deflate() call after a deflateInit2() or deflateReset(). bits must be ++- less than or equal to 16, and that many of the least significant bits of ++- value will be inserted in the output. ++- ++- deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source ++- stream state was inconsistent. ++-*/ ++- ++-ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm, ++- gz_headerp head)); ++-/* ++- deflateSetHeader() provides gzip header information for when a gzip ++- stream is requested by deflateInit2(). deflateSetHeader() may be called ++- after deflateInit2() or deflateReset() and before the first call of ++- deflate(). The text, time, os, extra field, name, and comment information ++- in the provided gz_header structure are written to the gzip header (xflag is ++- ignored -- the extra flags are set according to the compression level). The ++- caller must assure that, if not Z_NULL, name and comment are terminated with ++- a zero byte, and that if extra is not Z_NULL, that extra_len bytes are ++- available there. If hcrc is true, a gzip header crc is included. Note that ++- the current versions of the command-line version of gzip (up through version ++- 1.3.x) do not support header crc's, and will report that it is a "multi-part ++- gzip file" and give up. ++- ++- If deflateSetHeader is not used, the default gzip header has text false, ++- the time set to zero, and os set to 255, with no extra, name, or comment ++- fields. The gzip header is returned to the default state by deflateReset(). ++- ++- deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source ++- stream state was inconsistent. ++-*/ ++- ++-/* ++-ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm, ++- int windowBits)); ++- ++- This is another version of inflateInit with an extra parameter. The ++- fields next_in, avail_in, zalloc, zfree and opaque must be initialized ++- before by the caller. ++- ++- The windowBits parameter is the base two logarithm of the maximum window ++- size (the size of the history buffer). It should be in the range 8..15 for ++- this version of the library. The default value is 15 if inflateInit is used ++- instead. windowBits must be greater than or equal to the windowBits value ++- provided to deflateInit2() while compressing, or it must be equal to 15 if ++- deflateInit2() was not used. If a compressed stream with a larger window ++- size is given as input, inflate() will return with the error code ++- Z_DATA_ERROR instead of trying to allocate a larger window. ++- ++- windowBits can also be -8..-15 for raw inflate. In this case, -windowBits ++- determines the window size. inflate() will then process raw deflate data, ++- not looking for a zlib or gzip header, not generating a check value, and not ++- looking for any check values for comparison at the end of the stream. This ++- is for use with other formats that use the deflate compressed data format ++- such as zip. Those formats provide their own check values. If a custom ++- format is developed using the raw deflate format for compressed data, it is ++- recommended that a check value such as an adler32 or a crc32 be applied to ++- the uncompressed data as is done in the zlib, gzip, and zip formats. For ++- most applications, the zlib format should be used as is. Note that comments ++- above on the use in deflateInit2() applies to the magnitude of windowBits. ++- ++- windowBits can also be greater than 15 for optional gzip decoding. Add ++- 32 to windowBits to enable zlib and gzip decoding with automatic header ++- detection, or add 16 to decode only the gzip format (the zlib format will ++- return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is ++- a crc32 instead of an adler32. ++- ++- inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough ++- memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg ++- is set to null if there is no error message. inflateInit2 does not perform ++- any decompression apart from reading the zlib header if present: this will ++- be done by inflate(). (So next_in and avail_in may be modified, but next_out ++- and avail_out are unchanged.) ++-*/ ++- ++-ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm, ++- const Bytef *dictionary, ++- uInt dictLength)); ++-/* ++- Initializes the decompression dictionary from the given uncompressed byte ++- sequence. This function must be called immediately after a call of inflate, ++- if that call returned Z_NEED_DICT. The dictionary chosen by the compressor ++- can be determined from the adler32 value returned by that call of inflate. ++- The compressor and decompressor must use exactly the same dictionary (see ++- deflateSetDictionary). For raw inflate, this function can be called ++- immediately after inflateInit2() or inflateReset() and before any call of ++- inflate() to set the dictionary. The application must insure that the ++- dictionary that was used for compression is provided. ++- ++- inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a ++- parameter is invalid (such as NULL dictionary) or the stream state is ++- inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the ++- expected one (incorrect adler32 value). inflateSetDictionary does not ++- perform any decompression: this will be done by subsequent calls of ++- inflate(). ++-*/ ++- ++-ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm)); ++-/* ++- Skips invalid compressed data until a full flush point (see above the ++- description of deflate with Z_FULL_FLUSH) can be found, or until all ++- available input is skipped. No output is provided. ++- ++- inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR ++- if no more input was provided, Z_DATA_ERROR if no flush point has been found, ++- or Z_STREAM_ERROR if the stream structure was inconsistent. In the success ++- case, the application may save the current current value of total_in which ++- indicates where valid compressed data was found. In the error case, the ++- application may repeatedly call inflateSync, providing more input each time, ++- until success or end of the input data. ++-*/ ++- ++-ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest, ++- z_streamp source)); ++-/* ++- Sets the destination stream as a complete copy of the source stream. ++- ++- This function can be useful when randomly accessing a large stream. The ++- first pass through the stream can periodically record the inflate state, ++- allowing restarting inflate at those points when randomly accessing the ++- stream. ++- ++- inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not ++- enough memory, Z_STREAM_ERROR if the source stream state was inconsistent ++- (such as zalloc being NULL). msg is left unchanged in both source and ++- destination. ++-*/ ++- ++-ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm)); ++-/* ++- This function is equivalent to inflateEnd followed by inflateInit, ++- but does not free and reallocate all the internal decompression state. ++- The stream will keep attributes that may have been set by inflateInit2. ++- ++- inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source ++- stream state was inconsistent (such as zalloc or state being NULL). ++-*/ ++- ++-ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm, ++- int bits, ++- int value)); ++-/* ++- This function inserts bits in the inflate input stream. The intent is ++- that this function is used to start inflating at a bit position in the ++- middle of a byte. The provided bits will be used before any bytes are used ++- from next_in. This function should only be used with raw inflate, and ++- should be used before the first inflate() call after inflateInit2() or ++- inflateReset(). bits must be less than or equal to 16, and that many of the ++- least significant bits of value will be inserted in the input. ++- ++- inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source ++- stream state was inconsistent. ++-*/ ++- ++-ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm, ++- gz_headerp head)); ++-/* ++- inflateGetHeader() requests that gzip header information be stored in the ++- provided gz_header structure. inflateGetHeader() may be called after ++- inflateInit2() or inflateReset(), and before the first call of inflate(). ++- As inflate() processes the gzip stream, head->done is zero until the header ++- is completed, at which time head->done is set to one. If a zlib stream is ++- being decoded, then head->done is set to -1 to indicate that there will be ++- no gzip header information forthcoming. Note that Z_BLOCK can be used to ++- force inflate() to return immediately after header processing is complete ++- and before any actual data is decompressed. ++- ++- The text, time, xflags, and os fields are filled in with the gzip header ++- contents. hcrc is set to true if there is a header CRC. (The header CRC ++- was valid if done is set to one.) If extra is not Z_NULL, then extra_max ++- contains the maximum number of bytes to write to extra. Once done is true, ++- extra_len contains the actual extra field length, and extra contains the ++- extra field, or that field truncated if extra_max is less than extra_len. ++- If name is not Z_NULL, then up to name_max characters are written there, ++- terminated with a zero unless the length is greater than name_max. If ++- comment is not Z_NULL, then up to comm_max characters are written there, ++- terminated with a zero unless the length is greater than comm_max. When ++- any of extra, name, or comment are not Z_NULL and the respective field is ++- not present in the header, then that field is set to Z_NULL to signal its ++- absence. This allows the use of deflateSetHeader() with the returned ++- structure to duplicate the header. However if those fields are set to ++- allocated memory, then the application will need to save those pointers ++- elsewhere so that they can be eventually freed. ++- ++- If inflateGetHeader is not used, then the header information is simply ++- discarded. The header is always checked for validity, including the header ++- CRC if present. inflateReset() will reset the process to discard the header ++- information. The application would need to call inflateGetHeader() again to ++- retrieve the header from the next gzip stream. ++- ++- inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source ++- stream state was inconsistent. ++-*/ ++- ++-/* ++-ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits, ++- unsigned char FAR *window)); ++- ++- Initialize the internal stream state for decompression using inflateBack() ++- calls. The fields zalloc, zfree and opaque in strm must be initialized ++- before the call. If zalloc and zfree are Z_NULL, then the default library- ++- derived memory allocation routines are used. windowBits is the base two ++- logarithm of the window size, in the range 8..15. window is a caller ++- supplied buffer of that size. Except for special applications where it is ++- assured that deflate was used with small window sizes, windowBits must be 15 ++- and a 32K byte window must be supplied to be able to decompress general ++- deflate streams. ++- ++- See inflateBack() for the usage of these routines. ++- ++- inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of ++- the paramaters are invalid, Z_MEM_ERROR if the internal state could not ++- be allocated, or Z_VERSION_ERROR if the version of the library does not ++- match the version of the header file. ++-*/ ++- ++-typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *)); ++-typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned)); ++- ++-ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm, ++- in_func in, void FAR *in_desc, ++- out_func out, void FAR *out_desc)); ++-/* ++- inflateBack() does a raw inflate with a single call using a call-back ++- interface for input and output. This is more efficient than inflate() for ++- file i/o applications in that it avoids copying between the output and the ++- sliding window by simply making the window itself the output buffer. This ++- function trusts the application to not change the output buffer passed by ++- the output function, at least until inflateBack() returns. ++- ++- inflateBackInit() must be called first to allocate the internal state ++- and to initialize the state with the user-provided window buffer. ++- inflateBack() may then be used multiple times to inflate a complete, raw ++- deflate stream with each call. inflateBackEnd() is then called to free ++- the allocated state. ++- ++- A raw deflate stream is one with no zlib or gzip header or trailer. ++- This routine would normally be used in a utility that reads zip or gzip ++- files and writes out uncompressed files. The utility would decode the ++- header and process the trailer on its own, hence this routine expects ++- only the raw deflate stream to decompress. This is different from the ++- normal behavior of inflate(), which expects either a zlib or gzip header and ++- trailer around the deflate stream. ++- ++- inflateBack() uses two subroutines supplied by the caller that are then ++- called by inflateBack() for input and output. inflateBack() calls those ++- routines until it reads a complete deflate stream and writes out all of the ++- uncompressed data, or until it encounters an error. The function's ++- parameters and return types are defined above in the in_func and out_func ++- typedefs. inflateBack() will call in(in_desc, &buf) which should return the ++- number of bytes of provided input, and a pointer to that input in buf. If ++- there is no input available, in() must return zero--buf is ignored in that ++- case--and inflateBack() will return a buffer error. inflateBack() will call ++- out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out() ++- should return zero on success, or non-zero on failure. If out() returns ++- non-zero, inflateBack() will return with an error. Neither in() nor out() ++- are permitted to change the contents of the window provided to ++- inflateBackInit(), which is also the buffer that out() uses to write from. ++- The length written by out() will be at most the window size. Any non-zero ++- amount of input may be provided by in(). ++- ++- For convenience, inflateBack() can be provided input on the first call by ++- setting strm->next_in and strm->avail_in. If that input is exhausted, then ++- in() will be called. Therefore strm->next_in must be initialized before ++- calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called ++- immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in ++- must also be initialized, and then if strm->avail_in is not zero, input will ++- initially be taken from strm->next_in[0 .. strm->avail_in - 1]. ++- ++- The in_desc and out_desc parameters of inflateBack() is passed as the ++- first parameter of in() and out() respectively when they are called. These ++- descriptors can be optionally used to pass any information that the caller- ++- supplied in() and out() functions need to do their job. ++- ++- On return, inflateBack() will set strm->next_in and strm->avail_in to ++- pass back any unused input that was provided by the last in() call. The ++- return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR ++- if in() or out() returned an error, Z_DATA_ERROR if there was a format ++- error in the deflate stream (in which case strm->msg is set to indicate the ++- nature of the error), or Z_STREAM_ERROR if the stream was not properly ++- initialized. In the case of Z_BUF_ERROR, an input or output error can be ++- distinguished using strm->next_in which will be Z_NULL only if in() returned ++- an error. If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to ++- out() returning non-zero. (in() will always be called before out(), so ++- strm->next_in is assured to be defined if out() returns non-zero.) Note ++- that inflateBack() cannot return Z_OK. ++-*/ ++- ++-ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm)); ++-/* ++- All memory allocated by inflateBackInit() is freed. ++- ++- inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream ++- state was inconsistent. ++-*/ ++- ++-ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void)); ++-/* Return flags indicating compile-time options. ++- ++- Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other: ++- 1.0: size of uInt ++- 3.2: size of uLong ++- 5.4: size of voidpf (pointer) ++- 7.6: size of z_off_t ++- ++- Compiler, assembler, and debug options: ++- 8: DEBUG ++- 9: ASMV or ASMINF -- use ASM code ++- 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention ++- 11: 0 (reserved) ++- ++- One-time table building (smaller code, but not thread-safe if true): ++- 12: BUILDFIXED -- build static block decoding tables when needed ++- 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed ++- 14,15: 0 (reserved) ++- ++- Library content (indicates missing functionality): ++- 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking ++- deflate code when not needed) ++- 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect ++- and decode gzip streams (to avoid linking crc code) ++- 18-19: 0 (reserved) ++- ++- Operation variations (changes in library functionality): ++- 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate ++- 21: FASTEST -- deflate algorithm with only one, lowest compression level ++- 22,23: 0 (reserved) ++- ++- The sprintf variant used by gzprintf (zero is best): ++- 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format ++- 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure! ++- 26: 0 = returns value, 1 = void -- 1 means inferred string length returned ++- ++- Remainder: ++- 27-31: 0 (reserved) ++- */ ++- ++- ++- /* utility functions */ ++- ++-/* ++- The following utility functions are implemented on top of the ++- basic stream-oriented functions. To simplify the interface, some ++- default options are assumed (compression level and memory usage, ++- standard memory allocation functions). The source code of these ++- utility functions can easily be modified if you need special options. ++-*/ ++- ++-ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen, ++- const Bytef *source, uLong sourceLen)); ++-/* ++- Compresses the source buffer into the destination buffer. sourceLen is ++- the byte length of the source buffer. Upon entry, destLen is the total ++- size of the destination buffer, which must be at least the value returned ++- by compressBound(sourceLen). Upon exit, destLen is the actual size of the ++- compressed buffer. ++- This function can be used to compress a whole file at once if the ++- input file is mmap'ed. ++- compress returns Z_OK if success, Z_MEM_ERROR if there was not ++- enough memory, Z_BUF_ERROR if there was not enough room in the output ++- buffer. ++-*/ ++- ++-ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen, ++- const Bytef *source, uLong sourceLen, ++- int level)); ++-/* ++- Compresses the source buffer into the destination buffer. The level ++- parameter has the same meaning as in deflateInit. sourceLen is the byte ++- length of the source buffer. Upon entry, destLen is the total size of the ++- destination buffer, which must be at least the value returned by ++- compressBound(sourceLen). Upon exit, destLen is the actual size of the ++- compressed buffer. ++- ++- compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough ++- memory, Z_BUF_ERROR if there was not enough room in the output buffer, ++- Z_STREAM_ERROR if the level parameter is invalid. ++-*/ ++- ++-ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen)); ++-/* ++- compressBound() returns an upper bound on the compressed size after ++- compress() or compress2() on sourceLen bytes. It would be used before ++- a compress() or compress2() call to allocate the destination buffer. ++-*/ ++- ++-ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen, ++- const Bytef *source, uLong sourceLen)); ++-/* ++- Decompresses the source buffer into the destination buffer. sourceLen is ++- the byte length of the source buffer. Upon entry, destLen is the total ++- size of the destination buffer, which must be large enough to hold the ++- entire uncompressed data. (The size of the uncompressed data must have ++- been saved previously by the compressor and transmitted to the decompressor ++- by some mechanism outside the scope of this compression library.) ++- Upon exit, destLen is the actual size of the compressed buffer. ++- This function can be used to decompress a whole file at once if the ++- input file is mmap'ed. ++- ++- uncompress returns Z_OK if success, Z_MEM_ERROR if there was not ++- enough memory, Z_BUF_ERROR if there was not enough room in the output ++- buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete. ++-*/ ++- ++- ++-typedef voidp gzFile; ++- ++-ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode)); ++-/* ++- Opens a gzip (.gz) file for reading or writing. The mode parameter ++- is as in fopen ("rb" or "wb") but can also include a compression level ++- ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for ++- Huffman only compression as in "wb1h", or 'R' for run-length encoding ++- as in "wb1R". (See the description of deflateInit2 for more information ++- about the strategy parameter.) ++- ++- gzopen can be used to read a file which is not in gzip format; in this ++- case gzread will directly read from the file without decompression. ++- ++- gzopen returns NULL if the file could not be opened or if there was ++- insufficient memory to allocate the (de)compression state; errno ++- can be checked to distinguish the two cases (if errno is zero, the ++- zlib error is Z_MEM_ERROR). */ ++- ++-ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode)); ++-/* ++- gzdopen() associates a gzFile with the file descriptor fd. File ++- descriptors are obtained from calls like open, dup, creat, pipe or ++- fileno (in the file has been previously opened with fopen). ++- The mode parameter is as in gzopen. ++- The next call of gzclose on the returned gzFile will also close the ++- file descriptor fd, just like fclose(fdopen(fd), mode) closes the file ++- descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode). ++- gzdopen returns NULL if there was insufficient memory to allocate ++- the (de)compression state. ++-*/ ++- ++-ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy)); ++-/* ++- Dynamically update the compression level or strategy. See the description ++- of deflateInit2 for the meaning of these parameters. ++- gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not ++- opened for writing. ++-*/ ++- ++-ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len)); ++-/* ++- Reads the given number of uncompressed bytes from the compressed file. ++- If the input file was not in gzip format, gzread copies the given number ++- of bytes into the buffer. ++- gzread returns the number of uncompressed bytes actually read (0 for ++- end of file, -1 for error). */ ++- ++-ZEXTERN int ZEXPORT gzwrite OF((gzFile file, ++- voidpc buf, unsigned len)); ++-/* ++- Writes the given number of uncompressed bytes into the compressed file. ++- gzwrite returns the number of uncompressed bytes actually written ++- (0 in case of error). ++-*/ ++- ++-ZEXTERN int ZEXPORTVA gzprintf OF((gzFile file, const char *format, ...)); ++-/* ++- Converts, formats, and writes the args to the compressed file under ++- control of the format string, as in fprintf. gzprintf returns the number of ++- uncompressed bytes actually written (0 in case of error). The number of ++- uncompressed bytes written is limited to 4095. The caller should assure that ++- this limit is not exceeded. If it is exceeded, then gzprintf() will return ++- return an error (0) with nothing written. In this case, there may also be a ++- buffer overflow with unpredictable consequences, which is possible only if ++- zlib was compiled with the insecure functions sprintf() or vsprintf() ++- because the secure snprintf() or vsnprintf() functions were not available. ++-*/ ++- ++-ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s)); ++-/* ++- Writes the given null-terminated string to the compressed file, excluding ++- the terminating null character. ++- gzputs returns the number of characters written, or -1 in case of error. ++-*/ ++- ++-ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len)); ++-/* ++- Reads bytes from the compressed file until len-1 characters are read, or ++- a newline character is read and transferred to buf, or an end-of-file ++- condition is encountered. The string is then terminated with a null ++- character. ++- gzgets returns buf, or Z_NULL in case of error. ++-*/ ++- ++-ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c)); ++-/* ++- Writes c, converted to an unsigned char, into the compressed file. ++- gzputc returns the value that was written, or -1 in case of error. ++-*/ ++- ++-ZEXTERN int ZEXPORT gzgetc OF((gzFile file)); ++-/* ++- Reads one byte from the compressed file. gzgetc returns this byte ++- or -1 in case of end of file or error. ++-*/ ++- ++-ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file)); ++-/* ++- Push one character back onto the stream to be read again later. ++- Only one character of push-back is allowed. gzungetc() returns the ++- character pushed, or -1 on failure. gzungetc() will fail if a ++- character has been pushed but not read yet, or if c is -1. The pushed ++- character will be discarded if the stream is repositioned with gzseek() ++- or gzrewind(). ++-*/ ++- ++-ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush)); ++-/* ++- Flushes all pending output into the compressed file. The parameter ++- flush is as in the deflate() function. The return value is the zlib ++- error number (see function gzerror below). gzflush returns Z_OK if ++- the flush parameter is Z_FINISH and all output could be flushed. ++- gzflush should be called only when strictly necessary because it can ++- degrade compression. ++-*/ ++- ++-ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file, ++- z_off_t offset, int whence)); ++-/* ++- Sets the starting position for the next gzread or gzwrite on the ++- given compressed file. The offset represents a number of bytes in the ++- uncompressed data stream. The whence parameter is defined as in lseek(2); ++- the value SEEK_END is not supported. ++- If the file is opened for reading, this function is emulated but can be ++- extremely slow. If the file is opened for writing, only forward seeks are ++- supported; gzseek then compresses a sequence of zeroes up to the new ++- starting position. ++- ++- gzseek returns the resulting offset location as measured in bytes from ++- the beginning of the uncompressed stream, or -1 in case of error, in ++- particular if the file is opened for writing and the new starting position ++- would be before the current position. ++-*/ ++- ++-ZEXTERN int ZEXPORT gzrewind OF((gzFile file)); ++-/* ++- Rewinds the given file. This function is supported only for reading. ++- ++- gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET) ++-*/ ++- ++-ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file)); ++-/* ++- Returns the starting position for the next gzread or gzwrite on the ++- given compressed file. This position represents a number of bytes in the ++- uncompressed data stream. ++- ++- gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR) ++-*/ ++- ++-ZEXTERN int ZEXPORT gzeof OF((gzFile file)); ++-/* ++- Returns 1 when EOF has previously been detected reading the given ++- input stream, otherwise zero. ++-*/ ++- ++-ZEXTERN int ZEXPORT gzdirect OF((gzFile file)); ++-/* ++- Returns 1 if file is being read directly without decompression, otherwise ++- zero. ++-*/ ++- ++-ZEXTERN int ZEXPORT gzclose OF((gzFile file)); ++-/* ++- Flushes all pending output if necessary, closes the compressed file ++- and deallocates all the (de)compression state. The return value is the zlib ++- error number (see function gzerror below). ++-*/ ++- ++-ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum)); ++-/* ++- Returns the error message for the last error which occurred on the ++- given compressed file. errnum is set to zlib error number. If an ++- error occurred in the file system and not in the compression library, ++- errnum is set to Z_ERRNO and the application may consult errno ++- to get the exact error code. ++-*/ ++- ++-ZEXTERN void ZEXPORT gzclearerr OF((gzFile file)); ++-/* ++- Clears the error and end-of-file flags for file. This is analogous to the ++- clearerr() function in stdio. This is useful for continuing to read a gzip ++- file that is being written concurrently. ++-*/ ++- ++- /* checksum functions */ ++- ++-/* ++- These functions are not related to compression but are exported ++- anyway because they might be useful in applications using the ++- compression library. ++-*/ ++- ++-ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len)); ++-/* ++- Update a running Adler-32 checksum with the bytes buf[0..len-1] and ++- return the updated checksum. If buf is NULL, this function returns ++- the required initial value for the checksum. ++- An Adler-32 checksum is almost as reliable as a CRC32 but can be computed ++- much faster. Usage example: ++- ++- uLong adler = adler32(0L, Z_NULL, 0); ++- ++- while (read_buffer(buffer, length) != EOF) { ++- adler = adler32(adler, buffer, length); ++- } ++- if (adler != original_adler) error(); ++-*/ ++- ++-ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2, ++- z_off_t len2)); ++-/* ++- Combine two Adler-32 checksums into one. For two sequences of bytes, seq1 ++- and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for ++- each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of ++- seq1 and seq2 concatenated, requiring only adler1, adler2, and len2. ++-*/ ++- ++-ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)); ++-/* ++- Update a running CRC-32 with the bytes buf[0..len-1] and return the ++- updated CRC-32. If buf is NULL, this function returns the required initial ++- value for the for the crc. Pre- and post-conditioning (one's complement) is ++- performed within this function so it shouldn't be done by the application. ++- Usage example: ++- ++- uLong crc = crc32(0L, Z_NULL, 0); ++- ++- while (read_buffer(buffer, length) != EOF) { ++- crc = crc32(crc, buffer, length); ++- } ++- if (crc != original_crc) error(); ++-*/ ++- ++-ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2)); ++- ++-/* ++- Combine two CRC-32 check values into one. For two sequences of bytes, ++- seq1 and seq2 with lengths len1 and len2, CRC-32 check values were ++- calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32 ++- check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and ++- len2. ++-*/ ++- ++- ++- /* various hacks, don't look :) */ ++- ++-/* deflateInit and inflateInit are macros to allow checking the zlib version ++- * and the compiler's view of z_stream: ++- */ ++-ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level, ++- const char *version, int stream_size)); ++-ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm, ++- const char *version, int stream_size)); ++-ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method, ++- int windowBits, int memLevel, ++- int strategy, const char *version, ++- int stream_size)); ++-ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits, ++- const char *version, int stream_size)); ++-ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits, ++- unsigned char FAR *window, ++- const char *version, ++- int stream_size)); ++-#define deflateInit(strm, level) \ ++- deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream)) ++-#define inflateInit(strm) \ ++- inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream)) ++-#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \ ++- deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\ ++- (strategy), ZLIB_VERSION, sizeof(z_stream)) ++-#define inflateInit2(strm, windowBits) \ ++- inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream)) ++-#define inflateBackInit(strm, windowBits, window) \ ++- inflateBackInit_((strm), (windowBits), (window), \ ++- ZLIB_VERSION, sizeof(z_stream)) ++- ++- ++-#if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL) ++- struct internal_state {int dummy;}; /* hack for buggy compilers */ ++-#endif ++- ++-ZEXTERN const char * ZEXPORT zError OF((int)); ++-ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp z)); ++-ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void)); ++- ++-#ifdef __cplusplus ++-} ++-#endif ++- ++-#endif /* ZLIB_H */ ++--- python-pysam.orig/tests/pysam_data/example_empty_with_header.sam +++++ /dev/null ++@@ -1 +0,0 @@ ++-@HD VN:1.3 SO:coordinate ++--- python-pysam.orig/tests/pysam_data/rg_with_tab.sam +++++ /dev/null ++@@ -1,3273 +0,0 @@ ++-@SQ SN:chr1 LN:1575 ++-@SQ SN:chr2 LN:1584 ++-@PG ID:bwa PN:bwa VN:0.7.9a-r786 CL:bwa mem -p -t 8 -M -R @RG ID:None SM:None /mnt/data/hg19.fa /mnt/analysis/default-0.fastq ++-EAS56_57:6:190:289:82 69 chr1 100 0 * = 100 0 CTCAAGGTTGTTGCAAGGGGGTCTATGTGAACAAA <<<7<<<;<<<<<<<<8;;<7;4<;<;;;;;94<; MF:i:192 ++-EAS56_57:6:190:289:82 137 chr1 100 73 35M = 100 0 AGGGGTGCAGAGCCGAGTCACGGGGTTGCCAGCAC <<<<<<;<<<<<<<<<<;<<;<<<<;8<6;9;;2; MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_64:3:190:727:308 99 chr1 103 99 35M = 263 195 GGTGCAGAGCCGAGTCACGGGGTTGCCAGCACAGG <<<<<<<<<<<<<<<<<<<<<<<<<<<::<<<844 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:7:141:80:875 99 chr1 110 99 35M = 265 190 AGCCGAGTCACGGGGTTGCCAGCACAGGGGCTTAA <<<<<<<<<<<<<<<<<<<<<<:<<8;<<8+7;-7 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_FC30151:3:40:1128:1940 163 chr1 112 99 35M = 291 214 CCGAGTCACGGGGTTGCCAGCACAGGGGCTTAACC <<<<<<<<<<<<<<<<<<<;<<5;;<<<9;;;;7: MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_62:5:290:319:736 69 chr1 113 0 * = 113 0 GTTCTCAAGGTTGTTGCAAGGGGGTCTATGTGAAC <<<<<<:7:<.<<<<7<<.<.<<.9*<4<:<4%74 MF:i:192 ++-EAS51_62:5:290:319:736 137 chr1 113 73 35M = 113 0 CGAGTCACGGGGTTGCCAGCACAGGGGCTTAACCT ==;=======7====6=;==:;;====66=::27: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_597:2:132:493:921 69 chr1 119 0 * = 119 0 GTTCTCAAGGTTGTTGCAAGGGGGTCTATGTGAAC <<<<<<<<<<<<<<<<<<<;<<<<77;0<;;6777 MF:i:192 ++-B7_597:2:132:493:921 137 chr1 119 75 35M = 119 0 ACGGGGTTGCCAGCACAGGGGCTTAACCTCTGGTG <<<<<<<<<<<<<<<<;<<<<<<<<;<<<<;;88: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:7:283:799:560 163 chr1 121 66 35M = 283 197 GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGAC <<<<+<<<<8<<<+<<<<<;<<:07;8;7402447 MF:i:18 Aq:i:66 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:1:225:195:543 99 chr1 123 99 35M = 299 211 GGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTG <<<<<<<<<<<<<<<<<<<<<<<9<<;::388998 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_589:6:114:714:317 99 chr1 126 99 35M = 311 220 TGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCA <<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<5;<; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_39:1:70:147:84 163 chr1 128 73 35M = 285 192 CCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGA <<<<<<<<<<<<<<<<<<<<;<<<<<<<<<;(5<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:2:187:227:818 163 chr1 129 99 35M = 290 196 CAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<3<;<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:4:77:29:126 99 chr1 131 99 35M = 315 219 GCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCT <<<<<<<<<<3<<<<<<<;;;7<;<<449<-:977 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:4:327:795:103 99 chr1 133 99 35M = 302 204 ACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:3:139:117:262 69 chr1 135 0 * = 135 0 GTTCTCAAGGTTGTTGCAAGGGGGTCTATGTGAAC <<<7<<<<<<<<<<<<<<<<<<<;<;<<<<<37;3 MF:i:192 ++-EAS114_30:3:139:117:262 137 chr1 135 76 35M = 135 0 AGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTG <<<<;<<<<<<<<<<<<<:<<<<<:<<8<<<<:<: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_FC30151:5:29:817:854 73 chr1 135 77 35M = 135 0 AGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_FC30151:5:29:817:854 133 chr1 135 0 * = 135 0 GTTCTCAAGGTTGTTGCAAGGGGGTTTATGTGAAC <<<<<<<<<<<<<<<1..;:;;;;1%407)07&7. MF:i:192 ++-EAS192_3:6:170:169:57 163 chr1 138 99 35M = 296 193 GGCTTGACCTCTGGTGACTGCCAGAGCTGCTGGCC <<<<<;<<<<<<;<<<<<<<<<<<<:<<<<<;;+% MF:i:18 Aq:i:45 NM:i:2 UQ:i:30 H0:i:0 H1:i:1 ++-B7_595:4:84:802:737 99 chr1 140 68 35M = 284 179 CTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAG <<<<<<<<<<;9<9<<<;<<;73;<<<<<37;1+. MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_4:7:78:583:670 163 chr1 142 99 35M = 316 209 TAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCT <<<<<<<<<<;;;<;;<<<:7;5;<5;;<2--8-; MF:i:18 Aq:i:66 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_64:3:90:435:691 99 chr1 147 99 35M = 318 206 TCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAGT <<<<<<<<<<;<<<;<<<<:<<<;<81;<<1;784 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:3:13:122:187 163 chr1 153 99 35M = 343 225 GACTGCCAGAGCTGCTGGCAAGCTAGAGTCCCCTT <<<<<<<;<;<<<;<<<<:;6<<<<;;;;:<<%%< MF:i:18 Aq:i:69 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 ++-EAS221_1:6:69:735:1915 99 chr1 154 99 35M = 321 202 ACTGCCAGAGCTGCTGGCAAGCTAGAGTCCCATTT <<<<<<<<;<<<<<<<<<;<<<<;<8<<<<;1:<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:5:66:959:1311 163 chr1 159 95 35M = 336 212 CAGAGCTGCTGGCAAGCTAGAGGCCCATCTGGAGC ;;4;;;+;;;-01;;&-;;4;;&;;73)(&**274 MF:i:18 Aq:i:31 NM:i:2 UQ:i:12 H0:i:0 H1:i:1 ++-EAS56_57:6:325:759:288 99 chr1 163 99 35M = 341 213 GCTGCTGGCAAGCTAGAGTCCCATTTGGAGCCCCT 8<;<<<<81<<<<<;<<;<<<;9<<<<1>><<<< MF:i:18 Aq:i:21 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:4:240:264:231 121 chr1 213 66 35M = 213 0 TGTAATGAAAACTATATTTATGCTATTCAGTTCTA 9;,;;62<9<)29<<<;96<<<;<<7<<<<<<;<< MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:4:240:264:231 181 chr1 213 0 * = 213 0 CAACAGATCAAGAAGGAGGGGCAATGGACGAGTTA %15+5022))0&<<)0)+7:4+&<0<<:0<<<7<< MF:i:192 ++-EAS1_93:7:14:426:613 99 chr1 214 99 35M = 379 200 GTAATGAAAACTATATTTATGCTATTCAGTTCTAA ======;=;==========;;==3=;==-=<;<;< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:2:173:995:93 163 chr1 215 99 35M = 382 202 TAATGAAAACTATATTTATGCTATTCAGTTCTAAA <<<<<<<<<<<<<<<<<<<7:<<<<;:<:<<<<:7 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_64:6:195:348:703 163 chr1 215 99 35M = 353 173 TAATGAAAACTATATTTATGCTATTCAGTTCTAAA <<<<<<<;<<<<<;:<<<<<<<<<<<<:<1:<:7< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:2:62:879:264 163 chr1 216 99 35M = 396 215 AATGAAAACTATATTTATGCTATTCAGTTCTAAAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<2<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_61:4:83:452:970 99 chr1 216 99 35M = 379 198 AATGAAAACTATATTTATGCTATTCAGTTCTAAAT ==========================;======== MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_1:2:64:1318:1711 99 chr1 218 99 35M = 389 206 TGAAAACTATATTTATGCTATTCAGTTCTAAATAT <<<<<<<<<<<<<<<<7<<<<<<<:<<<<<2<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_589:8:113:968:19 83 chr1 219 99 35M = 50 -204 GAAAACTATATTTATGCTATTCAGTTCTAAATATA 8;<;8;9<<<<<<<9<:<<<<<<<<<;<<<<<<<< MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:4:160:896:275 163 chr1 220 99 35M = 387 202 AAAACTATATTTATGCTATTCAGTTCTAAATATAG ============<====<==<====<==<==;=:6 MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:6:181:191:418 163 chr1 221 99 36M = 387 202 AAACTATATTTATGCTATTCAGTTCTAAATATAGAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<988 MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:7:242:354:637 99 chr1 222 99 36M = 417 231 AACTATATTTATGCTATTCAGTTCTAAATATAGAAA <<<<<<<<<<<<<<<<<<<<<<<<<<;<<<<<6<;; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_589:1:122:77:789 163 chr1 223 99 35M = 396 208 ACTATATTTATGCTATTCAGTTCTAAATATAGAAA <<<:<4<<9<:7<<<:<<<7<<<<<<<<<<9<9<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:5:42:540:501 147 chr1 224 99 36M = 60 -200 CTATATTTATGCTATTCAGTTCTAAATATAGAAATT ;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:6:155:12:674 83 chr1 224 99 36M = 52 -208 CTATATTTATGCTATTCAGTTCTAAATATAGAAATT ;<<<<<<<<<<;<<<<;<<<<8<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:4:106:316:452 147 chr1 224 99 36M = 49 -211 CTATATTTATGCTATTCAGTTCTAAATATAGAAATT :<<<<<;<<<<:<<:<<<<<<7<<<<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:5:89:525:113 163 chr1 227 78 40M = 397 210 TATTTATGCTATTCAGTTATAAATATAGAAATTGAAACAG <1<7<6;+0;7;7'<70;-<7<:<:<<5<<:9<5:7:%:7 MF:i:18 Aq:i:39 NM:i:1 UQ:i:12 H0:i:0 H1:i:1 ++-EAS54_65:3:321:311:983 147 chr1 228 99 35M = 51 -212 ATTTATGCTATTCAGTTCTAAATATAGAAATTGAA ;;4;;<7<<<<<<77<<<<<<<<<<17<<<<<<<< MF:i:18 Aq:i:62 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_65:8:76:493:708 147 chr1 229 44 35M = 73 -191 TTTATGCTATTCAGTTCTAAATATAGAAATTGAAA 5/)63.&1517(544(055(0454&7706566679 MF:i:18 Aq:i:44 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:2:125:628:79 163 chr1 229 99 35M = 400 205 TTTATGCTATTCAGTTCTAAATATAGAAATTGAAA ==================<6<====<<:<==7;:: MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_32:5:78:583:499 83 chr1 229 74 35M = 37 -227 TTTACGCTATTCAGTACTAAATATAGAAATTGAAA &6&9774&<;67<44&-4<;<9<7<<<<<;<<<<< MF:i:18 Aq:i:37 NM:i:2 UQ:i:27 H0:i:0 H1:i:1 ++-EAS54_67:3:175:730:949 83 chr1 230 99 35M = 70 -195 TTATGCTATTCAGTTCTAAATATAGAAATTGAAAC <<<<;+<<<<7<;<<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:1:84:275:1572 163 chr1 230 99 35M = 394 199 TTATGCTATTCAGTTCTAAATATAGAAATTGAAAC /6;;;4;;;;;;;;7;;4;.4;;;;;6;;;77077 MF:i:18 Aq:i:62 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:4:248:753:731 99 chr1 231 99 35M = 402 206 TATGCTATTCAGTTCTAAATATAGAAATTGAAACA <<<<<<<<<<<8<<<<<<<<<<<<:<<<<&<:<.: MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:1:9:1289:215 99 chr1 231 99 35M = 394 198 TATGCTATTCAGTTCTAAATATAGAAATTGAAACA ;;;;;;9;;;67;;;;;99;9;;;;;;;;977747 MF:i:18 Aq:i:59 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:7:188:802:71 163 chr1 232 99 35M = 415 218 ATGCTATTCAGTTCTAAATATAGAAATTGAAACAG <<<<<<<<<;<<<<<9<<<:<<<:<<<<<<:<<<; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:7:252:171:323 83 chr1 234 99 35M = 43 -226 GCTATTCAGTTCTAAATATAGAAATTGAAACAGCT ;8<;<=3=6==:====;;======;========== MF:i:18 Aq:i:62 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:5:223:142:410 147 chr1 235 99 35M = 60 -210 CTATTCAGTTCTAAATATAGAAATTGAAACAGCTG 8;<<<;<<<<;<<<<<<;<;<<<<<<<<<<<<;<< MF:i:18 Aq:i:39 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:1:243:10:911 83 chr1 236 99 35M = 63 -208 TATTCAGTTCTAAATATAGAAATTGAAACAGCTGT ;<;;;<4;9:<<<;<<;<<<<<;;<<<<<<<<<<< MF:i:18 Aq:i:64 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:6:5:730:1436 163 chr1 236 99 35M = 403 202 TATTCAGTTCTAAATATAGAAATTGAAACAGCTGT ;;;;;;;;;;;;;;;;;;8;;;;;8;;;;;67777 MF:i:18 Aq:i:67 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:2:57:1672:1890 121 chr1 236 75 40M = 236 0 TATTCAGTTCTAAATATAGAAATTGAAACAGCTGTGTTTA :;;;9<8;;*<<<<<<:<<<<<<<<1:<<<<<<<<<<<7< MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:2:57:1672:1890 181 chr1 236 0 * = 236 0 CCCCCCCCCCCCCCCCCAGCCACTGCGGCCCCCCCAGCCA -+)%)'-'+,,<066,))090+:&486083:5&&:<<5<0 MF:i:192 ++-EAS1_105:2:299:360:220 99 chr1 237 99 35M = 403 201 ATTCAGTTCTAAATATAGAAATTGAAACAGCTGTG <<<<<<<9<9<<<<<<<<<<<<<<<<<5<;<0<<< MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:2:24:1037:84 163 chr1 238 99 35M = 415 212 TTCAGTTCTAAATATAGAAATTGAAACAGCTGTGT <<<<<<<<<<<<<<<<<<<<<<;<<<<<<;:<57< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_105:3:86:823:683 163 chr1 240 99 35M = 408 203 CAGTTCTAAATATAGAAATTGAAACAGCTGTGTTT <<<<<<<<;<<<<<<<<<<<<<<<<<<<<;;9<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:4:130:568:978 99 chr1 246 88 35M = 434 223 TAAATATAGAAATTGAAACAGCTGTGTTTAGTGAC 7<<;<<;<7<:;<7<<<<<<<<);4;+<7+3+%;< MF:i:18 Aq:i:24 NM:i:1 UQ:i:26 H0:i:1 H1:i:0 ++-EAS114_45:4:73:1208:495 163 chr1 246 99 35M = 431 220 TAAATATAGAAATTGAAACAGCTGTGTTTAGTGCC ;;;;;;;;;;;;;;;;;;;;;;;;5;;;;;37377 MF:i:18 Aq:i:67 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:7:264:642:506 99 chr1 247 99 35M = 420 208 AAATATAGAAATTGAAACAGCTGTGTTTATTGTAT <<;<<<<<<;<<<;:;;:;;<<;<<<<;*+;*&.4 MF:i:18 Aq:i:56 NM:i:3 UQ:i:28 H0:i:1 H1:i:0 ++-EAS114_28:5:104:350:749 163 chr1 247 99 36M = 415 204 AAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTT <<8<<<<<<<<<<<;<<<<<<<<0;<<<9;<85;;; MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_61:6:227:259:597 147 chr1 248 99 35M = 61 -222 AATATAGAAATTGAAACAGCTGTGTTTAGTGCCTT <8<;2;9;<;;-92<;;;<;<<<<<<<<<<<<<<< MF:i:18 Aq:i:61 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_32:7:113:809:364 99 chr1 250 99 35M = 413 198 TATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTG <<<<<<<<<<<<<<<<<<<<<<<<<<7<;<;<<<4 MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:2:218:877:489 83 chr1 250 86 35M = 80 -205 TATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTG 9<<<8<<<;<9<<<<<<<<<<;<<<<<<<<<<<<< MF:i:18 Aq:i:10 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:7:20:979:96 83 chr1 254 99 35M = 79 -210 GAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCA '9996;(:;-<;1<<<<=<<<<=<<<<<<<<<<<< MF:i:18 Aq:i:37 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:2:259:219:114 99 chr1 254 99 35M = 411 192 GAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCA <<<<<<<<<<<<<<<;<<<<<<7<7<<<<<0<<9< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_39:6:13:1034:1144 99 chr1 256 99 35M = 429 208 AATTGAAACAGCTGTGTTTAGTGCCTTTGTTCACA <<<<<<<<<<<<<<<<<<<<<<<<3<<<;<<;<++ MF:i:18 Aq:i:69 NM:i:2 UQ:i:48 H0:i:1 H1:i:0 ++-EAS221_1:2:29:1486:672 147 chr1 256 99 35M = 79 -212 AATTGAAACAGCTGTGTTTAGTGCCTTTGTTCACA <<:<<<<;<<<<<<<<<<<<<<<<<<<<<<<<<++ MF:i:18 Aq:i:29 NM:i:2 UQ:i:54 H0:i:0 H1:i:0 ++-EAS139_11:7:46:695:738 163 chr1 259 74 35M = 428 204 TGAAACAGCTGAGTTTAGCGCCTGTGTTCACATAG <;<<<<;<<),&4<3<<7&7<0;)).3;79;7<;0 MF:i:130 Aq:i:74 NM:i:3 UQ:i:18 H0:i:0 H1:i:0 ++-EAS139_11:8:26:1221:222 163 chr1 261 99 35M = 446 220 AAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:78 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_64:3:190:727:308 147 chr1 263 99 35M = 103 -195 ACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTG ;;<;;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:3:284:261:124 83 chr1 263 99 35M = 79 -219 ACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTG ===27===.====&===========;;======== MF:i:18 Aq:i:39 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:7:141:80:875 147 chr1 265 99 35M = 110 -190 AGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCA 6/<;84<;<;<<<<<<5<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:3:24:1135:563 163 chr1 266 99 40M = 446 220 GCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAACAACC <<<<:<<<<:1:<<<<<<.<<<<<<<<;<;;;43+:30:: MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:2:90:986:1224 83 chr1 267 99 35M = 67 -235 CTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAAC <7*37;;;;;;;9<<;<7<<<<<<<<<<<;;<<<< MF:i:18 Aq:i:41 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:7:287:492:169 99 chr1 269 99 36M = 449 216 GTGTTTAGTGCCTTTGTTCAACCCCCTTGCAACAAC <<<7<<<<<<<<<<<<<<<<<<<<<<<8;;<;6<<; MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_4:1:48:9:409 99 chr1 271 75 18M5I12M = 464 228 GTTTAGTGCCTTTGTTCACATAGACCCCCTTGCAA <<<<<<<<<<<<<:<<<<<<<<<<<<<<<<<<<<< MF:i:130 Aq:i:75 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-EAS139_19:1:87:1222:878 163 chr1 272 10 40M = 435 203 TATAGGGCCTTTGTTCAAACCCCTTGCAACAACCTTGAGA &+6<6&<:<<9<1112<<;)9227>>>>>>>>>>>>>;<>>>>><<>>>;<+<>=>>+==>>==<==<=8=><:;8/;7<<<<<<<<;<:<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:2:240:603:890 83 chr1 740 99 36M = 590 -186 GCTCCCAAGAGGGAAAGCTTTCAACGCTTCTAGCCA ;+&+//&<<<<<<<<<<9<<<8<<<<9<<<<<<<<< MF:i:18 Aq:i:66 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-B7_591:7:129:956:115 163 chr1 740 99 36M = 927 223 GCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;877- MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:4:168:528:288 83 chr1 740 99 35M = 570 -205 GCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCC 8<%<31;<<;<;<<<<<<<;<<<<<<<<<<;<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:8:275:851:240 147 chr1 743 99 35M = 561 -217 CCCCAGAGGGAAAGCTTTCAACGTTTCTAGCCATT 66614/&3616630666&66666&66666868666 MF:i:18 Aq:i:31 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 ++-EAS188_7:6:205:873:464 147 chr1 743 99 35M = 552 -226 CCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATT <-((+:+;289<--;<;-;<:;;<<<;;<<<<<<< MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:6:37:610:260 163 chr1 745 99 35M = 913 203 CCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTC <<<;<;<<7<<<<<<<<<<<<<<;6<963;;;3;1 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:7:93:945:176 147 chr1 745 99 35M = 582 -198 CCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTC 6;;;8<<3<<8.<;6)<<<<<9<<<<<<<<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:6:61:628:681 83 chr1 746 99 36M = 586 -196 CAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCTT 95<<<<<<<<;<<<<;<<<:<<;;<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:7:288:552:440 83 chr1 747 87 35M = 560 -222 AGAGGGAACGCTTTCAACTCTTCTAGCCATTTCTT 9<<%'%<<.2<<<<<<<<5:<<<<<<<<<<<<<<< MF:i:18 Aq:i:26 NM:i:2 UQ:i:33 H0:i:0 H1:i:0 ++-EAS56_53:2:170:265:818 163 chr1 748 10 35M = 920 207 GAGGGGAAGCTTTCAACGCTTCTAGCACTTTCTTT <<<<<(5/959<8.<9<8<<<2<&59&&:22:8+( MF:i:18 Aq:i:10 NM:i:3 UQ:i:17 H0:i:0 H1:i:0 ++-B7_595:2:251:121:479 83 chr1 750 99 35M = 572 -213 GGGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTG <<<<<6'..663;&<<;<<9<<<9<<<<<<<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:8:67:1797:1931 147 chr1 750 99 35M = 562 -223 GGGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_103:2:226:302:758 83 chr1 751 99 35M = 556 -230 GGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGG ;<<<<9;<<<<<<<<<<7<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:33 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_32:2:163:618:570 83 chr1 751 99 35M = 571 -215 GGAAAGCTGTCAACGCTTCTAGCCATTTCTTTTGG <9774<88&:8<:8<8:8<8<<<<<;88<88<<<< MF:i:18 Aq:i:41 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 ++-EAS1_97:3:73:292:429 99 chr1 752 99 35M = 920 203 GAAAGCTTTCAACGCTTCTAGCCATTTCTTTTTGC <<<<<<<<<<7<<;<<<<<<<2<<<5<<<<<:%)< MF:i:18 Aq:i:69 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 ++-EAS1_108:3:82:356:253 99 chr1 752 99 35M = 927 210 GAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGGC ===================<========;===39= MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:6:62:386:959 147 chr1 752 99 35M = 594 -193 AAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGGC %;71131((<<6<92(+<1<<;<-3<8<<;<;;<< MF:i:18 Aq:i:57 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 ++-EAS51_62:3:263:74:407 83 chr1 754 99 35M = 574 -215 AAGCTTTCAACGCTTCTAGCCATTTCTTTTGGCAT ;;88<::+;<)<5<<:<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_597:3:67:620:344 99 chr1 755 99 35M = 905 185 AGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATT <<<<2<:2<<<<<<7<<<<:<<*<<<<<<***3<< MF:i:18 Aq:i:33 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:6:148:776:486 83 chr1 755 99 35M = 578 -212 AGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATT ;:<<<;<<;<<<<<;<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_61:3:150:933:810 83 chr1 755 99 35M = 593 -197 AGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATT :89===:=:=;;==;==================== MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_64:4:102:467:897 99 chr1 756 97 35M = 940 219 GCTTTCAACGCTTCTAGCCATTTCTTTTGTCTTTT <<<<9<<<<9<2<<<&,/=====>=>=>>>=>>==>=>>>>>> MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:3:297:637:86 83 chr1 869 99 35M = 704 -200 TCTCAGCTAGGGGAACAGGGAGGTGCACTAATGCG <:75<;<;;<<<<<<;;<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:68 NM:i:1 UQ:i:26 H0:i:1 H1:i:0 ++-EAS54_65:3:290:558:349 147 chr1 869 99 35M = 719 -185 TCTCAGCTAGGGGAACAGGGAGGTGCACTAATGCG 2;2;;'5&;<<5<<;5/<<<<<7<<;+;<<+1<8< MF:i:18 Aq:i:59 NM:i:1 UQ:i:6 H0:i:1 H1:i:0 ++-EAS1_95:3:308:956:873 99 chr1 870 99 35M = 1068 233 CTCATCTAGGGGAACAGGGAGGTGCACTAATGCGC <<<<<<<<<<<<<;<;<;1<<<<<.<9<;<<<<+; MF:i:18 Aq:i:31 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_78:7:147:64:416 147 chr1 870 99 35M = 701 -204 CTCATCTAGGGGAACAGGGAGGTGCACTAATGCGC /;49;:6<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:4:30:812:345 163 chr1 871 99 36M = 1036 201 TCATCTAGGGGAACAGGGAGGTGCACTAATGCGCTC <<<<<<<7<;<<7<;77;3<&0-;<5<;6<1'13<: MF:i:18 Aq:i:64 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:7:134:243:630 163 chr1 871 99 35M = 1052 216 TCATCTAGGGGAACAGGGAGGCGCACTAATGAGCT <<<:<<<<::1:818;;&::<>.; MF:i:18 Aq:i:35 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 ++-EAS54_81:2:31:98:804 147 chr1 982 99 35M = 805 -212 CTTTACTGTCATAACTATGAAGAGACTATTGCCAG ====;========7===================== MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_103:2:235:805:373 163 chr1 983 99 35M = 1146 198 TTTACTGTCATAACTATGAAGAGACTATTTCCAGA <<<<<<<<<<<<<<<<<<<<;<;<<<<<<;;<99; MF:i:18 Aq:i:74 NM:i:1 UQ:i:26 H0:i:1 H1:i:0 ++-EAS114_28:5:11:868:62 99 chr1 983 99 36M = 1154 207 TTTACTGTCATAACTATGAAGAGACTATTGCCAGAT <<<<<<<<<<<<<<<<<<<<:<<<;<<<<(7:7039 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_81:2:280:512:316 163 chr1 984 99 35M = 1159 210 TTACTGTCATAACTATGAAGAGACTATTGCCAGCT ==<========6==4==6;;==:===;=2/:+8%6 MF:i:18 Aq:i:68 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 ++-EAS1_93:5:292:122:666 99 chr1 985 99 35M = 1159 209 TACTGTCATAACTATGAAGAGACTATTGTCAGATG <<<<<<6<<<<<<<<8;<<<<<<<<<<3&9+;;(; MF:i:18 Aq:i:68 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS56_53:1:23:403:981 99 chr1 985 99 35M = 1151 201 TACTGTCATAACTATGAAGAGACTATTGCCAGATG <8<<<;<<<<<<;<<<<<<8;<<<9<9,3;,6(91 MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:7:33:1566:588 99 chr1 985 76 35M = 1166 216 TACTGTCATAACTATGAAGAGCCTATTGCCAGATG <;.;;;;6;;;;6;;29;;;<+9;;;.3;;73797 MF:i:18 Aq:i:37 NM:i:1 UQ:i:10 H0:i:0 H1:i:1 ++-EAS139_11:7:92:367:1495 83 chr1 987 99 35M = 820 -202 CTGTCATAACTATGAAGAGACTATTGCCAGATGAA <8<88<<<<7<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:78 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS220_1:8:38:1576:1923 83 chr1 987 99 35M = 822 -200 CTGTCATAACTATGAAGAGACTATTGCCAGATGAA 8;<98<<<<<<<;<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:7:190:481:295 163 chr1 990 99 35M = 1161 206 TCATAACTATGAAGAGACTATTGCCAGATGAACCA <<<<<<<<<<<<<<<<<<<<<<9<<<<<9<7<2:: MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_32:7:168:117:441 99 chr1 990 99 35M = 1151 196 TCATAACTATGAAGAGACTATTGCCAGATGAACCA <<3<<<<<<<<<<<<<<<<<<<+<<17;<;:<995 MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:3:239:796:221 163 chr1 992 99 35M = 1160 203 ATAACTATGAAGAGACTATTGCCAGCTGACCCCCC <<<7<<7<<7<<7<;<<<<<,;;,+'<+/+99%:' MF:i:18 Aq:i:37 NM:i:4 UQ:i:26 H0:i:0 H1:i:1 ++-EAS220_1:4:69:88:1154 147 chr1 992 99 35M = 805 -222 ATAACTATGAAGAGACTATTGCCAGATGAACCACA <<<<9<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_3:8:34:956:1309 99 chr1 994 99 35M = 1168 209 AACTATGAAGAGACTATTGCCAGATGAACCACACA <<<<<<7<<<<<<<<<<<6<<<<<<<<<<<:<8<8 MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:5:229:717:121 99 chr1 995 99 35M = 1150 190 ACTATGAAGAGACTATTGCCAGATGAACCACACAC =================<)=<4<0=.<<<71;41& MF:i:18 Aq:i:43 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS219_1:1:67:191:668 99 chr1 995 99 35M = 1134 174 ACTATGAAGAGACTATTGCCAGATGAACCACACCT <<<<<<<<<<<<<<<<<6<<;<;<;<<<<<<6;%2 MF:i:18 Aq:i:74 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 ++-EAS51_64:3:309:303:278 163 chr1 996 99 35M = 1178 217 CTATGAAGAGACTATTGCCAGATGAACCACACATT <<<<<<<<<<<<<<<<+<<+<<7<<<<<5<<<;;; MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:8:60:1020:1259 99 chr1 996 99 35M = 1157 196 CTATGAAGAGACTATTGCCAGATGAACCACACATT <;<<<<;<<<<<<<<<;<<<<<<<8<<<<<:<:<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_4:7:89:1487:520 83 chr1 997 99 35M = 805 -227 TATGAAGAGACTATTGCCAGATGAACCACACATTA 4;;/<<<<<:<;<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:4:15:805:420 163 chr1 998 35 35M = 1164 201 ATGAAGAGACTATTCACATGTGAACCACACATTTA ;73;;;;67.;1<<+*.;*&<4947<&474&*9*( MF:i:130 Aq:i:35 NM:i:4 UQ:i:33 H0:i:0 H1:i:0 ++-EAS56_57:3:119:761:239 147 chr1 999 99 35M = 813 -221 TGAAGAGACTATTGCCAGATGAACCACACATTAAT ;;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:7:142:457:584 99 chr1 999 99 35M = 1160 196 TGAAGAGACTATTTCCAGATGAACCACACATTAAT <<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_63:7:190:95:706 147 chr1 1078 99 35M = 920 -193 TTGTGTCCATGTACACACGCTGTCCTATGTACTTA 9;97437;<;;<<;<;<<<<<<;<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_589:1:101:825:28 83 chr1 1079 99 35M = 879 -235 TGTGTCCATGTACACACGCTGTCCTATGTACTTAT 0;0'0;<<<<<<8<;<<<<;;3<<;;<<<8<<<<< MF:i:18 Aq:i:39 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:4:188:460:1000 99 chr1 1080 99 35M = 1251 206 GTGTCCATGTACACACGCTGTCCTATGTACTTATC <<<<<<<<<<<<<<<<7<<;:4;44<;;:8;;9;; MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_95:3:268:523:511 99 chr1 1081 99 35M = 1241 195 TGTCCATGTACACACGCTGTCCTATGTACTTATCA <<<<<<<<<<<<<<<<<<<<;<<<<6<:9<<3<44 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:6:54:263:585 99 chr1 1081 99 36M = 1254 209 TGTCCATGTACACACGCTGTCCTATGTACTTATCAT <<<<<<<<<<<<<<<<<<<:;<<;<:;::<<;;:;4 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:7:174:987:334 83 chr1 1082 99 35M = 908 -209 GTCCATGTACACACGCTGTCCTATGTACTTATCAT ,;<;;<<<&<<<1<<<<<<<<<<<<<;<<<<<<<< MF:i:18 Aq:i:41 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:6:224:932:942 99 chr1 1082 99 34M = 1250 203 GTCCATGTACACACGCTGTCCTATGTACTTATCA <<<<<<<<<<<<<<<<<<<<<<;<<<<7<<(;3, MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:1:12:1296:358 99 chr1 1082 96 35M = 1252 205 GTCCATGTACACACGCTGTCCTATGTACTTATCAT ;;;6;7;7;;;;;7;9;;-*1;9;699/99/7477 MF:i:18 Aq:i:37 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_32:2:306:119:56 147 chr1 1083 99 35M = 919 -199 TCCATGTACACACGCTGTCCTATGTACTTATCATG ;;;;;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_95:4:66:179:118 163 chr1 1084 99 35M = 1262 213 CCATGTACACACGCTGTCCTATGTACTTATCATGA <<<<<<<<<<<<<<<<<<<<<<<<<<:<<;<<6<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_105:2:110:584:649 99 chr1 1084 99 35M = 1266 217 CCATGTACACACGCTGTCCTATGTACTTATCATGA <<<<<<<<<<<<<<<<<<<<<<<<;<<<<<::<38 MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_1:4:28:315:310 163 chr1 1085 99 35M = 1242 192 CATGTACACACGCTGTCCTATGTACTTATCATGAC <<<<<<<<<<<<<<<<<<<<<:<+.<<.<+7<*17 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:7:242:4:593 147 chr1 1086 99 35M = 905 -216 ATATACACACGCTGTCCTATGTACTTATCATGACT 1.%55877+8+88808887+7;7;18:8;;;.&;8 MF:i:18 Aq:i:53 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 ++-EAS1_93:1:131:946:353 163 chr1 1087 99 35M = 1249 197 TGTACACACGCTGTCCTATGTACTTATCATGACTC <<<<<<<<<<<<<;<<<<;;<<<<<<<;<:52;<2 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:4:4:1732:88 99 chr1 1087 99 35M = 1265 213 TGTACACACGCTGTCCTATGTACTTATCATGACTC <<<<<<<<<<<<<<<<<2<8;8<;<8;<2;2:<:< MF:i:18 Aq:i:45 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:4:58:703:72 83 chr1 1088 99 35M = 905 -218 GTACACACGCTGTCCTATGTACTTATCATGACTCT 5&<<7;+95;7'6<<<<<.<<<<<;<<9<7<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:5:113:694:725 163 chr1 1088 99 35M = 1266 213 GTACACACGCTGTCCTATGTACTTATCATGACTCT <<<<<<<<<<<<9<<<<<:<<<<<<<<<<:;;<;; MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:5:278:848:765 147 chr1 1088 99 35M = 920 -203 GTACACACGCTGTCCTATGTACTTATCATGACTCT 7;;<;5<55<<;;<;<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:6:234:787:12 163 chr1 1092 97 35M = 1257 200 ACACGCTGGCCTATGTACTTATAATGACTCTATCC <;<<<9<<&+9;3;<993;<9<+94;9&41;08%9 MF:i:18 Aq:i:24 NM:i:2 UQ:i:15 H0:i:0 H1:i:0 ++-EAS218_1:4:15:856:340 147 chr1 1093 99 35M = 936 -192 CACGCTGTCCTATGTACTTATCATGACTCTATCCC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_62:2:258:266:101 163 chr1 1094 99 35M = 1285 226 ACGCTGTCCTATGTACTTATCATGACTCTATCCCA <<<<<<<<<<<<<<<<<<5<;,<-2<<<<;68<<6 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:2:177:552:234 147 chr1 1094 99 35M = 903 -226 ACGCTGTCCTATGTACTTATCATGACTCTATCCCA ::;:=;=99=====;;====;==========<=== MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:1:134:379:893 147 chr1 1095 99 35M = 927 -203 CGCTGTCCTATGTACTTATCATGACTCTATCCCAA 7137::;<<<<<<<;<<<<<<<<<<;<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_105:8:256:404:584 147 chr1 1096 99 35M = 928 -203 ACTGTCCTATGTACTTATCATGACTCTATCCCAAA &&326+23<3<<<+:<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:3:57:735:151 99 chr1 1121 94 35M = 1314 228 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTC <<<<<<<<8<<8<:<<*<:<<<4<<<;,<<<<:<: MF:i:18 Aq:i:26 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_81:8:142:858:903 147 chr1 1121 99 35M = 943 -213 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTC <<<<<;<<<<9<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:7:247:522:670 83 chr1 1121 99 35M = 960 -196 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTC ;;;9;:<<<<<;<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:3:75:732:442 99 chr1 1121 99 40M = 1293 212 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTCTTAGG <<<<<;<<<<<9<<<;<<;<<<5<<;8<<<<<<<<;:9%% MF:i:18 Aq:i:60 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_99:7:183:645:699 99 chr1 1122 86 35M = 1281 194 TATCCCAAATTCCCAATTACGTCCTATCTTCTTCT <<9<9<<<<<<<<<;<<;<<*175;173<;;;<-/ MF:i:18 Aq:i:21 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:6:175:437:950 163 chr1 1126 99 35M = 1298 207 CCAAATTCCCAATTACGTCCTATCTTCTTCTTAGG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<:59 MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_63:3:93:1002:845 83 chr1 1129 99 35M = 954 -210 AATTCCCAATTACGTCCTATCTTCTTCTTAGGGAA <<::;;;<<<<<<<<<<<<<<<<;<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_62:6:50:542:881 163 chr1 1132 99 35M = 1324 227 TCCCAATTACGTCCTATCTTCTTCTTAGGTAAGAA <<<<<4<09<<9<<2<<<<<<<<<<<2/.&2<%<7 MF:i:18 Aq:i:63 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS1_99:3:118:851:285 83 chr1 1133 99 35M = 953 -215 CCCAATTACGTCCTATCTTCTTCTTAGGGAAGAAC 3+7<<<;<;<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_1:1:67:191:668 147 chr1 1134 99 35M = 995 -174 CCAATTACGTCCTATCTTCTTCTTAGGGAAGAACA <<<<<7<<7<<<<<<<;<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:7:166:203:416 83 chr1 1136 99 35M = 963 -208 AATTACGTCCTATCTTCTTCTTAGGGAAGAACAGC <<<<<<<<::<<<<<<<<<;<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:2:15:1497:1530 99 chr1 1136 99 35M = 1314 213 AATTACGTCCTATCTTCTTCTTAGGGAAGAACAGC 0<;;;9;;86<;;;<<&<<.<<;)3;7;654-471 MF:i:18 Aq:i:57 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:8:206:563:262 83 chr1 1137 99 35M = 971 -201 ATTACGTCCTATCTTCTTCTTAGGGAAGAACAGCT <<<<7<<<<<<<<<<<<<<<<<<<<<<<<<<<<<7 MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:4:40:352:151 99 chr1 1137 99 35M = 1327 225 ATTACGTCCTATCTTCTTCTTAGGGAAGAACAGCT <<<<<<<<<<<<<<<;<<9<<<<:<<<<;<99<3< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:7:67:302:762 99 chr1 1138 99 36M = 1313 211 TTACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTA <<<<<<<<<<<<<<<<<<<<;;65;<-<;<:8<<<3 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:5:84:927:843 147 chr1 1138 99 35M = 938 -235 TTACGTCCTATCTTCTTCTTAGGGAAGAACAGCTT 588;<:<<<<<<<6<<<<;<<<:/<<3<:;<*<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_99:5:147:479:41 163 chr1 1139 99 35M = 1322 218 TACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTA <<<<<<<<<<<<<<<<<<<<::6<<;<<<;;9;;6 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_105:3:329:177:267 83 chr1 1139 99 35M = 962 -212 TACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_589:7:72:916:763 163 chr1 1142 99 35M = 1340 233 GTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGT ==7>==9>=7=>=>>=>> MF:i:18 Aq:i:78 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_65:4:91:267:655 147 chr1 1365 99 35M = 1204 -196 TGTTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGT ;,:;5:<6:<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:2:91:856:504 99 chr1 1366 99 35M = 1520 189 GTTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTT <<<<<<<<<<<<<<7<<<<<<<7<<<&;<<<&&<& MF:i:18 Aq:i:68 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS1_108:2:170:326:433 99 chr1 1367 99 35M = 1535 203 TTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGG =====<=9===:=<:==2=======2:===9==/5 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:6:132:717:233 99 chr1 1368 99 35M = 1529 196 TGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGA <<<<<<<<<<<<;<<<<<<<<<<<7<<<<&-<4<1 MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:1:14:420:712 99 chr1 1368 99 40M = 1525 197 TGTTGGTTTTCTGTTTCTTTGTTTGATTTTTTTGAAGACA <<<<<<<<<<<<;<<<<<<<;<<<-;<<<&,<&*8111:6 MF:i:18 Aq:i:66 NM:i:3 UQ:i:21 H0:i:1 H1:i:0 ++-EAS114_39:4:43:1047:1626 163 chr1 1369 99 35M = 1523 189 GTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAA <<<<<<<<<<<<<<<<<<<:<<<:<<<<:+;-4:( MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:2:20:413:1334 147 chr1 1370 99 35M = 1215 -190 TTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAG 88878777;:;:1:;9;;;6;;;6;9;;;;;296; MF:i:18 Aq:i:60 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_62:5:154:669:853 83 chr1 1371 99 35M = 1193 -213 TGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGA <::<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:7:117:857:942 99 chr1 1372 99 35M = 1527 190 GGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGAC <<<<<<<<<<<<<<<<<<<<<<<<<:6<;;7;9<; MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:6:145:144:796 147 chr1 1372 99 35M = 1181 -226 GGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGAC ;<<<;<<<<<<<<<;<<<;<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_32:6:88:162:587 147 chr1 1372 99 35M = 1189 -218 GGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGAC 386;;388-<8;<;68<<;;<;<6<<<8<<<<<<< MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:8:73:108:1621 99 chr1 1373 99 35M = 1532 194 GTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACA <<<<<<<<71<<<<<<<<<+<<<<70:0<9<<61< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:6:127:153:861 147 chr1 1374 99 35M = 1202 -207 TTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACAT :;:6;9<<1;<<95<<<9<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:2:152:765:744 163 chr1 1374 99 35M = 1534 195 TTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACAT <<<<<<<<<<<<<<<<<<:<<<<<<<<&<7293<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:3:313:827:992 147 chr1 1379 99 35M = 1197 -217 TGTTTCTTTGTTTGATTTGGTGGAAGACATAATCC '187:1'<75<.*<<:5<..<<*<<917<<7<<17 MF:i:18 Aq:i:57 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_64:3:7:268:263 121 chr1 1381 22 35M = 1381 0 TTGCGTTATTTGAGTTGGTGGAAGACATAATCCCA ',)*&2<$7+<<<'<-<7<<<<<<<7<<7><>;>+>>/;>>=>=>=:>><>=<<==;)<=8; MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_4:7:85:923:726 147 chr2 199 99 35M = 43 -191 GTAAAGTAACTGAACCTATGAGTCACAGGTATTCC <:<<<%3<<1<<86<<-<<<<<<<<<<<<6<<1<< MF:i:18 Aq:i:44 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_103:5:285:241:560 83 chr2 200 99 35M = 37 -198 TAAAGTAACTGAACCTATGAGTCACAGGTATTCCT :<<<<;<<,<<<<5<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:6:41:461:436 163 chr2 200 74 35M = 389 224 TAAAGTAACTGAACCTATGAGTCACAGGTATTCCT <<<<<<<<<<<<<<<<<<<:<<<<<<<<;<;;;:; MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_61:6:25:949:33 99 chr2 201 99 35M = 383 217 AAAGTAACTGAACCTATGAGTCACAGGTATTCCTG =;===/8========*==&;6=&=&:=6&:=::67 MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_3:2:60:590:1760 99 chr2 201 99 35M = 376 210 AAAGTAACTGAACCTATGAGTCACAGGTATTCCTG <:<<<<<2<<<<:<::<<<::<<<<<6<<<<<<<6 MF:i:18 Aq:i:51 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:6:86:693:234 163 chr2 202 82 35M = 388 221 AAGTAACTGAACCTATGAGTCACAGGTATTCCTGA ;;;;;;;;;;;;;;;;9;;;;;;;;99;;&70777 MF:i:18 Aq:i:18 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:6:4:223:776 163 chr2 203 93 35M = 387 219 AGTAACTGAACCTATGAGTCACAGGTATTCCTGAG <<<<<<<<<<<<<<<<<<<<<<<<<9<<<:;<;2< MF:i:18 Aq:i:19 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_39:3:88:84:1558 99 chr2 203 95 35M = 394 226 AGTAACTGAACCTATGAGTCACAGGTATTCCTGTG <<;<<<<<<<<<<<<;;<<<<<<<::<<<<<<7&< MF:i:18 Aq:i:22 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-B7_597:2:168:829:88 163 chr2 205 99 35M = 369 199 TAACTGAACCTATGAGTCACAGGTATTCCTGAGGA <<<<<<<<<<<<<<<<<<<<<<<6<<<<<<9;4;2 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:1:168:389:889 147 chr2 205 99 36M = 37 -204 TAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA ;<<;;56;==================8========8 MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:5:81:685:141 99 chr2 207 85 34M = 382 210 ACTGAACCTATGAGTCACAGGTATTCCTGAGGAA <<<<<<<<<<<<<<<<<<<<<;;<<;<<<',7,7 MF:i:18 Aq:i:17 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:4:26:1312:1400 99 chr2 207 99 40M = 385 218 ACTGAACCTATGAGTCACAGGTATTCCTGAGGAAAAAGAA <<<<;<<<:<<:<;<:<<<;:;<<<<<<:<8<1;;:::88 MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:4:127:725:381 83 chr2 209 99 34M = 39 -204 TGAACCTATGAGTCACAGGTATTCCTGAGGAAAA +<<.<<;<;<<<3;<;<<<<<<6<8;<<<<<<<1 MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:2:19:736:559 99 chr2 209 99 35M = 370 196 TGAACCTATGAGTCACAGGTATTCCTGAGGAAAAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:3:117:284:589 83 chr2 210 99 35M = 43 -202 GAACCTATGAGTCACAGGTATTCCTGAGGAAAAAG ==8==;==================;========== MF:i:18 Aq:i:56 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:5:120:596:847 163 chr2 211 83 35M = 410 234 AACCTATGAGTCACAGGTATTCCTGAGGAAAAAGA <<<<<<<<<<<<<;<<<9<<<<<<<;:<62;58;2 MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:1 ++-B7_610:5:51:904:391 163 chr2 212 97 35M = 401 224 ACCTATGAGTCACAGGTATTCCTGAGGAAAAAGAA <<<<<<<<<<<<<<<<<;<<<<<;:;<2<6;;;;; MF:i:18 Aq:i:24 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_11:8:96:1314:1448 163 chr2 213 93 35M = 388 210 CCTATGAGTCACAGGTATTCCTGAGGAAAAATAAA <<<<<<<<<<<<<<<<<<<<<<<<5<4<<<<-<<< MF:i:18 Aq:i:18 NM:i:1 UQ:i:12 H0:i:1 H1:i:0 ++-EAS139_19:3:73:1158:535 163 chr2 213 99 40M = 377 204 CCTATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGTG <<<<<<<<<<<<<<<<<<<<<<8<;;<<<<<9<<9::8:8 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:2:223:583:968 147 chr2 215 88 36M = 47 -204 TATGAGGCACAGGTATTCCTGAGGAAAAAGAAAAAG 1<';<<&%-:<<<<<:66%<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS1_97:3:160:173:889 163 chr2 215 99 35M = 379 199 TATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAA <<<<<<<<<<<<<<<<<<<<;0<7<<;<<<;7<09 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_39:1:28:350:895 83 chr2 215 95 35M = 48 -202 TATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAA :<;<<<:;<-<<<<<4;77<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:4:45:707:147 163 chr2 216 99 35M = 424 243 ATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAG <<<<<<<<<<<<&<<<<:<<9<<<9<<<<75;;;< MF:i:18 Aq:i:51 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS220_1:8:18:1757:95 99 chr2 216 45 35M = 374 193 ATGAGTCGCAGGTATTCCTGAGGAAAAAGAAAAAG <<<<<<<<<<<<<<<<<<<1<:<<<<<<:<<<;:< MF:i:18 Aq:i:0 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS51_66:6:310:747:415 163 chr2 217 99 35M = 387 205 TGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGT <<<<<<<<<<<<<<<<<<:<<<<<;<<<<<;<;<; MF:i:18 Aq:i:24 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:2:114:938:216 147 chr2 218 99 36M = 63 -191 GAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGA <<<<7<6<<<<<<<6<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:1:179:629:513 163 chr2 220 99 35M = 409 224 GTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAG <<<<<<<<<<<<<<<;<;<<<<<<<<<<<<<;<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_45:4:88:55:1187 99 chr2 220 66 35M = 391 206 GTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAG ;;<;;;<<99<<;;<;;;;;:;49;:;;;;87898 MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_62:5:119:38:945 99 chr2 221 99 35M = 428 242 TCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGA <<<<<<<<<<<<<<<8<<<<<8<<<8<;<<7<:<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:6:67:800:450 147 chr2 221 99 35M = 41 -215 TCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGA 9-<9<;<<<<9;5<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:5:102:915:87 147 chr2 222 99 35M = 65 -192 CACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAA ;<8<;;<<<<7;<<<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:1:113:367:659 163 chr2 222 72 35M = 390 203 CACAGGTATTCCTGAGGAAAAAGAAAAAGCGAGAA =9====8==========:=:=====9=:=&====5 MF:i:18 Aq:i:0 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS218_1:2:26:211:481 147 chr2 222 99 35M = 43 -214 CACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAA :<:<<<<<<9:5<<<<<<<<<<<<<<:<:<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_FC30151:3:90:1906:1528 83 chr2 222 99 35M = 41 -216 CACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAA :<<<<<<<<<3:<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:2:13:100:876 163 chr2 223 73 36M = 397 210 ACAGGGATTCCTGAGGAAAAAGAAAAAGTGAGAAGT <8<<<*<2<7<<<6<<<<<<6<<8<<<<5<<<<4<9 MF:i:18 Aq:i:30 NM:i:1 UQ:i:9 H0:i:0 H1:i:1 ++-EAS56_63:5:117:570:971 163 chr2 223 99 35M = 413 225 ACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAG <<<<<<<<<<<<<;;;<<<<6<7;9;<:;<;<;;< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_3:8:50:1203:1094 83 chr2 223 99 35M = 46 -212 ACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAG <7<<<<<5:+63<<<<<<<<<<<<<<<<2<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:6:107:395:312 83 chr2 224 99 35M = 44 -215 CAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGT ;<;;<<<<;<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:27 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:3:29:833:612 83 chr2 224 99 35M = 58 -201 CAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGT <<;<<<;<::<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:27 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:7:158:943:467 83 chr2 225 99 35M = 57 -203 AGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTT <:<<;;<:5<<<<<<<<<<<<<<<<<<;<<<;<<< MF:i:18 Aq:i:46 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:2:201:768:529 163 chr2 225 99 35M = 396 206 AGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTT ==========================1=======; MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:6:11:994:584 99 chr2 226 97 35M = 417 226 GGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTT <<<<<<<<<<<7<<<<<<<<<<<<<6<<<<<<3<6 MF:i:18 Aq:i:24 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:2:206:873:186 83 chr2 227 99 35M = 66 -196 GTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTG ;<<;--7<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_63:4:38:28:122 83 chr2 227 99 35M = 46 -216 GTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTG ;9;9;-1<<<<<<<<<<<<<<<<<;<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:5:66:372:343 83 chr2 228 99 35M = 40 -223 TATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTGG ;<1;89<<<<<;<9<<<<9<<<;8<9<;<<<<<;8 MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:3:277:144:848 83 chr2 228 99 35M = 64 -199 TATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTGG <<<)63<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:6:21:1601:1666 83 chr2 228 99 40M = 56 -212 TATTACTGAGGAAAAAGAAAAAGTGAGAAGTTTGGAAAAA -;;3&1<<<<<<<<<<<<1<<<<<<<<<<<7<<<<<<<<=<<<<<<<<<<<<<< MF:i:32 Aq:i:19 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:2:23:268:529 153 chr2 329 71 35M * 0 0 TGAAAGAGGCTCAAAGAATGCCAGGAAGATACATT 7;<<<<<<57;-<<<<<<:<77<<<<<<<;<;<<< MF:i:32 Aq:i:28 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:2:315:219:7 153 chr2 330 69 35M * 0 0 GAAAGAGGCTCAAAGAATGCCAGGAAGATACATTG 7==::<2=8<<<=====>888<=2=>==>,>,>>8 MF:i:32 Aq:i:19 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:4:63:5:870 83 chr2 330 75 35M = 148 -217 GAAAGAGGCTCAAAGAATGCCAGGAAGATACATTG :<;<;<<<4:;<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:5:243:557:560 163 chr2 331 75 36M = 499 204 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGCA <<<<<<<9<<<<<<<<<<<<<<<<<<;<<89<<9<; MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:2:270:430:269 163 chr2 331 99 36M = 519 224 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;;7;: MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:6:284:442:747 89 chr2 331 75 35M * 0 0 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGC <;<<<<<:<;<<<<<<<<;<<<<<<<<<<<<<<<< MF:i:32 Aq:i:29 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_4:7:71:31:1973 89 chr2 331 76 35M * 0 0 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGC <<<<<7<:<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:32 Aq:i:29 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:2:30:466:652 147 chr2 332 98 35M = 163 -204 AAGAGGCTAAAAGAATGCCAGGAAGATACATTGCA <<<<<;3;&<<<<<<<============= MF:i:18 Aq:i:22 NM:i:0 UQ:i:0 H0:i:4 H1:i:13 ++-EAS114_39:3:88:84:1558 147 chr2 394 95 35M = 203 -226 ATCAGACTATCTAAAGTCAACATGAAGGAAAAAAA ;;<<;<<;<<5<<<<<<;<<:<<<;<<<<<<;<<< MF:i:18 Aq:i:22 NM:i:0 UQ:i:0 H0:i:2 H1:i:3 ++-EAS56_59:2:201:768:529 83 chr2 396 99 35M = 225 -206 CAGACTATCTAAAGTCAACATGAAGGAAAAAAATT 3<:<9<<;<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_591:2:13:100:876 83 chr2 397 73 36M = 223 -210 AGAATATATAAAGTCAACATGAAGGAAAAAAATTCT ;9<$<<<$<<<<<<<<75<<<<<<<9<9<<<<<<<< MF:i:18 Aq:i:30 NM:i:2 UQ:i:6 H0:i:1 H1:i:1 ++-EAS139_11:4:26:137:1382 99 chr2 397 99 35M = 579 217 AGACTATCTAAAGTCAACATGAAGGAAAAAAATTC <<<<<<7<<<77<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:1:93:490:901 83 chr2 445 99 35M = 280 -200 AGAAAAGCATACAGTCATCTATAAAGGAAATCCCA <<<<<<<;<<<;<<<;<<;<<;<<<<<<<<<<<<< MF:i:18 Aq:i:53 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:7:96:489:453 99 chr2 445 99 35M = 625 215 AGAAAAGCATACAGTCATCTATAAAGGAAATCCCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;;;: MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:6:46:13:880 147 chr2 445 99 35M = 290 -190 AGAAAAGCATACAGTCATCTATAAAGGAAATCCCA =&====8==========0================= MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:2:167:905:852 163 chr2 445 99 36M = 647 238 AGAAAAGCATACAGTCATCTATAAAGAAAATCCCAT <<<7<<<<<<<<<<<<<<:<:<<:::&.<:<66:3< MF:i:18 Aq:i:43 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 ++-EAS219_FC30151:3:13:674:1717 163 chr2 445 99 35M = 623 213 AGAAAAGCATGCAGTCATCTATAAAGGAAATCCCA <<<<<<<<<<%<<<<<<<<<<<<<<<<<<<;:;;; MF:i:18 Aq:i:45 NM:i:1 UQ:i:4 H0:i:0 H1:i:1 ++-EAS51_62:7:196:511:896 83 chr2 446 99 35M = 283 -198 GAAAAGCATACAGTCATCTATAAAGGAAATCCCAT 8<<<<<;<<<:<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:52 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:1:154:118:488 163 chr2 447 99 35M = 624 212 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<<<<<<<<<<<<<<<<<<<<<:7<<<<7<:;;:: MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:2:44:153:969 83 chr2 447 95 35M = 245 -237 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<5<:7<72<51<7<*79<<<<<5<<<<<<<<<2< MF:i:18 Aq:i:36 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:4:215:246:640 99 chr2 447 99 36M = 624 213 AAAAGCATACAGTCATCTATAAAGGAAATCCCATCA <<<<<<<<<<9<;<<<<<<<<<<9;<<<<<<3;<;3 MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_1:7:94:1655:1921 147 chr2 447 85 35M = 258 -224 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<<<;:===<==;<==<;================; MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:6:60:1037:1146 147 chr2 447 99 35M = 250 -232 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<<<<<<<<;<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:53 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_65:1:23:536:229 99 chr2 448 99 35M = 614 201 AAAGCATACAGTCATCTATAAAGGAAATCCCATCA <<<<<<<<<<<<<<<<<:<8<:<<;<<<<<<7<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:6:130:865:838 163 chr2 448 99 35M = 649 236 AAAGCATACAGTCATCTATAAAGGAAATCCCATCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;:<;3 MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:2:239:1001:406 99 chr2 450 99 35M = 634 219 AGCATACAGTCATCTATAAAGGAAATCCCATCAGA <<<<<<7<<<<<<<<8<;<<<7<<<<36<<3<:33 MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:3:147:423:584 147 chr2 451 99 35M = 277 -209 GCATACAGTCATCTATAAAGGAAATCCCATCAGAA 27<;<3<<<+<<;<<<;;-4<<<<<;<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_99:1:187:715:521 83 chr2 451 99 35M = 291 -195 GCATACAGTCATCTATAAAGGAAATCCCATCAGAA <7<:<9<<<<<<<<<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:3:172:196:746 99 chr2 451 99 35M = 620 204 GCATACAGTCATCTATAAAGGAAATCCCATCAGAA <<<<<<<<9<<<<9<<<<<<<<<;<<<<6<<<<;< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:3:267:821:860 83 chr2 451 99 34M = 296 -189 GCATACAGTCATCTATAAAGGAAATCCCATCAGA $&<<<.<:;6<<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:3 ++-EAS56_61:7:7:682:201 83 chr2 452 99 35M = 288 -199 CATACAGTCATCTATAAAGGAAATCCCATCAGAAT 0:8;5<8<1:78<<<<<<<<<<<<:8<<2<<<<:< MF:i:18 Aq:i:66 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_589:3:82:13:897 163 chr2 453 99 35M = 606 188 ATACAGTCATCTATAAAGGAAATCCCAGCAGAATA <<<<;<<<<<<;<;<;5<51;<1<<<<%<<<<,58 MF:i:18 Aq:i:41 NM:i:1 UQ:i:4 H0:i:0 H1:i:1 ++-EAS56_53:6:180:695:621 99 chr2 453 99 35M = 637 219 ATACAGTCATCTATAAAGGAAATCCCATCAGAATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<;::<<< MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:2:158:909:321 83 chr2 453 99 35M = 271 -217 ATACAGTCATCTATAAAGGAAATCCCATCAGAATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_26:2:237:497:165 99 chr2 454 99 35M = 619 200 TACAGTCATCTATAAAGGAAATCCCATCAGAATAA 8===<8===========37=<===7=;7=8===== MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_99:2:152:355:962 83 chr2 456 99 35M = 269 -222 CAGTCATCTATAAAGGAAATCCCATCAGAATAACA &<.9.<;+;<;<<<<<<<<<<::<<:<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 ++-EAS192_3:4:255:549:422 83 chr2 456 99 35M = 295 -196 AAGTCATCTATAAAGGAAATCCCATCAGAATAACA &<;;+<;4;<<<<<<<<<<<;<;<<;<<<<<<<<< MF:i:18 Aq:i:30 NM:i:1 UQ:i:5 H0:i:1 H1:i:2 ++-EAS220_1:4:100:20:1199 163 chr2 456 99 35M = 614 193 CAGTCATCTATAAAGGAAATCCCATCAGAATAACA 7<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<4<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:5:71:408:741 163 chr2 457 99 35M = 637 215 AGTCATCTATAAAGGAAATCCCATCAGAATAACAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:5:285:395:450 147 chr2 458 99 35M = 269 -224 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT 8)3<8+;<)<<<<<<<<97:7<<<<<<<<<<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:3:4:854:140 99 chr2 458 72 35M = 638 215 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT <<<6<<<:<6<<<:36:<<<<3<<8:.6<38::4< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:6:227:657:95 147 chr2 458 99 35M = 280 -213 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT ;3;<);<<<<<<<<<<<<18<<<<<<<<<<<<<<< MF:i:18 Aq:i:59 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:7:57:324:546 83 chr2 458 99 36M = 281 -213 GTCATCTATAAAGGAAATCCCATCAGAATAACAATG ;;5<;,<<<;;<<<<<<<97<<<<<<<<<<9<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:4:26:274:1078 83 chr2 458 99 40M = 282 -216 GTCATCTATAAAGGAAATCCCATCAGAATAACAATGGGCT 9:*:64<<;<<<<<<<<<;8;<<:<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:6:107:636:642 163 chr2 458 99 35M = 630 207 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_81:7:226:869:36 147 chr2 461 99 35M = 273 -223 ATATATAAAGGAAATCCCATCAGAATAACAATGGG <0/)&<=,==4>4=>>= MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_1:8:82:1540:77 163 chr2 619 99 35M = 786 202 GAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGC <<<<<<<<<<<<<<<<<<<<<<<<<<<;;<<<<:8 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:3:172:196:746 147 chr2 620 99 35M = 451 -204 AAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCT <<<;><<+<<<<:<<<<2<;<<<;<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:7:97:892:419 163 chr2 621 99 35M = 800 214 AATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:4:312:915:751 147 chr2 621 99 35M = 461 -195 AATAAAGTCAAGTCTTTCCTGACAAGCAAAAGCTA <:-<<<99:::);:7<4;8<<<<<<<;<2<+8<;< MF:i:18 Aq:i:41 NM:i:1 UQ:i:10 H0:i:0 H1:i:1 ++-EAS1_93:4:325:352:67 163 chr2 622 99 35M = 794 207 ATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAA ==================<========<=<;-=== MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:4:83:731:540 99 chr2 623 99 35M = 804 216 TAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAG <<<<<<<<<;<<<<<<<<<<<<<:<7<*;&;<;;9 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:5:74:329:459 163 chr2 623 99 35M = 795 207 TAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAG <<<<<<<<<<<<<<<<<<<<<<<;<<;<<;9;599 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_FC30151:3:13:674:1717 83 chr2 623 99 35M = 445 -213 TAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_105:1:141:415:738 69 chr2 624 0 * = 624 0 TTACCTAGTTGCTCTGTAGTCTCAATTAATTGTTT <<<<<<<<<<<<<<<<<<<<<<<;<<<<<;<8<<< MF:i:192 ++-EAS1_105:1:141:415:738 137 chr2 624 76 35M = 624 0 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGA <<<<<<<<<<<<<<<<<<<<<<:<<;<<;<<<<6: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:1:154:118:488 83 chr2 624 99 35M = 447 -212 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGA <<<;58<<95:<<;<;<<<;<<<;;<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:5:198:929:684 83 chr2 624 99 35M = 471 -188 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGA <<;<<<<<:<<<<<<<<<<<<<<<<<;<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:4:215:246:640 147 chr2 624 99 36M = 447 -213 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGAT ;<<,<<<96<<:<:<9<6<97<<<<<9<<<<9<<9< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_103:2:234:167:381 83 chr2 625 99 35M = 443 -217 AAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGAT <<;<;<<<<;<<<<7<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:7:96:489:453 147 chr2 625 99 35M = 445 -215 AAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGAT ;<;;;<<<<5:<<:<<<<:<<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:3:79:879:15 99 chr2 626 99 35M = 790 199 AGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<2<;<<1< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:2:59:286:290 147 chr2 628 99 35M = 467 -196 TCAAGTCTTTCCTGACAAGCAAATGCTAAGATAAT 77<<<<7<<<97<<,7<<<;<<<;<9<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_95:5:263:511:936 99 chr2 629 99 35M = 801 207 CAAGTCTTTCCTGACAAGCAAATGCTAAGATAATT <<<<<<<<<<<<<<<<<;<<<<<;<<:<:<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_30:3:181:582:435 147 chr2 629 99 35M = 471 -193 CAAGTCTTTCCTGACAAGCAAATGCTAAGATAATT <<<<<<<<;<<<<<;<<4<<<<<<;<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:6:107:636:642 83 chr2 630 99 35M = 458 -207 AAGTCTTTCCTGACAAGCAAATGCTAAGATAATTC <<<<<<<;<<<<<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:4:12:273:89 83 chr2 631 99 35M = 477 -189 AGTCTTTCCTGACAAGCAAATGCTAAGATAATTCA <:737<288<<<7<<<<<<<<<:9<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_59:2:239:1001:406 147 chr2 634 99 35M = 450 -219 CTTTCCTGACAAGCAAATGCTAAGATAATTCATCA 0':.71;;:9==9=;====;=;============= MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_4:7:96:899:106 147 chr2 636 99 35M = 462 -209 TTCCTGACAAGCAAATGCTAAGATAATTCATCATC ;;;;<<<<<<<;<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_65:6:67:56:806 147 chr2 637 99 35M = 464 -208 TCCTGACAAGCAAATGCTAAGATAATTCATCATCA 844:8;7<88;8<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:5:71:408:741 83 chr2 637 99 35M = 457 -215 TCCTGACAAGCAAATGCTAAGATAATTCATCATCA ;7;<;<0<<<<<<<<:;<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_53:6:180:695:621 147 chr2 637 99 35M = 453 -219 TACTGAAAAGCAAATGCTAAGATAATTCATCATCA ;&377<&<<;7<<<<<7<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:68 NM:i:2 UQ:i:10 H0:i:1 H1:i:0 ++-EAS114_30:6:49:656:507 147 chr2 637 99 35M = 468 -204 TCCTGACAAGCAAATGCTAAGATAATTCATCATCA %44;;<:<<;<;<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:3:4:854:140 147 chr2 638 72 35M = 458 -215 CCTGACAAGCAAATGCTAAGATAATTCATCATCAC :9':<;<<<;<<<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:1:85:1521:58 99 chr2 639 99 40M = 813 214 CTGACAAGCAAATGCTAAGATAATTCATCATCACTAAACC <<<<<<<<9<<<<<<<<<<<<<7<<<<<<<<<<<<;;:7: MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_39:2:57:1064:925 137 chr2 640 76 35M * 0 0 TGACAAGCAAATGCTAAGATAATTCATCATCACTA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<:<<< MF:i:32 Aq:i:29 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_103:3:323:196:855 163 chr2 642 99 35M = 809 202 ACAAGCAAATGCTAAGATAATTCATCATCACTAAA <<<<<<<7<<<<<<:<<<<<<<<<<<<<<<<<;7: MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:5:117:33:262 163 chr2 642 99 35M = 814 207 ACAAGCAAATGCTAAGATAATTCATCATCACTAAA <<<;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<; MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_11:1:59:742:549 99 chr2 642 99 35M = 816 209 ACAAGCAAATGCTAAGATAATTCATCATCACTAAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<8< MF:i:18 Aq:i:48 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:2:55:562:403 163 chr2 643 99 36M = 825 218 CAAGCAAATGCTAAGATAATTCATCATCACTAAACC <<<<<<<<<<<<<<<<<<<<<;<<;<<<<<<<;<;: MF:i:18 Aq:i:51 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:7:97:743:602 163 chr2 644 99 35M = 821 211 AAGCAAATGCTAAGATAATTCATCATCACTAAACC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<: MF:i:18 Aq:i:26 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:2:167:905:852 83 chr2 647 99 36M = 445 -238 CAAATGCTAAGATAATTCATCATCACTAAACCAGTC +<<<9;7;<<+<<<<<39<;9<;9<<7<<<<<<<<< MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS112_34:6:130:865:838 83 chr2 649 99 35M = 448 -236 AATGCTAAGATAATTCATCATCACTAAACCAGTCC ;<:84<<<4<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_1:1:60:1420:660 163 chr2 649 99 35M = 808 194 AATGCTAAGATAATTCATCATCACTAAACCAGTCC <<<<<<<<<<<<<<<<<<<<<<<<<<;<<<8<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:3:180:89:582 99 chr2 650 99 36M = 809 195 ATGCTAAGATAATTCATCATCACTAAACCAGTCCTA <<<<<<<<<7<<<<<<<<<7<<<:<<<:<<::77:< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_99:1:86:871:319 147 chr2 651 71 35M = 494 -192 TGCTAAGATAATTCATCATCACTAAACCAGTCCTA 7;+1;<:<<<<<<<<;<<;<<9<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:2:236:841:20 83 chr2 652 99 35M = 467 -220 GCTAAGATAATTCATCATCACTAAACCAGTCCTAT 7;<<<;<<<<;;<<<<<<<<<<<<<<<<<<<<;<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_62:2:133:8:379 83 chr2 653 99 35M = 470 -218 ATAAGATAATTCATCATCACTAAACCAGTCCTATA &=========='==7==0=2====28===00==== MF:i:18 Aq:i:70 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 ++-EAS1_105:8:96:720:940 83 chr2 654 99 35M = 467 -222 TAAGATAATTCATCATCACTAAACCAGTCCTATAA *<<<<;<<<9<<;,<;0<;<<<<<<<<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:5:71:994:576 99 chr2 655 99 35M = 805 185 AAGATAATTCATCATCACTAAACCAGTCCTATAAG <<<<<<<<<<<<<<<<<<<<<<<;<<5<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_103:4:164:79:134 147 chr2 656 99 35M = 488 -203 AGATAATTCATCATCACTAAACCAGTCCTATAAGA <;<;<<<;<<;<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:6:78:1029:512 83 chr2 656 99 40M = 500 -196 AGATAATTCATCATCACTAAACCAGTCCTATAAGAAATGC ;;;;;<;;<<<.<<6;<<;<;8<<<<::<<<<<<<<;<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_93:1:214:784:690 147 chr2 657 99 35M = 472 -220 GATAATTCATCATCACTAAACCAGTCCTATAAGAA -<7<<7<:<<2<<<<;<<<<<;<<<<3<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:1 ++-EAS220_1:4:6:1178:1105 99 chr2 657 93 35M = 830 208 GATAATTCATCATCACTAAACCAGTCCTATAAGAA <<<<<<:<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:17 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_99:7:171:196:287 83 chr2 658 99 35M = 485 -208 ATAATTCATCATCACTAAACCAGTCCTATAAGAAA <;;;98;<;&<;;<<<<<<<;<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_28:1:220:801:282 99 chr2 660 99 36M = 837 213 AATTCATCATCACTAAACCAGTCCTATAAGAAATGC <<<<<<<<<<<<<<<<<<<<<;<+<;<<<<<::<<: MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 ++-EAS221_1:2:73:955:728 163 chr2 660 44 35M = 823 198 AATTCATCATCACTAAACCAGTCCTATAAGAAATG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<<< MF:i:18 Aq:i:14 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 ++-EAS1_105:1:3:903:957 147 chr2 661 99 35M = 516 -180 ATTCATCATCACTAAACCAGTCCTATAAGAAATGC <%12<&<<<;<:<<<<<<<<<7<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:1 ++-EAS56_65:2:224:579:433 83 chr2 662 99 35M = 485 -212 TTCATCATCACTAAACCAGTCCTATAAGAAATGCT '<08/8<+<>===> MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_105:2:146:374:692 99 chr2 690 99 35M = 874 219 AAATGCTCAAAAGAATTGTAAAAGTCAAAATTAAA <<<<<<<<<<<<<<<=>>>==>>===>==> MF:i:130 Aq:i:74 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-EAS1_108:6:159:493:275 99 chr2 760 72 35M = 939 214 ACAAAACTCACAGGTTTTATAAAACAATTAATTGA =====3============================= MF:i:130 Aq:i:72 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-EAS139_11:1:81:1019:558 163 chr2 760 77 35M = 926 201 ACAAAACTCACAGGTTTTATAAAACAATTAATTGA <<<<<<<<<<<6<<<<<<<<<<<<<<<<<<<<<7< MF:i:130 Aq:i:77 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-EAS51_62:7:162:195:761 163 chr2 767 30 18M4I13M = 922 190 TCACAGGTTTTATAAAACAATTAATTGAGACTACA <<<<<<<<<<<<<<<<<<<<<<<<<<;<:<<<<;; MF:i:130 Aq:i:30 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-B7_597:3:115:646:430 147 chr2 768 45 17M4I14M = 582 -217 CACAGGTTTTATAAAACAATTAATTGAGACTACAG 5;5<;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:130 Aq:i:45 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-EAS114_30:6:243:209:110 163 chr2 768 48 17M4I14M = 920 187 CACAGGTTTTATAAAACAATTAATTGAGACTACAG <<<<<;<;<<<;<<<<<<<<<<<;<:;<<:;;+85 MF:i:130 Aq:i:48 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 ++-EAS1_108:2:266:994:429 147 chr2 769 76 16M4I15M = 612 -188 ACAGGTTTTATAAAACAATTAATTGAGACTACAGA <<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:1:85:1521:58 147 chr2 813 99 40M = 639 -214 AAATTAACATTACAACAGGAACAAAACCTCATATATCAAT :::86<<:<<8<<<<;<<8<<<<<<<<<<<<<<<;<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS188_7:4:164:719:947 99 chr2 813 99 35M = 1005 227 AAATTAACATTACAACAGGAACAAAACCTCATATA <<<<<<<<<<<<<<<<<<<;<<<<<<<<<;<<<<< MF:i:18 Aq:i:64 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_1:1:50:257:341 163 chr2 813 99 35M = 971 193 AAATTAACATTACAACAGGAACAAAACCTCATATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<7<6<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_67:5:117:33:262 83 chr2 814 99 35M = 642 -207 AATTAACATTACAACAGGAACAAAACCTCATATAT <<;;<<;<:8<7<<;<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_1:8:90:706:1276 163 chr2 814 99 35M = 980 201 AATTAACATTACAACAGGAACAAAACCTCATATAT <<<<<<<<<<<<<<<<<<<<<<;<<<<<<<<:<:< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:2:116:966:193 163 chr2 815 99 35M = 967 187 ATTAACATTACAACAGGAACAAAACCTCATATATC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_11:1:59:742:549 147 chr2 816 99 35M = 642 -209 TTAACATTACAACAGGAACAAAACCTCATATATCA -<<<3<<<<6<<6<<<<<6<<<<6<<<<<<<<<<< MF:i:18 Aq:i:48 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_11:7:74:213:877 99 chr2 816 99 35M = 996 215 TTAACATTACAACAGGAACAAAACCTCATATATCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 ++-B7_610:3:85:219:371 163 chr2 817 99 35M = 967 185 TAACATTACAACAGGAACAAAACCTCATATATCAA <<<<<<<<<<<<<<<<<<<<<:<<<<<<<<<<;<; MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:2:176:653:957 163 chr2 819 82 35M = 982 198 ACATTACAACAGGAACAAAACCTCATATATCAATA ????????????<==>=>=>=>>>==>>>=>>> MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_19:5:57:366:844 83 chr2 877 99 40M = 708 -209 AAATTCCCCCACTTAAGAGATATAGATTGGCAGAACAGAT ;;;7:8&555<,;<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_32:3:236:475:254 163 chr2 880 99 35M = 1051 206 TTCCCCCACTTAAGAGATATAGATTGGCAGAACAG <<<<<<<<<<<<<<<<<<<<<<<<<:::<:;>=>>>>==>=>>>==>=>=:=====;=:=6:::6 MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS218_1:2:10:686:1024 163 chr2 947 99 35M = 1103 191 ACAAGAAACTCATTAATAAAGACATGAGTTCAGGT <:<<<<:<<<<<<<<<<:<:<<<<<<<<<<<5<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_73:5:53:61:31 163 chr2 949 99 35M = 1122 208 AAGAAACTCATTAATAAAGACATGAGTTCAGATAA <<<7;<7<<<;7<;;<7<7<7<;5<73<<<;>588>9<>7:<0<9; MF:i:18 Aq:i:30 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 ++-B7_589:2:30:644:942 99 chr2 1045 83 35M = 1229 219 TATATCAGATAAAGCACACTTTAAATCAACAACAG <<<<<<<<<<<<<<<<<<<<<<<9<<<<<<9;<9< MF:i:18 Aq:i:22 NM:i:0 UQ:i:0 H0:i:1 H1:i:3 ++-B7_591:2:123:924:645 83 chr2 1045 84 36M = 861 -220 TATATCAGATAAAGCACACTTTAAATCAACAACAGT ;<<<<*<<<<<<<<8<<<<<><<<<<><<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS51_62:4:308:614:911 99 chr2 1319 90 35M = 1493 209 TGCGCTTGTACTTCTAAATCTATAACAAAATTAAA <<<<<<<<<<<<<<<<<<<<<<<<<<<;;<<<<8< MF:i:18 Aq:i:43 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS54_65:3:155:541:234 83 chr2 1319 99 35M = 1151 -203 TGCGCTTGTACTTCTAAATCTATAAAAAAATTAAA 78;<7<<<<<<<<<<<<<<;<<<<<<<<<<;<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:6:175:289:351 147 chr2 1319 99 35M = 1144 -210 TGCGCTTGTACTTCTAAATCTATAAAAAAATTAAA 9;;:+<<<<<;<<:<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_593:7:283:186:707 83 chr2 1321 99 36M = 1154 -203 CGCTTGTACTTCTAAATCTATAACAAAATTAAAATT 889;<7;<7<<7<<<<<7<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS1_105:3:308:66:538 147 chr2 1321 99 35M = 1138 -218 CGCTTGTACTTCTAAATCTATAACAAAATTAAAAT 996999;<9;<:<<<<<:<<7<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS1_108:5:11:555:330 163 chr2 1321 99 35M = 1492 206 CGCTTGTACTTCTAAATCTATAAAAAAATTAAAAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<4<;< MF:i:18 Aq:i:56 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS51_66:7:84:411:336 73 chr2 1322 75 35M * 0 0 GCTTGTACTTCTAAATCTATAAAAAAATTAAAATT <<<;<<<;<<<<<<<<<<<<:<<;<<<<<<;8<;< MF:i:32 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS139_11:5:52:1278:1478 163 chr2 1322 47 35M = 1513 226 GCTTGTACTTCTAAATCTATAACAAAATTAAAATT <<<<<<<<<<<<<<9<<<<<<<<<<<<<<<<9<<< MF:i:18 Aq:i:0 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS56_53:3:101:809:776 147 chr2 1326 99 35M = 1160 -201 GTACTTCTAAATCTATAAAAAAATTAAAATTTAAC <<<-<;7;<<<<:;<<<7<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS192_3:3:221:881:916 147 chr2 1327 96 35M = 1168 -194 TAATTCTAAATCTAGAACAAAATTAAAATTTAACA 44%-4(5<;9/,:<68:1<:8<:<<84;<<<<<;< MF:i:18 Aq:i:24 NM:i:3 UQ:i:41 H0:i:0 H1:i:0 ++-EAS1_105:1:28:745:352 147 chr2 1329 99 35M = 1159 -205 CTTCTAAATCTATAACAAAATTAAAATTTAACAAA 4;;*;<<<;;<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS114_45:2:23:1754:796 99 chr2 1329 99 35M = 1488 194 CTTCTAAATCTATAAAAAAATTAAAATTTAACAAA ;<<;<;<;<;<;<<;;;;;<<<<;;<<<<<97999 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:2:96:419:327 147 chr2 1331 99 35M = 1149 -217 TCTAAATCTATAACAAAATTAAAATTTAACAAAAG ;1<<<<<9<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 ++-EAS1_97:4:274:287:423 163 chr2 1332 75 35M = 1515 218 CTAAATCTATAAAAAAATTAAAATTTAACAAAAGT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS219_1:7:35:392:2042 83 chr2 1332 99 35M = 1168 -199 ATAAATCTATAAAAAAATTAAAATTTAACAAAAGT +<<<<4<>>>>;>>&>->9>9;4>->>>>,4>9>,<1> MF:i:18 Aq:i:27 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 ++-EAS139_19:2:82:154:1333 99 chr2 1349 77 40M = 1511 202 TTAAAATTTAACAAAAGTAAATAAAACACACAGCTAAAAC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;<;;:;: MF:i:18 Aq:i:0 NM:i:1 UQ:i:27 H0:i:1 H1:i:0 ++-EAS188_7:1:290:286:763 99 chr2 1349 75 35M = 1515 201 TTAAAATTTAACAAAAGTAAATAAAACACATAGCT <<<<<<<<<<<<<<<<7<<<<<<<<<<<<<<<8<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS221_1:4:3:248:1491 73 chr2 1349 99 35M * 0 0 TTAAAATTTAACAAAAGTAAATAAAACACATAGCT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<:8:< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS114_39:3:6:1064:1805 99 chr2 1350 99 35M = 1502 187 TAAAATTTAACAAAAGTAAATAAAACACATAGCTA <<<<<<<<<<<<<<<<<<<<<<;<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_595:6:137:811:130 83 chr2 1351 99 35M = 1175 -211 AAAATTTAACAAAAGTAAATAAAACACATAGCTAA <<<<<<<<<:<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:1:155:809:543 83 chr2 1352 99 35M = 1156 -231 AAATTTAACAAAAGTAAATAAAACACATAGCTAAA <<<+0<<<9<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_63:3:41:468:459 99 chr2 1352 75 35M = 1513 196 AAATTTAACAAAAGTAAATAAAACACATAGCTAAA <<<<<<<<<<<<<<<<<<<<<;<<<<<<<<<<;;7 MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_108:4:31:622:216 73 chr2 1354 99 35M * 0 0 ATTTAACAAAAGTAAATAAAACACATAGCTAAAAC <<<<<<<<<<<<<<<<<<<<<<<<<<<<8<<96<7 MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS54_71:8:105:854:975 163 chr2 1354 71 35M = 1523 202 ATTTAACAAAAGTAAATAAAACACATAGCTAAAAC <<<<<<<<<<<<<<<<<<<<<<<<<<<7:<;;;;5 MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-B7_610:7:26:749:174 147 chr2 1357 78 35M = 1183 -209 TAACAAAAGTAAATAAAACACATAGCTAAAACTAA (<<)<<<<6<<<<<<<<<<&:<3<<<6<<<)<:<< MF:i:18 Aq:i:11 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS56_57:6:21:553:57 147 chr2 1358 99 35M = 1197 -196 AACAAAAGTAAATAAAACACATAGCTAAAACTAAA <<+<<<<<<<<<;<<<<8<<<<<<8<<<<<;<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 ++-EAS1_97:2:128:629:484 83 chr2 1359 96 35M = 1185 -209 AAAAAAGTAAATAAAACACATAGCTAAAACTAAAA :(::<<<<<<<<<< MF:i:32 Aq:i:0 NM:i:0 UQ:i:0 H0:i:82 H1:i:85 ++-B7_589:6:33:356:636 73 chr2 1520 0 35M * 0 0 TTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTT <<<<<<<8;<<<<<<<<<<<<<7<<<<<<<;;3&3 MF:i:32 Aq:i:0 NM:i:0 UQ:i:0 H0:i:14 H1:i:85 ++-EAS114_45:6:86:859:1779 137 chr2 1520 0 35M * 0 0 TTTTTTTCATTTCTCTTTTTTTTTTTTTTTTTTTT ;;;;;;;;;;;;;;;;;;;;;;;;;;;8;;)7699 MF:i:32 Aq:i:0 NM:i:1 UQ:i:26 H0:i:0 H1:i:15 ++-EAS54_71:8:105:854:975 83 chr2 1523 71 33M = 1354 -202 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTG <<<<;<:<<;<&<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:85 H1:i:85 ++-EAS51_62:4:187:907:145 153 chr2 1524 28 35M * 0 0 TTTCTTCTCTCTCTTTTTTTTTTTTTTTATTGCAT <<<+;;,6<<<<6<<<<<<<<<<<<<<<<<<<<<< MF:i:32 Aq:i:28 NM:i:3 UQ:i:59 H0:i:0 H1:i:0 ++-EAS54_71:4:284:269:882 73 chr2 1524 0 34M * 0 0 TTTCTTTTCTCTTTTTTTTTTTTTTGTTTTTGCA <;<<<<<8<7<8;<<<;<7<<<<<;272;73&&) MF:i:32 Aq:i:0 NM:i:1 UQ:i:17 H0:i:0 H1:i:85 ++-EAS56_63:4:141:9:811 137 chr2 1524 10 35M * 0 0 TTTCTTTTCTCCTTTTTTTTTTTTTTTTTCTACAT <<<;<<<<<<<;<;<:<<<;<<<<<<<<..));;. MF:i:32 Aq:i:0 NM:i:3 UQ:i:47 H0:i:2 H1:i:27 ++-EAS114_30:6:277:397:932 73 chr2 1524 0 35M * 0 0 TTTCTTTTCACTTTTTTTTTTTTTTTTTTTTACTT <<<<<<<<<<<<<<<<<<<<<<<<<<<<:8(,0%( MF:i:32 Aq:i:0 NM:i:3 UQ:i:42 H0:i:2 H1:i:85 ++-EAS139_11:7:50:1229:1313 83 chr2 1528 77 35M = 1376 -187 TTTTTTCTTTTTTTTTTTTTTTTTTTTGCATGCCA <<<<,<&<7<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:1 UQ:i:11 H0:i:3 H1:i:7 ++-EAS54_65:3:320:20:250 147 chr2 1532 77 35M = 1367 -200 TTTTTTTTTTTTTTTTTTTTTTTGCATGCCAGAAA +'''/<<<<7:;+<;::<<<;;<<<<<<<<<<<<< MF:i:18 Aq:i:6 NM:i:2 UQ:i:24 H0:i:1 H1:i:2 ++-EAS114_26:7:37:79:581 83 chr2 1533 68 35M = 1349 -219 TTTTTTTTTTTTTTTTTTTTTTTCATGCCAGAAAA 3,,,===6===<===<;=====-============ MF:i:18 Aq:i:27 NM:i:2 UQ:i:23 H0:i:0 H1:i:1 ++--- python-pysam.orig/tests/pysam_data/example_user_header.sam +++++ /dev/null ++@@ -1,8 +0,0 @@ ++-@HD VN:1.0 ++-@SQ SN:chr1 LN:1575 ++-@SQ SN:chr2 LN:1584 ++-@x1 A:2 B:5 ++-@x2 A:4 B:5 ++-@x3 A:6 B:5 ++-read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 ++-read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 ++--- python-pysam.orig/tests/pysam_data/Makefile +++++ python-pysam/tests/pysam_data/Makefile ++@@ -14,7 +14,6 @@ ++ $(BAM) $(BAI) \ ++ $(CRAM) $(CRAI) \ ++ example_bai.bam \ ++- rg_with_tab.bam \ ++ ex2_truncated.bam \ ++ empty.bam empty.bam.bai \ ++ explicit_index.bam explicit_index.cram \ ++--- python-pysam.orig/pysam/alternatives.py.obsolete +++++ python-pysam/pysam/alternatives.py.obsolete ++@@ -12,7 +12,6 @@ ++ int bam_merge(int argc, char *argv[]) ++ int bam_index(int argc, char *argv[]) ++ int bam_sort(int argc, char *argv[]) ++- int bam_tview_main(int argc, char *argv[]) ++ int bam_mating(int argc, char *argv[]) ++ int bam_rmdup(int argc, char *argv[]) ++ int bam_rmdupse(int argc, char *argv[]) ++--- python-pysam.orig/tests/AlignmentFile_test.py +++++ python-pysam/tests/AlignmentFile_test.py ++@@ -1382,19 +1382,19 @@ ++ os.unlink(tmpfilename) ++ ++ ++-class TestDeNovoConstructionUserTags(TestDeNovoConstruction): ++- ++- '''test de novo construction with a header that contains lower-case tags.''' ++- ++- header = {'HD': {'VN': '1.0'}, ++- 'SQ': [{'LN': 1575, 'SN': 'chr1'}, ++- {'LN': 1584, 'SN': 'chr2'}], ++- 'x1': {'A': 2, 'B': 5}, ++- 'x3': {'A': 6, 'B': 5}, ++- 'x2': {'A': 4, 'B': 5}} ++- ++- bamfile = os.path.join(BAM_DATADIR, "example_user_header.bam") ++- samfile = os.path.join(BAM_DATADIR, "example_user_header.sam") +++# class TestDeNovoConstructionUserTags(TestDeNovoConstruction): +++# +++# '''test de novo construction with a header that contains lower-case tags.''' +++# +++# header = {'HD': {'VN': '1.0'}, +++# 'SQ': [{'LN': 1575, 'SN': 'chr1'}, +++# {'LN': 1584, 'SN': 'chr2'}], +++# 'x1': {'A': 2, 'B': 5}, +++# 'x3': {'A': 6, 'B': 5}, +++# 'x2': {'A': 4, 'B': 5}} +++# +++# bamfile = os.path.join(BAM_DATADIR, "example_user_header.bam") +++# samfile = os.path.join(BAM_DATADIR, "example_user_header.sam") ++ ++ ++ class TestEmptyHeader(unittest.TestCase): ++--- python-pysam.orig/tests/samtools_test.py +++++ python-pysam/tests/samtools_test.py ++@@ -78,7 +78,7 @@ ++ # ("view -bT ex1.fa -o %(out)s_ex1.view2 ex1.sam", ++ "sort ex1.bam -o %(out)s_ex1.sort.bam", ++ "mpileup ex1.bam > %(out)s_ex1.pileup", ++- "depth ex1.bam > %(out)s_ex1.depth", +++ #"depth ex1.bam > %(out)s_ex1.depth", ++ # TODO: issues with file naming ++ # "faidx ex1.fa; %(out)s_ex1.fa.fai", ++ "index ex1.bam %(out)s_ex1.bam.fai", ++@@ -100,8 +100,8 @@ ++ "cat -o %(out)s_ex1.cat.bam ex1.bam ex1.bam", ++ "targetcut ex1.bam > %(out)s_ex1.targetcut", ++ "phase ex1.bam > %(out)s_ex1.phase", ++- "import ex1.fa.fai ex1.sam.gz %(out)s_ex1.bam", ++- "bam2fq ex1.bam > %(out)s_ex1.bam2fq", +++ #"view -bt ex1.fa.fai -o %(out)s_ex1.bam ex1.sam.gz", +++ #"bam2fq ex1.bam > %(out)s_ex1.bam2fq", ++ # TODO: not the same ++ # "pad2unpad -T ex1.fa ex2.bam > %(out)s_ex2.unpad", ++ # TODO: command line option problem diff --cc debian/patches/series index 0000000,0000000..6d0edd0 new file mode 100644 --- /dev/null +++ b/debian/patches/series @@@ -1,0 -1,0 +1,8 @@@ ++skip_test_remote.patch ++#spelling ++#hts1.10 ++#samtools_v1.10_full ++# samtools_v1.10 ++#bcftools_v1.10_full ++clean_less ++skip-test-on-32-bit.patch diff --cc debian/patches/skip-test-on-32-bit.patch index 0000000,0000000..3ce4e32 new file mode 100644 --- /dev/null +++ b/debian/patches/skip-test-on-32-bit.patch @@@ -1,0 -1,0 +1,29 @@@ ++--- a/tests/samtools_test.py +++++ b/tests/samtools_test.py ++@@ -20,6 +20,7 @@ ++ check_samtools_view_equal, get_temp_filename, force_bytes, WORKDIR, \ ++ make_data_files, BAM_DATADIR ++ +++import platform ++ ++ IS_PYTHON3 = sys.version_info[0] >= 3 ++ ++@@ -92,7 +93,7 @@ ++ "flagstat ex1.bam > %(out)s_ex1.flagstat", ++ # Fails python 3.3 on linux, passes on OsX and when ++ # run locally ++- "calmd ex1.bam ex1.fa > %(out)s_ex1.calmd.bam", +++ #"calmd ex1.bam ex1.fa > %(out)s_ex1.calmd.bam", ++ # use -s option, otherwise the following error in samtools 1.2: ++ # Samtools-htslib-API: bam_get_library() not yet implemented ++ # causes downstream problems ++@@ -118,6 +119,9 @@ ++ # ("addreplacerg -r 'RG\tID:ga\tSM:hs' ex1.bam > %(out)s_ex1.addreplacerg", ++ ] ++ +++ if platform.architecture()[0] == '64bit': +++ statements.append("calmd ex1.bam ex1.fa > %(out)s_ex1.calmd.bam") +++ ++ map_command = { ++ } ++ diff --cc debian/patches/skip_test_remote.patch index 0000000,0000000..7148f8e new file mode 100644 --- /dev/null +++ b/debian/patches/skip_test_remote.patch @@@ -1,0 -1,0 +1,90 @@@ ++Author: Andreas Tille ++Last-Update: Tue, 11 Sep 2018 14:12:55 +0200 ++Description: Skip tests trying to access remote site ++ ++--- a/tests/AlignmentFile_test.py +++++ b/tests/AlignmentFile_test.py ++@@ -503,17 +503,6 @@ ++ "tmp_ex2.sam", ++ "rb", "wh") ++ ++- def testBAM2CRAM(self): ++- # ignore header (md5 sum) ++- self.checkEcho("ex2.bam", ++- "ex2.cram", ++- "tmp_ex2.cram", ++- "rb", "wc", ++- sequence_filename=os.path.join(BAM_DATADIR, "ex1.fa"), ++- checkf=partial( ++- check_samtools_view_equal, ++- without_header=True)) ++- ++ def testCRAM2BAM(self): ++ # ignore header (md5 sum) ++ self.checkEcho("ex2.cram", ++@@ -1638,6 +1627,7 @@ ++ reference_filename = os.path.join(BAM_DATADIR, 'ex1.fa') ++ ++ +++@unittest.skip ++ class TestRemoteFileFTP(unittest.TestCase): ++ ++ '''test remote access. ++--- a/tests/tabix_test.py +++++ b/tests/tabix_test.py ++@@ -1017,6 +1017,7 @@ ++ globals()[n] = type(n, (TestVCFFromVariantFile,), dict(filename=vcf_file,)) ++ ++ +++@unittest.skip ++ class TestRemoteFileHTTP(unittest.TestCase): ++ ++ url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/example.gtf.gz" ++@@ -1056,25 +1057,28 @@ ++ self.assertEqual(list(self.local_file.header), []) ++ ++ ++-class TestRemoteFileHTTPWithHeader(TestRemoteFileHTTP): ++- ++- url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/example_comments.gtf.gz" ++- region = "chr1:1-1000" ++- local = os.path.join(TABIX_DATADIR, "example_comments.gtf.gz") ++- ++- def setUp(self): ++- if not pysam.config.HAVE_LIBCURL or not check_url(self.url): ++- self.remote_file = None ++- else: ++- self.remote_file = pysam.TabixFile(self.url, "r") ++- self.local_file = pysam.TabixFile(self.local, "r") ++- ++- def testHeader(self): ++- if self.remote_file is None: ++- return ++- ++- self.assertEqual(list(self.local_file.header), ["# comment at start"]) ++- self.assertEqual(list(self.local_file.header), self.remote_file.header) +++#@unittest.skip # this leads to +++ # E TypeError: Error when calling the metaclass bases +++ # E function() argument 1 must be code, not str +++#class TestRemoteFileHTTPWithHeader(TestRemoteFileHTTP): +++# +++# url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/example_comments.gtf.gz" +++# region = "chr1:1-1000" +++# local = os.path.join(TABIX_DATADIR, "example_comments.gtf.gz") +++# +++# def setUp(self): +++# if not pysam.config.HAVE_LIBCURL or not check_url(self.url): +++# self.remote_file = None +++# else: +++# self.remote_file = pysam.TabixFile(self.url, "r") +++# self.local_file = pysam.TabixFile(self.local, "r") +++# +++# def testHeader(self): +++# if self.remote_file is None: +++# return +++# +++# self.assertEqual(list(self.local_file.header), ["# comment at start"]) +++# self.assertEqual(list(self.local_file.header), self.remote_file.header) ++ ++ ++ class TestIndexArgument(unittest.TestCase): diff --cc debian/patches/spelling index 0000000,0000000..34e82d6 new file mode 100644 --- /dev/null +++ b/debian/patches/spelling @@@ -1,0 -1,0 +1,115 @@@ ++From: Michael R. Crusoe ++Subject: Fix spelling typos, courtesy of lintian ++--- a/bcftools/filter.c +++++ b/bcftools/filter.c ++@@ -1053,7 +1053,7 @@ static void filters_set_nmissing(filter_ ++ } ++ static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) ++ { ++- if ( nstack==0 ) error("Error parsing the expresion\n"); +++ if ( nstack==0 ) error("Error parsing the expression\n"); ++ token_t *tok = stack[nstack - 1]; ++ if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag); ++ ++--- a/bcftools/filter.c.pysam.c +++++ b/bcftools/filter.c.pysam.c ++@@ -1055,7 +1055,7 @@ static void filters_set_nmissing(filter_ ++ } ++ static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) ++ { ++- if ( nstack==0 ) error("Error parsing the expresion\n"); +++ if ( nstack==0 ) error("Error parsing the expression\n"); ++ token_t *tok = stack[nstack - 1]; ++ if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag); ++ ++--- a/pysam/libcalignedsegment.pyx +++++ b/pysam/libcalignedsegment.pyx ++@@ -2242,7 +2242,7 @@ cdef class AlignedSegment: ++ *value*. ++ ++ An existing value of the same *tag* will be overwritten unless ++- *replace* is set to False. This is usually not recommened as a +++ *replace* is set to False. This is usually not recommended as a ++ tag may only appear once in the optional alignment section. ++ ++ If *value* is None, the tag will be deleted. ++--- a/pysam/libcalignmentfile.pyx +++++ b/pysam/libcalignmentfile.pyx ++@@ -1029,7 +1029,7 @@ cdef class AlignmentFile(HTSFile): ++ ++ See :meth:`~pysam.HTSFile.parse_region` for more information ++ on how genomic regions can be specified. :term:`reference` and ++- `end` are also accepted for backward compatiblity as synonyms +++ `end` are also accepted for backward compatibility as synonyms ++ for :term:`contig` and `stop`, respectively. ++ ++ Without a `contig` or `region` all mapped reads in the file ++@@ -1212,7 +1212,7 @@ cdef class AlignmentFile(HTSFile): ++ """perform a :term:`pileup` within a :term:`region`. The region is ++ specified by :term:`contig`, `start` and `stop` (using ++ 0-based indexing). :term:`reference` and `end` are also accepted for ++- backward compatiblity as synonyms for :term:`contig` and `stop`, +++ backward compatibility as synonyms for :term:`contig` and `stop`, ++ respectively. Alternatively, a samtools 'region' string ++ can be supplied. ++ ++@@ -1354,7 +1354,7 @@ cdef class AlignmentFile(HTSFile): ++ ++ The region is specified by :term:`contig`, `start` and `stop`. ++ :term:`reference` and `end` are also accepted for backward ++- compatiblity as synonyms for :term:`contig` and `stop`, +++ compatibility as synonyms for :term:`contig` and `stop`, ++ respectively. Alternatively, a :term:`samtools` :term:`region` ++ string can be supplied. ++ ++@@ -1458,7 +1458,7 @@ cdef class AlignmentFile(HTSFile): ++ ++ The region is specified by :term:`contig`, `start` and `stop`. ++ :term:`reference` and `end` are also accepted for backward ++- compatiblity as synonyms for :term:`contig` and `stop`, +++ compatibility as synonyms for :term:`contig` and `stop`, ++ respectively. Alternatively, a :term:`samtools` :term:`region` ++ string can be supplied. The coverage is computed per-base [ACGT]. ++ ++--- a/pysam/libchtslib.pxd +++++ b/pysam/libchtslib.pxd ++@@ -2511,7 +2511,7 @@ cdef extern from "htslib/cram.h" nogil: ++ # 2 if the file is a stream and thus unseekable ++ # 1 if the file contains an EOF block ++ # 0 if the file does not contain an EOF block ++- # -1 if an error occured whilst reading the file or we could not seek back to where we were +++ # -1 if an error occurred whilst reading the file or we could not seek back to where we were ++ # ++ # ++ int cram_check_EOF(cram_fd *fd) ++--- a/pysam/libchtslib.pyx +++++ b/pysam/libchtslib.pyx ++@@ -585,7 +585,7 @@ cdef class HTSFile(object): ++ rval = hts_opt_apply(self.htsfile, opts) ++ if rval != 0: ++ hts_opt_free(opts) ++- raise RuntimeError('An error occured while applying the requested format options') +++ raise RuntimeError('An error occurred while applying the requested format options') ++ hts_opt_free(opts) ++ ++ def parse_region(self, contig=None, start=None, stop=None, ++@@ -595,7 +595,7 @@ cdef class HTSFile(object): ++ either be specified by :term:`contig`, `start` and ++ `stop`. `start` and `stop` denote 0-based, half-open ++ intervals. :term:`reference` and `end` are also accepted for ++- backward compatiblity as synonyms for :term:`contig` and +++ backward compatibility as synonyms for :term:`contig` and ++ `stop`, respectively. ++ ++ Alternatively, a samtools :term:`region` string can be ++--- a/pysam/libcutils.pyx +++++ b/pysam/libcutils.pyx ++@@ -179,7 +179,7 @@ cpdef parse_region(contig=None, ++ `end`. `start` and `end` denote 0-based, half-open intervals. ++ ++ :term:`reference` and `end` are also accepted for backward ++- compatiblity as synonyms for :term:`contig` and `stop`, +++ compatibility as synonyms for :term:`contig` and `stop`, ++ respectively. ++ ++ Alternatively, a samtools :term:`region` string can be supplied. diff --cc debian/python-pysam-tests.README.Debian index 0000000,0000000..0c7ed43 new file mode 100644 --- /dev/null +++ b/debian/python-pysam-tests.README.Debian @@@ -1,0 -1,0 +1,8 @@@ ++Pysam for Debian ++================ ++ ++To verify whether your python-pysam and python3-pysam modules are working ++correctly you can run the test suite manually by running the scripts ++run-nose-tests and run-nose3-tests in this directory. ++ ++ -- Jorge Soares Fri, 28 Nov 2014 14:29:40 +0100 diff --cc debian/python-pysam-tests.install index 0000000,0000000..a99a578 new file mode 100644 --- /dev/null +++ b/debian/python-pysam-tests.install @@@ -1,0 -1,0 +1,1 @@@ ++tests usr/share/doc/python-pysam diff --cc debian/python-pysam-tests.lintian-overrides index 0000000,0000000..1dfb2c6 new file mode 100644 --- /dev/null +++ b/debian/python-pysam-tests.lintian-overrides @@@ -1,0 -1,0 +1,5 @@@ ++# These files are intentionally empty to be used for testing ++zero-byte-file-in-doc-directory usr/share/doc/python-pysam/tests/cbcf_data/example_empty.vcf ++zero-byte-file-in-doc-directory usr/share/doc/python-pysam/tests/tabix_data/empty.bed.gz ++# The duplicate is used for testing and needs to be there ++duplicated-compressed-file usr/share/doc/python-pysam/tests/tabix_data/example.bed.gz diff --cc debian/python-pysam.links index 0000000,0000000..8cc3b2c new file mode 100755 --- /dev/null +++ b/debian/python-pysam.links @@@ -1,0 -1,0 +1,4 @@@ ++#!/usr/bin/dh-exec ++/usr/lib/python2.7/dist-packages/pysam/libcsamtools.${DEB_HOST_GNU_TYPE}.so /usr/lib/python2.7/dist-packages/pysam/libcsamtools.so ++/usr/lib/python2.7/dist-packages/pysam/libcbcftools.${DEB_HOST_GNU_TYPE}.so /usr/lib/python2.7/dist-packages/pysam/libcbcftools.so ++/usr/lib/python2.7/dist-packages/pysam/libchtslib.${DEB_HOST_GNU_TYPE}.so /usr/lib/python2.7/dist-packages/pysam/libchtslib.so diff --cc debian/python-pysam.lintian-overrides index 0000000,0000000..785680d new file mode 100644 --- /dev/null +++ b/debian/python-pysam.lintian-overrides @@@ -1,0 -1,0 +1,4 @@@ ++# These are apparently false-positives ++# Inspection of the build logs shows that these are indeed compiled with ++# -D_FORTIFY_SOURCE=2 ++hardening-no-fortify-functions usr/lib/python2.7/dist-packages/pysam/lib*.so diff --cc debian/python3-pysam.lintian-overrides index 0000000,0000000..288316d new file mode 100644 --- /dev/null +++ b/debian/python3-pysam.lintian-overrides @@@ -1,0 -1,0 +1,4 @@@ ++# These are apparently false-positives ++# Inspection of the build logs shows that these are indeed compiled with ++# -D_FORTIFY_SOURCE=2 ++hardening-no-fortify-functions usr/lib/python3/dist-packages/pysam/lib*.so diff --cc debian/rules index 0000000,0000000..306e730 new file mode 100755 --- /dev/null +++ b/debian/rules @@@ -1,0 -1,0 +1,73 @@@ ++#!/usr/bin/make -f ++ ++include /usr/share/dpkg/default.mk ++ ++export PYBUILD_NAME=pysam ++export LC_ALL = C.UTF-8 ++ ++export DEB_BUILD_MAINT_OPTIONS = hardening=+all ++export DEB_LDFLAGS_MAINT_APPEND=-Wl,--as-needed ++ ++TESTPKG := $(DEB_SOURCE)-tests ++ ++export HTSLIB_MODE=external ++HTSLIBDIR := /usr/lib/$(DEB_HOST_MULTIARCH) ++export HTSLIB_LIBRARY_DIR=$(HTSLIBDIR) ++export HTSLIB_INCLUDE_DIR=/usr/include ++ ++# unfortunately this does not work - any hint to fix this would be really welcome ++#export PYBUILD_TEST_ARGS_python3=-k-XTestRemoteFileHTTP -k-XTestRemoteFileHTTPWithHeader ++ ++clean: clean-tests ++%: ++ dh $@ --with python3 --buildsystem=pybuild ++ ++override_dh_install: clean-tests ++ dh_install -Xtest.gtf.gz ++ find debian -name log.txt -delete ++ ++ifeq (,$(findstring nocheck, $(DEB_BUILD_OPTIONS))) ++override_dh_auto_test: pysam_data.all cbcf_data.all ++ dh_auto_test ++else ++override_dh_auto_test: ++endif ++ ++override_dh_auto_clean: ++ dh_auto_clean ++ $(RM) \ ++ pysam/config.py \ ++ pysam/config.h \ ++ pysam/lib*.c \ ++ samtools/config.h \ ++ bcftools/config.h ++ rm -rf pysam.egg-info ++ ++.PHONY: pysam_data.% cbcf_data.% ++cbcf_data.%: ++ cd tests/$(basename $@) && $(MAKE) $* ++pysam_data.%: ++ cd tests/$(basename $@) && $(MAKE) $* ++ ++.PHONY: clean-tests ++clean-tests: pysam_data.clean cbcf_data.clean ++ find . -name "*.pyc" -delete ++ find . -name "*.pyxbldc" -delete ++ find . -name "tmp_*.bam" -delete ++ find . -name "*.bai*" -delete ++ find . -name "*.cram*" -delete ++ find . -name "pysam_uncompressed.bam" -delete ++ rm -rf tests/pysam_test_work \ ++ tests/example_htslib.gtf.gz.tbi \ ++ tests/log.txt \ ++ tests/test.bam \ ++ tests/_compile_test.c \ ++ tests/pysam_ex2.sam \ ++ tests/__pycache__ \ ++ pysam/calignmentfile.c \ ++ pysam/cbcf.c \ ++ tests/GRCh38_full_analysis_set_plus_decoy_hla.fa.fai \ ++ tests/pysam_data/ex1.fa.gz \ ++ tests/pysam_data/ex1.fa.gz.gzi \ ++ tests/pysam_data/ex1_csi.bam.csi ++ rm -rf .pytest_cache/ diff --cc debian/salsa-ci.yml index 0000000,0000000..33c3a64 new file mode 100644 --- /dev/null +++ b/debian/salsa-ci.yml @@@ -1,0 -1,0 +1,4 @@@ ++--- ++include: ++ - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml ++ - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml diff --cc debian/source/format index 0000000,0000000..163aaf8 new file mode 100644 --- /dev/null +++ b/debian/source/format @@@ -1,0 -1,0 +1,1 @@@ ++3.0 (quilt) diff --cc debian/tests/control index 0000000,0000000..6dc4058 new file mode 100644 --- /dev/null +++ b/debian/tests/control @@@ -1,0 -1,0 +1,3 @@@ ++Test-Command: export HOME=$PWD && make -C tests/pysam_data && make -C tests/cbcf_data && pytest-3 ++Depends: @builddeps@, python3-pysam, ++Restrictions: allow-stderr, rw-build-tree diff --cc debian/upstream/metadata index 0000000,0000000..95241be new file mode 100644 --- /dev/null +++ b/debian/upstream/metadata @@@ -1,0 -1,0 +1,13 @@@ ++Bug-Submit: https://github.com/pysam-developers/pysam/issues/new ++Registry: ++- Name: OMICtools ++ Entry: OMICS_19073 ++- Name: SciCrunch ++ Entry: NA ++- Name: bio.tools ++ Entry: pysam ++- Name: conda:bioconda ++ Entry: pysam ++Repository: https://github.com/pysam-developers/pysam.git ++Bug-Database: https://github.com/pysam-developers/pysam/issues ++Repository-Browse: https://github.com/pysam-developers/pysam diff --cc debian/watch index 0000000,0000000..c714a54 new file mode 100644 --- /dev/null +++ b/debian/watch @@@ -1,0 -1,0 +1,5 @@@ ++version=4 ++ ++opts="repacksuffix=+ds,dversionmangle=s/\+ds//g,filenamemangle=s%(?:.*?)?v?(\d[\d.]*)\.tar\.gz%python-pysam-$1.tar.gz%" \ ++ https://github.com/pysam-developers/pysam/tags \ ++ (?:.*/)?v?(\d[\d\.]*)\.tar\.gz