--- /dev/null
+name: CI
+
+# on: [push, pull_request]
+on: [pull_request]
+
+jobs:
+ direct:
+ runs-on: ${{ matrix.os }}-latest
+ strategy:
+ matrix:
+ os: [ubuntu, macos]
+ python-version: [2.7, 3.6, 3.7, 3.8, 3.9]
+ exclude:
+ # Run only the latest 2.x and 3.x on macOS
+ - os: macos
+ python-version: 3.6
+ - os: macos
+ python-version: 3.7
+ - os: macos
+ python-version: 3.8
+
+ steps:
+ - name: Checkout pysam
+ uses: actions/checkout@v2
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install prerequisite Python libraries
+ run: pip install cython pytest pytest-pep8
+
+ - name: Install build prerequisites
+ if: runner.os == 'Linux'
+ run: |
+ sudo apt-get update
+ sudo apt-get install -q --no-install-recommends --no-install-suggests libcurl4-openssl-dev
+
+ - name: Build (directly from checkout)
+ run: python setup.py build
+
+ - name: Install test prerequisites
+ run: |
+ case $RUNNER_OS in
+ Linux)
+ sudo apt-get install -q --no-install-recommends --no-install-suggests samtools bcftools tabix
+ ;;
+ macOS)
+ brew install -q samtools bcftools
+ ;;
+ esac
+
+ - name: Run tests
+ run: |
+ export PYTHONPATH=$(echo $GITHUB_WORKSPACE/build/lib.*)
+ export REF_PATH=':'
+ pytest
+
+
+ sdist:
+ runs-on: ${{ matrix.os }}-latest
+ strategy:
+ matrix:
+ os: [ubuntu, macos]
+ python-version: [3.9]
+
+ steps:
+ - name: Checkout pysam
+ uses: actions/checkout@v2
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install prerequisite Python libraries
+ run: pip install cython pytest pytest-pep8
+
+ - name: Install build prerequisites
+ if: runner.os == 'Linux'
+ run: |
+ sudo apt-get update
+ sudo apt-get install -q --no-install-recommends --no-install-suggests libcurl4-openssl-dev
+
+ - name: Create source distribution
+ run: python setup.py sdist --owner=root --group=root
+
+ - name: Build (via sdist tarball)
+ run: pip install --verbose --no-deps --no-binary=':all:' pysam-*.tar.gz
+ working-directory: dist
+
+ - name: Install test prerequisites
+ run: |
+ case $RUNNER_OS in
+ Linux)
+ sudo apt-get install -q --no-install-recommends --no-install-suggests samtools bcftools tabix
+ ;;
+ macOS)
+ brew install -q samtools bcftools
+ ;;
+ esac
+
+ - name: Run tests
+ run: REF_PATH=':' pytest
+
+ - name: Upload sdist tarball
+ if: runner.os == 'Linux'
+ uses: actions/upload-artifact@v2
+ with:
+ name: sdist
+ path: dist/pysam-*.tar.gz
+ retention-days: 14
+
+
+ conda:
+ timeout-minutes: 20
+ runs-on: ${{ matrix.os }}-latest
+ strategy:
+ matrix:
+ os: [ubuntu]
+ python-version: [3.7]
+ defaults:
+ run:
+ shell: bash -l {0} # needed for conda activation
+ env:
+ HTSLIB_CONFIGURE_OPTIONS: "--disable-libcurl"
+
+ steps:
+ - name: Checkout pysam
+ uses: actions/checkout@v2
+
+ - uses: conda-incubator/setup-miniconda@v2
+ with:
+ channel-priority: strict
+ activate-environment: testenv
+ auto-activate-base: false
+ use-only-tar-bz2: true
+
+ - name: Set up Conda and Python ${{ matrix.python-version }}
+ run: |
+ conda config --add channels bioconda --add channels conda-forge
+ conda install python=${{ matrix.python-version }} cython
+
+ - name: Build (directly from checkout)
+ run: python setup.py install
+
+ - name: Install test prerequisites via Conda
+ run: conda install "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" pytest
+
+ - name: Run tests
+ run: REF_PATH=':' pytest
--- /dev/null
+name: Publish pysam wheels to PyPI and TestPyPI
+
+on:
+ push:
+ branches:
+ - v[0-9]+.[0-9]+.x
+ tags:
+ - v*
+ release:
+ types:
+ - published
+
+jobs:
+ build_wheels:
+ name: Build wheels on ${{ matrix.os }}
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-20.04, macos-10.15] # windows-2019,
+
+ steps:
+ - name: Checkout pysam
+ uses: actions/checkout@v2
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: '3.8'
+
+ - name: Install prerequisite Python libraries
+ run: |
+ python -m pip install --upgrade pip
+ pip install cython pytest pytest-pep8
+
+ - name: Build wheels for linux
+ if: runner.os == 'Linux'
+ uses: pypa/cibuildwheel@v2.1.2
+ env:
+ CIBW_BUILD: cp36-* cp37-* cp38-* cp39-*
+ CIBW_BEFORE_BUILD: yum install -y libcurl-devel zlib-devel bzip2-devel xz-devel && pip install cython
+ CIBW_MANYLINUX_X86_64_IMAGE: manylinux1
+ CIBW_MANYLINUX_I686_IMAGE: manylinux1
+
+ - name: Build wheels for macos
+ if: runner.os != 'Linux'
+ uses: pypa/cibuildwheel@v2.1.2
+ env:
+ CIBW_BUILD: cp36-* cp37-* cp38-* cp39-*
+ CIBW_BEFORE_BUILD: pip install cython
+
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v2
+ with:
+ path: ./wheelhouse/*.whl
+
+ build_sdist:
+
+ runs-on: ${{ matrix.os }}-latest
+ strategy:
+ matrix:
+ os: [ubuntu, macos]
+ python-version: [3.9]
+
+ steps:
+ - name: Checkout pysam
+ uses: actions/checkout@v2
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install prerequisite Python libraries
+ run: pip install cython pytest pytest-pep8
+
+ - name: Install build prerequisites
+ if: runner.os == 'Linux'
+ run: |
+ sudo apt-get update
+ sudo apt-get install -q --no-install-recommends --no-install-suggests libcurl4-openssl-dev
+
+ - name: Create source distribution
+ run: python setup.py sdist
+
+ - uses: actions/upload-artifact@v2
+ with:
+ path: dist/*.tar.gz
+
+ upload_pypi:
+
+ needs: [build_wheels, build_sdist]
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Get Artifacts
+ uses: actions/download-artifact@v2
+ with:
+ name: artifact
+ path: dist
+
+ - name: Publish distribution to Test PyPI
+ if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
+ uses: pypa/gh-action-pypi-publish@master
+ with:
+ user: __token__
+ password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+ repository_url: https://test.pypi.org/legacy/
+
+ - name: Publish distribution to PyPI
+ if: github.event_name == 'release' && github.event.action == 'published'
+ uses: pypa/gh-action-pypi-publish@master
+ with:
+ user: __token__
+ password: ${{ secrets.PYPI_API_TOKEN }}
+
tests/tabix_data
samtools/config.h
+samtools/samtools_config_vars.h
bcftools/config.h
htslib/config.status
htslib/config.h
htslib/config.log
htslib/config.mk
+htslib/config_vars.h
+htslib/htscodecs.mk
htslib/htslib.pc.tmp
htslib/htslib-uninstalled.pc
pysam/config.py
--- /dev/null
+os:
+ - linux
+ - osx
+
+language: c
+
+stages:
+ - test
+ - name: deploy
+ if: tag IS present
+
+env:
+ matrix:
+ - CONDA_PY=2.7
+ - CONDA_PY=3.6
+ - CONDA_PY=3.7
+ - CONDA_PY=3.8
+ global:
+ - PYSAM_LINKING_TEST=1
+ - TWINE_USERNAME=grepall
+ - secure: bTbky3Un19NAl62lix8bMLmBv9IGNhFkRXlZH+B253nYub7jwQwPQKum3ct9ea+XHJT5//uM0B8WAF6eyugpNkPQ7+S7SEH5BJuCt30nv6qvGhSO2AffZKeHEDnfW2kqGrivn87TqeomlSBlO742CD/V0wOIUwkTT9tutd+E7FU=
+
+_cibw_common: &cibw_common
+ addons: {}
+ install:
+ - python3 -m pip install cibuildwheel>=1.1.0 twine
+ script:
+ - set -e
+ - cibuildwheel --output-dir dist
+ - twine check dist/*
+ - twine upload --skip-existing dist/*
+
+_cibw_linux: &cibw_linux
+ stage: deploy
+ os: linux
+ language: python
+ python: '3.5'
+ services:
+ - docker
+ <<: *cibw_common
+
+_cibw_linux_aarch64: &cibw_linux_aarch64
+ stage: deploy
+ os: linux
+ arch: arm64
+ language: python
+ python: '3.9'
+ services:
+ - docker
+ <<: *cibw_common
+
+matrix:
+ include:
+ - stage: deploy
+ os: linux
+ language: python
+ python: '3.5'
+ addons:
+ apt:
+ packages:
+ - gcc
+ - g++
+ - libcurl4-openssl-dev # for libcurl support in sdist
+ - libssl-dev # for s3 support in sdist
+ install:
+ - python3 -m pip install Cython twine
+ script:
+ - set -e
+ - python3 setup.py build_ext --inplace
+ - python3 setup.py sdist
+ - twine check dist/*
+ - twine upload --skip-existing dist/*
+ - <<: *cibw_linux
+ env:
+ - CIBW_BUILD="*_x86_64"
+ - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
+ - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+ - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
+ - CIBW_TEST_COMMAND='python -c "import pysam"'
+ - <<: *cibw_linux
+ env:
+ - CIBW_BUILD="*_i686"
+ - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
+ - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+ - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
+ - CIBW_TEST_COMMAND='python -c "import pysam"'
+ - <<: *cibw_linux_aarch64
+ env:
+ - CIBW_BUILD="*_aarch64"
+ - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
+ - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+ - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
+ - CIBW_TEST_COMMAND='python -c "import pysam"'
+ - stage: deploy
+ os: osx
+ language: generic
+ env:
+ - CIBW_BEFORE_BUILD="python -m pip install -r requirements.txt"
+ - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+ - CIBW_TEST_COMMAND='python -c "import pysam"'
+ <<: *cibw_common
+
+addons:
+ apt:
+ packages:
+ - gcc
+ - g++
+
+script:
+ - ./devtools/run_tests_travis.sh
+
+notifications:
+ email:
+ - andreas.heger@gmail.com
+++ /dev/null
-os:
- - linux
- - osx
-
-language: c
-
-stages:
- - test
- - name: deploy
- if: tag IS present
-
-env:
- matrix:
- - CONDA_PY=2.7
- - CONDA_PY=3.6
- - CONDA_PY=3.7
- - CONDA_PY=3.8
- global:
- - PYSAM_LINKING_TEST=1
- - TWINE_USERNAME=grepall
- - secure: bTbky3Un19NAl62lix8bMLmBv9IGNhFkRXlZH+B253nYub7jwQwPQKum3ct9ea+XHJT5//uM0B8WAF6eyugpNkPQ7+S7SEH5BJuCt30nv6qvGhSO2AffZKeHEDnfW2kqGrivn87TqeomlSBlO742CD/V0wOIUwkTT9tutd+E7FU=
-
-_cibw_common: &cibw_common
- addons: {}
- install:
- - python3 -m pip install cibuildwheel>=1.1.0 twine
- script:
- - set -e
- - cibuildwheel --output-dir dist
- - twine check dist/*
- - twine upload --skip-existing dist/*
-
-_cibw_linux: &cibw_linux
- stage: deploy
- os: linux
- language: python
- python: '3.5'
- services:
- - docker
- <<: *cibw_common
-
-matrix:
- include:
- - stage: deploy
- os: linux
- language: python
- python: '3.5'
- addons:
- apt:
- packages:
- - gcc
- - g++
- - libcurl4-openssl-dev # for libcurl support in sdist
- - libssl-dev # for s3 support in sdist
- install:
- - python3 -m pip install Cython twine
- script:
- - set -e
- - python3 setup.py build_ext --inplace
- - python3 setup.py sdist
- - twine check dist/*
- - twine upload --skip-existing dist/*
- - <<: *cibw_linux
- env:
- - CIBW_BUILD="*_x86_64"
- - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
- - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
- - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
- - CIBW_TEST_COMMAND='python -c "import pysam"'
- - <<: *cibw_linux
- env:
- - CIBW_BUILD="*_i686"
- - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
- - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
- - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
- - CIBW_TEST_COMMAND='python -c "import pysam"'
- - stage: deploy
- os: osx
- language: generic
- env:
- - CIBW_BEFORE_BUILD="python -m pip install -r requirements.txt"
- - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
- - CIBW_TEST_COMMAND='python -c "import pysam"'
- <<: *cibw_common
-
-addons:
- apt:
- packages:
- - gcc
- - g++
-
-script:
- - ./devtools/run_tests_travis.sh
-
-notifications:
- email:
- - andreas.heger@gmail.com
+Many people have contributed to pysam. The list of github contributors
+is the best place to get a full list of authors and their contributions.
+The list and summary below is a out-of-date and represents the earlier
+stages of the project.
+
List of contributors:
Andreas Heger, Tildon Grant Belgard, Florian Finkernagel, Leo
Goodstadt, Martin Goodson all contributed code to pysam.
+John Marshall has been looking after pysam and its community for
+several years, as well as making many code contributions and improving
+the engineering of pysam.
+
Kevin B. Jacobs implemented a Cython wrapper for the VCF/BCF
reader/writer in htslib.
dependencies (`libcurl`, `libcrypto`), it will fall back to
conservative defaults.
-Options can be passed to the configure script explicitely by
+Options can be passed to the configure script explicitly by
setting the environment variable `HTSLIB_CONFIGURE_OPTIONS`.
For example::
include pysam/libc*.c
include pysam/*.c
include pysam/*.h
+exclude pysam/config.py
+
+include win32/*.[ch]
# exclude tests from pypi tar-ball - they
# require additional data
prune tests/
# samtools
-include samtools/configure
-include samtools/config.mk.in
-include samtools/config.h.in
-include samtools/*.h
-include samtools/*.c
-exclude samtools/config.h
-include samtools/*/*.h
+include samtools/LICENSE samtools/README samtools/lz4/LICENSE
+recursive-include samtools *.[ch]
+include samtools/version.sh
+exclude samtools/*config*.h
# bcftools
-include bcftools/*.h
-include bcftools/*.c
-exclude bcftools/config.h
+include bcftools/LICENSE bcftools/README
+include bcftools/*.[ch]
+include bcftools/version.sh
+exclude bcftools/*config*.h
# htslib
-include htslib/*.c
-include htslib/*.h
-include htslib/INSTALL
-include htslib/NEWS
-exclude htslib/config.h
-include htslib/Makefile
-include htslib/htslib_vars.mk
-include htslib/configure
-include htslib/config.mk.in
-include htslib/config.h.in
-include htslib/htslib.pc.in
-include htslib/htslib/*.h
-include htslib/cram/*.c
-include htslib/cram/*.h
-include htslib/os/*.c
-include htslib/os/*.h
+include htslib/LICENSE htslib/README
+recursive-include htslib *.[ch]
+exclude htslib/*config*.h
+
+include htslib/configure.ac htslib/m4/*.m4 htslib/*.in
+include htslib/configure htslib/version.sh
+include htslib/Makefile htslib/*.mk
+exclude htslib/config.mk htslib/htscodecs.mk
+
include cy_build.py
-include pysam.py
include requirements.txt
# documentation
-include doc/*
+include doc/*.py doc/*.rst
+include doc/Makefile doc/make.bat
==============
This release wraps htslib/samtools/bcfools versions 1.4.1 in response
-to a security fix in these libraries. Additionaly the following
+to a security fix in these libraries. Additionally the following
issues have been fixed:
* [#452] add GFF3 support for tabix parsers
--------
The 0.9.0 release upgrades htslib to htslib 1.3 and numerous other
-enchancements and bugfixes. See below for a detailed list.
+enhancements and bugfixes. See below for a detailed list.
`Htslib 1.3 <https://github.com/samtools/htslib/releases/tag/1.3>`_
comes with additional capabilities for remote file access which depend
and code bloat.
* run configure for the builtin htslib library in order to detect
optional libraries such as libcurl. Configure behaviour can be
- controlled by setting the environmet variable
+ controlled by setting the environment variable
HTSLIB_CONFIGURE_OPTIONS.
* get_reference_sequence() now returns the reference sequence and not
something looking like it. This bug had effects on
Backwards incompatible changes
-* Empty cigarstring now returns None (intstead of '')
+* Empty cigarstring now returns None (instead of '')
* Empty cigar now returns None (instead of [])
* When using the extension classes in cython modules, AlignedRead
needs to be substituted with AlignedSegment.
compilation options. Especially for OS X this will potentially save a
lot of trouble.
-The current version of pysam wraps 3rd-party code from htslib-1.10.2, samtools-1.10, and bcftools-1.10.2.
+The current version of pysam wraps 3rd-party code from htslib-1.13, samtools-1.13, and bcftools-1.13.
Pysam is available through `pypi
<https://pypi.python.org/pypi/pysam>`_. To install, type::
/* The MIT License
- Copyright (c) 2014-2015 Genome Research Ltd.
+ Copyright (c) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
/* The MIT License
- Copyright (c) 2014-2015 Genome Research Ltd.
+ Copyright (c) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
/* The MIT License
- Copyright (c) 2014-2015 Genome Research Ltd.
+ Copyright (c) 2014-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
The MIT/Expat License
-Copyright (C) 2012-2014 Genome Research Ltd.
+Copyright (C) 2012-2021 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
+
+-----------------------------------------------------------------------------
+
+LICENSE for utlist.h
+
+Copyright (c) 2007-2014, Troy D. Hanson http://troydhanson.github.com/uthash/
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
to replace the Perl-based tools from vcftools.
See INSTALL for building and installation instructions.
+
+Please cite this paper when using BCFtools for your publications:
+
+Twelve years of SAMtools and BCFtools
+Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li
+GigaScience, Volume 10, Issue 2, February 2021, giab008, https://doi.org/10.1093/gigascience/giab008
+
+@article{10.1093/gigascience/giab008,
+ author = {Danecek, Petr and Bonfield, James K and Liddle, Jennifer and Marshall, John and Ohan, Valeriu and Pollard, Martin O and Whitwham, Andrew and Keane, Thomas and McCarthy, Shane A and Davies, Robert M and Li, Heng},
+ title = "{Twelve years of SAMtools and BCFtools}",
+ journal = {GigaScience},
+ volume = {10},
+ number = {2},
+ year = {2021},
+ month = {02},
+ abstract = "{SAMtools and BCFtools are widely used programs for processing and analysing high-throughput sequencing data. They include tools for file format conversion and manipulation, sorting, querying, statistics, variant calling, and effect analysis amongst other methods.The first version appeared online 12 years ago and has been maintained and further developed ever since, with many new features and improvements added over the years. The SAMtools and BCFtools packages represent a unique collection of tools that have been used in numerous other software projects and countless genomic pipelines.Both SAMtools and BCFtools are freely available on GitHub under the permissive MIT licence, free for both non-commercial and commercial use. Both packages have been installed \\>1 million times via Bioconda. The source code and documentation are available from https://www.htslib.org.}",
+ issn = {2047-217X},
+ doi = {10.1093/gigascience/giab008},
+ url = {https://doi.org/10.1093/gigascience/giab008},
+ note = {giab008},
+ eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab008/36332246/giab008.pdf},
+}
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2021 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <assert.h>
+#include <strings.h>
+#include <htslib/vcf.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "abuf.h"
+#include "rbuf.h"
+
+typedef enum
+{
+ M_FIRST, M_SUM
+}
+merge_rule_t;
+
+typedef struct
+{
+ kstring_t ref, alt;
+ int ial; // the index of the original ALT allele, 1-based
+ int beg, end; // 0-based inclusive offsets to ref,alt
+}
+atom_t;
+
+typedef struct
+{
+ bcf1_t *rec;
+ int nori, nout; // number of ALTs in the input, and VCF rows on output
+ uint8_t *tbl; // nori columns, nout rows; indicates allele contribution to output rows, see "The atomization works as follows" below
+ uint8_t *overlaps; // is the star allele needed for this variant?
+ atom_t **atoms;
+ int matoms, mtbl, moverlaps;
+ char *info_tag;
+}
+split_t;
+
+struct _abuf_t
+{
+ abuf_opt_t mode;
+ split_t split;
+ atom_t *atoms;
+ int natoms, matoms;
+ const bcf_hdr_t *hdr;
+ bcf_hdr_t *out_hdr;
+ bcf1_t **vcf; // dimensions stored in rbuf
+ rbuf_t rbuf;
+
+ kstring_t tmps;
+ void *tmp, *tmp2;
+ int32_t *gt, *tmpi;
+ int ngt, mgt, ntmpi, mtmpi, mtmp, mtmp2;
+ int star_allele;
+};
+
+abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode)
+{
+ if ( mode!=SPLIT ) error("todo\n");
+ abuf_t *buf = (abuf_t*) calloc(1,sizeof(abuf_t));
+ buf->hdr = hdr;
+ buf->out_hdr = (bcf_hdr_t*) hdr;
+ buf->mode = mode;
+ buf->star_allele = 1;
+ rbuf_init(&buf->rbuf, 0);
+ return buf;
+}
+
+void abuf_destroy(abuf_t *buf)
+{
+ int i;
+ for (i=0; i<buf->matoms; i++)
+ {
+ free(buf->atoms[i].ref.s);
+ free(buf->atoms[i].alt.s);
+ }
+ free(buf->atoms);
+ free(buf->split.atoms);
+ free(buf->split.overlaps);
+ free(buf->split.tbl);
+ for (i=0; i<buf->rbuf.m; i++)
+ if ( buf->vcf[i] ) bcf_destroy(buf->vcf[i]);
+ free(buf->vcf);
+ free(buf->gt);
+ free(buf->tmpi);
+ free(buf->tmp);
+ free(buf->tmp2);
+ free(buf->tmps.s);
+ free(buf);
+}
+
+void abuf_set(abuf_t *buf, abuf_opt_t key, void *value)
+{
+ if ( key==BCF_HDR ) { buf->out_hdr = *((bcf_hdr_t**)value); return; }
+ if ( key==INFO_TAG )
+ {
+ buf->split.info_tag = *((char**)value);
+ bcf_hdr_printf(buf->out_hdr,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX\">",buf->split.info_tag);
+ return;
+ }
+ if ( key==STAR_ALLELE ) { buf->star_allele = *((int*)value); return; }
+}
+
+/*
+ Split alleles into primitivs, e.g.
+ CC>TT becomes C>T,C>T
+ GCGT>GTGA becomes C>T,T>A
+
+ There is no sequence alignment, just trimming and hungry matching
+ from left side.
+*/
+static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial)
+{
+ // Trim identical sequence from right
+ char *ref = rec->d.allele[0];
+ char *alt = rec->d.allele[ial];
+ int rlen = strlen(ref);
+ int alen = strlen(alt);
+ while ( rlen>1 && alen>1 && ref[rlen-1]==alt[alen-1] ) rlen--, alen--;
+ int Mlen = rlen > alen ? rlen : alen;
+
+ atom_t *atom = NULL;
+ int i;
+ for (i=0; i<Mlen; i++)
+ {
+ char refb = i<rlen ? ref[i] : '-';
+ char altb = i<alen ? alt[i] : '-';
+ if ( refb!=altb )
+ {
+ if ( refb=='-' || altb=='-' )
+ {
+ assert(atom);
+ if ( altb!='-' ) kputc(altb, &atom->alt);
+ if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; }
+ }
+ else
+ {
+ buf->natoms++;
+ hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
+ atom = &buf->atoms[buf->natoms-1];
+ atom->ref.l = 0;
+ atom->alt.l = 0;
+ kputc(refb, &atom->ref);
+ kputc(altb, &atom->alt);
+ atom->beg = atom->end = i;
+ atom->ial = ial;
+ }
+ continue;
+ }
+ if ( i+1>=rlen || i+1>=alen ) // is the next base a deletion?
+ {
+ buf->natoms++;
+ hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
+ atom = &buf->atoms[buf->natoms-1];
+ atom->ref.l = 0;
+ atom->alt.l = 0;
+ kputc(refb, &atom->ref);
+ kputc(altb, &atom->alt);
+ atom->beg = atom->end = i;
+ atom->ial = ial;
+ }
+ }
+}
+static int _atoms_inconsistent(const atom_t *a, const atom_t *b)
+{
+ if ( a->beg < b->beg ) return -1;
+ if ( a->beg > b->beg ) return 1;
+ int rcmp = strcasecmp(a->ref.s,b->ref.s);
+ if ( rcmp ) return rcmp;
+ return strcasecmp(a->alt.s,b->alt.s);
+}
+/*
+ For reproducibility of tests on different platforms, we need to guarantee the same order of identical
+ atoms originating from different source ALTs. Even though they are consistent, different values can be
+ picked for VCF annotations as currently the values from the one that comes first are used.
+*/
+static int _cmp_atoms(const void *aptr, const void *bptr)
+{
+ const atom_t *a = (const atom_t*) aptr;
+ const atom_t *b = (const atom_t*) bptr;
+ int rcmp = _atoms_inconsistent(a,b);
+ if ( rcmp ) return rcmp;
+ if ( a->ial < b->ial ) return -1;
+ if ( a->ial > b->ial ) return 1;
+ return 0;
+}
+static void _split_table_init(abuf_t *buf, bcf1_t *rec, int natoms)
+{
+ buf->split.rec = rec;
+ buf->split.nori = rec->n_allele - 1;
+ buf->split.nout = 0;
+ hts_expand(uint8_t,buf->split.nori*natoms,buf->split.mtbl,buf->split.tbl);
+ hts_expand(atom_t*,natoms,buf->split.matoms,buf->split.atoms);
+ hts_expand(uint8_t,natoms,buf->split.moverlaps,buf->split.overlaps);
+ memset(buf->split.overlaps,0,sizeof(*buf->split.overlaps)*natoms);
+}
+static void _split_table_new(abuf_t *buf, atom_t *atom)
+{
+ int i, iout = buf->split.nout++;
+ buf->split.atoms[iout] = atom;
+ uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
+ for (i=0; i<buf->split.nori; i++) ptr[i] = 0;
+ ptr[atom->ial-1] = 1;
+}
+static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom)
+{
+ uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
+ ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1;
+ buf->split.overlaps[iout] = 1;
+}
+#if 0
+static void _split_table_print(abuf_t *buf)
+{
+ int i,j;
+ for (i=0; i<buf->split.nout; i++)
+ {
+ atom_t *atom = buf->split.atoms[i];
+ uint8_t *ptr = buf->split.tbl + i*buf->split.nori;
+ fprintf(stderr,"%d\t%s\t%s",(int)buf->split.rec->pos+1+atom->beg,atom->ref.s,atom->alt.s);
+ for (j=0; j<buf->split.nori; j++) fprintf(stderr,"\t%d",(int)ptr[j]);
+ fprintf(stderr,"\n");
+ }
+}
+static void _split_table_print_atoms(abuf_t *buf)
+{
+ int i;
+ for (i=0; i<buf->natoms; i++)
+ {
+ atom_t *atom = &buf->atoms[i];
+ fprintf(stderr,"atom%d %p: ialt=%d %s>%s %d-%d\n",i,atom,atom->ial,atom->ref.s,atom->alt.s,atom->beg,atom->end);
+ }
+}
+#endif
+static inline uint8_t _has_star_allele(abuf_t *buf, int iout)
+{
+ if ( !buf->star_allele ) return 0;
+ return buf->split.overlaps[iout];
+}
+static inline int _split_table_get_ial(abuf_t *buf, int irow, int ial)
+{
+ if ( !ial ) return ial;
+ return buf->split.tbl[irow*buf->split.nori + ial - 1];
+}
+static void _split_table_set_chrom_qual(abuf_t *buf)
+{
+ int iout,j;
+ bcf1_t *rec = buf->split.rec;
+ for (iout=0; iout<buf->split.nout; iout++)
+ {
+ rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
+ j = rbuf_append(&buf->rbuf);
+ if ( !buf->vcf[j] ) buf->vcf[j] = bcf_init1();
+ bcf1_t *out = buf->vcf[j];
+ bcf_clear1(out);
+
+ atom_t *atom = buf->split.atoms[iout];
+ out->rid = rec->rid;
+ out->pos = rec->pos + atom->beg;
+ bcf_update_id(buf->out_hdr, out, rec->d.id);
+
+ const char *als[3];
+ als[0] = atom->ref.s;
+ als[1] = atom->alt.s;
+ als[2] = "*";
+ int nals = _has_star_allele(buf,iout) ? 3 : 2;
+ bcf_update_alleles(buf->out_hdr, out, als, nals);
+
+ if ( bcf_float_is_missing(rec->qual) )
+ bcf_float_set_missing(out->qual);
+ else
+ out->qual = rec->qual;
+
+ bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt);
+ }
+}
+static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mode)
+{
+ const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key);
+ int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key);
+ int len = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key);
+ if ( len==BCF_VL_G ) return; // todo: Number=G INFO tags
+ if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings
+ if ( type==BCF_HT_LONG ) return; // todo: 64bit integers
+
+ bcf1_t *rec = buf->split.rec;
+ int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/4 : buf->mtmp;
+ int nval = bcf_get_info_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*4;
+
+ // Check for incorrect number of values. Note this check does not consider all values missing
+ // and will remove annotations that don't pass.
+ if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return;
+
+ if ( buf->mtmp2 < buf->mtmp )
+ {
+ buf->tmp2 = realloc(buf->tmp2, buf->mtmp);
+ if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", buf->mtmp);
+ buf->mtmp2 = buf->mtmp;
+ }
+
+ int32_t missing = bcf_int32_missing;
+ void *missing_ptr = (void*)&missing;
+ if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
+
+ int iout,i;
+ for (iout=0; iout<buf->split.nout; iout++)
+ {
+ bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
+ int star_allele = _has_star_allele(buf,iout);
+ int ret = 0;
+ if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
+ ret = bcf_update_info(buf->out_hdr, out, tag, type==BCF_HT_FLAG ? NULL : buf->tmp, nval, type);
+ else if ( len==BCF_VL_A )
+ {
+ int iori = buf->split.atoms[iout]->ial - 1;
+ assert( iori<nval );
+ memcpy(buf->tmp2,buf->tmp+4*iori,4);
+ if ( star_allele )
+ memcpy(buf->tmp2+4,missing_ptr,4);
+ ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type);
+ }
+ else if ( len==BCF_VL_R )
+ {
+ memcpy(buf->tmp2,buf->tmp,4); // REF contributes to all records
+ int iori = buf->split.atoms[iout]->ial;
+ assert( iori<nval && iori<=buf->split.nori );
+ memcpy(buf->tmp2+4,buf->tmp+4*iori,4);
+ if ( type==BCF_HT_INT && mode==M_SUM )
+ {
+ uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
+ for (i=iori; i<buf->split.nori; i++)
+ {
+ if ( tbl[i]==1 ) ((int32_t*)buf->tmp2)[1] += ((int32_t*)buf->tmp)[i+1];
+ }
+ }
+ if ( star_allele )
+ memcpy(buf->tmp2+8,missing_ptr,4);
+ ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type);
+ }
+ if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
+ }
+}
+static void _split_table_set_history(abuf_t *buf)
+{
+ int i,j;
+ bcf1_t *rec = buf->split.rec;
+ buf->tmps.l = 0;
+ ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
+ for (i=1; i<rec->n_allele; i++)
+ {
+ kputs(rec->d.allele[i],&buf->tmps);
+ if ( i+1<rec->n_allele ) kputc(',',&buf->tmps);
+ else kputc(',',&buf->tmps);
+ }
+ int len = buf->tmps.l;
+ buf->tmps.s[buf->tmps.l-1] = '|';
+
+ for (i=0; i<buf->split.nout; i++)
+ {
+ buf->tmps.l = len;
+ bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)];
+ uint8_t *ptr = buf->split.tbl + i*buf->split.nori;
+ for (j=0; j<buf->split.nori; j++)
+ {
+ if ( ptr[j]!=1 ) continue;
+ kputw(j+1,&buf->tmps);
+ kputc(',',&buf->tmps);
+ }
+ buf->tmps.s[--buf->tmps.l] = 0;
+ if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
+ error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
+ }
+}
+static void _split_table_set_gt(abuf_t *buf)
+{
+ int nsmpl = bcf_hdr_nsamples(buf->hdr);
+ if ( !nsmpl ) return;
+
+ bcf1_t *rec = buf->split.rec;
+ buf->ngt = bcf_get_genotypes(buf->hdr, rec, &buf->gt, &buf->mgt);
+ if ( buf->ngt<=0 ) return;
+ else
+ hts_expand(int32_t,buf->ngt,buf->mtmpi,buf->tmpi);
+
+ int iout,i,j;
+ for (iout=0; iout<buf->split.nout; iout++)
+ {
+ bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
+ int star_allele = _has_star_allele(buf,iout);
+ int max_ploidy = buf->ngt/nsmpl;
+ int32_t *src = buf->gt, *dst = buf->tmpi;
+ for (i=0; i<nsmpl; i++)
+ {
+ for (j=0; j<max_ploidy; j++)
+ {
+ if ( src[j]==bcf_int32_vector_end || bcf_gt_is_missing(src[j]) )
+ {
+ dst[j] = src[j];
+ continue;
+ }
+ int iori = bcf_gt_allele(src[j]);
+ if ( iori<0 || iori>=rec->n_allele )
+ error("Out-of-bounds genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(buf->hdr,rec),rec->pos+1);
+ int ial = _split_table_get_ial(buf,iout,iori);
+ if ( ial==2 && !star_allele )
+ dst[j] = bcf_gt_missing;
+ else
+ dst[j] = bcf_gt_is_phased(src[j]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
+ }
+ src += max_ploidy;
+ dst += max_ploidy;
+ }
+ bcf_update_genotypes(buf->out_hdr,out,buf->tmpi,buf->ngt);
+ }
+}
+static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mode)
+{
+ int nsmpl = bcf_hdr_nsamples(buf->hdr);
+ if ( !nsmpl ) return;
+
+ const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,fmt->id);
+ if ( tag[0]=='G' && tag[1]=='T' && !tag[2] ) // FORMAT/GT
+ {
+ _split_table_set_gt(buf);
+ return;
+ }
+
+ int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id);
+ int len = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id);
+ if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings
+ if ( type==BCF_HT_LONG ) return; // todo: 64bit integers
+
+ const int num_size = 4;
+ assert( num_size==sizeof(int32_t) && num_size==sizeof(float) );
+ int32_t missing = bcf_int32_missing;
+ void *missing_ptr = (void*)&missing;
+ if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
+
+ bcf1_t *rec = buf->split.rec;
+ int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp; // number of items
+ int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size; // number of bytes
+
+ if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid
+
+ // Check for incorrect number of values. Note this check does not consider all values missing
+ // and will remove annotations that don't pass.
+ if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return;
+
+ // Increase buffer size to accommodate star allele
+ int nval1 = nval / nsmpl;
+ mtmp = buf->mtmp;
+ if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele
+ else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3);
+
+ if ( buf->mtmp2 < mtmp )
+ {
+ buf->tmp2 = realloc(buf->tmp2, mtmp);
+ if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", mtmp);
+ buf->mtmp2 = mtmp;
+ }
+
+ int iout, i, j;
+ for (iout=0; iout<buf->split.nout; iout++)
+ {
+ int star_allele = _has_star_allele(buf,iout);
+ bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
+ int ret = 0;
+ if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
+ ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type);
+ else if ( len==BCF_VL_A )
+ {
+ int iori = buf->split.atoms[iout]->ial - 1;
+ assert( iori<nval );
+ for (i=0; i<nsmpl; i++)
+ {
+ void *src = buf->tmp + nval1*num_size*i;
+ void *dst = buf->tmp2 + num_size*i*(star_allele+1);
+ memcpy(dst,src+iori*num_size,num_size);
+ if ( star_allele )
+ memcpy(dst+num_size,missing_ptr,num_size);
+ }
+ ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type);
+ }
+ else if ( len==BCF_VL_R )
+ {
+ int iori = buf->split.atoms[iout]->ial;
+ assert( iori<=nval );
+ for (i=0; i<nsmpl; i++)
+ {
+ void *src = buf->tmp + nval1*num_size*i;
+ void *dst = buf->tmp2 + num_size*i*(star_allele+2);
+ memcpy(dst,src,num_size);
+ memcpy(dst+num_size,src+iori*num_size,num_size);
+
+ if ( type==BCF_HT_INT && mode==M_SUM )
+ {
+ uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
+ for (j=iori; j<buf->split.nori; j++)
+ if ( tbl[j]==1 ) ((int32_t*)dst)[1] += ((int32_t*)src)[j+1];
+ }
+ if ( star_allele )
+ memcpy(dst+num_size*2,missing_ptr,num_size);
+ }
+ ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type);
+ }
+ else if ( len==BCF_VL_G )
+ {
+ int iori = buf->split.atoms[iout]->ial;
+ int i01 = bcf_alleles2gt(0,iori);
+ int i11 = bcf_alleles2gt(iori,iori);
+ assert( iori<nval );
+ #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end) { \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ type_t *src = (type_t*)buf->tmp + i*nval1; \
+ type_t *dst = (type_t*)buf->tmp2 + i*3*(1+star_allele); \
+ int n=0; /* determine ploidy of this genotype */ \
+ while ( n<nval1 && !(is_vector_end) ) { n++; src++; } \
+ src = (type_t*)buf->tmp + i*nval1; \
+ memcpy(dst++,src,sizeof(type)); \
+ int nmiss = 0, nend = 0; \
+ if ( n==rec->n_allele ) /* haploid */ \
+ { \
+ memcpy(dst++,src+iori,sizeof(type)); \
+ if ( star_allele ) { nmiss = 1; nend = 3; } \
+ else nend = 1; \
+ } \
+ else if ( n==nval1 ) \
+ { \
+ memcpy(dst++,src+i01,sizeof(type)); \
+ memcpy(dst++,src+i11,sizeof(type)); \
+ if ( star_allele ) nmiss = 3; \
+ } \
+ else if ( n==1 && is_missing ) \
+ { \
+ if ( star_allele ) nend = 5; \
+ else nend = 2; \
+ } \
+ else \
+ error("Incorrect number of values at %s:%"PRIhts_pos" .. tag=FORMAT/%s Number=G nAlleles=%d nValues=%d, %d-th sample\n", \
+ bcf_seqname(buf->hdr,rec),rec->pos+1,tag,rec->n_allele,n,i+1); \
+ for (j=0; j<nmiss; j++) { set_missing; dst++; } \
+ for (j=0; j<nend; j++) { set_vector_end; dst++; } \
+ } \
+ }
+ switch (type)
+ {
+ case BCF_HT_INT: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *dst=bcf_int32_missing, *dst=bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*dst), bcf_float_set_vector_end(*dst)); break;
+ default: error("Unexpected case: %d\n", type);
+ }
+ #undef BRANCH
+ ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
+ }
+ if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
+ }
+}
+static inline int _is_acgtn(char *seq)
+{
+ while ( *seq )
+ {
+ char c = toupper(*seq);
+ if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 0;
+ seq++;
+ }
+ return 1;
+}
+/*
+ The atomization works as follows:
+ - Atomize each alternate allele separately by leaving out sequence identical to the reference. No
+ alignment is performed, just greedy trimming of the end, then from left. This operation returns
+ a list of atoms (atom_t) which carry fragments of REF,ALT and their positions as 0-based offsets
+ to the original REF allele
+ - Sort atoms by POS, REF and ALT. Each unique atom (POS+REF+ALT) forms a new VCF record, each
+ with a single ALT.
+ - For each new VCF record determine how to translate the original allele index (iori) to this new
+ record:
+ - 1: the original allele matches the atom
+ - 0: the original allele does not overlap this atom or the overlapping part matches the REF
+ allele
+ - 2 (or equivalently "."): there is a mismatch between the original allele and the atom
+ The mapping is encoded in a table with columns corresponding to the original ALTs and rows
+ to the new POS+ALTs (atoms). The table is initialized to 0, then we set 1's for matching
+ atoms and 2's for overlapping mismatching atoms.
+
+ Note that different ALT alleles can result in the same atom (the same output line) and this code
+ does not know how to reconcile possibly conflicting VCF annotations. This could be improved
+ and merge logic provided, similarly to `merge -l`. For example, the allelic depths (AD) should
+ be summed for the same atomized output allele. However, this level of complexity is not addressed
+ in this initial draft. Higher priority for now is to provide the inverse "join" operation.
+
+ Update 2021-04-09:
+ Tags QS,AD are now automatically incremented as they should be, for both INFO and FORMAT.
+ Note that the code will fail on missing values (todo) and it needs to be generalized and
+ made customizable.
+*/
+void _abuf_split(abuf_t *buf, bcf1_t *rec)
+{
+ int i,j;
+ if ( rec->n_allele < 2 )
+ {
+ rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
+ int j = rbuf_append(&buf->rbuf);
+ if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]);
+ buf->vcf[j] = bcf_dup(rec);
+ return;
+ }
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( _is_acgtn(rec->d.allele[i]) ) continue;
+ rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
+ int j = rbuf_append(&buf->rbuf);
+ if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]);
+ buf->vcf[j] = bcf_dup(rec);
+ return;
+ }
+
+ buf->natoms = 0;
+ for (i=1; i<rec->n_allele; i++) _atomize_allele(buf,rec,i);
+ qsort(buf->atoms,buf->natoms,sizeof(*buf->atoms),_cmp_atoms);
+ _split_table_init(buf,rec,buf->natoms);
+ for (i=0; i<buf->natoms; i++)
+ {
+ if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue;
+ _split_table_new(buf, &buf->atoms[i]); // add a new unique output atom
+ }
+ for (i=0; i<buf->natoms; i++)
+ {
+ // Looping over sorted list of all atoms with possible duplicates from different source ALT alleles
+ atom_t *atom = &buf->atoms[i];
+ for (j=0; j<buf->split.nout; j++)
+ {
+ atom_t *out = buf->split.atoms[j];
+ if ( atom == out ) continue; // table already set to 1
+ if ( atom->beg > out->end ) continue; // cannot overlap this output atom
+ if ( atom->end < out->beg ) break; // this atom is ahead of all subsequent output records
+ _split_table_overlap(buf, j, atom);
+ }
+ }
+ assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode
+
+ // Create the output records, transferring all annotations:
+ // CHROM-QUAL
+ _split_table_set_chrom_qual(buf);
+
+ // INFO
+ for (i=0; i<rec->n_info; i++)
+ {
+ // this implementation of merging rules is temporary: generalize and made customizable through the API
+ merge_rule_t mode = M_FIRST;
+ const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.info[i].key);
+ if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM;
+
+ _split_table_set_info(buf, &rec->d.info[i], mode);
+ }
+
+ // Set INFO tag showing the original record
+ if ( buf->split.info_tag )
+ _split_table_set_history(buf);
+
+ // FORMAT
+ for (i=0; i<rec->n_fmt; i++)
+ {
+ // this implementation of merging rules is temporary: generalize and made customizable through the API
+ merge_rule_t mode = M_FIRST;
+ const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.fmt[i].id);
+ if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM;
+
+ _split_table_set_format(buf, &rec->d.fmt[i], mode);
+ }
+}
+
+void abuf_push(abuf_t *buf, bcf1_t *rec)
+{
+ bcf_unpack(rec, BCF_UN_ALL);
+ if ( buf->mode==SPLIT ) _abuf_split(buf,rec);
+}
+
+bcf1_t *abuf_flush(abuf_t *buf, int flush_all)
+{
+ int i;
+
+ if ( buf->rbuf.n==0 ) return NULL;
+ if ( flush_all ) goto ret;
+
+ret:
+ i = rbuf_shift(&buf->rbuf);
+ return buf->vcf[i];
+}
+
--- /dev/null
+#include "bcftools.pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2021 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <assert.h>
+#include <strings.h>
+#include <htslib/vcf.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "abuf.h"
+#include "rbuf.h"
+
+typedef enum
+{
+ M_FIRST, M_SUM
+}
+merge_rule_t;
+
+typedef struct
+{
+ kstring_t ref, alt;
+ int ial; // the index of the original ALT allele, 1-based
+ int beg, end; // 0-based inclusive offsets to ref,alt
+}
+atom_t;
+
+typedef struct
+{
+ bcf1_t *rec;
+ int nori, nout; // number of ALTs in the input, and VCF rows on output
+ uint8_t *tbl; // nori columns, nout rows; indicates allele contribution to output rows, see "The atomization works as follows" below
+ uint8_t *overlaps; // is the star allele needed for this variant?
+ atom_t **atoms;
+ int matoms, mtbl, moverlaps;
+ char *info_tag;
+}
+split_t;
+
+struct _abuf_t
+{
+ abuf_opt_t mode;
+ split_t split;
+ atom_t *atoms;
+ int natoms, matoms;
+ const bcf_hdr_t *hdr;
+ bcf_hdr_t *out_hdr;
+ bcf1_t **vcf; // dimensions stored in rbuf
+ rbuf_t rbuf;
+
+ kstring_t tmps;
+ void *tmp, *tmp2;
+ int32_t *gt, *tmpi;
+ int ngt, mgt, ntmpi, mtmpi, mtmp, mtmp2;
+ int star_allele;
+};
+
+abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode)
+{
+ if ( mode!=SPLIT ) error("todo\n");
+ abuf_t *buf = (abuf_t*) calloc(1,sizeof(abuf_t));
+ buf->hdr = hdr;
+ buf->out_hdr = (bcf_hdr_t*) hdr;
+ buf->mode = mode;
+ buf->star_allele = 1;
+ rbuf_init(&buf->rbuf, 0);
+ return buf;
+}
+
+void abuf_destroy(abuf_t *buf)
+{
+ int i;
+ for (i=0; i<buf->matoms; i++)
+ {
+ free(buf->atoms[i].ref.s);
+ free(buf->atoms[i].alt.s);
+ }
+ free(buf->atoms);
+ free(buf->split.atoms);
+ free(buf->split.overlaps);
+ free(buf->split.tbl);
+ for (i=0; i<buf->rbuf.m; i++)
+ if ( buf->vcf[i] ) bcf_destroy(buf->vcf[i]);
+ free(buf->vcf);
+ free(buf->gt);
+ free(buf->tmpi);
+ free(buf->tmp);
+ free(buf->tmp2);
+ free(buf->tmps.s);
+ free(buf);
+}
+
+void abuf_set(abuf_t *buf, abuf_opt_t key, void *value)
+{
+ if ( key==BCF_HDR ) { buf->out_hdr = *((bcf_hdr_t**)value); return; }
+ if ( key==INFO_TAG )
+ {
+ buf->split.info_tag = *((char**)value);
+ bcf_hdr_printf(buf->out_hdr,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX\">",buf->split.info_tag);
+ return;
+ }
+ if ( key==STAR_ALLELE ) { buf->star_allele = *((int*)value); return; }
+}
+
+/*
+ Split alleles into primitivs, e.g.
+ CC>TT becomes C>T,C>T
+ GCGT>GTGA becomes C>T,T>A
+
+ There is no sequence alignment, just trimming and hungry matching
+ from left side.
+*/
+static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial)
+{
+ // Trim identical sequence from right
+ char *ref = rec->d.allele[0];
+ char *alt = rec->d.allele[ial];
+ int rlen = strlen(ref);
+ int alen = strlen(alt);
+ while ( rlen>1 && alen>1 && ref[rlen-1]==alt[alen-1] ) rlen--, alen--;
+ int Mlen = rlen > alen ? rlen : alen;
+
+ atom_t *atom = NULL;
+ int i;
+ for (i=0; i<Mlen; i++)
+ {
+ char refb = i<rlen ? ref[i] : '-';
+ char altb = i<alen ? alt[i] : '-';
+ if ( refb!=altb )
+ {
+ if ( refb=='-' || altb=='-' )
+ {
+ assert(atom);
+ if ( altb!='-' ) kputc(altb, &atom->alt);
+ if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; }
+ }
+ else
+ {
+ buf->natoms++;
+ hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
+ atom = &buf->atoms[buf->natoms-1];
+ atom->ref.l = 0;
+ atom->alt.l = 0;
+ kputc(refb, &atom->ref);
+ kputc(altb, &atom->alt);
+ atom->beg = atom->end = i;
+ atom->ial = ial;
+ }
+ continue;
+ }
+ if ( i+1>=rlen || i+1>=alen ) // is the next base a deletion?
+ {
+ buf->natoms++;
+ hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
+ atom = &buf->atoms[buf->natoms-1];
+ atom->ref.l = 0;
+ atom->alt.l = 0;
+ kputc(refb, &atom->ref);
+ kputc(altb, &atom->alt);
+ atom->beg = atom->end = i;
+ atom->ial = ial;
+ }
+ }
+}
+static int _atoms_inconsistent(const atom_t *a, const atom_t *b)
+{
+ if ( a->beg < b->beg ) return -1;
+ if ( a->beg > b->beg ) return 1;
+ int rcmp = strcasecmp(a->ref.s,b->ref.s);
+ if ( rcmp ) return rcmp;
+ return strcasecmp(a->alt.s,b->alt.s);
+}
+/*
+ For reproducibility of tests on different platforms, we need to guarantee the same order of identical
+ atoms originating from different source ALTs. Even though they are consistent, different values can be
+ picked for VCF annotations as currently the values from the one that comes first are used.
+*/
+static int _cmp_atoms(const void *aptr, const void *bptr)
+{
+ const atom_t *a = (const atom_t*) aptr;
+ const atom_t *b = (const atom_t*) bptr;
+ int rcmp = _atoms_inconsistent(a,b);
+ if ( rcmp ) return rcmp;
+ if ( a->ial < b->ial ) return -1;
+ if ( a->ial > b->ial ) return 1;
+ return 0;
+}
+static void _split_table_init(abuf_t *buf, bcf1_t *rec, int natoms)
+{
+ buf->split.rec = rec;
+ buf->split.nori = rec->n_allele - 1;
+ buf->split.nout = 0;
+ hts_expand(uint8_t,buf->split.nori*natoms,buf->split.mtbl,buf->split.tbl);
+ hts_expand(atom_t*,natoms,buf->split.matoms,buf->split.atoms);
+ hts_expand(uint8_t,natoms,buf->split.moverlaps,buf->split.overlaps);
+ memset(buf->split.overlaps,0,sizeof(*buf->split.overlaps)*natoms);
+}
+static void _split_table_new(abuf_t *buf, atom_t *atom)
+{
+ int i, iout = buf->split.nout++;
+ buf->split.atoms[iout] = atom;
+ uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
+ for (i=0; i<buf->split.nori; i++) ptr[i] = 0;
+ ptr[atom->ial-1] = 1;
+}
+static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom)
+{
+ uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
+ ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1;
+ buf->split.overlaps[iout] = 1;
+}
+#if 0
+static void _split_table_print(abuf_t *buf)
+{
+ int i,j;
+ for (i=0; i<buf->split.nout; i++)
+ {
+ atom_t *atom = buf->split.atoms[i];
+ uint8_t *ptr = buf->split.tbl + i*buf->split.nori;
+ fprintf(bcftools_stderr,"%d\t%s\t%s",(int)buf->split.rec->pos+1+atom->beg,atom->ref.s,atom->alt.s);
+ for (j=0; j<buf->split.nori; j++) fprintf(bcftools_stderr,"\t%d",(int)ptr[j]);
+ fprintf(bcftools_stderr,"\n");
+ }
+}
+static void _split_table_print_atoms(abuf_t *buf)
+{
+ int i;
+ for (i=0; i<buf->natoms; i++)
+ {
+ atom_t *atom = &buf->atoms[i];
+ fprintf(bcftools_stderr,"atom%d %p: ialt=%d %s>%s %d-%d\n",i,atom,atom->ial,atom->ref.s,atom->alt.s,atom->beg,atom->end);
+ }
+}
+#endif
+static inline uint8_t _has_star_allele(abuf_t *buf, int iout)
+{
+ if ( !buf->star_allele ) return 0;
+ return buf->split.overlaps[iout];
+}
+static inline int _split_table_get_ial(abuf_t *buf, int irow, int ial)
+{
+ if ( !ial ) return ial;
+ return buf->split.tbl[irow*buf->split.nori + ial - 1];
+}
+static void _split_table_set_chrom_qual(abuf_t *buf)
+{
+ int iout,j;
+ bcf1_t *rec = buf->split.rec;
+ for (iout=0; iout<buf->split.nout; iout++)
+ {
+ rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
+ j = rbuf_append(&buf->rbuf);
+ if ( !buf->vcf[j] ) buf->vcf[j] = bcf_init1();
+ bcf1_t *out = buf->vcf[j];
+ bcf_clear1(out);
+
+ atom_t *atom = buf->split.atoms[iout];
+ out->rid = rec->rid;
+ out->pos = rec->pos + atom->beg;
+ bcf_update_id(buf->out_hdr, out, rec->d.id);
+
+ const char *als[3];
+ als[0] = atom->ref.s;
+ als[1] = atom->alt.s;
+ als[2] = "*";
+ int nals = _has_star_allele(buf,iout) ? 3 : 2;
+ bcf_update_alleles(buf->out_hdr, out, als, nals);
+
+ if ( bcf_float_is_missing(rec->qual) )
+ bcf_float_set_missing(out->qual);
+ else
+ out->qual = rec->qual;
+
+ bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt);
+ }
+}
+static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mode)
+{
+ const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key);
+ int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key);
+ int len = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key);
+ if ( len==BCF_VL_G ) return; // todo: Number=G INFO tags
+ if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings
+ if ( type==BCF_HT_LONG ) return; // todo: 64bit integers
+
+ bcf1_t *rec = buf->split.rec;
+ int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/4 : buf->mtmp;
+ int nval = bcf_get_info_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*4;
+
+ // Check for incorrect number of values. Note this check does not consider all values missing
+ // and will remove annotations that don't pass.
+ if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return;
+
+ if ( buf->mtmp2 < buf->mtmp )
+ {
+ buf->tmp2 = realloc(buf->tmp2, buf->mtmp);
+ if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", buf->mtmp);
+ buf->mtmp2 = buf->mtmp;
+ }
+
+ int32_t missing = bcf_int32_missing;
+ void *missing_ptr = (void*)&missing;
+ if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
+
+ int iout,i;
+ for (iout=0; iout<buf->split.nout; iout++)
+ {
+ bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
+ int star_allele = _has_star_allele(buf,iout);
+ int ret = 0;
+ if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
+ ret = bcf_update_info(buf->out_hdr, out, tag, type==BCF_HT_FLAG ? NULL : buf->tmp, nval, type);
+ else if ( len==BCF_VL_A )
+ {
+ int iori = buf->split.atoms[iout]->ial - 1;
+ assert( iori<nval );
+ memcpy(buf->tmp2,buf->tmp+4*iori,4);
+ if ( star_allele )
+ memcpy(buf->tmp2+4,missing_ptr,4);
+ ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type);
+ }
+ else if ( len==BCF_VL_R )
+ {
+ memcpy(buf->tmp2,buf->tmp,4); // REF contributes to all records
+ int iori = buf->split.atoms[iout]->ial;
+ assert( iori<nval && iori<=buf->split.nori );
+ memcpy(buf->tmp2+4,buf->tmp+4*iori,4);
+ if ( type==BCF_HT_INT && mode==M_SUM )
+ {
+ uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
+ for (i=iori; i<buf->split.nori; i++)
+ {
+ if ( tbl[i]==1 ) ((int32_t*)buf->tmp2)[1] += ((int32_t*)buf->tmp)[i+1];
+ }
+ }
+ if ( star_allele )
+ memcpy(buf->tmp2+8,missing_ptr,4);
+ ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type);
+ }
+ if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
+ }
+}
+static void _split_table_set_history(abuf_t *buf)
+{
+ int i,j;
+ bcf1_t *rec = buf->split.rec;
+ buf->tmps.l = 0;
+ ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
+ for (i=1; i<rec->n_allele; i++)
+ {
+ kputs(rec->d.allele[i],&buf->tmps);
+ if ( i+1<rec->n_allele ) kputc(',',&buf->tmps);
+ else kputc(',',&buf->tmps);
+ }
+ int len = buf->tmps.l;
+ buf->tmps.s[buf->tmps.l-1] = '|';
+
+ for (i=0; i<buf->split.nout; i++)
+ {
+ buf->tmps.l = len;
+ bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)];
+ uint8_t *ptr = buf->split.tbl + i*buf->split.nori;
+ for (j=0; j<buf->split.nori; j++)
+ {
+ if ( ptr[j]!=1 ) continue;
+ kputw(j+1,&buf->tmps);
+ kputc(',',&buf->tmps);
+ }
+ buf->tmps.s[--buf->tmps.l] = 0;
+ if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
+ error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
+ }
+}
+static void _split_table_set_gt(abuf_t *buf)
+{
+ int nsmpl = bcf_hdr_nsamples(buf->hdr);
+ if ( !nsmpl ) return;
+
+ bcf1_t *rec = buf->split.rec;
+ buf->ngt = bcf_get_genotypes(buf->hdr, rec, &buf->gt, &buf->mgt);
+ if ( buf->ngt<=0 ) return;
+ else
+ hts_expand(int32_t,buf->ngt,buf->mtmpi,buf->tmpi);
+
+ int iout,i,j;
+ for (iout=0; iout<buf->split.nout; iout++)
+ {
+ bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
+ int star_allele = _has_star_allele(buf,iout);
+ int max_ploidy = buf->ngt/nsmpl;
+ int32_t *src = buf->gt, *dst = buf->tmpi;
+ for (i=0; i<nsmpl; i++)
+ {
+ for (j=0; j<max_ploidy; j++)
+ {
+ if ( src[j]==bcf_int32_vector_end || bcf_gt_is_missing(src[j]) )
+ {
+ dst[j] = src[j];
+ continue;
+ }
+ int iori = bcf_gt_allele(src[j]);
+ if ( iori<0 || iori>=rec->n_allele )
+ error("Out-of-bounds genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(buf->hdr,rec),rec->pos+1);
+ int ial = _split_table_get_ial(buf,iout,iori);
+ if ( ial==2 && !star_allele )
+ dst[j] = bcf_gt_missing;
+ else
+ dst[j] = bcf_gt_is_phased(src[j]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
+ }
+ src += max_ploidy;
+ dst += max_ploidy;
+ }
+ bcf_update_genotypes(buf->out_hdr,out,buf->tmpi,buf->ngt);
+ }
+}
+static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mode)
+{
+ int nsmpl = bcf_hdr_nsamples(buf->hdr);
+ if ( !nsmpl ) return;
+
+ const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,fmt->id);
+ if ( tag[0]=='G' && tag[1]=='T' && !tag[2] ) // FORMAT/GT
+ {
+ _split_table_set_gt(buf);
+ return;
+ }
+
+ int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id);
+ int len = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id);
+ if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings
+ if ( type==BCF_HT_LONG ) return; // todo: 64bit integers
+
+ const int num_size = 4;
+ assert( num_size==sizeof(int32_t) && num_size==sizeof(float) );
+ int32_t missing = bcf_int32_missing;
+ void *missing_ptr = (void*)&missing;
+ if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
+
+ bcf1_t *rec = buf->split.rec;
+ int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp; // number of items
+ int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size; // number of bytes
+
+ if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid
+
+ // Check for incorrect number of values. Note this check does not consider all values missing
+ // and will remove annotations that don't pass.
+ if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return;
+
+ // Increase buffer size to accommodate star allele
+ int nval1 = nval / nsmpl;
+ mtmp = buf->mtmp;
+ if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele
+ else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3);
+
+ if ( buf->mtmp2 < mtmp )
+ {
+ buf->tmp2 = realloc(buf->tmp2, mtmp);
+ if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", mtmp);
+ buf->mtmp2 = mtmp;
+ }
+
+ int iout, i, j;
+ for (iout=0; iout<buf->split.nout; iout++)
+ {
+ int star_allele = _has_star_allele(buf,iout);
+ bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
+ int ret = 0;
+ if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
+ ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type);
+ else if ( len==BCF_VL_A )
+ {
+ int iori = buf->split.atoms[iout]->ial - 1;
+ assert( iori<nval );
+ for (i=0; i<nsmpl; i++)
+ {
+ void *src = buf->tmp + nval1*num_size*i;
+ void *dst = buf->tmp2 + num_size*i*(star_allele+1);
+ memcpy(dst,src+iori*num_size,num_size);
+ if ( star_allele )
+ memcpy(dst+num_size,missing_ptr,num_size);
+ }
+ ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type);
+ }
+ else if ( len==BCF_VL_R )
+ {
+ int iori = buf->split.atoms[iout]->ial;
+ assert( iori<=nval );
+ for (i=0; i<nsmpl; i++)
+ {
+ void *src = buf->tmp + nval1*num_size*i;
+ void *dst = buf->tmp2 + num_size*i*(star_allele+2);
+ memcpy(dst,src,num_size);
+ memcpy(dst+num_size,src+iori*num_size,num_size);
+
+ if ( type==BCF_HT_INT && mode==M_SUM )
+ {
+ uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
+ for (j=iori; j<buf->split.nori; j++)
+ if ( tbl[j]==1 ) ((int32_t*)dst)[1] += ((int32_t*)src)[j+1];
+ }
+ if ( star_allele )
+ memcpy(dst+num_size*2,missing_ptr,num_size);
+ }
+ ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type);
+ }
+ else if ( len==BCF_VL_G )
+ {
+ int iori = buf->split.atoms[iout]->ial;
+ int i01 = bcf_alleles2gt(0,iori);
+ int i11 = bcf_alleles2gt(iori,iori);
+ assert( iori<nval );
+ #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end) { \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ type_t *src = (type_t*)buf->tmp + i*nval1; \
+ type_t *dst = (type_t*)buf->tmp2 + i*3*(1+star_allele); \
+ int n=0; /* determine ploidy of this genotype */ \
+ while ( n<nval1 && !(is_vector_end) ) { n++; src++; } \
+ src = (type_t*)buf->tmp + i*nval1; \
+ memcpy(dst++,src,sizeof(type)); \
+ int nmiss = 0, nend = 0; \
+ if ( n==rec->n_allele ) /* haploid */ \
+ { \
+ memcpy(dst++,src+iori,sizeof(type)); \
+ if ( star_allele ) { nmiss = 1; nend = 3; } \
+ else nend = 1; \
+ } \
+ else if ( n==nval1 ) \
+ { \
+ memcpy(dst++,src+i01,sizeof(type)); \
+ memcpy(dst++,src+i11,sizeof(type)); \
+ if ( star_allele ) nmiss = 3; \
+ } \
+ else if ( n==1 && is_missing ) \
+ { \
+ if ( star_allele ) nend = 5; \
+ else nend = 2; \
+ } \
+ else \
+ error("Incorrect number of values at %s:%"PRIhts_pos" .. tag=FORMAT/%s Number=G nAlleles=%d nValues=%d, %d-th sample\n", \
+ bcf_seqname(buf->hdr,rec),rec->pos+1,tag,rec->n_allele,n,i+1); \
+ for (j=0; j<nmiss; j++) { set_missing; dst++; } \
+ for (j=0; j<nend; j++) { set_vector_end; dst++; } \
+ } \
+ }
+ switch (type)
+ {
+ case BCF_HT_INT: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *dst=bcf_int32_missing, *dst=bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*dst), bcf_float_set_vector_end(*dst)); break;
+ default: error("Unexpected case: %d\n", type);
+ }
+ #undef BRANCH
+ ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
+ }
+ if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
+ }
+}
+static inline int _is_acgtn(char *seq)
+{
+ while ( *seq )
+ {
+ char c = toupper(*seq);
+ if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 0;
+ seq++;
+ }
+ return 1;
+}
+/*
+ The atomization works as follows:
+ - Atomize each alternate allele separately by leaving out sequence identical to the reference. No
+ alignment is performed, just greedy trimming of the end, then from left. This operation returns
+ a list of atoms (atom_t) which carry fragments of REF,ALT and their positions as 0-based offsets
+ to the original REF allele
+ - Sort atoms by POS, REF and ALT. Each unique atom (POS+REF+ALT) forms a new VCF record, each
+ with a single ALT.
+ - For each new VCF record determine how to translate the original allele index (iori) to this new
+ record:
+ - 1: the original allele matches the atom
+ - 0: the original allele does not overlap this atom or the overlapping part matches the REF
+ allele
+ - 2 (or equivalently "."): there is a mismatch between the original allele and the atom
+ The mapping is encoded in a table with columns corresponding to the original ALTs and rows
+ to the new POS+ALTs (atoms). The table is initialized to 0, then we set 1's for matching
+ atoms and 2's for overlapping mismatching atoms.
+
+ Note that different ALT alleles can result in the same atom (the same output line) and this code
+ does not know how to reconcile possibly conflicting VCF annotations. This could be improved
+ and merge logic provided, similarly to `merge -l`. For example, the allelic depths (AD) should
+ be summed for the same atomized output allele. However, this level of complexity is not addressed
+ in this initial draft. Higher priority for now is to provide the inverse "join" operation.
+
+ Update 2021-04-09:
+ Tags QS,AD are now automatically incremented as they should be, for both INFO and FORMAT.
+ Note that the code will fail on missing values (todo) and it needs to be generalized and
+ made customizable.
+*/
+void _abuf_split(abuf_t *buf, bcf1_t *rec)
+{
+ int i,j;
+ if ( rec->n_allele < 2 )
+ {
+ rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
+ int j = rbuf_append(&buf->rbuf);
+ if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]);
+ buf->vcf[j] = bcf_dup(rec);
+ return;
+ }
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( _is_acgtn(rec->d.allele[i]) ) continue;
+ rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
+ int j = rbuf_append(&buf->rbuf);
+ if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]);
+ buf->vcf[j] = bcf_dup(rec);
+ return;
+ }
+
+ buf->natoms = 0;
+ for (i=1; i<rec->n_allele; i++) _atomize_allele(buf,rec,i);
+ qsort(buf->atoms,buf->natoms,sizeof(*buf->atoms),_cmp_atoms);
+ _split_table_init(buf,rec,buf->natoms);
+ for (i=0; i<buf->natoms; i++)
+ {
+ if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue;
+ _split_table_new(buf, &buf->atoms[i]); // add a new unique output atom
+ }
+ for (i=0; i<buf->natoms; i++)
+ {
+ // Looping over sorted list of all atoms with possible duplicates from different source ALT alleles
+ atom_t *atom = &buf->atoms[i];
+ for (j=0; j<buf->split.nout; j++)
+ {
+ atom_t *out = buf->split.atoms[j];
+ if ( atom == out ) continue; // table already set to 1
+ if ( atom->beg > out->end ) continue; // cannot overlap this output atom
+ if ( atom->end < out->beg ) break; // this atom is ahead of all subsequent output records
+ _split_table_overlap(buf, j, atom);
+ }
+ }
+ assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode
+
+ // Create the output records, transferring all annotations:
+ // CHROM-QUAL
+ _split_table_set_chrom_qual(buf);
+
+ // INFO
+ for (i=0; i<rec->n_info; i++)
+ {
+ // this implementation of merging rules is temporary: generalize and made customizable through the API
+ merge_rule_t mode = M_FIRST;
+ const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.info[i].key);
+ if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM;
+
+ _split_table_set_info(buf, &rec->d.info[i], mode);
+ }
+
+ // Set INFO tag showing the original record
+ if ( buf->split.info_tag )
+ _split_table_set_history(buf);
+
+ // FORMAT
+ for (i=0; i<rec->n_fmt; i++)
+ {
+ // this implementation of merging rules is temporary: generalize and made customizable through the API
+ merge_rule_t mode = M_FIRST;
+ const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.fmt[i].id);
+ if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM;
+
+ _split_table_set_format(buf, &rec->d.fmt[i], mode);
+ }
+}
+
+void abuf_push(abuf_t *buf, bcf1_t *rec)
+{
+ bcf_unpack(rec, BCF_UN_ALL);
+ if ( buf->mode==SPLIT ) _abuf_split(buf,rec);
+}
+
+bcf1_t *abuf_flush(abuf_t *buf, int flush_all)
+{
+ int i;
+
+ if ( buf->rbuf.n==0 ) return NULL;
+ if ( flush_all ) goto ret;
+
+ret:
+ i = rbuf_shift(&buf->rbuf);
+ return buf->vcf[i];
+}
+
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2021 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+/*
+ Atomize/deatomize complex variants
+*/
+
+#ifndef __ABUF_H__
+#define __ABUF_H__
+
+#include <htslib/vcf.h>
+
+typedef struct _abuf_t abuf_t;
+
+// Modes of operation
+typedef enum
+{
+ NONE,
+
+ // mode of operation, to be passed to abuf_init
+ SPLIT,
+ JOIN,
+
+ BCF_HDR, // should the records be annotated, a writable bcf header is required
+ INFO_TAG, // set BCF_HDR first
+ STAR_ALLELE // 1: use STAR allele (the default), 0: set overlaps to missing
+}
+abuf_opt_t;
+
+#define abuf_set_opt(buf,type,key,value) { type tmp = value; abuf_set(buf, key, (void*)&tmp); }
+void abuf_set(abuf_t *buf, abuf_opt_t key, void *value);
+
+/*
+ * abuf_init() - init buffer
+ * @win: number of sites (>0) or bp (<0)
+ */
+abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode);
+void abuf_destroy(abuf_t *buf);
+
+/*
+ * abuf_push() - Push a new site for analysis
+ */
+void abuf_push(abuf_t *buf, bcf1_t *rec);
+
+/*
+ * abuf_flush() - Return next buffered record
+ * @flush_all: Set to 1 if no more overlapping records are coming (e.g. end of chromosome or end of file),
+ * the buffer can be emptied.
+ * return: The next atomized/deatomized VCF record or NULL if no record is ready. The returned
+ * structure will be cleaned by abuf.
+ */
+bcf1_t *abuf_flush(abuf_t *buf, int flush_all);
+
+#endif
+
/* bam2bcf.c -- variant calling.
Copyright (C) 2010-2012 Broad Institute.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#define CAP_DIST 25
-bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
+bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ,
+ int delta_baseQ)
{
bcf_callaux_t *bca;
if (theta <= 0.) theta = CALL_DEFTHETA;
bca->capQ = 60;
bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
bca->min_baseQ = min_baseQ;
+ bca->max_baseQ = max_baseQ;
+ bca->delta_baseQ = delta_baseQ;
bca->e = errmod_init(1. - theta);
bca->min_frac = 0.002;
bca->min_support = 1;
bca->npos = 100;
bca->ref_pos = (int*) malloc(bca->npos*sizeof(int));
bca->alt_pos = (int*) malloc(bca->npos*sizeof(int));
+ bca->iref_pos= (int*) malloc(bca->npos*sizeof(int));
+ bca->ialt_pos= (int*) malloc(bca->npos*sizeof(int));
bca->nqual = 60;
bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int));
bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->iref_mq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->ialt_mq = (int*) malloc(bca->nqual*sizeof(int));
bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int));
bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int));
bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int));
{
if (bca == 0) return;
errmod_destroy(bca->e);
- if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; }
- free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq);
+ if (bca->npos) {
+ free(bca->ref_pos); free(bca->alt_pos);
+ free(bca->iref_pos); free(bca->ialt_pos);
+ bca->npos = 0;
+ }
+ free(bca->ref_mq); free(bca->alt_mq);
+ free(bca->iref_mq); free(bca->ialt_mq);
+ free(bca->ref_bq); free(bca->alt_bq);
free(bca->fwd_mqs); free(bca->rev_mqs);
bca->nqual = 0;
free(bca->bases); free(bca->inscns); free(bca);
}
// position in the sequence with respect to the aligned part of the read
-static int get_position(const bam_pileup1_t *p, int *len)
-{
- int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
- for (icig=0; icig<p->b->core.n_cigar; icig++)
- {
- int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK;
- int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT;
- if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
- {
- n_tot_bases += ncig;
- iread += ncig;
- continue;
- }
- if ( cig==BAM_CINS )
- {
- n_tot_bases += ncig;
- iread += ncig;
+static int get_position(const bam_pileup1_t *p, int *len,
+ int *sc_len, int *sc_dist) {
+ int i, j, edist = p->qpos + 1;
+ int sc_left = 0, sc_right = 0;
+ int sc_left_dist = -1, sc_right_dist = -1;
+
+ // left end
+ for (i = 0; i < p->b->core.n_cigar; i++) {
+ int cig = bam_get_cigar(p->b)[i] & BAM_CIGAR_MASK;
+ if (cig == BAM_CHARD_CLIP)
continue;
- }
- if ( cig==BAM_CSOFT_CLIP )
- {
- iread += ncig;
- if ( iread<=p->qpos ) edist -= ncig;
+ else if (cig == BAM_CSOFT_CLIP)
+ sc_left += bam_get_cigar(p->b)[i] >> BAM_CIGAR_SHIFT;
+ else
+ break;
+ }
+ if (sc_left)
+ sc_left_dist = p->qpos+1 - sc_left;
+ edist -= sc_left;
+
+ // right end
+ for (j = p->b->core.n_cigar-1; j >= i; j--) {
+ int cig = bam_get_cigar(p->b)[j] & BAM_CIGAR_MASK;
+ if (cig == BAM_CHARD_CLIP)
continue;
+ else if (cig == BAM_CSOFT_CLIP)
+ sc_right += bam_get_cigar(p->b)[j] >> BAM_CIGAR_SHIFT;
+ else
+ break;
+ }
+ if (sc_right)
+ sc_right_dist = p->b->core.l_qseq - sc_right - p->qpos;
+
+ // Distance to nearest soft-clips and length of that clip.
+ if (sc_left_dist >= 0) {
+ if (sc_right_dist < 0 || sc_left_dist < sc_right_dist) {
+ *sc_len = sc_left;
+ *sc_dist = sc_left_dist;
}
- if ( cig==BAM_CDEL ) continue;
- if ( cig==BAM_CHARD_CLIP ) continue;
- if ( cig==BAM_CPAD ) continue;
- if ( cig==BAM_CREF_SKIP ) continue;
- fprintf(stderr,"todo: cigar %d\n", cig);
- assert(0);
- }
- *len = n_tot_bases;
+ } else if (sc_right_dist >= 0) {
+ *sc_len = sc_right;
+ *sc_dist = sc_right_dist;
+ } else {
+ *sc_len = 0;
+ *sc_dist = 0;
+ }
+
+ *len = p->b->core.l_qseq - sc_left - sc_right;
return edist;
}
{
memset(bca->ref_pos,0,sizeof(int)*bca->npos);
memset(bca->alt_pos,0,sizeof(int)*bca->npos);
+ memset(bca->iref_pos,0,sizeof(int)*bca->npos);
+ memset(bca->ialt_pos,0,sizeof(int)*bca->npos);
memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
+ memset(bca->iref_mq,0,sizeof(int)*bca->nqual);
+ memset(bca->ialt_mq,0,sizeof(int)*bca->nqual);
memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
+ memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
+ memset(bca->ref_scl, 0, 100*sizeof(int));
+ memset(bca->alt_scl, 0, 100*sizeof(int));
+ memset(bca->iref_scl, 0, 100*sizeof(int));
+ memset(bca->ialt_scl, 0, 100*sizeof(int));
}
/*
Notes:
- - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies
- which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation.
- Later it's used for multiallelic calling by bcftools -m
+ - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.QS frequencies
+ which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the INFO/QS and FMT/QS annotations.
+ Later it's used for multiallelic calling by `call -m`, `call -mG` and `+trio-dnm`.
- ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
*/
/*
// clean from previous run
r->ori_depth = 0;
r->mq0 = 0;
- memset(r->qsum,0,sizeof(float)*4);
memset(r->anno,0,sizeof(double)*16);
memset(r->p,0,sizeof(float)*25);
r->SCR = 0;
kroundup32(bca->max_bases);
bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
}
+
// fill the bases array
+ double nqual_over_60 = bca->nqual / 60.0;
+ int ADR_ref_missed[4] = {0};
+ int ADF_ref_missed[4] = {0};
for (i = n = 0; i < _n; ++i) {
const bam_pileup1_t *p = pl + i;
int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
+ if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
if (p->is_del && !is_indel) continue;
++ori_depth;
if (is_indel)
{
- b = p->aux>>16&0x3f;
- baseQ = q = p->aux&0xff;
- // This read is not counted as indel. Instead of skipping it, treat it as ref. It is
- // still only an approximation, but gives more accurate AD counts and calls correctly
- // hets instead of alt-homs in some cases (see test/mpileup/indel-AD.1.sam)
- if ( q < bca->min_baseQ ) b = 0, q = (int)bam_get_qual(p->b)[p->qpos];
- seqQ = p->aux>>8&0xff;
+ b = p->aux>>16&0x3f;
+ seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
+ if (q < bca->min_baseQ)
+ {
+ if (!p->indel && b < 4)
+ {
+ if (bam_is_rev(p->b))
+ ADR_ref_missed[b]++;
+ else
+ ADF_ref_missed[b]++;
+ }
+ continue;
+ }
+ if (p->indel == 0 && (q < _n/2 || _n > 20)) {
+ // high quality indel calls without p->indel set aren't
+ // particularly indicative of being a good REF match either,
+ // at least not in low coverage. So require solid coverage
+ // before we start utilising such quals.
+ b = 0;
+ q = (int)bam_get_qual(p->b)[p->qpos];
+ seqQ = (3*seqQ + 2*q)/8;
+ }
+ if (_n > 20 && seqQ > 40) seqQ = 40;
+ baseQ = p->aux>>8&0xff;
+
is_diff = (b != 0);
}
else
{
b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
- baseQ = q = (int)bam_get_qual(p->b)[p->qpos];
+
+ // Lowest of this and neighbour quality values
+ uint8_t *qual = bam_get_qual(p->b);
+ q = qual[p->qpos];
+ if (p->qpos > 0 &&
+ q > qual[p->qpos-1]+bca->delta_baseQ)
+ q = qual[p->qpos-1]+bca->delta_baseQ;
+ if (p->qpos+1 < p->b->core.l_qseq &&
+ q > qual[p->qpos+1]+bca->delta_baseQ)
+ q = qual[p->qpos+1]+bca->delta_baseQ;
+
if (q < bca->min_baseQ) continue;
+ if (q > bca->max_baseQ) q = bca->max_baseQ;
+ baseQ = q;
seqQ = 99;
is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
}
if (q > 63) q = 63;
if (q < 4) q = 4; // MQ=0 reads count as BQ=4
bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
- if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
// collect annotations
if (b < 4)
{
- r->qsum[b] += q;
+ r->QS[b] += q;
if ( r->ADF )
{
if ( bam_is_rev(p->b) )
// collect for bias tests
if ( baseQ > 59 ) baseQ = 59;
if ( mapQ > 59 ) mapQ = 59;
- int len, epos = 0;
- if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB) )
+ int len, epos = 0, sc_len = 0, sc_dist = 0;
+ if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) )
{
- int pos = get_position(p, &len);
+ int pos = get_position(p, &len, &sc_len, &sc_dist);
epos = (double)pos/(len+1) * bca->npos;
+
+ if (sc_len) {
+ sc_len = 15.0*sc_len / sc_dist;
+ if (sc_len > 99) sc_len = 99;
+ }
}
- int ibq = baseQ/60. * bca->nqual;
- int imq = mapQ/60. * bca->nqual;
- if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++;
- else bca->fwd_mqs[imq]++;
+
+ int imq = mapQ * nqual_over_60;
+ int ibq = baseQ * nqual_over_60;
+
+ if ( bam_is_rev(p->b) )
+ bca->rev_mqs[imq]++;
+ else
+ bca->fwd_mqs[imq]++;
+
if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
{
bca->ref_pos[epos]++;
bca->ref_bq[ibq]++;
bca->ref_mq[imq]++;
+ bca->ref_scl[sc_len]++;
}
else
{
bca->alt_pos[epos]++;
bca->alt_bq[ibq]++;
bca->alt_mq[imq]++;
+ bca->alt_scl[sc_len]++;
}
}
+
+ // Compensate for AD not being counted on low quality REF indel matches.
+ if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 )
+ {
+ for (i=0; i<4; i++) // verify: are the counters ever non-zero for i!=0?
+ {
+ r->ADR[i] += ADR_ref_missed[i];
+ r->ADF[i] += ADF_ref_missed[i];
+ }
+ }
+ else if ( r->ADF && bca->ambig_reads==B2B_INC_AD )
+ {
+ int dp = 0, dp_ambig = 0;
+ for (i=0; i<4; i++) dp += r->ADR[i];
+ for (i=0; i<4; i++) dp_ambig += ADR_ref_missed[i];
+ if ( dp )
+ for (i=0; i<4; i++) r->ADR[i] += lroundf((float)dp_ambig * r->ADR[i]/dp);
+ dp = 0, dp_ambig = 0;
+ for (i=0; i<4; i++) dp += r->ADF[i];
+ for (i=0; i<4; i++) dp_ambig += ADF_ref_missed[i];
+ if ( dp )
+ for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
+ }
+
r->ori_depth = ori_depth;
// glfgen
errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
return pval>1 ? 1 : pval;
}
-double calc_mwu_bias(int *a, int *b, int n)
+double calc_mwu_bias(int *a, int *b, int n, int left)
{
int na = 0, nb = 0, i;
double U = 0, ties = 0;
if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely
double mean = ((double)na*nb)*0.5;
+ if (left && U > mean) return 1; // for MQB which is asymmetrical
if ( na==2 || nb==2 )
{
// Linear approximation
return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
}
+// A Z-score version of the above function.
+//
+// See "Normal approximation and tie correction" at
+// https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
+//
+// The Z score is the number of standard deviations above or below the mean
+// with 0 being equality of the two distributions and +ve/-ve from there.
+//
+// This is a more robust score to filter on.
+double calc_mwu_biasZ(int *a, int *b, int n, int left_only, int do_Z) {
+ int i;
+ int64_t t;
+
+ // Optimisation
+ for (i = 0; i < n; i++)
+ if (b[i])
+ break;
+ int b_empty = (i == n);
+
+ // Count equal (e), less-than (l) and greater-than (g) permutations.
+ int e = 0, l = 0, na = 0, nb = 0;
+ if (b_empty) {
+ for (t = 0, i = n-1; i >= 0; i--) {
+ na += a[i];
+ t += (a[i]*a[i]-1)*a[i]; // adjustment score for ties
+ }
+ } else {
+ for (t = 0, i = n-1; i >= 0; i--) {
+ // Combinations of a[i] and b[j] for i==j
+ e += a[i]*b[i];
+
+ // nb is running total of b[i+1]..b[n-1].
+ // Therefore a[i]*nb is the number of combinations of a[i] and b[j]
+ // for all i < j.
+ l += a[i]*nb; // a<b
+
+ na += a[i];
+ nb += b[i];
+ int p = a[i]+b[i];
+ t += (p*p-1)*p; // adjustment score for ties
+ }
+ }
+
+ if (na+nb <= 1)
+ return HUGE_VAL;
+
+ double U, m;
+ U = l + e*0.5; // Mann-Whitney U score
+ m = na*nb / 2.0;
+
+ // With ties adjustment
+ double var2 = (na*nb)/12.0 * ((na+nb+1) - t/(double)((na+nb)*(na+nb-1)));
+ // var = na*nb*(na+nb+1)/12.0; // simpler; minus tie adjustment
+ if (var2 <= 0)
+ return HUGE_VAL;
+
+ if (do_Z) {
+ // S.D. normalised Z-score
+ //Z = (U - m - (U-m >= 0 ? 0.5 : -0.5)) / sd; // gatk method?
+ return (U - m) / sqrt(var2);
+ }
+
+ // Else U score, which can be asymmetric for some data types.
+ if (left_only && U > m)
+ return HUGE_VAL; // one-sided, +ve bias is OK, -ve is not.
+
+ if (na >= 8 || nb >= 8) {
+ // Normal approximation, very good for na>=8 && nb>=8 and
+ // reasonable if na<8 or nb<8
+ return exp(-0.5*(U-m)*(U-m)/var2);
+ }
+
+ // Exact calculation
+ if (na==1 || nb == 1)
+ return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2);
+ else
+ return mann_whitney_1947(na, nb, U) * sqrt(2*M_PI*var2);
+}
+
static inline double logsumexp2(double a, double b)
{
if ( a>b )
int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
{
int ref4, i, j;
- float qsum[5] = {0,0,0,0,0};
+ float qsum[B2B_MAX_ALLELES] = {0,0,0,0,0};
if (ref_base >= 0) {
call->ori_ref = ref4 = seq_nt16_int[ref_base];
if (ref4 > 4) ref4 = 4;
for (i = 0; i < n; ++i)
{
float sum = 0;
- for (j = 0; j < 4; ++j) sum += calls[i].qsum[j];
+ for (j = 0; j < 4; ++j) sum += calls[i].QS[j];
if ( sum )
- for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum;
+ for (j = 0; j < 4; j++) qsum[j] += (float)calls[i].QS[j] / sum;
}
// sort qsum in ascending order (insertion sort)
// Set the reference allele and alternative allele(s)
for (i=0; i<5; i++) call->a[i] = -1;
- for (i=0; i<5; i++) call->qsum[i] = 0;
+ for (i=0; i<B2B_MAX_ALLELES; i++) call->qsum[i] = 0;
call->unseen = -1;
call->a[0] = ref4;
for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering
adf += B2B_MAX_ALLELES;
}
}
+ if ( bca->fmt_flag & B2B_FMT_QS )
+ {
+ assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well
+
+ // reorder QS to match the allele ordering at this site
+ int32_t tmp[B2B_MAX_ALLELES];
+ int32_t *qs = call->QS, *qs_out = call->QS;
+ for (i=0; i<n; i++)
+ {
+ for (j=0; j<call->n_alleles; j++) tmp[j] = qs[ call->a[j] ];
+ for (j=0; j<call->n_alleles; j++) qs_out[j] = tmp[j] < BCF_MAX_BT_INT32 ? tmp[j] : BCF_MAX_BT_INT32;
+ qs_out += call->n_alleles;
+ qs += B2B_MAX_ALLELES;
+ }
+ }
// if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
call->shift = (int)(sum_min + .499);
// calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
// calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
- if ( bca->fmt_flag & B2B_INFO_RPB )
- call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos);
- call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual);
- call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual);
- call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
+ if (bca->fmt_flag & B2B_INFO_ZSCORE) {
+ // U z-normalised as +/- number of standard deviations from mean.
+ if (call->ori_ref < 0) {
+ if (bca->fmt_flag & B2B_INFO_RPB)
+ call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
+ bca->npos, 0, 1);
+ call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq,
+ bca->nqual,1,1);
+ if ( bca->fmt_flag & B2B_INFO_SCB )
+ call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl,
+ 100, 0,1);
+ } else {
+ if (bca->fmt_flag & B2B_INFO_RPB)
+ call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
+ bca->npos, 0, 1);
+ call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
+ bca->nqual,1,1);
+ call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
+ bca->nqual,0,1);
+ call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
+ bca->nqual,0,1);
+ if ( bca->fmt_flag & B2B_INFO_SCB )
+ call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
+ 100, 0,1);
+ }
+ } else {
+ // Old method; U as probability between 0 and 1
+ if ( bca->fmt_flag & B2B_INFO_RPB )
+ call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
+ bca->npos, 0, 0);
+ call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
+ bca->nqual, 1, 0);
+ call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
+ bca->nqual, 0, 0);
+ call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
+ bca->nqual, 0, 0);
+ }
#if CDF_MWU_TESTS
// CDF version of MWU tests is not calculated by default
call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
#endif
- if ( bca->fmt_flag & B2B_INFO_VDB )
+ if ( bca->fmt_flag & B2B_INFO_VDB )
call->vdb = calc_vdb(bca->alt_pos, bca->npos);
return 0;
if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
- if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
- if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
- if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
- if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+
+ if (bca->fmt_flag & B2B_INFO_ZSCORE) {
+ if ( bc->mwu_pos != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
+ if ( bc->mwu_mq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
+ if ( bc->mwu_mqs != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
+ if ( bc->mwu_bq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
+ if ( bc->mwu_sc != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
+ } else {
+ if ( bc->mwu_pos != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
+ if ( bc->mwu_mq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
+ if ( bc->mwu_mqs != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
+ if ( bc->mwu_bq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+ }
+
+ if ( bc->strand_bias != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
+
#if CDF_MWU_TESTS
if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
}
if ( fmt_flag&B2B_FMT_SCR )
bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample);
+ if ( fmt_flag&B2B_FMT_QS )
+ bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele);
return 0;
}
/* bam2bcf.c -- variant calling.
Copyright (C) 2010-2012 Broad Institute.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#define CAP_DIST 25
-bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
+bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ,
+ int delta_baseQ)
{
bcf_callaux_t *bca;
if (theta <= 0.) theta = CALL_DEFTHETA;
bca->capQ = 60;
bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
bca->min_baseQ = min_baseQ;
+ bca->max_baseQ = max_baseQ;
+ bca->delta_baseQ = delta_baseQ;
bca->e = errmod_init(1. - theta);
bca->min_frac = 0.002;
bca->min_support = 1;
bca->npos = 100;
bca->ref_pos = (int*) malloc(bca->npos*sizeof(int));
bca->alt_pos = (int*) malloc(bca->npos*sizeof(int));
+ bca->iref_pos= (int*) malloc(bca->npos*sizeof(int));
+ bca->ialt_pos= (int*) malloc(bca->npos*sizeof(int));
bca->nqual = 60;
bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int));
bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->iref_mq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->ialt_mq = (int*) malloc(bca->nqual*sizeof(int));
bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int));
bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int));
bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int));
{
if (bca == 0) return;
errmod_destroy(bca->e);
- if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; }
- free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq);
+ if (bca->npos) {
+ free(bca->ref_pos); free(bca->alt_pos);
+ free(bca->iref_pos); free(bca->ialt_pos);
+ bca->npos = 0;
+ }
+ free(bca->ref_mq); free(bca->alt_mq);
+ free(bca->iref_mq); free(bca->ialt_mq);
+ free(bca->ref_bq); free(bca->alt_bq);
free(bca->fwd_mqs); free(bca->rev_mqs);
bca->nqual = 0;
free(bca->bases); free(bca->inscns); free(bca);
}
// position in the sequence with respect to the aligned part of the read
-static int get_position(const bam_pileup1_t *p, int *len)
-{
- int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
- for (icig=0; icig<p->b->core.n_cigar; icig++)
- {
- int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK;
- int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT;
- if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
- {
- n_tot_bases += ncig;
- iread += ncig;
- continue;
- }
- if ( cig==BAM_CINS )
- {
- n_tot_bases += ncig;
- iread += ncig;
+static int get_position(const bam_pileup1_t *p, int *len,
+ int *sc_len, int *sc_dist) {
+ int i, j, edist = p->qpos + 1;
+ int sc_left = 0, sc_right = 0;
+ int sc_left_dist = -1, sc_right_dist = -1;
+
+ // left end
+ for (i = 0; i < p->b->core.n_cigar; i++) {
+ int cig = bam_get_cigar(p->b)[i] & BAM_CIGAR_MASK;
+ if (cig == BAM_CHARD_CLIP)
continue;
- }
- if ( cig==BAM_CSOFT_CLIP )
- {
- iread += ncig;
- if ( iread<=p->qpos ) edist -= ncig;
+ else if (cig == BAM_CSOFT_CLIP)
+ sc_left += bam_get_cigar(p->b)[i] >> BAM_CIGAR_SHIFT;
+ else
+ break;
+ }
+ if (sc_left)
+ sc_left_dist = p->qpos+1 - sc_left;
+ edist -= sc_left;
+
+ // right end
+ for (j = p->b->core.n_cigar-1; j >= i; j--) {
+ int cig = bam_get_cigar(p->b)[j] & BAM_CIGAR_MASK;
+ if (cig == BAM_CHARD_CLIP)
continue;
+ else if (cig == BAM_CSOFT_CLIP)
+ sc_right += bam_get_cigar(p->b)[j] >> BAM_CIGAR_SHIFT;
+ else
+ break;
+ }
+ if (sc_right)
+ sc_right_dist = p->b->core.l_qseq - sc_right - p->qpos;
+
+ // Distance to nearest soft-clips and length of that clip.
+ if (sc_left_dist >= 0) {
+ if (sc_right_dist < 0 || sc_left_dist < sc_right_dist) {
+ *sc_len = sc_left;
+ *sc_dist = sc_left_dist;
}
- if ( cig==BAM_CDEL ) continue;
- if ( cig==BAM_CHARD_CLIP ) continue;
- if ( cig==BAM_CPAD ) continue;
- if ( cig==BAM_CREF_SKIP ) continue;
- fprintf(bcftools_stderr,"todo: cigar %d\n", cig);
- assert(0);
- }
- *len = n_tot_bases;
+ } else if (sc_right_dist >= 0) {
+ *sc_len = sc_right;
+ *sc_dist = sc_right_dist;
+ } else {
+ *sc_len = 0;
+ *sc_dist = 0;
+ }
+
+ *len = p->b->core.l_qseq - sc_left - sc_right;
return edist;
}
{
memset(bca->ref_pos,0,sizeof(int)*bca->npos);
memset(bca->alt_pos,0,sizeof(int)*bca->npos);
+ memset(bca->iref_pos,0,sizeof(int)*bca->npos);
+ memset(bca->ialt_pos,0,sizeof(int)*bca->npos);
memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
+ memset(bca->iref_mq,0,sizeof(int)*bca->nqual);
+ memset(bca->ialt_mq,0,sizeof(int)*bca->nqual);
memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
+ memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
+ memset(bca->ref_scl, 0, 100*sizeof(int));
+ memset(bca->alt_scl, 0, 100*sizeof(int));
+ memset(bca->iref_scl, 0, 100*sizeof(int));
+ memset(bca->ialt_scl, 0, 100*sizeof(int));
}
/*
Notes:
- - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies
- which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation.
- Later it's used for multiallelic calling by bcftools -m
+ - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.QS frequencies
+ which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the INFO/QS and FMT/QS annotations.
+ Later it's used for multiallelic calling by `call -m`, `call -mG` and `+trio-dnm`.
- ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
*/
/*
// clean from previous run
r->ori_depth = 0;
r->mq0 = 0;
- memset(r->qsum,0,sizeof(float)*4);
memset(r->anno,0,sizeof(double)*16);
memset(r->p,0,sizeof(float)*25);
r->SCR = 0;
kroundup32(bca->max_bases);
bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
}
+
// fill the bases array
+ double nqual_over_60 = bca->nqual / 60.0;
+ int ADR_ref_missed[4] = {0};
+ int ADF_ref_missed[4] = {0};
for (i = n = 0; i < _n; ++i) {
const bam_pileup1_t *p = pl + i;
int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
+ if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
if (p->is_del && !is_indel) continue;
++ori_depth;
if (is_indel)
{
- b = p->aux>>16&0x3f;
- baseQ = q = p->aux&0xff;
- // This read is not counted as indel. Instead of skipping it, treat it as ref. It is
- // still only an approximation, but gives more accurate AD counts and calls correctly
- // hets instead of alt-homs in some cases (see test/mpileup/indel-AD.1.sam)
- if ( q < bca->min_baseQ ) b = 0, q = (int)bam_get_qual(p->b)[p->qpos];
- seqQ = p->aux>>8&0xff;
+ b = p->aux>>16&0x3f;
+ seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
+ if (q < bca->min_baseQ)
+ {
+ if (!p->indel && b < 4)
+ {
+ if (bam_is_rev(p->b))
+ ADR_ref_missed[b]++;
+ else
+ ADF_ref_missed[b]++;
+ }
+ continue;
+ }
+ if (p->indel == 0 && (q < _n/2 || _n > 20)) {
+ // high quality indel calls without p->indel set aren't
+ // particularly indicative of being a good REF match either,
+ // at least not in low coverage. So require solid coverage
+ // before we start utilising such quals.
+ b = 0;
+ q = (int)bam_get_qual(p->b)[p->qpos];
+ seqQ = (3*seqQ + 2*q)/8;
+ }
+ if (_n > 20 && seqQ > 40) seqQ = 40;
+ baseQ = p->aux>>8&0xff;
+
is_diff = (b != 0);
}
else
{
b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
- baseQ = q = (int)bam_get_qual(p->b)[p->qpos];
+
+ // Lowest of this and neighbour quality values
+ uint8_t *qual = bam_get_qual(p->b);
+ q = qual[p->qpos];
+ if (p->qpos > 0 &&
+ q > qual[p->qpos-1]+bca->delta_baseQ)
+ q = qual[p->qpos-1]+bca->delta_baseQ;
+ if (p->qpos+1 < p->b->core.l_qseq &&
+ q > qual[p->qpos+1]+bca->delta_baseQ)
+ q = qual[p->qpos+1]+bca->delta_baseQ;
+
if (q < bca->min_baseQ) continue;
+ if (q > bca->max_baseQ) q = bca->max_baseQ;
+ baseQ = q;
seqQ = 99;
is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
}
if (q > 63) q = 63;
if (q < 4) q = 4; // MQ=0 reads count as BQ=4
bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
- if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
// collect annotations
if (b < 4)
{
- r->qsum[b] += q;
+ r->QS[b] += q;
if ( r->ADF )
{
if ( bam_is_rev(p->b) )
// collect for bias tests
if ( baseQ > 59 ) baseQ = 59;
if ( mapQ > 59 ) mapQ = 59;
- int len, epos = 0;
- if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB) )
+ int len, epos = 0, sc_len = 0, sc_dist = 0;
+ if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) )
{
- int pos = get_position(p, &len);
+ int pos = get_position(p, &len, &sc_len, &sc_dist);
epos = (double)pos/(len+1) * bca->npos;
+
+ if (sc_len) {
+ sc_len = 15.0*sc_len / sc_dist;
+ if (sc_len > 99) sc_len = 99;
+ }
}
- int ibq = baseQ/60. * bca->nqual;
- int imq = mapQ/60. * bca->nqual;
- if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++;
- else bca->fwd_mqs[imq]++;
+
+ int imq = mapQ * nqual_over_60;
+ int ibq = baseQ * nqual_over_60;
+
+ if ( bam_is_rev(p->b) )
+ bca->rev_mqs[imq]++;
+ else
+ bca->fwd_mqs[imq]++;
+
if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
{
bca->ref_pos[epos]++;
bca->ref_bq[ibq]++;
bca->ref_mq[imq]++;
+ bca->ref_scl[sc_len]++;
}
else
{
bca->alt_pos[epos]++;
bca->alt_bq[ibq]++;
bca->alt_mq[imq]++;
+ bca->alt_scl[sc_len]++;
}
}
+
+ // Compensate for AD not being counted on low quality REF indel matches.
+ if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 )
+ {
+ for (i=0; i<4; i++) // verify: are the counters ever non-zero for i!=0?
+ {
+ r->ADR[i] += ADR_ref_missed[i];
+ r->ADF[i] += ADF_ref_missed[i];
+ }
+ }
+ else if ( r->ADF && bca->ambig_reads==B2B_INC_AD )
+ {
+ int dp = 0, dp_ambig = 0;
+ for (i=0; i<4; i++) dp += r->ADR[i];
+ for (i=0; i<4; i++) dp_ambig += ADR_ref_missed[i];
+ if ( dp )
+ for (i=0; i<4; i++) r->ADR[i] += lroundf((float)dp_ambig * r->ADR[i]/dp);
+ dp = 0, dp_ambig = 0;
+ for (i=0; i<4; i++) dp += r->ADF[i];
+ for (i=0; i<4; i++) dp_ambig += ADF_ref_missed[i];
+ if ( dp )
+ for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
+ }
+
r->ori_depth = ori_depth;
// glfgen
errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
return pval>1 ? 1 : pval;
}
-double calc_mwu_bias(int *a, int *b, int n)
+double calc_mwu_bias(int *a, int *b, int n, int left)
{
int na = 0, nb = 0, i;
double U = 0, ties = 0;
if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely
double mean = ((double)na*nb)*0.5;
+ if (left && U > mean) return 1; // for MQB which is asymmetrical
if ( na==2 || nb==2 )
{
// Linear approximation
return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
}
+// A Z-score version of the above function.
+//
+// See "Normal approximation and tie correction" at
+// https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
+//
+// The Z score is the number of standard deviations above or below the mean
+// with 0 being equality of the two distributions and +ve/-ve from there.
+//
+// This is a more robust score to filter on.
+double calc_mwu_biasZ(int *a, int *b, int n, int left_only, int do_Z) {
+ int i;
+ int64_t t;
+
+ // Optimisation
+ for (i = 0; i < n; i++)
+ if (b[i])
+ break;
+ int b_empty = (i == n);
+
+ // Count equal (e), less-than (l) and greater-than (g) permutations.
+ int e = 0, l = 0, na = 0, nb = 0;
+ if (b_empty) {
+ for (t = 0, i = n-1; i >= 0; i--) {
+ na += a[i];
+ t += (a[i]*a[i]-1)*a[i]; // adjustment score for ties
+ }
+ } else {
+ for (t = 0, i = n-1; i >= 0; i--) {
+ // Combinations of a[i] and b[j] for i==j
+ e += a[i]*b[i];
+
+ // nb is running total of b[i+1]..b[n-1].
+ // Therefore a[i]*nb is the number of combinations of a[i] and b[j]
+ // for all i < j.
+ l += a[i]*nb; // a<b
+
+ na += a[i];
+ nb += b[i];
+ int p = a[i]+b[i];
+ t += (p*p-1)*p; // adjustment score for ties
+ }
+ }
+
+ if (na+nb <= 1)
+ return HUGE_VAL;
+
+ double U, m;
+ U = l + e*0.5; // Mann-Whitney U score
+ m = na*nb / 2.0;
+
+ // With ties adjustment
+ double var2 = (na*nb)/12.0 * ((na+nb+1) - t/(double)((na+nb)*(na+nb-1)));
+ // var = na*nb*(na+nb+1)/12.0; // simpler; minus tie adjustment
+ if (var2 <= 0)
+ return HUGE_VAL;
+
+ if (do_Z) {
+ // S.D. normalised Z-score
+ //Z = (U - m - (U-m >= 0 ? 0.5 : -0.5)) / sd; // gatk method?
+ return (U - m) / sqrt(var2);
+ }
+
+ // Else U score, which can be asymmetric for some data types.
+ if (left_only && U > m)
+ return HUGE_VAL; // one-sided, +ve bias is OK, -ve is not.
+
+ if (na >= 8 || nb >= 8) {
+ // Normal approximation, very good for na>=8 && nb>=8 and
+ // reasonable if na<8 or nb<8
+ return exp(-0.5*(U-m)*(U-m)/var2);
+ }
+
+ // Exact calculation
+ if (na==1 || nb == 1)
+ return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2);
+ else
+ return mann_whitney_1947(na, nb, U) * sqrt(2*M_PI*var2);
+}
+
static inline double logsumexp2(double a, double b)
{
if ( a>b )
int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
{
int ref4, i, j;
- float qsum[5] = {0,0,0,0,0};
+ float qsum[B2B_MAX_ALLELES] = {0,0,0,0,0};
if (ref_base >= 0) {
call->ori_ref = ref4 = seq_nt16_int[ref_base];
if (ref4 > 4) ref4 = 4;
for (i = 0; i < n; ++i)
{
float sum = 0;
- for (j = 0; j < 4; ++j) sum += calls[i].qsum[j];
+ for (j = 0; j < 4; ++j) sum += calls[i].QS[j];
if ( sum )
- for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum;
+ for (j = 0; j < 4; j++) qsum[j] += (float)calls[i].QS[j] / sum;
}
// sort qsum in ascending order (insertion sort)
// Set the reference allele and alternative allele(s)
for (i=0; i<5; i++) call->a[i] = -1;
- for (i=0; i<5; i++) call->qsum[i] = 0;
+ for (i=0; i<B2B_MAX_ALLELES; i++) call->qsum[i] = 0;
call->unseen = -1;
call->a[0] = ref4;
for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering
adf += B2B_MAX_ALLELES;
}
}
+ if ( bca->fmt_flag & B2B_FMT_QS )
+ {
+ assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well
+
+ // reorder QS to match the allele ordering at this site
+ int32_t tmp[B2B_MAX_ALLELES];
+ int32_t *qs = call->QS, *qs_out = call->QS;
+ for (i=0; i<n; i++)
+ {
+ for (j=0; j<call->n_alleles; j++) tmp[j] = qs[ call->a[j] ];
+ for (j=0; j<call->n_alleles; j++) qs_out[j] = tmp[j] < BCF_MAX_BT_INT32 ? tmp[j] : BCF_MAX_BT_INT32;
+ qs_out += call->n_alleles;
+ qs += B2B_MAX_ALLELES;
+ }
+ }
// if (ref_base < 0) fprintf(bcftools_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
call->shift = (int)(sum_min + .499);
// calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
// calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
- if ( bca->fmt_flag & B2B_INFO_RPB )
- call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos);
- call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual);
- call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual);
- call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
+ if (bca->fmt_flag & B2B_INFO_ZSCORE) {
+ // U z-normalised as +/- number of standard deviations from mean.
+ if (call->ori_ref < 0) {
+ if (bca->fmt_flag & B2B_INFO_RPB)
+ call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
+ bca->npos, 0, 1);
+ call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq,
+ bca->nqual,1,1);
+ if ( bca->fmt_flag & B2B_INFO_SCB )
+ call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl,
+ 100, 0,1);
+ } else {
+ if (bca->fmt_flag & B2B_INFO_RPB)
+ call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
+ bca->npos, 0, 1);
+ call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
+ bca->nqual,1,1);
+ call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
+ bca->nqual,0,1);
+ call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
+ bca->nqual,0,1);
+ if ( bca->fmt_flag & B2B_INFO_SCB )
+ call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
+ 100, 0,1);
+ }
+ } else {
+ // Old method; U as probability between 0 and 1
+ if ( bca->fmt_flag & B2B_INFO_RPB )
+ call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
+ bca->npos, 0, 0);
+ call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
+ bca->nqual, 1, 0);
+ call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
+ bca->nqual, 0, 0);
+ call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
+ bca->nqual, 0, 0);
+ }
#if CDF_MWU_TESTS
// CDF version of MWU tests is not calculated by default
call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
#endif
- if ( bca->fmt_flag & B2B_INFO_VDB )
+ if ( bca->fmt_flag & B2B_INFO_VDB )
call->vdb = calc_vdb(bca->alt_pos, bca->npos);
return 0;
if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
- if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
- if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
- if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
- if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+
+ if (bca->fmt_flag & B2B_INFO_ZSCORE) {
+ if ( bc->mwu_pos != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
+ if ( bc->mwu_mq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
+ if ( bc->mwu_mqs != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
+ if ( bc->mwu_bq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
+ if ( bc->mwu_sc != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
+ } else {
+ if ( bc->mwu_pos != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
+ if ( bc->mwu_mq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
+ if ( bc->mwu_mqs != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
+ if ( bc->mwu_bq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+ }
+
+ if ( bc->strand_bias != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
+
#if CDF_MWU_TESTS
if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
}
if ( fmt_flag&B2B_FMT_SCR )
bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample);
+ if ( fmt_flag&B2B_FMT_QS )
+ bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele);
return 0;
}
/* bam2bcf.h -- variant calling.
Copyright (C) 2010-2012 Broad Institute.
- Copyright (C) 2012-2014,2016 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#define B2B_FMT_SCR (1<<13)
#define B2B_INFO_VDB (1<<14)
#define B2B_INFO_RPB (1<<15)
+#define B2B_FMT_QS (1<<16)
+#define B2B_INFO_SCB (1<<17)
+#define B2B_INFO_ZSCORE (1<<30) // MWU as-is or Z-normalised
#define B2B_MAX_ALLELES 5
+#define B2B_DROP 0
+#define B2B_INC_AD 1
+#define B2B_INC_AD0 2
+
#define PLP_HAS_SOFT_CLIP(i) ((i)&1)
-#define PLP_SAMPLE_ID(i) ((i)>>1)
+#define PLP_HAS_INDEL(i) ((i)&2)
+#define PLP_SAMPLE_ID(i) ((i)>>2)
+
+#define PLP_SET_SOFT_CLIP(i) ((i)|=1)
+#define PLP_SET_INDEL(i) ((i)|=2)
+#define PLP_SET_SAMPLE_ID(i,n) ((i)|=(n)<<2)
typedef struct __bcf_callaux_t {
- int fmt_flag;
- int capQ, min_baseQ;
+ int fmt_flag, ambig_reads;
+ int capQ, min_baseQ, max_baseQ, delta_baseQ;
int openQ, extQ, tandemQ; // for indels
uint32_t min_support, max_support; // for collecting indel candidates
double min_frac; // for collecting indel candidates
float max_frac; // for collecting indel candidates
int per_sample_flt; // indel filtering strategy
int *ref_pos, *alt_pos, npos, *ref_mq, *alt_mq, *ref_bq, *alt_bq, *fwd_mqs, *rev_mqs, nqual; // for bias tests
+ int *iref_pos, *ialt_pos, *iref_mq, *ialt_mq; // for indels
+ int ref_scl[100], alt_scl[100]; // soft-clip length bias; SNP
+ int iref_scl[100], ialt_scl[100]; // soft-clip length bias; INDEL
// for internal uses
int max_bases;
int indel_types[4]; // indel lengths
uint16_t *bases; // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types)
errmod_t *e;
void *rghash;
+ float indel_bias; // adjusts indel score threshold; lower => call more.
} bcf_callaux_t;
// per-sample values
typedef struct {
- uint32_t ori_depth;
+ uint32_t ori_depth; // ori_depth = anno[0..3] but before --min-BQ is applied
unsigned int mq0;
- int32_t *ADF, *ADR, SCR;
- float qsum[4];
+ int32_t *ADF, *ADR, SCR, *QS; // FMT/QS
// The fields are:
// depth fwd .. ref (0) and non-ref (2)
// depth rev .. ref (1) and non-ref (3)
int tid, pos;
bcf_hdr_t *bcf_hdr;
int a[5]; // alleles: ref, alt, alt2, alt3
- float qsum[5]; // for the QS tag
+ float qsum[B2B_MAX_ALLELES]; // INFO/QS tag
int n, n_alleles, shift, ori_ref, unseen;
int n_supp; // number of supporting non-reference reads
double anno[16];
unsigned int depth, ori_depth, mq0;
- int32_t *PL, *DP4, *ADR, *ADF, *SCR;
+ int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS;
uint8_t *fmt_arr;
float vdb; // variant distance bias
- float mwu_pos, mwu_mq, mwu_bq, mwu_mqs;
+ float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc;
#if CDF_MWU_TESTS
float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf;
#endif
float seg_bias;
+ float strand_bias; // phred-scaled fisher-exact test
kstring_t tmp;
} bcf_call_t;
extern "C" {
#endif
- bcf_callaux_t *bcf_call_init(double theta, int min_baseQ);
+ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ,
+ int delta_baseQ);
void bcf_call_destroy(bcf_callaux_t *bca);
int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r);
int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call);
/* bam2bcf_indel.c -- indel caller.
Copyright (C) 2010, 2011 Broad Institute.
- Copyright (C) 2012-2014,2016 Genome Research Ltd.
+ Copyright (C) 2012-2014,2016-2017, 2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#include <assert.h>
#include <ctype.h>
#include <string.h>
+#include <math.h>
#include <htslib/hts.h>
#include <htslib/sam.h>
#include <htslib/khash_str2int.h>
#include "bam2bcf.h"
+#include "str_finder.h"
#include <htslib/ksort.h>
KSORT_INIT_GENERIC(uint32_t)
#define MINUS_CONST 0x10000000
-#define INDEL_WINDOW_SIZE 50
+#define INDEL_WINDOW_SIZE 110
+#define MAX_TYPES 64
+
+// Take a reference position tpos and convert to a query position (returned).
+// This uses the CIGAR string plus alignment c->pos to do the mapping.
+//
+// *_tpos is returned as tpos if query overlaps tpos, but for deletions
+// it'll be either the start (is_left) or end (!is_left) ref position.
static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
{
+ // x = pos in ref, y = pos in query seq
int k, x = c->pos, y = 0, last_y = 0;
*_tpos = c->pos;
for (k = 0; k < c->n_cigar; ++k) {
*_tpos = x;
return last_y;
}
+
// FIXME: check if the inserted sequence is consistent with the homopolymer run
// l is the relative gap length and l_run is the length of the homopolymer on the reference
static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
return max_i - pos;
}
+// Identify spft-clip length, position in seq, and clipped seq len
+static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p,
+ int *sc_len_r, int *slen_r, int *epos_r, int *end) {
+ bam1_t *b = p->b;
+ int sc_len = 0, sc_dist = -1, at_left = 1;
+ int epos = p->qpos, slen = b->core.l_qseq;
+ int k;
+ uint32_t *cigar = bam_get_cigar(b);
+ *end = -1;
+ for (k = 0; k < b->core.n_cigar; k++) {
+ int op = bam_cigar_op(cigar[k]);
+ if (op == BAM_CSOFT_CLIP) {
+ slen -= bam_cigar_oplen(cigar[k]);
+ if (at_left) {
+ // left end
+ sc_len += bam_cigar_oplen(cigar[k]);
+ epos -= sc_len; // don't count SC in seq pos
+ sc_dist = epos;
+ *end = 0;
+ } else {
+ // right end
+ int srlen = bam_cigar_oplen(cigar[k]);
+ int rd = b->core.l_qseq - srlen - p->qpos;
+ if (sc_dist < 0 || sc_dist > rd) {
+ // closer to right end than left
+ // FIXME: compensate for indel length too?
+ sc_dist = rd;
+ sc_len = srlen;
+ *end = 1;
+ }
+ }
+ } else if (op != BAM_CHARD_CLIP) {
+ at_left = 0;
+ }
+ }
+
+ if (p->indel > 0 && slen - (epos+p->indel) < epos)
+ epos += p->indel-1; // end of insertion, if near end of seq
+
+ // slen is now length of sequence minus soft-clips and
+ // epos is position of indel in seq minus left-clip.
+ *epos_r = (double)epos / (slen+1) * bca->npos;
+
+ if (sc_len) {
+ // scale importance of clip by distance to closest end
+ *sc_len_r = 15.0*sc_len / (sc_dist+1);
+ if (*sc_len_r > 99) *sc_len_r = 99;
+ } else {
+ *sc_len_r = 0;
+ }
+
+ *slen_r = slen;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Scans the pileup to identify all the different sizes of indels
+// present.
+//
+// Returns types and fills out n_types_r, max_rd_len_r and ref_type_r,
+// or NULL on error.
+static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp,
+ int pos, bcf_callaux_t *bca, const char *ref,
+ int *max_rd_len_r, int *n_types_r,
+ int *ref_type_r, int *N_r) {
+ int i, j, t, s, N, m, max_rd_len, n_types;
+ int n_alt = 0, n_tot = 0, indel_support_ok = 0;
+ uint32_t *aux;
+ int *types;
+
+ // N is the total number of reads
+ for (s = N = 0; s < n; ++s)
+ N += n_plp[s];
+
+ bca->max_support = bca->max_frac = 0;
+ aux = (uint32_t*) calloc(N + 1, 4);
+ if (!aux)
+ return NULL;
+
+ m = max_rd_len = 0;
+ aux[m++] = MINUS_CONST; // zero indel is always a type (REF)
+
+ // Fill out aux[] array with all the non-zero indel sizes.
+ // Also tally number with indels (n_alt) and total (n_tot).
+ for (s = 0; s < n; ++s) {
+ int na = 0, nt = 0;
+ for (i = 0; i < n_plp[s]; ++i) {
+ const bam_pileup1_t *p = plp[s] + i;
+ ++nt;
+ if (p->indel != 0) {
+ ++na;
+ aux[m++] = MINUS_CONST + p->indel;
+ }
+
+ // FIXME: cache me in pileup struct.
+ j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
+ if (j > max_rd_len) max_rd_len = j;
+ }
+ double frac = (double)na/nt;
+ if ( !indel_support_ok && na >= bca->min_support
+ && frac >= bca->min_frac )
+ indel_support_ok = 1;
+ if ( na > bca->max_support && frac > 0 )
+ bca->max_support = na, bca->max_frac = frac;
+
+ n_alt += na;
+ n_tot += nt;
+ }
+
+ // Sort aux[] and dedup
+ ks_introsort(uint32_t, m, aux);
+ for (i = 1, n_types = 1; i < m; ++i)
+ if (aux[i] != aux[i-1]) ++n_types;
+
+ // Taking totals makes it hard to call rare indels (IMF filter)
+ if ( !bca->per_sample_flt )
+ indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac
+ || n_alt < bca->min_support )
+ ? 0 : 1;
+ if ( n_types == 1 || !indel_support_ok ) { // then skip
+ free(aux);
+ return NULL;
+ }
+
+ // Bail out if we have far too many types of indel
+ if (n_types >= MAX_TYPES) {
+ free(aux);
+ // TODO revisit how/whether to control printing this warning
+ if (hts_verbose >= 2)
+ fprintf(stderr, "[%s] excessive INDEL alleles at position %d. "
+ "Skip the position.\n", __func__, pos + 1);
+ return NULL;
+ }
+
+ // To prevent long stretches of N's to be mistaken for indels
+ // (sometimes thousands of bases), check the number of N's in the
+ // sequence and skip places where half or more reference bases are Ns.
+ int nN=0, i_end = pos + (2*INDEL_WINDOW_SIZE < max_rd_len
+ ?2*INDEL_WINDOW_SIZE : max_rd_len);
+ for (i=pos; i<i_end && ref[i]; i++)
+ nN += ref[i] == 'N';
+ if ( nN*2>(i-pos) ) {
+ free(aux);
+ return NULL;
+ }
+
+ // Finally fill out the types[] array detailing the size of insertion
+ // or deletion.
+ types = (int*)calloc(n_types, sizeof(int));
+ if (!types) {
+ free(aux);
+ return NULL;
+ }
+ t = 0;
+ types[t++] = aux[0] - MINUS_CONST;
+ for (i = 1; i < m; ++i)
+ if (aux[i] != aux[i-1])
+ types[t++] = aux[i] - MINUS_CONST;
+ free(aux);
+
+ // Find reference type; types[?] == 0)
+ for (t = 0; t < n_types; ++t)
+ if (types[t] == 0) break;
+
+ *ref_type_r = t;
+ *n_types_r = n_types;
+ *max_rd_len_r = max_rd_len;
+ *N_r = N;
+
+ return types;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Construct per-sample consensus.
+//
+// Returns an array of consensus seqs,
+// or NULL on failure.
+static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp,
+ int pos, bcf_callaux_t *bca, const char *ref,
+ int left, int right) {
+ int i, k, s, L = right - left + 1, max_i, max2_i;
+ char **ref_sample; // returned
+ uint32_t *cns = NULL, max, max2;
+ char *ref0 = NULL, *r;
+ ref_sample = (char**) calloc(n, sizeof(char*));
+ cns = (uint32_t*) calloc(L, 4);
+ ref0 = (char*) calloc(L, 1);
+ if (!ref_sample || !cns || !ref0) {
+ n = 0;
+ goto err;
+ }
+
+ // Convert ref ASCII to 0-15.
+ for (i = 0; i < right - left; ++i)
+ ref0[i] = seq_nt16_table[(int)ref[i+left]];
+
+ // NB: one consensus per sample 'n', not per indel type.
+ // FIXME: consider fixing this. We should compute alignments vs
+ // types, not vs samples? Or types/sample combined?
+ for (s = 0; s < n; ++s) {
+ r = ref_sample[s] = (char*) calloc(L, 1);
+ if (!r) {
+ n = s-1;
+ goto err;
+ }
+
+ memset(cns, 0, sizeof(int) * L);
+
+ // collect ref and non-ref counts in cns
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ bam1_t *b = p->b;
+ uint32_t *cigar = bam_get_cigar(b);
+ uint8_t *seq = bam_get_seq(b);
+ int x = b->core.pos, y = 0;
+
+ // TODO: pileup exposes pileup_ind, but we also need e.g.
+ // pileup_len to know how much of the current CIGAR op-len
+ // we've used (or have remaining). If we had that, we
+ // could start at p->qpos without having to scan through
+ // the entire CIGAR string until we find it.
+ //
+ // Without it about all we could do is have a side channel
+ // to cache the last known coords. Messy, so punt for now.
+ // This is no longer the bottle neck until we get to 1000s of
+ // CIGAR ops.
+
+ for (k = 0; k < b->core.n_cigar; ++k) {
+ int op = cigar[k]&0xf;
+ int j, l = cigar[k]>>4;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ if (x + l >= left) {
+ j = left - x > 0 ? left - x : 0;
+ int j_end = right - x < l ? right - x : l;
+ for (; j < j_end; j++)
+ // Append to cns. Note this is ref coords,
+ // so insertions aren't in cns and deletions
+ // will have lower coverage.
+
+ // FIXME: want true consensus (with ins) per
+ // type, so we can independently compare each
+ // seq to each consensus and see which it
+ // matches best, so we get proper GT analysis.
+ cns[x+j-left] +=
+ (bam_seqi(seq, y+j) == ref0[x+j-left])
+ ? 1 // REF
+ : (1<<16); // ALT
+ }
+ x += l; y += l;
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+ x += l;
+ } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+ y += l;
+ }
+
+ if (x > right)
+ break;
+ }
+ }
+
+ // Determine a sample specific reference.
+ for (i = 0; i < right - left; ++i)
+ r[i] = ref0[i];
+
+ // Find deepest and 2nd deepest ALT region (max & max2).
+ max = max2 = 0; max_i = max2_i = -1;
+ for (i = 0; i < right - left; ++i) {
+ if (cns[i]>>16 >= max>>16)
+ max2 = max, max2_i = max_i, max = cns[i], max_i = i;
+ else if (cns[i]>>16 >= max2>>16)
+ max2 = cns[i], max2_i = i;
+ }
+
+ // Masks mismatches present in at least 70% of the reads with 'N'.
+ // This code is nREF/(nREF+n_ALT) >= 70% for deepest region.
+ // The effect is that at least 30% of bases differing to REF will
+ // use "N" in consensus, so we don't penalise ALT or REF when
+ // aligning against it. (A poor man IUPAC code)
+ //
+ // Why is it only done in two loci at most?
+ if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7)
+ max_i = -1;
+ if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7)
+ max2_i = -1;
+ if (max_i >= 0) r[max_i] = 15;
+ if (max2_i >= 0) r[max2_i] = 15;
+
+ //for (i = 0; i < right - left; ++i)
+ // fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr);
+ //fputc('\n', stderr);
+ }
+
+ free(ref0);
+ free(cns);
+
+ return ref_sample;
+
+ err:
+ free(ref0);
+ free(cns);
+ if (ref_sample) {
+ for (s = 0; s < n; s++)
+ free(ref_sample[s]);
+ free(ref_sample);
+ }
+
+ return NULL;
+}
+
+// The length of the homopolymer run around the current position
+static int bcf_cgp_l_run(const char *ref, int pos) {
+ int i, l_run;
+
+ int c = seq_nt16_table[(int)ref[pos + 1]];
+ if (c == 15) {
+ l_run = 1;
+ } else {
+ for (i = pos + 2; ref[i]; ++i)
+ if (seq_nt16_table[(int)ref[i]] != c) break;
+ l_run = i;
+ for (i = pos; i >= 0; --i)
+ if (seq_nt16_table[(int)ref[i]] != c) break;
+ l_run -= i + 1;
+ }
+
+ return l_run;
+}
+
+
+// Compute the consensus for this sample 's', minus indels which
+// get added later.
+static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp,
+ int pos, int *types, int n_types,
+ int max_ins, int s) {
+ int i, j, t, k;
+ int *inscns_aux = (int*)calloc(5 * n_types * max_ins, sizeof(int));
+ if (!inscns_aux)
+ return NULL;
+
+ // Count the number of occurrences of each base at each position for
+ // each type of insertion.
+ for (t = 0; t < n_types; ++t) {
+ if (types[t] > 0) {
+ for (s = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ if (p->indel == types[t]) {
+ uint8_t *seq = bam_get_seq(p->b);
+ for (k = 1; k <= p->indel; ++k) {
+ int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
+ assert(c<5);
+ ++inscns_aux[(t*max_ins+(k-1))*5 + c];
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Use the majority rule to construct the consensus
+ char *inscns = (char *)calloc(n_types * max_ins, 1);
+ for (t = 0; t < n_types; ++t) {
+ for (j = 0; j < types[t]; ++j) {
+ int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
+ for (k = 0; k < 5; ++k)
+ if (ia[k] > max)
+ max = ia[k], max_k = k;
+ inscns[t*max_ins + j] = max ? max_k : 4;
+ if (max_k == 4) {
+ // discard insertions which contain N's
+ types[t] = 0;
+ break;
+ }
+ }
+ }
+ free(inscns_aux);
+
+ return inscns;
+}
+
+#ifndef MIN
+# define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+// Part of bcf_call_gap_prep.
+//
+// Realign using BAQ to get an alignment score of a single read vs
+// a haplotype consensus.
+//
+// Fills out score
+// Returns 0 on success,
+// <0 on error
+static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
+ int type, uint8_t *ref2, uint8_t *query,
+ int r_start, int r_end, int long_read,
+ int tbeg, int tend,
+ int left, int right,
+ int qbeg, int qend,
+ int qpos, int max_deletion,
+ int *score) {
+ // Illumina
+ probaln_par_t apf = { 1e-4, 1e-2, 10 };
+
+ // Parameters that work better on PacBio CCS 15k.
+ // We should consider querying the header and RG PU field.
+ // See also htslib/realn.c:sam_prob_realn()
+ if (long_read) {
+ apf.d = 1e-3;
+ apf.e = 1e-1;
+ }
+
+ type = abs(type);
+ apf.bw = type + 3;
+ int l, sc;
+ const uint8_t *qual = bam_get_qual(p->b), *bq;
+ uint8_t *qq;
+
+ // Get segment of quality, either ZQ tag or if absent QUAL.
+ if (!(qq = (uint8_t*) calloc(qend - qbeg, 1)))
+ return -1;
+ bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
+ if (bq) ++bq; // skip type
+ for (l = qbeg; l < qend; ++l) {
+ int qval = bq? qual[l] + (bq[l] - 64) : qual[l];
+ if (qval > 30)
+ qval = 30;
+ if (qval < 7)
+ qval = 7;
+ qq[l - qbeg] = qval;
+ }
+
+ // The bottom 8 bits are length-normalised score while
+ // the top bits are unnormalised.
+ sc = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type,
+ query, qend - qbeg, qq, &apf, 0, 0);
+ if (sc < 0) {
+ *score = 0xffffff;
+ free(qq);
+ return 0;
+ }
+
+ // used for adjusting indelQ below
+ l = (int)(100. * sc / (qend - qbeg) + .499) * bca->indel_bias;
+ *score = sc<<8 | MIN(255, l);
+
+ rep_ele *reps, *elt, *tmp;
+ uint8_t *seg = ref2 + tbeg - left;
+ int seg_len = tend - tbeg + type;
+
+ // Note: although seg moves (tbeg varies), ref2 is reused many times
+ // so we could factor out some find_STR calls. However it's not the
+ // bottleneck for now.
+
+ // FIXME: need to make this work on IUPAC.
+ reps = find_STR((char *)seg, seg_len, 0);
+ int iscore = 0;
+
+ // Identify STRs in ref covering the indel up to
+ // (or close to) the end of the sequence.
+ // Those having an indel and right at the sequence
+ // end do not confirm the total length of indel
+ // size. Specifically a *lack* of indel at the
+ // end, where we know indels occur in other
+ // sequences, is a possible reference bias.
+ //
+ // This is emphasised further if the sequence ends with
+ // soft clipping.
+ DL_FOREACH_SAFE(reps, elt, tmp) {
+ if (elt->start <= qpos && elt->end >= qpos) {
+ iscore += (elt->end-elt->start) / elt->rep_len; // c
+ if (elt->start+tbeg <= r_start ||
+ elt->end+tbeg >= r_end)
+ iscore += 2*(elt->end-elt->start);
+ }
+
+ DL_DELETE(reps, elt);
+ free(elt);
+ }
+
+ // Apply STR score to existing indelQ
+ l = (*score&0xff)*.8 + iscore*2;
+ *score = (*score & ~0xff) | MIN(255, l);
+
+ free(qq);
+
+ return 0;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Returns n_alt on success
+// -1 on failure
+static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
+ bcf_callaux_t *bca, char *inscns,
+ int l_run, int max_ins,
+ int ref_type, int *types, int n_types,
+ int *score) {
+ // FIXME: n_types has a maximum; no need to alloc - use a #define?
+ int sc[MAX_TYPES], sumq[MAX_TYPES], s, i, j, t, K, n_alt, tmp;
+ memset(sumq, 0, n_types * sizeof(int));
+ for (s = K = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i, ++K) {
+ bam_pileup1_t *p = plp[s] + i;
+ int *sct = &score[K*n_types], seqQ, indelQ;
+ for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+ tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+
+ /* errmod_cal() assumes that if the call is wrong, the
+ * likelihoods of other events are equal. This is about
+ * right for substitutions, but is not desired for
+ * indels. To reuse errmod_cal(), I have to make
+ * compromise for multi-allelic indels.
+ */
+ if ((sc[0]&0x3f) == ref_type) {
+ indelQ = (sc[1]>>14) - (sc[0]>>14);
+ seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
+ } else {
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sc[t]&0x3f) == ref_type) break;
+ indelQ = (sc[t]>>14) - (sc[0]>>14);
+ seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
+ }
+ tmp = sc[0]>>6 & 0xff;
+ // reduce indelQ
+ indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499);
+
+ // Doesn't really help accuracy, but permits -h to take
+ // affect still.
+ if (indelQ > seqQ) indelQ = seqQ;
+ if (indelQ > 255) indelQ = 255;
+ if (seqQ > 255) seqQ = 255;
+ p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
+ sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
+ // fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
+ }
+ }
+ // determine bca->indel_types[] and bca->inscns
+ bca->maxins = max_ins;
+ bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
+ if (bca->maxins && !bca->inscns)
+ return -1;
+ for (t = 0; t < n_types; ++t)
+ sumq[t] = sumq[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
+ tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sumq[t]&0x3f) == ref_type) break;
+ if (t) { // then move the reference type to the first
+ tmp = sumq[t];
+ for (; t > 0; --t) sumq[t] = sumq[t-1];
+ sumq[0] = tmp;
+ }
+ for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
+ for (t = 0; t < 4 && t < n_types; ++t) {
+ bca->indel_types[t] = types[sumq[t]&0x3f];
+ if (bca->maxins)
+ memcpy(&bca->inscns[t * bca->maxins],
+ &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
+ }
+ // update p->aux
+ for (s = n_alt = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ int x = types[p->aux>>16&0x3f];
+ for (j = 0; j < 4; ++j)
+ if (x == bca->indel_types[j]) break;
+ p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
+ if ((p->aux>>16&0x3f) > 0) ++n_alt;
+ //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
+ }
+ }
+
+ return n_alt;
+}
+
+/*
+FIXME: with high number of samples, do we handle IMF correctly? Is it
+fraction of indels across entire data set, or just fraction for this
+specific sample? Needs to check bca->per_sample_flt (--per-sample-mF) opt.
+ */
+
/*
notes:
- - n .. number of samples
- - the routine sets bam_pileup1_t.aux of each read as follows:
- - 6: unused
- - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f
- - 8: estimated sequence quality .. (aux>>8)&0xff
- - 8: indel quality .. aux&0xff
+ - n .. number of samples
+ - the routine sets bam_pileup1_t.aux of each read as follows:
+ - 6: unused
+ - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f
+ - 8: estimated sequence quality .. (aux>>8)&0xff
+ - 8: indel quality .. aux&0xff
*/
-int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
+ bcf_callaux_t *bca, const char *ref)
{
- int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2;
+ if (ref == 0 || bca == 0) return -1;
+
+ int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins;
+ int *score, max_ref2;
int N, K, l_run, ref_type, n_alt;
char *inscns = 0, *ref2, *query, **ref_sample;
- if (ref == 0 || bca == 0) return -1;
// determine if there is a gap
for (s = N = 0; s < n; ++s) {
if (plp[s][i].indel != 0) break;
if (i < n_plp[s]) break;
}
- if (s == n) return -1; // there is no indel at this position.
- for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
- { // find out how many types of indels are present
- bca->max_support = bca->max_frac = 0;
- int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
- uint32_t *aux;
- aux = (uint32_t*) calloc(N + 1, 4);
- m = max_rd_len = 0;
- aux[m++] = MINUS_CONST; // zero indel is always a type
- for (s = 0; s < n; ++s) {
- int na = 0, nt = 0;
- for (i = 0; i < n_plp[s]; ++i) {
- const bam_pileup1_t *p = plp[s] + i;
- ++nt;
- if (p->indel != 0) {
- ++na;
- aux[m++] = MINUS_CONST + p->indel;
- }
- j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
- if (j > max_rd_len) max_rd_len = j;
- }
- double frac = (double)na/nt;
- if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
- indel_support_ok = 1;
- if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
- n_alt += na;
- n_tot += nt;
- }
- // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
- // check the number of N's in the sequence and skip places where half or more reference bases are Ns.
- int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
- if ( nN*2>(i-pos) ) { free(aux); return -1; }
-
- ks_introsort(uint32_t, m, aux);
- // squeeze out identical types
- for (i = 1, n_types = 1; i < m; ++i)
- if (aux[i] != aux[i-1]) ++n_types;
- // Taking totals makes it hard to call rare indels
- if ( !bca->per_sample_flt )
- indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
- if ( n_types == 1 || !indel_support_ok ) { // then skip
- free(aux); return -1;
- }
- if (n_types >= 64) {
- free(aux);
- // TODO revisit how/whether to control printing this warning
- if (hts_verbose >= 2)
- fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
- return -1;
- }
- types = (int*)calloc(n_types, sizeof(int));
- t = 0;
- types[t++] = aux[0] - MINUS_CONST;
- for (i = 1; i < m; ++i)
- if (aux[i] != aux[i-1])
- types[t++] = aux[i] - MINUS_CONST;
- free(aux);
- for (t = 0; t < n_types; ++t)
- if (types[t] == 0) break;
- ref_type = t; // the index of the reference type (0)
- }
- { // calculate left and right boundary
- left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
- right = pos + INDEL_WINDOW_SIZE;
- if (types[0] < 0) right -= types[0];
- // in case the alignments stand out the reference
- for (i = pos; i < right; ++i)
- if (ref[i] == 0) break;
- right = i;
- }
- /* The following block fixes a long-existing flaw in the INDEL
+ if (s == n)
+ // there is no indel at this position.
+ return -1;
+
+ // find out how many types of indels are present
+ types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref,
+ &max_rd_len, &n_types, &ref_type, &N);
+ if (!types)
+ return -1;
+
+
+ // calculate left and right boundary
+ left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
+ right = pos + INDEL_WINDOW_SIZE;
+ if (types[0] < 0) right -= types[0];
+
+ // in case the alignments stand out the reference
+ for (i = pos; i < right; ++i)
+ if (ref[i] == 0) break;
+ right = i;
+
+
+ /* The following call fixes a long-existing flaw in the INDEL
* calling model: the interference of nearby SNPs. However, it also
* reduces the power because sometimes, substitutions caused by
* indels are not distinguishable from true mutations. Multiple
*
* Masks mismatches present in at least 70% of the reads with 'N'.
*/
- { // construct per-sample consensus
- int L = right - left + 1, max_i, max2_i;
- uint32_t *cns, max, max2;
- char *ref0, *r;
- ref_sample = (char**) calloc(n, sizeof(char*));
- cns = (uint32_t*) calloc(L, 4);
- ref0 = (char*) calloc(L, 1);
- for (i = 0; i < right - left; ++i)
- ref0[i] = seq_nt16_table[(int)ref[i+left]];
- for (s = 0; s < n; ++s) {
- r = ref_sample[s] = (char*) calloc(L, 1);
- memset(cns, 0, sizeof(int) * L);
- // collect ref and non-ref counts
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- bam1_t *b = p->b;
- uint32_t *cigar = bam_get_cigar(b);
- uint8_t *seq = bam_get_seq(b);
- int x = b->core.pos, y = 0;
- for (k = 0; k < b->core.n_cigar; ++k) {
- int op = cigar[k]&0xf;
- int j, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j)
- if (x + j >= left && x + j < right)
- cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
- x += l; y += l;
- } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
- else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
- }
- }
- // determine the consensus
- for (i = 0; i < right - left; ++i) r[i] = ref0[i];
- max = max2 = 0; max_i = max2_i = -1;
- for (i = 0; i < right - left; ++i) {
- if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
- else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
- }
- if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
- if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
- if (max_i >= 0) r[max_i] = 15;
- if (max2_i >= 0) r[max2_i] = 15;
- //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr);
- }
- free(ref0); free(cns);
- }
- { // the length of the homopolymer run around the current position
- int c = seq_nt16_table[(int)ref[pos + 1]];
- if (c == 15) l_run = 1;
- else {
- for (i = pos + 2; ref[i]; ++i)
- if (seq_nt16_table[(int)ref[i]] != c) break;
- l_run = i;
- for (i = pos; i >= 0; --i)
- if (seq_nt16_table[(int)ref[i]] != c) break;
- l_run -= i + 1;
- }
- }
- // construct the consensus sequence
+ ref_sample = bcf_cgp_ref_sample(n, n_plp, plp, pos, bca, ref, left, right);
+
+ // The length of the homopolymer run around the current position
+ l_run = bcf_cgp_l_run(ref, pos);
+
+ // construct the consensus sequence (minus indels, which are added later)
max_ins = types[n_types - 1]; // max_ins is at least 0
if (max_ins > 0) {
- int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int));
- // count the number of occurrences of each base at each position for each type of insertion
- for (t = 0; t < n_types; ++t) {
- if (types[t] > 0) {
- for (s = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- if (p->indel == types[t]) {
- uint8_t *seq = bam_get_seq(p->b);
- for (k = 1; k <= p->indel; ++k) {
- int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
- assert(c<5);
- ++inscns_aux[(t*max_ins+(k-1))*5 + c];
- }
- }
- }
- }
- }
- }
- // use the majority rule to construct the consensus
- inscns = (char*) calloc(n_types * max_ins, 1);
- for (t = 0; t < n_types; ++t) {
- for (j = 0; j < types[t]; ++j) {
- int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
- for (k = 0; k < 5; ++k)
- if (ia[k] > max)
- max = ia[k], max_k = k;
- inscns[t*max_ins + j] = max? max_k : 4;
- if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
- }
- }
- free(inscns_aux);
+ inscns = bcf_cgp_calc_cons(n, n_plp, plp, pos,
+ types, n_types, max_ins, s);
+ if (!inscns)
+ return -1;
}
+
// compute the likelihood given each type of indel for each read
max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
ref2 = (char*) calloc(max_ref2, 1);
query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1);
- score1 = (int*) calloc(N * n_types, sizeof(int));
- score2 = (int*) calloc(N * n_types, sizeof(int));
+ score = (int*) calloc(N * n_types, sizeof(int));
bca->indelreg = 0;
+ double nqual_over_60 = bca->nqual / 60.0;
+
for (t = 0; t < n_types; ++t) {
int l, ir;
- probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
- apf1.bw = apf2.bw = abs(types[t]) + 3;
+
// compute indelreg
- if (types[t] == 0) ir = 0;
- else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
- else ir = est_indelreg(pos, ref, -types[t], 0);
- if (ir > bca->indelreg) bca->indelreg = ir;
-// fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir);
- // realignment
+ if (types[t] == 0)
+ ir = 0;
+ else if (types[t] > 0)
+ ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
+ else
+ ir = est_indelreg(pos, ref, -types[t], 0);
+
+ if (ir > bca->indelreg)
+ bca->indelreg = ir;
+
+ // Identify max deletion length
+ int max_deletion = 0;
+ for (s = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i, ++K) {
+ bam_pileup1_t *p = plp[s] + i;
+ if (max_deletion < -p->indel)
+ max_deletion = -p->indel;
+ }
+ }
+
+ // Realignment score, computed via BAQ
for (s = K = 0; s < n; ++s) {
- // write ref2
+ // Construct ref2 from ref_sample, inscns and indels.
+ // This is now the true sample consensus (possibly prepended
+ // and appended with reference if sample data doesn't span
+ // the full length).
for (k = 0, j = left; j <= pos; ++j)
ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
- if (types[t] <= 0) j += -types[t];
- else for (l = 0; l < types[t]; ++l)
- ref2[k++] = inscns[t*max_ins + l];
+
+ if (types[t] <= 0)
+ j += -types[t];
+ else
+ for (l = 0; l < types[t]; ++l)
+ ref2[k++] = inscns[t*max_ins + l];
+
for (; j < right && ref[j]; ++j)
ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
- for (; k < max_ref2; ++k) ref2[k] = 4;
- if (j < right) right = j;
+ for (; k < max_ref2; ++k)
+ ref2[k] = 4;
+
+ if (right > j)
+ right = j;
+
// align each read to ref2
for (i = 0; i < n_plp[s]; ++i, ++K) {
bam_pileup1_t *p = plp[s] + i;
- int qbeg, qend, tbeg, tend, sc, kk;
+
+ // Some basic ref vs alt stats.
+ int imq = p->b->core.qual > 59 ? 59 : p->b->core.qual;
+ imq *= nqual_over_60;
+
+ int sc_len, slen, epos, sc_end;
+
+ // Only need to gather stats on one type, as it's
+ // identical calculation for all the subsequent ones
+ // and we're sharing the same stats array
+ if (t == 0) {
+ // Gather stats for INFO field to aid filtering.
+ // mq and sc_len not very helpful for filtering, but could
+ // help in assigning a better QUAL value.
+ //
+ // Pos is slightly useful.
+ // Base qual can be useful, but need qual prior to BAQ?
+ // May need to cache orig quals in aux tag so we can fetch
+ // them even after mpileup step.
+ get_pos(bca, p, &sc_len, &slen, &epos, &sc_end);
+
+ assert(imq >= 0 && imq < bca->nqual);
+ assert(epos >= 0 && epos < bca->npos);
+ assert(sc_len >= 0 && sc_len < 100);
+ if (p->indel) {
+ bca->ialt_mq[imq]++;
+ bca->ialt_scl[sc_len]++;
+ bca->ialt_pos[epos]++;
+ } else {
+ bca->iref_mq[imq]++;
+ bca->iref_scl[sc_len]++;
+ bca->iref_pos[epos]++;
+ }
+ }
+
+ int qbeg, qpos, qend, tbeg, tend, kk;
uint8_t *seq = bam_get_seq(p->b);
uint32_t *cigar = bam_get_cigar(p->b);
- if (p->b->core.flag&4) continue; // unmapped reads
- // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
+ if (p->b->core.flag & BAM_FUNMAP) continue;
+
+ // FIXME: the following loop should be better moved outside;
+ // nonetheless, realignment should be much slower anyway.
for (kk = 0; kk < p->b->core.n_cigar; ++kk)
- if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
- if (kk < p->b->core.n_cigar) continue;
- // FIXME: the following skips soft clips, but using them may be more sensitive.
+ if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
+ break;
+ if (kk < p->b->core.n_cigar)
+ continue;
+
// determine the start and end of sequences for alignment
- qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg);
- qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend);
+ // FIXME: loops over CIGAR multiple times
+ int left2 = left, right2 = right;
+ if (p->b->core.l_qseq > 1000) {
+ // long read data needs less context. It also tends to
+ // have many more candidate indels to investigate so
+ // speed here matters more.
+ if (pos - left >= INDEL_WINDOW_SIZE)
+ left2 += INDEL_WINDOW_SIZE/2;
+ if (right-pos >= INDEL_WINDOW_SIZE)
+ right2 -= INDEL_WINDOW_SIZE/2;
+ }
+
+ int r_start = p->b->core.pos;
+ int r_end = bam_cigar2rlen(p->b->core.n_cigar,
+ bam_get_cigar(p->b))
+ -1 + r_start;
+
+ qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left2,
+ 0, &tbeg);
+ qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos,
+ 0, &tend) - qbeg;
+ qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right2,
+ 1, &tend);
+
if (types[t] < 0) {
int l = -types[t];
tbeg = tbeg - l > left? tbeg - l : left;
}
+
// write the query sequence
for (l = qbeg; l < qend; ++l)
query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
- { // do realignment; this is the bottleneck
- const uint8_t *qual = bam_get_qual(p->b), *bq;
- uint8_t *qq;
- qq = (uint8_t*) calloc(qend - qbeg, 1);
- bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
- if (bq) ++bq; // skip type
- for (l = qbeg; l < qend; ++l) {
- qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
- if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
- if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
- }
- sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
- l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
- if (l > 255) l = 255;
- score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
- if (sc > 5) {
- sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
- l = (int)(100. * sc / (qend - qbeg) + .499);
- if (l > 255) l = 255;
- score2[K*n_types + t] = sc<<8 | l;
+
+ // A fudge for now. Consider checking SAM header for
+ // RG platform field.
+ int long_read = p->b->core.l_qseq > 1000;
+
+ // do realignment; this is the bottleneck
+ if (tend > tbeg) {
+ if (bcf_cgp_align_score(p, bca, types[t],
+ (uint8_t *)ref2 + left2-left,
+ (uint8_t *)query,
+ r_start, r_end, long_read,
+ tbeg, tend, left2, right2,
+ qbeg, qend, qpos, max_deletion,
+ &score[K*n_types + t]) < 0) {
+ score[K*n_types + t] = 0xffffff;
+ return -1;
}
- free(qq);
+ } else {
+ // place holder large cost for reads that cover the
+ // region entirely within a deletion (thus tend < tbeg).
+ score[K*n_types + t] = 0xffffff;
}
#if 0
for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr);
fputc('\n', stderr);
- for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr);
+ for (l = 0; l < qend - qbeg; ++l)
+ fputc("ACGTN"[(int)query[l]], stderr);
fputc('\n', stderr);
- fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), qbeg, tbeg, sc);
+ fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s "
+ "qbeg=%d tbeg=%d score=%d\n",
+ pos, types[t], s, i, bam_get_qname(p->b),
+ qbeg, tbeg, sc);
#endif
}
}
}
- free(ref2); free(query);
- { // compute indelQ
- int sc_a[16], sumq_a[16];
- int tmp, *sc = sc_a, *sumq = sumq_a;
- if (n_types > 16) {
- sc = (int *)malloc(n_types * sizeof(int));
- sumq = (int *)malloc(n_types * sizeof(int));
- }
- memset(sumq, 0, n_types * sizeof(int));
- for (s = K = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i, ++K) {
- bam_pileup1_t *p = plp[s] + i;
- int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
- for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
- for (t = 1; t < n_types; ++t) // insertion sort
- for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
- tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
- /* errmod_cal() assumes that if the call is wrong, the
- * likelihoods of other events are equal. This is about
- * right for substitutions, but is not desired for
- * indels. To reuse errmod_cal(), I have to make
- * compromise for multi-allelic indels.
- */
- if ((sc[0]&0x3f) == ref_type) {
- indelQ1 = (sc[1]>>14) - (sc[0]>>14);
- seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
- } else {
- for (t = 0; t < n_types; ++t) // look for the reference type
- if ((sc[t]&0x3f) == ref_type) break;
- indelQ1 = (sc[t]>>14) - (sc[0]>>14);
- seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
- }
- tmp = sc[0]>>6 & 0xff;
- indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
- sct = &score2[K*n_types];
- for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
- for (t = 1; t < n_types; ++t) // insertion sort
- for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
- tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
- if ((sc[0]&0x3f) == ref_type) {
- indelQ2 = (sc[1]>>14) - (sc[0]>>14);
- } else {
- for (t = 0; t < n_types; ++t) // look for the reference type
- if ((sc[t]&0x3f) == ref_type) break;
- indelQ2 = (sc[t]>>14) - (sc[0]>>14);
- }
- tmp = sc[0]>>6 & 0xff;
- indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
- // pick the smaller between indelQ1 and indelQ2
- indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
- if (indelQ > 255) indelQ = 255;
- if (seqQ > 255) seqQ = 255;
- p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
- sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
-// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
- }
- }
- // determine bca->indel_types[] and bca->inscns
- bca->maxins = max_ins;
- bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
- for (t = 0; t < n_types; ++t)
- sumq[t] = sumq[t]<<6 | t;
- for (t = 1; t < n_types; ++t) // insertion sort
- for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
- tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
- for (t = 0; t < n_types; ++t) // look for the reference type
- if ((sumq[t]&0x3f) == ref_type) break;
- if (t) { // then move the reference type to the first
- tmp = sumq[t];
- for (; t > 0; --t) sumq[t] = sumq[t-1];
- sumq[0] = tmp;
- }
- for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
- for (t = 0; t < 4 && t < n_types; ++t) {
- bca->indel_types[t] = types[sumq[t]&0x3f];
- memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
- }
- // update p->aux
- for (s = n_alt = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- int x = types[p->aux>>16&0x3f];
- for (j = 0; j < 4; ++j)
- if (x == bca->indel_types[j]) break;
- p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
- if ((p->aux>>16&0x3f) > 0) ++n_alt;
- //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
- }
- }
- if (sc != sc_a) free(sc);
- if (sumq != sumq_a) free(sumq);
- }
- free(score1); free(score2);
+ // compute indelQ
+ n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins,
+ ref_type, types, n_types, score);
+
// free
- for (i = 0; i < n; ++i) free(ref_sample[i]);
+ free(ref2);
+ free(query);
+ free(score);
+
+ for (i = 0; i < n; ++i)
+ free(ref_sample[i]);
+
free(ref_sample);
free(types); free(inscns);
+
return n_alt > 0? 0 : -1;
}
/* bam2bcf_indel.c -- indel caller.
Copyright (C) 2010, 2011 Broad Institute.
- Copyright (C) 2012-2014,2016 Genome Research Ltd.
+ Copyright (C) 2012-2014,2016-2017, 2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#include <assert.h>
#include <ctype.h>
#include <string.h>
+#include <math.h>
#include <htslib/hts.h>
#include <htslib/sam.h>
#include <htslib/khash_str2int.h>
#include "bam2bcf.h"
+#include "str_finder.h"
#include <htslib/ksort.h>
KSORT_INIT_GENERIC(uint32_t)
#define MINUS_CONST 0x10000000
-#define INDEL_WINDOW_SIZE 50
+#define INDEL_WINDOW_SIZE 110
+#define MAX_TYPES 64
+
+// Take a reference position tpos and convert to a query position (returned).
+// This uses the CIGAR string plus alignment c->pos to do the mapping.
+//
+// *_tpos is returned as tpos if query overlaps tpos, but for deletions
+// it'll be either the start (is_left) or end (!is_left) ref position.
static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
{
+ // x = pos in ref, y = pos in query seq
int k, x = c->pos, y = 0, last_y = 0;
*_tpos = c->pos;
for (k = 0; k < c->n_cigar; ++k) {
*_tpos = x;
return last_y;
}
+
// FIXME: check if the inserted sequence is consistent with the homopolymer run
// l is the relative gap length and l_run is the length of the homopolymer on the reference
static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
return max_i - pos;
}
+// Identify spft-clip length, position in seq, and clipped seq len
+static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p,
+ int *sc_len_r, int *slen_r, int *epos_r, int *end) {
+ bam1_t *b = p->b;
+ int sc_len = 0, sc_dist = -1, at_left = 1;
+ int epos = p->qpos, slen = b->core.l_qseq;
+ int k;
+ uint32_t *cigar = bam_get_cigar(b);
+ *end = -1;
+ for (k = 0; k < b->core.n_cigar; k++) {
+ int op = bam_cigar_op(cigar[k]);
+ if (op == BAM_CSOFT_CLIP) {
+ slen -= bam_cigar_oplen(cigar[k]);
+ if (at_left) {
+ // left end
+ sc_len += bam_cigar_oplen(cigar[k]);
+ epos -= sc_len; // don't count SC in seq pos
+ sc_dist = epos;
+ *end = 0;
+ } else {
+ // right end
+ int srlen = bam_cigar_oplen(cigar[k]);
+ int rd = b->core.l_qseq - srlen - p->qpos;
+ if (sc_dist < 0 || sc_dist > rd) {
+ // closer to right end than left
+ // FIXME: compensate for indel length too?
+ sc_dist = rd;
+ sc_len = srlen;
+ *end = 1;
+ }
+ }
+ } else if (op != BAM_CHARD_CLIP) {
+ at_left = 0;
+ }
+ }
+
+ if (p->indel > 0 && slen - (epos+p->indel) < epos)
+ epos += p->indel-1; // end of insertion, if near end of seq
+
+ // slen is now length of sequence minus soft-clips and
+ // epos is position of indel in seq minus left-clip.
+ *epos_r = (double)epos / (slen+1) * bca->npos;
+
+ if (sc_len) {
+ // scale importance of clip by distance to closest end
+ *sc_len_r = 15.0*sc_len / (sc_dist+1);
+ if (*sc_len_r > 99) *sc_len_r = 99;
+ } else {
+ *sc_len_r = 0;
+ }
+
+ *slen_r = slen;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Scans the pileup to identify all the different sizes of indels
+// present.
+//
+// Returns types and fills out n_types_r, max_rd_len_r and ref_type_r,
+// or NULL on error.
+static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp,
+ int pos, bcf_callaux_t *bca, const char *ref,
+ int *max_rd_len_r, int *n_types_r,
+ int *ref_type_r, int *N_r) {
+ int i, j, t, s, N, m, max_rd_len, n_types;
+ int n_alt = 0, n_tot = 0, indel_support_ok = 0;
+ uint32_t *aux;
+ int *types;
+
+ // N is the total number of reads
+ for (s = N = 0; s < n; ++s)
+ N += n_plp[s];
+
+ bca->max_support = bca->max_frac = 0;
+ aux = (uint32_t*) calloc(N + 1, 4);
+ if (!aux)
+ return NULL;
+
+ m = max_rd_len = 0;
+ aux[m++] = MINUS_CONST; // zero indel is always a type (REF)
+
+ // Fill out aux[] array with all the non-zero indel sizes.
+ // Also tally number with indels (n_alt) and total (n_tot).
+ for (s = 0; s < n; ++s) {
+ int na = 0, nt = 0;
+ for (i = 0; i < n_plp[s]; ++i) {
+ const bam_pileup1_t *p = plp[s] + i;
+ ++nt;
+ if (p->indel != 0) {
+ ++na;
+ aux[m++] = MINUS_CONST + p->indel;
+ }
+
+ // FIXME: cache me in pileup struct.
+ j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
+ if (j > max_rd_len) max_rd_len = j;
+ }
+ double frac = (double)na/nt;
+ if ( !indel_support_ok && na >= bca->min_support
+ && frac >= bca->min_frac )
+ indel_support_ok = 1;
+ if ( na > bca->max_support && frac > 0 )
+ bca->max_support = na, bca->max_frac = frac;
+
+ n_alt += na;
+ n_tot += nt;
+ }
+
+ // Sort aux[] and dedup
+ ks_introsort(uint32_t, m, aux);
+ for (i = 1, n_types = 1; i < m; ++i)
+ if (aux[i] != aux[i-1]) ++n_types;
+
+ // Taking totals makes it hard to call rare indels (IMF filter)
+ if ( !bca->per_sample_flt )
+ indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac
+ || n_alt < bca->min_support )
+ ? 0 : 1;
+ if ( n_types == 1 || !indel_support_ok ) { // then skip
+ free(aux);
+ return NULL;
+ }
+
+ // Bail out if we have far too many types of indel
+ if (n_types >= MAX_TYPES) {
+ free(aux);
+ // TODO revisit how/whether to control printing this warning
+ if (hts_verbose >= 2)
+ fprintf(bcftools_stderr, "[%s] excessive INDEL alleles at position %d. "
+ "Skip the position.\n", __func__, pos + 1);
+ return NULL;
+ }
+
+ // To prevent long stretches of N's to be mistaken for indels
+ // (sometimes thousands of bases), check the number of N's in the
+ // sequence and skip places where half or more reference bases are Ns.
+ int nN=0, i_end = pos + (2*INDEL_WINDOW_SIZE < max_rd_len
+ ?2*INDEL_WINDOW_SIZE : max_rd_len);
+ for (i=pos; i<i_end && ref[i]; i++)
+ nN += ref[i] == 'N';
+ if ( nN*2>(i-pos) ) {
+ free(aux);
+ return NULL;
+ }
+
+ // Finally fill out the types[] array detailing the size of insertion
+ // or deletion.
+ types = (int*)calloc(n_types, sizeof(int));
+ if (!types) {
+ free(aux);
+ return NULL;
+ }
+ t = 0;
+ types[t++] = aux[0] - MINUS_CONST;
+ for (i = 1; i < m; ++i)
+ if (aux[i] != aux[i-1])
+ types[t++] = aux[i] - MINUS_CONST;
+ free(aux);
+
+ // Find reference type; types[?] == 0)
+ for (t = 0; t < n_types; ++t)
+ if (types[t] == 0) break;
+
+ *ref_type_r = t;
+ *n_types_r = n_types;
+ *max_rd_len_r = max_rd_len;
+ *N_r = N;
+
+ return types;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Construct per-sample consensus.
+//
+// Returns an array of consensus seqs,
+// or NULL on failure.
+static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp,
+ int pos, bcf_callaux_t *bca, const char *ref,
+ int left, int right) {
+ int i, k, s, L = right - left + 1, max_i, max2_i;
+ char **ref_sample; // returned
+ uint32_t *cns = NULL, max, max2;
+ char *ref0 = NULL, *r;
+ ref_sample = (char**) calloc(n, sizeof(char*));
+ cns = (uint32_t*) calloc(L, 4);
+ ref0 = (char*) calloc(L, 1);
+ if (!ref_sample || !cns || !ref0) {
+ n = 0;
+ goto err;
+ }
+
+ // Convert ref ASCII to 0-15.
+ for (i = 0; i < right - left; ++i)
+ ref0[i] = seq_nt16_table[(int)ref[i+left]];
+
+ // NB: one consensus per sample 'n', not per indel type.
+ // FIXME: consider fixing this. We should compute alignments vs
+ // types, not vs samples? Or types/sample combined?
+ for (s = 0; s < n; ++s) {
+ r = ref_sample[s] = (char*) calloc(L, 1);
+ if (!r) {
+ n = s-1;
+ goto err;
+ }
+
+ memset(cns, 0, sizeof(int) * L);
+
+ // collect ref and non-ref counts in cns
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ bam1_t *b = p->b;
+ uint32_t *cigar = bam_get_cigar(b);
+ uint8_t *seq = bam_get_seq(b);
+ int x = b->core.pos, y = 0;
+
+ // TODO: pileup exposes pileup_ind, but we also need e.g.
+ // pileup_len to know how much of the current CIGAR op-len
+ // we've used (or have remaining). If we had that, we
+ // could start at p->qpos without having to scan through
+ // the entire CIGAR string until we find it.
+ //
+ // Without it about all we could do is have a side channel
+ // to cache the last known coords. Messy, so punt for now.
+ // This is no longer the bottle neck until we get to 1000s of
+ // CIGAR ops.
+
+ for (k = 0; k < b->core.n_cigar; ++k) {
+ int op = cigar[k]&0xf;
+ int j, l = cigar[k]>>4;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ if (x + l >= left) {
+ j = left - x > 0 ? left - x : 0;
+ int j_end = right - x < l ? right - x : l;
+ for (; j < j_end; j++)
+ // Append to cns. Note this is ref coords,
+ // so insertions aren't in cns and deletions
+ // will have lower coverage.
+
+ // FIXME: want true consensus (with ins) per
+ // type, so we can independently compare each
+ // seq to each consensus and see which it
+ // matches best, so we get proper GT analysis.
+ cns[x+j-left] +=
+ (bam_seqi(seq, y+j) == ref0[x+j-left])
+ ? 1 // REF
+ : (1<<16); // ALT
+ }
+ x += l; y += l;
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+ x += l;
+ } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+ y += l;
+ }
+
+ if (x > right)
+ break;
+ }
+ }
+
+ // Determine a sample specific reference.
+ for (i = 0; i < right - left; ++i)
+ r[i] = ref0[i];
+
+ // Find deepest and 2nd deepest ALT region (max & max2).
+ max = max2 = 0; max_i = max2_i = -1;
+ for (i = 0; i < right - left; ++i) {
+ if (cns[i]>>16 >= max>>16)
+ max2 = max, max2_i = max_i, max = cns[i], max_i = i;
+ else if (cns[i]>>16 >= max2>>16)
+ max2 = cns[i], max2_i = i;
+ }
+
+ // Masks mismatches present in at least 70% of the reads with 'N'.
+ // This code is nREF/(nREF+n_ALT) >= 70% for deepest region.
+ // The effect is that at least 30% of bases differing to REF will
+ // use "N" in consensus, so we don't penalise ALT or REF when
+ // aligning against it. (A poor man IUPAC code)
+ //
+ // Why is it only done in two loci at most?
+ if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7)
+ max_i = -1;
+ if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7)
+ max2_i = -1;
+ if (max_i >= 0) r[max_i] = 15;
+ if (max2_i >= 0) r[max2_i] = 15;
+
+ //for (i = 0; i < right - left; ++i)
+ // fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], bcftools_stderr);
+ //fputc('\n', bcftools_stderr);
+ }
+
+ free(ref0);
+ free(cns);
+
+ return ref_sample;
+
+ err:
+ free(ref0);
+ free(cns);
+ if (ref_sample) {
+ for (s = 0; s < n; s++)
+ free(ref_sample[s]);
+ free(ref_sample);
+ }
+
+ return NULL;
+}
+
+// The length of the homopolymer run around the current position
+static int bcf_cgp_l_run(const char *ref, int pos) {
+ int i, l_run;
+
+ int c = seq_nt16_table[(int)ref[pos + 1]];
+ if (c == 15) {
+ l_run = 1;
+ } else {
+ for (i = pos + 2; ref[i]; ++i)
+ if (seq_nt16_table[(int)ref[i]] != c) break;
+ l_run = i;
+ for (i = pos; i >= 0; --i)
+ if (seq_nt16_table[(int)ref[i]] != c) break;
+ l_run -= i + 1;
+ }
+
+ return l_run;
+}
+
+
+// Compute the consensus for this sample 's', minus indels which
+// get added later.
+static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp,
+ int pos, int *types, int n_types,
+ int max_ins, int s) {
+ int i, j, t, k;
+ int *inscns_aux = (int*)calloc(5 * n_types * max_ins, sizeof(int));
+ if (!inscns_aux)
+ return NULL;
+
+ // Count the number of occurrences of each base at each position for
+ // each type of insertion.
+ for (t = 0; t < n_types; ++t) {
+ if (types[t] > 0) {
+ for (s = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ if (p->indel == types[t]) {
+ uint8_t *seq = bam_get_seq(p->b);
+ for (k = 1; k <= p->indel; ++k) {
+ int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
+ assert(c<5);
+ ++inscns_aux[(t*max_ins+(k-1))*5 + c];
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Use the majority rule to construct the consensus
+ char *inscns = (char *)calloc(n_types * max_ins, 1);
+ for (t = 0; t < n_types; ++t) {
+ for (j = 0; j < types[t]; ++j) {
+ int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
+ for (k = 0; k < 5; ++k)
+ if (ia[k] > max)
+ max = ia[k], max_k = k;
+ inscns[t*max_ins + j] = max ? max_k : 4;
+ if (max_k == 4) {
+ // discard insertions which contain N's
+ types[t] = 0;
+ break;
+ }
+ }
+ }
+ free(inscns_aux);
+
+ return inscns;
+}
+
+#ifndef MIN
+# define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+// Part of bcf_call_gap_prep.
+//
+// Realign using BAQ to get an alignment score of a single read vs
+// a haplotype consensus.
+//
+// Fills out score
+// Returns 0 on success,
+// <0 on error
+static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
+ int type, uint8_t *ref2, uint8_t *query,
+ int r_start, int r_end, int long_read,
+ int tbeg, int tend,
+ int left, int right,
+ int qbeg, int qend,
+ int qpos, int max_deletion,
+ int *score) {
+ // Illumina
+ probaln_par_t apf = { 1e-4, 1e-2, 10 };
+
+ // Parameters that work better on PacBio CCS 15k.
+ // We should consider querying the header and RG PU field.
+ // See also htslib/realn.c:sam_prob_realn()
+ if (long_read) {
+ apf.d = 1e-3;
+ apf.e = 1e-1;
+ }
+
+ type = abs(type);
+ apf.bw = type + 3;
+ int l, sc;
+ const uint8_t *qual = bam_get_qual(p->b), *bq;
+ uint8_t *qq;
+
+ // Get segment of quality, either ZQ tag or if absent QUAL.
+ if (!(qq = (uint8_t*) calloc(qend - qbeg, 1)))
+ return -1;
+ bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
+ if (bq) ++bq; // skip type
+ for (l = qbeg; l < qend; ++l) {
+ int qval = bq? qual[l] + (bq[l] - 64) : qual[l];
+ if (qval > 30)
+ qval = 30;
+ if (qval < 7)
+ qval = 7;
+ qq[l - qbeg] = qval;
+ }
+
+ // The bottom 8 bits are length-normalised score while
+ // the top bits are unnormalised.
+ sc = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type,
+ query, qend - qbeg, qq, &apf, 0, 0);
+ if (sc < 0) {
+ *score = 0xffffff;
+ free(qq);
+ return 0;
+ }
+
+ // used for adjusting indelQ below
+ l = (int)(100. * sc / (qend - qbeg) + .499) * bca->indel_bias;
+ *score = sc<<8 | MIN(255, l);
+
+ rep_ele *reps, *elt, *tmp;
+ uint8_t *seg = ref2 + tbeg - left;
+ int seg_len = tend - tbeg + type;
+
+ // Note: although seg moves (tbeg varies), ref2 is reused many times
+ // so we could factor out some find_STR calls. However it's not the
+ // bottleneck for now.
+
+ // FIXME: need to make this work on IUPAC.
+ reps = find_STR((char *)seg, seg_len, 0);
+ int iscore = 0;
+
+ // Identify STRs in ref covering the indel up to
+ // (or close to) the end of the sequence.
+ // Those having an indel and right at the sequence
+ // end do not confirm the total length of indel
+ // size. Specifically a *lack* of indel at the
+ // end, where we know indels occur in other
+ // sequences, is a possible reference bias.
+ //
+ // This is emphasised further if the sequence ends with
+ // soft clipping.
+ DL_FOREACH_SAFE(reps, elt, tmp) {
+ if (elt->start <= qpos && elt->end >= qpos) {
+ iscore += (elt->end-elt->start) / elt->rep_len; // c
+ if (elt->start+tbeg <= r_start ||
+ elt->end+tbeg >= r_end)
+ iscore += 2*(elt->end-elt->start);
+ }
+
+ DL_DELETE(reps, elt);
+ free(elt);
+ }
+
+ // Apply STR score to existing indelQ
+ l = (*score&0xff)*.8 + iscore*2;
+ *score = (*score & ~0xff) | MIN(255, l);
+
+ free(qq);
+
+ return 0;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Returns n_alt on success
+// -1 on failure
+static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
+ bcf_callaux_t *bca, char *inscns,
+ int l_run, int max_ins,
+ int ref_type, int *types, int n_types,
+ int *score) {
+ // FIXME: n_types has a maximum; no need to alloc - use a #define?
+ int sc[MAX_TYPES], sumq[MAX_TYPES], s, i, j, t, K, n_alt, tmp;
+ memset(sumq, 0, n_types * sizeof(int));
+ for (s = K = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i, ++K) {
+ bam_pileup1_t *p = plp[s] + i;
+ int *sct = &score[K*n_types], seqQ, indelQ;
+ for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+ tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+
+ /* errmod_cal() assumes that if the call is wrong, the
+ * likelihoods of other events are equal. This is about
+ * right for substitutions, but is not desired for
+ * indels. To reuse errmod_cal(), I have to make
+ * compromise for multi-allelic indels.
+ */
+ if ((sc[0]&0x3f) == ref_type) {
+ indelQ = (sc[1]>>14) - (sc[0]>>14);
+ seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
+ } else {
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sc[t]&0x3f) == ref_type) break;
+ indelQ = (sc[t]>>14) - (sc[0]>>14);
+ seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
+ }
+ tmp = sc[0]>>6 & 0xff;
+ // reduce indelQ
+ indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499);
+
+ // Doesn't really help accuracy, but permits -h to take
+ // affect still.
+ if (indelQ > seqQ) indelQ = seqQ;
+ if (indelQ > 255) indelQ = 255;
+ if (seqQ > 255) seqQ = 255;
+ p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
+ sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
+ // fprintf(bcftools_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
+ }
+ }
+ // determine bca->indel_types[] and bca->inscns
+ bca->maxins = max_ins;
+ bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
+ if (bca->maxins && !bca->inscns)
+ return -1;
+ for (t = 0; t < n_types; ++t)
+ sumq[t] = sumq[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
+ tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sumq[t]&0x3f) == ref_type) break;
+ if (t) { // then move the reference type to the first
+ tmp = sumq[t];
+ for (; t > 0; --t) sumq[t] = sumq[t-1];
+ sumq[0] = tmp;
+ }
+ for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
+ for (t = 0; t < 4 && t < n_types; ++t) {
+ bca->indel_types[t] = types[sumq[t]&0x3f];
+ if (bca->maxins)
+ memcpy(&bca->inscns[t * bca->maxins],
+ &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
+ }
+ // update p->aux
+ for (s = n_alt = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ int x = types[p->aux>>16&0x3f];
+ for (j = 0; j < 4; ++j)
+ if (x == bca->indel_types[j]) break;
+ p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
+ if ((p->aux>>16&0x3f) > 0) ++n_alt;
+ //fprintf(bcftools_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
+ }
+ }
+
+ return n_alt;
+}
+
+/*
+FIXME: with high number of samples, do we handle IMF correctly? Is it
+fraction of indels across entire data set, or just fraction for this
+specific sample? Needs to check bca->per_sample_flt (--per-sample-mF) opt.
+ */
+
/*
notes:
- - n .. number of samples
- - the routine sets bam_pileup1_t.aux of each read as follows:
- - 6: unused
- - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f
- - 8: estimated sequence quality .. (aux>>8)&0xff
- - 8: indel quality .. aux&0xff
+ - n .. number of samples
+ - the routine sets bam_pileup1_t.aux of each read as follows:
+ - 6: unused
+ - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f
+ - 8: estimated sequence quality .. (aux>>8)&0xff
+ - 8: indel quality .. aux&0xff
*/
-int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
+ bcf_callaux_t *bca, const char *ref)
{
- int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2;
+ if (ref == 0 || bca == 0) return -1;
+
+ int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins;
+ int *score, max_ref2;
int N, K, l_run, ref_type, n_alt;
char *inscns = 0, *ref2, *query, **ref_sample;
- if (ref == 0 || bca == 0) return -1;
// determine if there is a gap
for (s = N = 0; s < n; ++s) {
if (plp[s][i].indel != 0) break;
if (i < n_plp[s]) break;
}
- if (s == n) return -1; // there is no indel at this position.
- for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
- { // find out how many types of indels are present
- bca->max_support = bca->max_frac = 0;
- int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
- uint32_t *aux;
- aux = (uint32_t*) calloc(N + 1, 4);
- m = max_rd_len = 0;
- aux[m++] = MINUS_CONST; // zero indel is always a type
- for (s = 0; s < n; ++s) {
- int na = 0, nt = 0;
- for (i = 0; i < n_plp[s]; ++i) {
- const bam_pileup1_t *p = plp[s] + i;
- ++nt;
- if (p->indel != 0) {
- ++na;
- aux[m++] = MINUS_CONST + p->indel;
- }
- j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
- if (j > max_rd_len) max_rd_len = j;
- }
- double frac = (double)na/nt;
- if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
- indel_support_ok = 1;
- if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
- n_alt += na;
- n_tot += nt;
- }
- // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
- // check the number of N's in the sequence and skip places where half or more reference bases are Ns.
- int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
- if ( nN*2>(i-pos) ) { free(aux); return -1; }
-
- ks_introsort(uint32_t, m, aux);
- // squeeze out identical types
- for (i = 1, n_types = 1; i < m; ++i)
- if (aux[i] != aux[i-1]) ++n_types;
- // Taking totals makes it hard to call rare indels
- if ( !bca->per_sample_flt )
- indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
- if ( n_types == 1 || !indel_support_ok ) { // then skip
- free(aux); return -1;
- }
- if (n_types >= 64) {
- free(aux);
- // TODO revisit how/whether to control printing this warning
- if (hts_verbose >= 2)
- fprintf(bcftools_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
- return -1;
- }
- types = (int*)calloc(n_types, sizeof(int));
- t = 0;
- types[t++] = aux[0] - MINUS_CONST;
- for (i = 1; i < m; ++i)
- if (aux[i] != aux[i-1])
- types[t++] = aux[i] - MINUS_CONST;
- free(aux);
- for (t = 0; t < n_types; ++t)
- if (types[t] == 0) break;
- ref_type = t; // the index of the reference type (0)
- }
- { // calculate left and right boundary
- left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
- right = pos + INDEL_WINDOW_SIZE;
- if (types[0] < 0) right -= types[0];
- // in case the alignments stand out the reference
- for (i = pos; i < right; ++i)
- if (ref[i] == 0) break;
- right = i;
- }
- /* The following block fixes a long-existing flaw in the INDEL
+ if (s == n)
+ // there is no indel at this position.
+ return -1;
+
+ // find out how many types of indels are present
+ types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref,
+ &max_rd_len, &n_types, &ref_type, &N);
+ if (!types)
+ return -1;
+
+
+ // calculate left and right boundary
+ left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
+ right = pos + INDEL_WINDOW_SIZE;
+ if (types[0] < 0) right -= types[0];
+
+ // in case the alignments stand out the reference
+ for (i = pos; i < right; ++i)
+ if (ref[i] == 0) break;
+ right = i;
+
+
+ /* The following call fixes a long-existing flaw in the INDEL
* calling model: the interference of nearby SNPs. However, it also
* reduces the power because sometimes, substitutions caused by
* indels are not distinguishable from true mutations. Multiple
*
* Masks mismatches present in at least 70% of the reads with 'N'.
*/
- { // construct per-sample consensus
- int L = right - left + 1, max_i, max2_i;
- uint32_t *cns, max, max2;
- char *ref0, *r;
- ref_sample = (char**) calloc(n, sizeof(char*));
- cns = (uint32_t*) calloc(L, 4);
- ref0 = (char*) calloc(L, 1);
- for (i = 0; i < right - left; ++i)
- ref0[i] = seq_nt16_table[(int)ref[i+left]];
- for (s = 0; s < n; ++s) {
- r = ref_sample[s] = (char*) calloc(L, 1);
- memset(cns, 0, sizeof(int) * L);
- // collect ref and non-ref counts
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- bam1_t *b = p->b;
- uint32_t *cigar = bam_get_cigar(b);
- uint8_t *seq = bam_get_seq(b);
- int x = b->core.pos, y = 0;
- for (k = 0; k < b->core.n_cigar; ++k) {
- int op = cigar[k]&0xf;
- int j, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j)
- if (x + j >= left && x + j < right)
- cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
- x += l; y += l;
- } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
- else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
- }
- }
- // determine the consensus
- for (i = 0; i < right - left; ++i) r[i] = ref0[i];
- max = max2 = 0; max_i = max2_i = -1;
- for (i = 0; i < right - left; ++i) {
- if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
- else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
- }
- if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
- if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
- if (max_i >= 0) r[max_i] = 15;
- if (max2_i >= 0) r[max2_i] = 15;
- //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], bcftools_stderr); fputc('\n', bcftools_stderr);
- }
- free(ref0); free(cns);
- }
- { // the length of the homopolymer run around the current position
- int c = seq_nt16_table[(int)ref[pos + 1]];
- if (c == 15) l_run = 1;
- else {
- for (i = pos + 2; ref[i]; ++i)
- if (seq_nt16_table[(int)ref[i]] != c) break;
- l_run = i;
- for (i = pos; i >= 0; --i)
- if (seq_nt16_table[(int)ref[i]] != c) break;
- l_run -= i + 1;
- }
- }
- // construct the consensus sequence
+ ref_sample = bcf_cgp_ref_sample(n, n_plp, plp, pos, bca, ref, left, right);
+
+ // The length of the homopolymer run around the current position
+ l_run = bcf_cgp_l_run(ref, pos);
+
+ // construct the consensus sequence (minus indels, which are added later)
max_ins = types[n_types - 1]; // max_ins is at least 0
if (max_ins > 0) {
- int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int));
- // count the number of occurrences of each base at each position for each type of insertion
- for (t = 0; t < n_types; ++t) {
- if (types[t] > 0) {
- for (s = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- if (p->indel == types[t]) {
- uint8_t *seq = bam_get_seq(p->b);
- for (k = 1; k <= p->indel; ++k) {
- int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
- assert(c<5);
- ++inscns_aux[(t*max_ins+(k-1))*5 + c];
- }
- }
- }
- }
- }
- }
- // use the majority rule to construct the consensus
- inscns = (char*) calloc(n_types * max_ins, 1);
- for (t = 0; t < n_types; ++t) {
- for (j = 0; j < types[t]; ++j) {
- int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
- for (k = 0; k < 5; ++k)
- if (ia[k] > max)
- max = ia[k], max_k = k;
- inscns[t*max_ins + j] = max? max_k : 4;
- if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
- }
- }
- free(inscns_aux);
+ inscns = bcf_cgp_calc_cons(n, n_plp, plp, pos,
+ types, n_types, max_ins, s);
+ if (!inscns)
+ return -1;
}
+
// compute the likelihood given each type of indel for each read
max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
ref2 = (char*) calloc(max_ref2, 1);
query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1);
- score1 = (int*) calloc(N * n_types, sizeof(int));
- score2 = (int*) calloc(N * n_types, sizeof(int));
+ score = (int*) calloc(N * n_types, sizeof(int));
bca->indelreg = 0;
+ double nqual_over_60 = bca->nqual / 60.0;
+
for (t = 0; t < n_types; ++t) {
int l, ir;
- probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
- apf1.bw = apf2.bw = abs(types[t]) + 3;
+
// compute indelreg
- if (types[t] == 0) ir = 0;
- else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
- else ir = est_indelreg(pos, ref, -types[t], 0);
- if (ir > bca->indelreg) bca->indelreg = ir;
-// fprintf(bcftools_stderr, "%d, %d, %d\n", pos, types[t], ir);
- // realignment
+ if (types[t] == 0)
+ ir = 0;
+ else if (types[t] > 0)
+ ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
+ else
+ ir = est_indelreg(pos, ref, -types[t], 0);
+
+ if (ir > bca->indelreg)
+ bca->indelreg = ir;
+
+ // Identify max deletion length
+ int max_deletion = 0;
+ for (s = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i, ++K) {
+ bam_pileup1_t *p = plp[s] + i;
+ if (max_deletion < -p->indel)
+ max_deletion = -p->indel;
+ }
+ }
+
+ // Realignment score, computed via BAQ
for (s = K = 0; s < n; ++s) {
- // write ref2
+ // Construct ref2 from ref_sample, inscns and indels.
+ // This is now the true sample consensus (possibly prepended
+ // and appended with reference if sample data doesn't span
+ // the full length).
for (k = 0, j = left; j <= pos; ++j)
ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
- if (types[t] <= 0) j += -types[t];
- else for (l = 0; l < types[t]; ++l)
- ref2[k++] = inscns[t*max_ins + l];
+
+ if (types[t] <= 0)
+ j += -types[t];
+ else
+ for (l = 0; l < types[t]; ++l)
+ ref2[k++] = inscns[t*max_ins + l];
+
for (; j < right && ref[j]; ++j)
ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
- for (; k < max_ref2; ++k) ref2[k] = 4;
- if (j < right) right = j;
+ for (; k < max_ref2; ++k)
+ ref2[k] = 4;
+
+ if (right > j)
+ right = j;
+
// align each read to ref2
for (i = 0; i < n_plp[s]; ++i, ++K) {
bam_pileup1_t *p = plp[s] + i;
- int qbeg, qend, tbeg, tend, sc, kk;
+
+ // Some basic ref vs alt stats.
+ int imq = p->b->core.qual > 59 ? 59 : p->b->core.qual;
+ imq *= nqual_over_60;
+
+ int sc_len, slen, epos, sc_end;
+
+ // Only need to gather stats on one type, as it's
+ // identical calculation for all the subsequent ones
+ // and we're sharing the same stats array
+ if (t == 0) {
+ // Gather stats for INFO field to aid filtering.
+ // mq and sc_len not very helpful for filtering, but could
+ // help in assigning a better QUAL value.
+ //
+ // Pos is slightly useful.
+ // Base qual can be useful, but need qual prior to BAQ?
+ // May need to cache orig quals in aux tag so we can fetch
+ // them even after mpileup step.
+ get_pos(bca, p, &sc_len, &slen, &epos, &sc_end);
+
+ assert(imq >= 0 && imq < bca->nqual);
+ assert(epos >= 0 && epos < bca->npos);
+ assert(sc_len >= 0 && sc_len < 100);
+ if (p->indel) {
+ bca->ialt_mq[imq]++;
+ bca->ialt_scl[sc_len]++;
+ bca->ialt_pos[epos]++;
+ } else {
+ bca->iref_mq[imq]++;
+ bca->iref_scl[sc_len]++;
+ bca->iref_pos[epos]++;
+ }
+ }
+
+ int qbeg, qpos, qend, tbeg, tend, kk;
uint8_t *seq = bam_get_seq(p->b);
uint32_t *cigar = bam_get_cigar(p->b);
- if (p->b->core.flag&4) continue; // unmapped reads
- // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
+ if (p->b->core.flag & BAM_FUNMAP) continue;
+
+ // FIXME: the following loop should be better moved outside;
+ // nonetheless, realignment should be much slower anyway.
for (kk = 0; kk < p->b->core.n_cigar; ++kk)
- if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
- if (kk < p->b->core.n_cigar) continue;
- // FIXME: the following skips soft clips, but using them may be more sensitive.
+ if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
+ break;
+ if (kk < p->b->core.n_cigar)
+ continue;
+
// determine the start and end of sequences for alignment
- qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg);
- qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend);
+ // FIXME: loops over CIGAR multiple times
+ int left2 = left, right2 = right;
+ if (p->b->core.l_qseq > 1000) {
+ // long read data needs less context. It also tends to
+ // have many more candidate indels to investigate so
+ // speed here matters more.
+ if (pos - left >= INDEL_WINDOW_SIZE)
+ left2 += INDEL_WINDOW_SIZE/2;
+ if (right-pos >= INDEL_WINDOW_SIZE)
+ right2 -= INDEL_WINDOW_SIZE/2;
+ }
+
+ int r_start = p->b->core.pos;
+ int r_end = bam_cigar2rlen(p->b->core.n_cigar,
+ bam_get_cigar(p->b))
+ -1 + r_start;
+
+ qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left2,
+ 0, &tbeg);
+ qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos,
+ 0, &tend) - qbeg;
+ qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right2,
+ 1, &tend);
+
if (types[t] < 0) {
int l = -types[t];
tbeg = tbeg - l > left? tbeg - l : left;
}
+
// write the query sequence
for (l = qbeg; l < qend; ++l)
query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
- { // do realignment; this is the bottleneck
- const uint8_t *qual = bam_get_qual(p->b), *bq;
- uint8_t *qq;
- qq = (uint8_t*) calloc(qend - qbeg, 1);
- bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
- if (bq) ++bq; // skip type
- for (l = qbeg; l < qend; ++l) {
- qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
- if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
- if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
- }
- sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
- l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
- if (l > 255) l = 255;
- score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
- if (sc > 5) {
- sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
- l = (int)(100. * sc / (qend - qbeg) + .499);
- if (l > 255) l = 255;
- score2[K*n_types + t] = sc<<8 | l;
+
+ // A fudge for now. Consider checking SAM header for
+ // RG platform field.
+ int long_read = p->b->core.l_qseq > 1000;
+
+ // do realignment; this is the bottleneck
+ if (tend > tbeg) {
+ if (bcf_cgp_align_score(p, bca, types[t],
+ (uint8_t *)ref2 + left2-left,
+ (uint8_t *)query,
+ r_start, r_end, long_read,
+ tbeg, tend, left2, right2,
+ qbeg, qend, qpos, max_deletion,
+ &score[K*n_types + t]) < 0) {
+ score[K*n_types + t] = 0xffffff;
+ return -1;
}
- free(qq);
+ } else {
+ // place holder large cost for reads that cover the
+ // region entirely within a deletion (thus tend < tbeg).
+ score[K*n_types + t] = 0xffffff;
}
#if 0
for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
fputc("ACGTN"[(int)ref2[tbeg-left+l]], bcftools_stderr);
fputc('\n', bcftools_stderr);
- for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], bcftools_stderr);
+ for (l = 0; l < qend - qbeg; ++l)
+ fputc("ACGTN"[(int)query[l]], bcftools_stderr);
fputc('\n', bcftools_stderr);
- fprintf(bcftools_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), qbeg, tbeg, sc);
+ fprintf(bcftools_stderr, "pos=%d type=%d read=%d:%d name=%s "
+ "qbeg=%d tbeg=%d score=%d\n",
+ pos, types[t], s, i, bam_get_qname(p->b),
+ qbeg, tbeg, sc);
#endif
}
}
}
- free(ref2); free(query);
- { // compute indelQ
- int sc_a[16], sumq_a[16];
- int tmp, *sc = sc_a, *sumq = sumq_a;
- if (n_types > 16) {
- sc = (int *)malloc(n_types * sizeof(int));
- sumq = (int *)malloc(n_types * sizeof(int));
- }
- memset(sumq, 0, n_types * sizeof(int));
- for (s = K = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i, ++K) {
- bam_pileup1_t *p = plp[s] + i;
- int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
- for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
- for (t = 1; t < n_types; ++t) // insertion sort
- for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
- tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
- /* errmod_cal() assumes that if the call is wrong, the
- * likelihoods of other events are equal. This is about
- * right for substitutions, but is not desired for
- * indels. To reuse errmod_cal(), I have to make
- * compromise for multi-allelic indels.
- */
- if ((sc[0]&0x3f) == ref_type) {
- indelQ1 = (sc[1]>>14) - (sc[0]>>14);
- seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
- } else {
- for (t = 0; t < n_types; ++t) // look for the reference type
- if ((sc[t]&0x3f) == ref_type) break;
- indelQ1 = (sc[t]>>14) - (sc[0]>>14);
- seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
- }
- tmp = sc[0]>>6 & 0xff;
- indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
- sct = &score2[K*n_types];
- for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
- for (t = 1; t < n_types; ++t) // insertion sort
- for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
- tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
- if ((sc[0]&0x3f) == ref_type) {
- indelQ2 = (sc[1]>>14) - (sc[0]>>14);
- } else {
- for (t = 0; t < n_types; ++t) // look for the reference type
- if ((sc[t]&0x3f) == ref_type) break;
- indelQ2 = (sc[t]>>14) - (sc[0]>>14);
- }
- tmp = sc[0]>>6 & 0xff;
- indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
- // pick the smaller between indelQ1 and indelQ2
- indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
- if (indelQ > 255) indelQ = 255;
- if (seqQ > 255) seqQ = 255;
- p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
- sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
-// fprintf(bcftools_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
- }
- }
- // determine bca->indel_types[] and bca->inscns
- bca->maxins = max_ins;
- bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
- for (t = 0; t < n_types; ++t)
- sumq[t] = sumq[t]<<6 | t;
- for (t = 1; t < n_types; ++t) // insertion sort
- for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
- tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
- for (t = 0; t < n_types; ++t) // look for the reference type
- if ((sumq[t]&0x3f) == ref_type) break;
- if (t) { // then move the reference type to the first
- tmp = sumq[t];
- for (; t > 0; --t) sumq[t] = sumq[t-1];
- sumq[0] = tmp;
- }
- for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
- for (t = 0; t < 4 && t < n_types; ++t) {
- bca->indel_types[t] = types[sumq[t]&0x3f];
- memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
- }
- // update p->aux
- for (s = n_alt = 0; s < n; ++s) {
- for (i = 0; i < n_plp[s]; ++i) {
- bam_pileup1_t *p = plp[s] + i;
- int x = types[p->aux>>16&0x3f];
- for (j = 0; j < 4; ++j)
- if (x == bca->indel_types[j]) break;
- p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
- if ((p->aux>>16&0x3f) > 0) ++n_alt;
- //fprintf(bcftools_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
- }
- }
- if (sc != sc_a) free(sc);
- if (sumq != sumq_a) free(sumq);
- }
- free(score1); free(score2);
+ // compute indelQ
+ n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins,
+ ref_type, types, n_types, score);
+
// free
- for (i = 0; i < n; ++i) free(ref_sample[i]);
+ free(ref2);
+ free(query);
+ free(score);
+
+ for (i = 0; i < n; ++i)
+ free(ref_sample[i]);
+
free(ref_sample);
free(types); free(inscns);
+
return n_alt > 0? 0 : -1;
}
/* bcftools.h -- utility function declarations.
- Copyright (C) 2013 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
const char *hts_bcf_wmode(int file_type);
+const char *hts_bcf_wmode2(int file_type, char *fname);
+char *init_tmp_prefix(const char *prefix);
void *smalloc(size_t size); // safe malloc
-static inline char gt2iupac(char a, char b)
+static inline int iupac2bitmask(char iupac)
{
- static const char iupac[4][4] = { {'A','M','R','W'},{'M','C','S','Y'},{'R','S','G','K'},{'W','Y','K','T'} };
- if ( a>='a' ) a -= 'a' - 'A';
- if ( b>='a' ) b -= 'a' - 'A';
- if ( a=='A' ) a = 0;
- else if ( a=='C' ) a = 1;
- else if ( a=='G' ) a = 2;
- else if ( a=='T' ) a = 3;
- else return 'N';
- if ( b=='A' ) b = 0;
- else if ( b=='C' ) b = 1;
- else if ( b=='G' ) b = 2;
- else if ( b=='T' ) b = 3;
- else return 'N';
- return iupac[(int)a][(int)b];
+ const int A = 1;
+ const int C = 2;
+ const int G = 4;
+ const int T = 8;
+ if ( iupac >= 97 ) iupac -= 32;
+ if ( iupac == 'A' ) return A;
+ if ( iupac == 'C' ) return C;
+ if ( iupac == 'G' ) return G;
+ if ( iupac == 'T' ) return T;
+ if ( iupac == 'M' ) return A|C;
+ if ( iupac == 'R' ) return A|G;
+ if ( iupac == 'W' ) return A|T;
+ if ( iupac == 'S' ) return C|G;
+ if ( iupac == 'Y' ) return C|T;
+ if ( iupac == 'K' ) return G|T;
+ if ( iupac == 'V' ) return A|C|G;
+ if ( iupac == 'H' ) return A|C|T;
+ if ( iupac == 'D' ) return A|G|T;
+ if ( iupac == 'B' ) return C|G|T;
+ if ( iupac == 'N' ) return A|C|G|T;
+ return -1;
+}
+static inline char bitmask2iupac(int bitmask)
+{
+ const char iupac[16] = {'.','A','C','M','G','R','S','V','T','W','Y','H','K','D','B','N'};
+ if ( bitmask <= 0 || bitmask > 15 ) return 0;
+ return iupac[bitmask];
}
static inline int iupac_consistent(char iupac, char nt)
return prob>99 ? 99 : prob;
}
+static const uint64_t bcf_double_missing = 0x7ff0000000000001;
+static const uint64_t bcf_double_vector_end = 0x7ff0000000000002;
+static inline void bcf_double_set(double *ptr, uint64_t value)
+{
+ union { uint64_t i; double d; } u;
+ u.i = value;
+ *ptr = u.d;
+}
+static inline int bcf_double_test(double d, uint64_t value)
+{
+ union { uint64_t i; double d; } u;
+ u.d = d;
+ return u.i==value ? 1 : 0;
+}
+#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end)
+#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing)
+#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end)
+#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing)
+#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end))
+
#endif
#include <ctype.h>
#include <assert.h>
#include <unistd.h>
+#include <setjmp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
return putc('\n', bcftools_stdout);
}
+
+static jmp_buf bcftools_jmpbuf;
+static int bcftools_status = 0;
+
+int bcftools_dispatch(int argc, char *argv[])
+{
+ if (setjmp(bcftools_jmpbuf) == 0)
+ return bcftools_main(argc, argv);
+ else
+ return bcftools_status;
+}
+
+void bcftools_exit(int status)
+{
+ bcftools_status = status;
+ longjmp(bcftools_jmpbuf, 1);
+}
+
+
void bcftools_set_optind(int val)
{
// setting this in cython via
#include <stdio.h>
+#ifndef __has_attribute
+#define __has_attribute(attribute) 0
+#endif
+#ifndef PYSAM_NORETURN
+#if __has_attribute(__noreturn__) || __GNUC__ >= 3
+#define PYSAM_NORETURN __attribute__((__noreturn__))
+#else
+#define PYSAM_NORETURN
+#endif
+#endif
+
extern FILE * bcftools_stderr;
extern FILE * bcftools_stdout;
int bcftools_dispatch(int argc, char *argv[]);
+void PYSAM_NORETURN bcftools_exit(int status);
+
void bcftools_set_optind(int);
extern int bcftools_main(int argc, char *argv[]);
*/
#include <stdio.h>
+#include <assert.h>
#include "bcftools.h"
#include "bin.h"
*/
#include <stdio.h>
+#include <assert.h>
#include "bcftools.h"
#include "bin.h"
/* call.h -- variant calling declarations.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2015, 2019-2020 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#define CALL_CONSTR_TRIO (1<<2)
#define CALL_CONSTR_ALLELES (1<<3)
//
-//
+#define CALL_FMT_PV4 (1<<5)
#define CALL_FMT_GQ (1<<6)
#define CALL_FMT_GP (1<<7)
// For the single-sample and grouped -G calling
typedef struct
{
+ double ref_lk, max_lk, lk_sum;
float *qsum; // QS(quality sum) values
- int nqsum, dp;
- double fa,fb,fc,fa2,fb2,fc2,fab,fac,fbc;
-}
-grp1_t;
-typedef struct
-{
- grp1_t *grp;
- int ngrp;
- int *smpl2grp;
+ int nqsum;
+ uint32_t *smpl, nsmpl;
+ uint32_t nals, als;
}
-grp_t;
+smpl_grp_t;
// For the `-C alleles -i` constrained calling
typedef struct
int *pl_map, npl_map; // same as above for PLs, but reverse (new -> old)
char **als; // array to hold the trimmed set of alleles to appear on output
int nals; // size of the als array
+ int als_new, nals_new; // bitmask with final alleles and their number
family_t *fams; // list of families and samples for trio calling
int nfams, mfams;
int ntrio[5][5]; // possible trio genotype combinations and their counts; first idx:
int32_t *ugts, *cgts; // unconstraind and constrained GTs
uint32_t output_tags;
char *prior_AN, *prior_AC; // reference panel AF tags (AF=AC/AN)
- tgt_als_t *tgt_als; // for CALL_CONSTR_ALLELES
- char *sample_groups; // for single-sample or grouped calling with -G
- grp_t smpl_grp;
- float *qsum;
- int nqsum;
+ tgt_als_t *tgt_als; // for CALL_CONSTR_ALLELES
+ char *sample_groups; // for single-sample or grouped calling with -G
+ char *sample_groups_tag; // for -G [AD|QS:]
+ smpl_grp_t *smpl_grp;
+ int nsmpl_grp;
// ccall only
double indel_frac, min_perm_p, min_lrt;
double prior_type, pref;
- double ref_lk, lk_sum;
int ngrp1_samples, n_perm;
- int nhets, ndiploid;
char *prior_file;
ccall_t *cdat;
void call_init_pl2p(call_t *call);
uint32_t *call_trio_prep(int is_x, int is_son);
-void init_allele_trimming_maps(call_t *call, int als, int nals);
-void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als);
+void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out);
+void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new);
#endif
THE SOFTWARE. */
#include <math.h>
+#include <assert.h>
#include <htslib/kfunc.h>
#include "call.h"
#include "kmin.h"
// trim Number=R tags
int out_als = 0;
for (i=0; i<nals; i++) out_als |= 1<<i;
- init_allele_trimming_maps(call, out_als, nals_ori);
- mcall_trim_numberR(call, rec, nals_ori, nals, out_als);
+ init_allele_trimming_maps(call, nals_ori, out_als);
+ mcall_trim_and_update_numberR(call, rec, nals_ori, nals);
return is_var;
}
THE SOFTWARE. */
#include <math.h>
+#include <assert.h>
#include <htslib/kfunc.h>
#include "call.h"
#include "kmin.h"
// trim Number=R tags
int out_als = 0;
for (i=0; i<nals; i++) out_als |= 1<<i;
- init_allele_trimming_maps(call, out_als, nals_ori);
- mcall_trim_numberR(call, rec, nals_ori, nals, out_als);
+ init_allele_trimming_maps(call, nals_ori, out_als);
+ mcall_trim_and_update_numberR(call, rec, nals_ori, nals);
return is_var;
}
/* The MIT License
- Copyright (c) 2014-2017 Genome Research Ltd.
+ Copyright (c) 2014-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
+#include <assert.h>
#include <errno.h>
#include <getopt.h>
#include <unistd.h>
#define PICK_SHORT 8
#define PICK_IUPAC 16
+#define TO_UPPER 0
+#define TO_LOWER 1
+
typedef struct
{
int num; // number of ungapped blocks in this chain
}
chain_t;
+#define MASK_LC 1
+#define MASK_UC 2
+#define MASK_SKIP(x) (((x)->with!=MASK_LC && (x)->with!=MASK_UC) ? 1 : 0)
+typedef struct
+{
+ char *fname, with;
+ regidx_t *idx;
+ regitr_t *itr;
+}
+mask_t;
typedef struct
{
int fa_ori_pos; // start position of the fa_buffer (wrt original sequence)
int fa_frz_pos; // protected position to avoid conflicting variants (last pos for SNPs/ins)
int fa_mod_off; // position difference of fa_frz_pos in the ori and modified sequence (ins positive)
+ int fa_frz_mod; // the fa_buf offset of the protected fa_frz_pos position, includes the modified sequence
int fa_end_pos; // region's end position in the original sequence
int fa_length; // region's length in the original sequence (in case end_pos not provided in the FASTA header)
- int fa_case; // output upper case or lower case?
+ int fa_case; // output upper case or lower case: TO_UPPER|TO_LOWER
int fa_src_pos; // last genomic coordinate read from the input fasta (0-based)
char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778
int prev_base_pos; // the position of prev_base
int nvcf_buf, rid;
char *chr, *chr_prefix;
- regidx_t *mask;
- regitr_t *itr;
+ mask_t *mask;
+ int nmask;
int chain_id; // chain_id, to provide a unique ID to each chain in the chain output
chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
FILE *fp_chain;
char **argv;
int argc, output_iupac, haplotype, allele, isample, napplied;
- char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele;
+ uint8_t *iupac_bitmask;
+ int miupac_bitmask;
+ char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
+ char mark_del, mark_ins, mark_snv;
}
args_t;
// fprintf(stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
int num = chain->num;
- if (ref_start <= chain->ref_last_block_ori) {
+ if (num && ref_start <= chain->ref_last_block_ori) {
// In case this variant is back-to-back with the previous one
chain->ref_last_block_ori = ref_start + ref_len;
chain->alt_last_block_ori = alt_start + alt_len;
if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
args->isample = 0;
}
- if ( args->mask_fname )
+ int i;
+ for (i=0; i<args->nmask; i++)
{
- args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL);
- if ( !args->mask ) error("Failed to initialize mask regions\n");
- args->itr = regitr_init(args->mask);
+ mask_t *mask = &args->mask[i];
+ mask->idx = regidx_init(mask->fname,NULL,NULL,0,NULL);
+ if ( !mask->idx ) error("Failed to initialize mask regions\n");
+ mask->itr = regitr_init(mask->idx);
}
// In case we want to store the chains
if ( args->chain_fname )
if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n");
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
+ args->rid = -1;
+}
+static void add_mask(args_t *args, char *fname)
+{
+ args->nmask++;
+ args->mask = (mask_t*)realloc(args->mask,args->nmask*sizeof(*args->mask));
+ mask_t *mask = &args->mask[args->nmask-1];
+ mask->fname = fname;
+ mask->with = 'N';
+}
+static void add_mask_with(args_t *args, char *with)
+{
+ if ( !args->nmask ) error("The --mask-with option must follow --mask\n");
+ mask_t *mask = &args->mask[args->nmask-1];
+ if ( !strcasecmp(with,"uc") ) mask->with = MASK_UC;
+ else if ( !strcasecmp(with,"lc") ) mask->with = MASK_LC;
+ else if ( strlen(with)!=1 ) error("Expected \"lc\", \"uc\", or a single character with the --mask-with option\n");
+ else mask->with = *with;
}
-
static void destroy_data(args_t *args)
{
+ free(args->iupac_bitmask);
if (args->filter) filter_destroy(args->filter);
bcf_sr_destroy(args->files);
int i;
free(args->vcf_buf);
free(args->fa_buf.s);
free(args->chr);
- if ( args->mask ) regidx_destroy(args->mask);
- if ( args->itr ) regitr_destroy(args->itr);
+ for (i=0; i<args->nmask; i++)
+ {
+ mask_t *mask = &args->mask[i];
+ regidx_destroy(mask->idx);
+ regitr_destroy(mask->itr);
+ }
+ free(args->mask);
if ( args->chain_fname )
if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
args->fa_src_pos = from;
args->fa_mod_off = 0;
args->fa_frz_pos = -1;
+ args->fa_frz_mod = -1;
args->fa_case = -1;
args->vcf_rbuf.n = 0;
bcf_sr_seek(args->files,line,args->fa_ori_pos);
static void flush_fa_buffer(args_t *args, int len)
{
if ( !args->fa_buf.l ) return;
-
int nwr = 0;
while ( nwr + 60 <= args->fa_buf.l )
{
if ( nwr )
args->fa_ori_pos += nwr;
+ args->fa_frz_mod -= nwr;
+
if ( len )
{
// not finished on this chr yet and the buffer cannot be emptied completely
args->fa_mod_off = 0;
args->fa_buf.l = 0;
}
+static void apply_absent(args_t *args, hts_pos_t pos)
+{
+ if ( !args->fa_buf.l || pos <= args->fa_frz_pos + 1 || pos <= args->fa_ori_pos ) return;
+
+ int ie = pos && pos - args->fa_ori_pos + args->fa_mod_off < args->fa_buf.l ? pos - args->fa_ori_pos + args->fa_mod_off : args->fa_buf.l;
+ int ib = args->fa_frz_mod < 0 ? 0 : args->fa_frz_mod;
+ int i;
+ for (i=ib; i<ie; i++)
+ args->fa_buf.s[i] = args->absent_allele;
+}
+static void freeze_ref(args_t *args, bcf1_t *rec)
+{
+ if ( args->fa_frz_pos >= rec->pos + rec->rlen - 1 ) return;
+ args->fa_frz_pos = rec->pos + rec->rlen - 1;
+ args->fa_frz_mod = rec->pos - args->fa_ori_pos + args->fa_mod_off + rec->rlen;
+}
+static char *mark_del(char *ref, int rlen, char *alt, int mark)
+{
+ char *out = malloc(rlen+1);
+ int i;
+ if ( alt )
+ {
+ int nalt = strlen(alt);
+ for (i=0; i<nalt; i++) out[i] = alt[i];
+ }
+ else // symbolic <DEL>
+ {
+ int nref = strlen(ref);
+ for (i=0; i<nref; i++) out[i] = ref[i];
+ }
+ for (; i<rlen; i++) out[i] = mark;
+ out[rlen] = 0;
+ return out;
+}
+static void mark_ins(char *ref, char *alt, char mark)
+{
+ int i, nref = strlen(ref), nalt = strlen(alt);
+ if ( mark=='l' )
+ for (i=nref; i<nalt; i++) alt[i] = tolower(alt[i]);
+ else
+ for (i=nref; i<nalt; i++) alt[i] = toupper(alt[i]);
+}
+static void mark_snv(char *ref, char *alt, char mark)
+{
+ int i, nref = strlen(ref), nalt = strlen(alt);
+ int n = nref < nalt ? nref : nalt;
+ if ( mark=='l' )
+ {
+ for (i=0; i<n; i++)
+ if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = tolower(alt[i]);
+ }
+ else
+ {
+ for (i=0; i<n; i++)
+ if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
+ }
+}
static void apply_variant(args_t *args, bcf1_t *rec)
{
static int warned_haplotype = 0;
- if ( rec->n_allele==1 && !args->missing_allele ) return;
+ if ( args->absent_allele ) apply_absent(args, rec->pos);
+ if ( rec->n_allele==1 && !args->missing_allele && !args->absent_allele ) { return; }
+ int i,j;
if ( args->mask )
{
char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
int start = rec->pos;
int end = rec->pos + rec->rlen - 1;
- if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
+ for (i=0; i<args->nmask; i++)
+ {
+ mask_t *mask = &args->mask[i];
+ if ( MASK_SKIP(mask) && regidx_overlap(mask->idx, chr,start,end,NULL) ) return;
+ }
}
- int i, ialt = 1; // the alternate allele
+ int ialt = 1; // the alternate allele
if ( args->isample >= 0 )
{
bcf_unpack(rec, BCF_UN_FMT);
enum { use_hap, use_iupac, pick_one } action = use_hap;
if ( args->allele==PICK_IUPAC )
{
+ if ( !args->haplotype ) action = use_iupac;
if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac;
}
else if ( args->output_iupac ) action = use_iupac;
}
else if ( action==use_iupac )
{
- ialt = ptr[0];
- if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end )
+ ialt = -1;
+ int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1;
+ for (i=0; i<fmt->n; i++)
{
- if ( !args->missing_allele ) return;
- ialt = -1;
- }
- else
- ialt = bcf_gt_allele(ialt);
+ if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; }
+ if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
+ int jalt = bcf_gt_allele(ptr[i]);
+ if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ if ( fallback_alt <= 0 ) fallback_alt = jalt;
- int jalt;
- if ( fmt->n>1 )
- {
- jalt = ptr[1];
- if ( bcf_gt_is_missing(jalt) )
+ int l = strlen(rec->d.allele[jalt]);
+ for (j=0; j<l; j++)
+ if ( iupac2bitmask(rec->d.allele[jalt][j]) < 0 ) break;
+ if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
+
+ if ( l > mlen )
{
- if ( !args->missing_allele ) return;
- ialt = -1;
+ hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
+ for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
+ mlen = l;
}
- else if ( jalt==bcf_int32_vector_end ) jalt = ialt;
- else
- jalt = bcf_gt_allele(jalt);
- }
- else jalt = ialt;
-
- if ( ialt>=0 )
- {
- if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
- if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
+ if ( jalt>0 && l>alen )
{
- char ial = rec->d.allele[ialt][0];
- char jal = rec->d.allele[jalt][0];
- if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored
- rec->d.allele[ialt][0] = gt2iupac(ial,jal);
+ alen = l;
+ ialt = jalt;
}
+ for (j=0; j<l; j++)
+ args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]);
}
+ if ( alen > 0 )
+ for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
+ else if ( fallback_alt >= 0 )
+ ialt = fallback_alt;
+ else if ( is_missing && !args->missing_allele ) return;
}
else
{
}
}
}
- if ( !ialt ) return; // ref allele
+ if ( !ialt )
+ {
+ // ref allele
+ if ( args->absent_allele ) freeze_ref(args,rec);
+ return;
+ }
if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
}
- else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] )
+ else if ( args->output_iupac && rec->n_allele>1 )
{
- char ial = rec->d.allele[0][0];
- char jal = rec->d.allele[1][0];
- rec->d.allele[1][0] = gt2iupac(ial,jal);
+ int ialt, alen = 0, mlen = 0;
+ for (i=0; i<rec->n_allele; i++)
+ {
+ int l = strlen(rec->d.allele[i]);
+ for (j=0; j<l; j++)
+ if ( iupac2bitmask(rec->d.allele[i][j]) < 0 ) break;
+ if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
+
+ if ( l > mlen )
+ {
+ hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
+ for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
+ mlen = l;
+ }
+ if ( i>0 && l>alen )
+ {
+ alen = l;
+ ialt = i;
+ }
+ for (j=0; j<l; j++)
+ args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]);
+ }
+ if ( alen > 0 )
+ for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
+ else
+ ialt = 1;
}
- if ( rec->n_allele==1 && ialt!=-1 ) return; // non-missing reference
+ if ( rec->n_allele==1 && ialt!=-1 )
+ {
+ // non-missing reference
+ if ( args->absent_allele ) freeze_ref(args,rec);
+ return;
+ }
if ( ialt==-1 )
{
char alleles[4];
ialt = 1;
}
+ // For some variant types POS+REF refer to the base *before* the event; in such case set trim_beg
+ int trim_beg = 0;
+ int var_type = bcf_get_variant_type(rec,ialt);
+ int var_len = rec->d.var[ialt].n;
+ if ( var_type & VCF_INDEL )
+ {
+ // normally indel starts one base after, but not if the first base of the fa reference is deleted
+ if ( rec->d.allele[0][0] == rec->d.allele[ialt][0] )
+ trim_beg = 1;
+ else
+ trim_beg = 0;
+ }
+ else if ( (var_type & VCF_OTHER) && !strcasecmp(rec->d.allele[ialt],"<DEL>") )
+ {
+ trim_beg = 1;
+ var_len = 1 - rec->rlen;
+ }
+ else if ( (var_type & VCF_OTHER) && !strncasecmp(rec->d.allele[ialt],"<INS",4) ) trim_beg = 1;
+
// Overlapping variant?
if ( rec->pos <= args->fa_frz_pos )
{
// Can be still OK iff this is an insertion (and which does not follow another insertion, see #888).
// This still may not be enough for more complicated cases with multiple duplicate positions
// and other types in between. In such case let the user normalize the VCF and remove duplicates.
+
int overlap = 0;
- if ( rec->pos < args->fa_frz_pos || !(bcf_get_variant_type(rec,ialt) & VCF_INDEL) ) overlap = 1;
- else if ( rec->d.var[ialt].n <= 0 || args->prev_is_insert ) overlap = 1;
+ if ( rec->pos < args->fa_frz_pos || !trim_beg || var_len==0 || args->prev_is_insert ) overlap = 1;
if ( overlap )
{
}
+ char *alt_allele = rec->d.allele[ialt];
+ int rmme_alt = 0;
+
int len_diff = 0, alen = 0;
int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
if ( idx<0 )
if ( rec->rlen > args->fa_buf.l - idx )
{
rec->rlen = args->fa_buf.l - idx;
- alen = strlen(rec->d.allele[ialt]);
+ alen = strlen(alt_allele);
if ( alen > rec->rlen )
{
- rec->d.allele[ialt][rec->rlen] = 0;
+ alt_allele[rec->rlen] = 0;
fprintf(stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
}
}
error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
// sanity check the reference base
- if ( rec->d.allele[ialt][0]=='<' )
+ if ( alt_allele[0]=='<' )
{
- if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
- error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%"PRId64"\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
- assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1
- len_diff = 1-rec->rlen;
- rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event
- alen = strlen(rec->d.allele[ialt]);
+ // TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG
+
+ if ( strcasecmp(alt_allele,"<DEL>") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"<NON_REF>") )
+ error("Symbolic alleles other than <DEL>, <*> or <NON_REF> are currently not supported, e.g. %s at %s:%"PRId64".\n"
+ "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n",
+ alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ if ( !strcasecmp(alt_allele,"<DEL>") )
+ {
+ static int multibase_ref_del_warned = 0;
+ if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned )
+ {
+ fprintf(stderr,
+ "Warning: one REF base is expected with <DEL>, assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n"
+ " (This warning is printed only once.)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ multibase_ref_del_warned = 1;
+ }
+ if ( args->mark_del ) // insert dashes instead of delete sequence
+ {
+ alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del);
+ alen = rec->rlen;
+ len_diff = 0;
+ rmme_alt = 1;
+ }
+ else
+ {
+ len_diff = 1-rec->rlen;
+ alt_allele = rec->d.allele[0]; // according to VCF spec, the first REF base must precede the event
+ alen = 1;
+ }
+ }
+ else
+ {
+ // <*> or <NON_REF> .. gVCF, evidence for the reference allele throughout the whole block
+ freeze_ref(args,rec);
+ return;
+ }
}
else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
{
}
error(
"The fasta sequence does not match the REF allele at %s:%"PRId64":\n"
- " .vcf: [%s] <- (REF)\n"
- " .vcf: [%s] <- (ALT)\n"
- " .fa: [%s]%c%s\n",
- bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx,
+ " REF .vcf: [%s]\n"
+ " ALT .vcf: [%s]\n"
+ " REF .fa : [%s]%c%s\n",
+ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx,
tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
);
}
- alen = strlen(rec->d.allele[ialt]);
+ alen = strlen(alt_allele);
len_diff = alen - rec->rlen;
+
+ if ( args->mark_del && len_diff<0 )
+ {
+ alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
+ alen = rec->rlen;
+ len_diff = 0;
+ rmme_alt = 1;
+ }
}
else
{
- alen = strlen(rec->d.allele[ialt]);
+ alen = strlen(alt_allele);
len_diff = alen - rec->rlen;
+
+ if ( args->mark_del && len_diff<0 )
+ {
+ alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
+ alen = rec->rlen;
+ len_diff = 0;
+ rmme_alt = 1;
+ }
}
- if ( args->fa_case )
- for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]);
+ args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER;
+ if ( args->fa_case==TO_UPPER )
+ for (i=0; i<alen; i++) alt_allele[i] = toupper(alt_allele[i]);
else
- for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]);
+ for (i=0; i<alen; i++) alt_allele[i] = tolower(alt_allele[i]);
+
+ if ( args->mark_ins && len_diff>0 )
+ mark_ins(rec->d.allele[0], alt_allele, args->mark_ins);
+ if ( args->mark_snv )
+ mark_snv(rec->d.allele[0], alt_allele, args->mark_snv);
if ( len_diff <= 0 )
{
// deletion or same size event
- for (i=0; i<alen; i++)
- args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
+ assert( args->fa_buf.l >= idx+rec->rlen );
+ args->prev_base = args->fa_buf.s[idx+rec->rlen-1];
+ args->prev_base_pos = rec->pos + rec->rlen - 1;
+ args->prev_is_insert = 0;
+ args->fa_frz_mod = idx + alen;
+
+ for (i=trim_beg; i<alen; i++)
+ args->fa_buf.s[idx+i] = alt_allele[i];
if ( len_diff )
memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
-
- args->prev_base = rec->d.allele[0][rec->rlen - 1];
- args->prev_base_pos = rec->pos + rec->rlen - 1;
- args->prev_is_insert = 0;
}
else
{
// 1 C T
// 1 C CAA
int ibeg = 0;
- while ( ibeg<alen && rec->d.allele[0][ibeg]==rec->d.allele[ialt][ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++;
+ while ( ibeg<alen && rec->d.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++;
for (i=ibeg; i<alen; i++)
- args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
+ args->fa_buf.s[idx+i] = alt_allele[i];
+
+ args->fa_frz_mod = idx + alen - ibeg + 1;
}
if (args->chain && len_diff != 0)
{
// If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
- if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0)
+ if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0)
{
// ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
args->fa_mod_off += len_diff;
args->fa_frz_pos = rec->pos + rec->rlen - 1;
args->napplied++;
+ if ( rmme_alt ) free(alt_allele);
}
{
int start = args->fa_src_pos - len;
int end = args->fa_src_pos;
+ int i;
- if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;
-
- int idx_start, idx_end, i;
- while ( regitr_overlap(args->itr) )
+ for (i=0; i<args->nmask; i++)
{
- idx_start = args->itr->beg - start;
- idx_end = args->itr->end - start;
- if ( idx_start < 0 ) idx_start = 0;
- if ( idx_end >= len ) idx_end = len - 1;
- for (i=idx_start; i<=idx_end; i++) seq[i] = 'N';
+ mask_t *mask = &args->mask[i];
+ if ( !regidx_overlap(mask->idx, args->chr,start,end, mask->itr) ) continue;
+
+ int idx_start, idx_end, j;
+ while ( regitr_overlap(mask->itr) )
+ {
+ idx_start = mask->itr->beg - start;
+ idx_end = mask->itr->end - start;
+ if ( idx_start < 0 ) idx_start = 0;
+ if ( idx_end >= len ) idx_end = len - 1;
+ if ( mask->with==MASK_UC )
+ for (j=idx_start; j<=idx_end; j++) seq[j] = toupper(seq[j]);
+ else if ( mask->with==MASK_LC )
+ for (j=idx_start; j<=idx_end; j++) seq[j] = tolower(seq[j]);
+ else
+ for (j=idx_start; j<=idx_end; j++) seq[j] = mask->with;
+ }
}
}
print_chain(args);
destroy_chain(args);
}
- // apply all cached variants
- while ( args->vcf_rbuf.n )
+ // apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*)
+ bcf1_t **rec_ptr = NULL;
+ while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
{
- bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f];
+ bcf1_t *rec = *rec_ptr;
if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break;
- int i = rbuf_shift(&args->vcf_rbuf);
- apply_variant(args, args->vcf_buf[i]);
+ apply_variant(args, rec);
+ }
+ if ( args->absent_allele )
+ {
+ int pos = 0;
+ if ( args->vcf_rbuf.n && args->vcf_buf[args->vcf_rbuf.f]->rid==args->rid )
+ pos = args->vcf_buf[args->vcf_rbuf.f]->pos;
+ apply_absent(args, pos);
}
flush_fa_buffer(args, 0);
init_region(args, str.s+1);
}
apply_variant(args, rec);
}
- if ( !rec_ptr ) flush_fa_buffer(args, 60);
+ if ( !rec_ptr )
+ {
+ if ( args->absent_allele ) apply_absent(args, args->fa_ori_pos - args->fa_mod_off + args->fa_buf.l);
+ flush_fa_buffer(args, 60);
+ }
}
bcf1_t **rec_ptr = NULL;
while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
print_chain(args);
destroy_chain(args);
}
+ if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX);
flush_fa_buffer(args, 0);
bgzf_close(fasta);
free(str.s);
fprintf(stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n");
fprintf(stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n");
- fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
+ fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -c, --chain <file> write a chain file for liftover\n");
- fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
- fprintf(stderr, " -H, --haplotype <which> choose which allele to use from the FORMAT/GT field, note\n");
- fprintf(stderr, " the codes are case-insensitive:\n");
- fprintf(stderr, " 1: first allele from GT, regardless of phasing\n");
- fprintf(stderr, " 2: second allele from GT, regardless of phasing\n");
- fprintf(stderr, " R: REF allele in het genotypes\n");
- fprintf(stderr, " A: ALT allele\n");
- fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
- fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
- fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
- fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
- fprintf(stderr, " -m, --mask <file> replace regions with N\n");
- fprintf(stderr, " -M, --missing <char> output <char> instead of skipping the missing genotypes\n");
- fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(stderr, " -p, --prefix <string> prefix to add to output sequence names\n");
- fprintf(stderr, " -s, --sample <name> apply variants of the given sample\n");
+ fprintf(stderr, " -c, --chain FILE write a chain file for liftover\n");
+ fprintf(stderr, " -a, --absent CHAR replace positions absent from VCF with CHAR\n");
+ fprintf(stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -f, --fasta-ref FILE reference sequence in fasta format\n");
+ fprintf(stderr, " -H, --haplotype WHICH choose which allele to use from the FORMAT/GT field, note\n");
+ fprintf(stderr, " the codes are case-insensitive:\n");
+ fprintf(stderr, " 1: first allele from GT, regardless of phasing\n");
+ fprintf(stderr, " 2: second allele from GT, regardless of phasing\n");
+ fprintf(stderr, " R: REF allele in het genotypes\n");
+ fprintf(stderr, " A: ALT allele\n");
+ fprintf(stderr, " I: IUPAC code for all genotypes\n");
+ fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
+ fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
+ fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
+ fprintf(stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
+ fprintf(stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n");
+ fprintf(stderr, " --mark-ins uc|lc highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+ fprintf(stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+ fprintf(stderr, " -m, --mask FILE replace regions according to the next --mask-with option. The default is --mask-with N\n");
+ fprintf(stderr, " --mask-with CHAR|uc|lc replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
+ fprintf(stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n");
+ fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n");
+ fprintf(stderr, " -p, --prefix STRING prefix to add to output sequence names\n");
+ fprintf(stderr, " -s, --sample NAME apply variants of the given sample\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
fprintf(stderr, " # in the form \">chr:from-to\".\n");
static struct option loptions[] =
{
+ {"mark-del",required_argument,NULL,1},
+ {"mark-ins",required_argument,NULL,2},
+ {"mark-snv",required_argument,NULL,3},
+ {"mask-with",1,0,4},
{"exclude",required_argument,NULL,'e'},
{"include",required_argument,NULL,'i'},
{"sample",1,0,'s'},
{"fasta-ref",1,0,'f'},
{"mask",1,0,'m'},
{"missing",1,0,'M'},
+ {"absent",1,0,'a'},
{"chain",1,0,'c'},
{"prefix",required_argument,0,'p'},
{0,0,0,0}
};
int c;
- while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
{
switch (c)
{
+ case 1 : args->mark_del = optarg[0]; break;
+ case 2 :
+ if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u';
+ else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l';
+ else error("The argument is not recognised: --mark-ins %s\n",optarg);
+ break;
+ case 3 :
+ if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u';
+ else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l';
+ else error("The argument is not recognised: --mark-snv %s\n",optarg);
+ break;
case 'p': args->chr_prefix = optarg; break;
case 's': args->sample = optarg; break;
case 'o': args->output_fname = optarg; break;
case 'I': args->output_iupac = 1; break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'f': args->ref_fname = optarg; break;
- case 'm': args->mask_fname = optarg; break;
+ case 'm': add_mask(args,optarg); break;
+ case 4 : add_mask_with(args,optarg); break;
+ case 'a':
+ args->absent_allele = optarg[0];
+ if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg);
+ break;
case 'M':
args->missing_allele = optarg[0];
if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg);
else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT;
else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF;
else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT;
+ else if ( !strcasecmp(optarg,"I") ) args->allele |= PICK_IUPAC;
else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1;
else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2;
else
/* The MIT License
- Copyright (c) 2014-2017 Genome Research Ltd.
+ Copyright (c) 2014-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
+#include <assert.h>
#include <errno.h>
#include <getopt.h>
#include <unistd.h>
#define PICK_SHORT 8
#define PICK_IUPAC 16
+#define TO_UPPER 0
+#define TO_LOWER 1
+
typedef struct
{
int num; // number of ungapped blocks in this chain
}
chain_t;
+#define MASK_LC 1
+#define MASK_UC 2
+#define MASK_SKIP(x) (((x)->with!=MASK_LC && (x)->with!=MASK_UC) ? 1 : 0)
+typedef struct
+{
+ char *fname, with;
+ regidx_t *idx;
+ regitr_t *itr;
+}
+mask_t;
typedef struct
{
int fa_ori_pos; // start position of the fa_buffer (wrt original sequence)
int fa_frz_pos; // protected position to avoid conflicting variants (last pos for SNPs/ins)
int fa_mod_off; // position difference of fa_frz_pos in the ori and modified sequence (ins positive)
+ int fa_frz_mod; // the fa_buf offset of the protected fa_frz_pos position, includes the modified sequence
int fa_end_pos; // region's end position in the original sequence
int fa_length; // region's length in the original sequence (in case end_pos not provided in the FASTA header)
- int fa_case; // output upper case or lower case?
+ int fa_case; // output upper case or lower case: TO_UPPER|TO_LOWER
int fa_src_pos; // last genomic coordinate read from the input fasta (0-based)
char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778
int prev_base_pos; // the position of prev_base
int nvcf_buf, rid;
char *chr, *chr_prefix;
- regidx_t *mask;
- regitr_t *itr;
+ mask_t *mask;
+ int nmask;
int chain_id; // chain_id, to provide a unique ID to each chain in the chain output
chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
FILE *fp_chain;
char **argv;
int argc, output_iupac, haplotype, allele, isample, napplied;
- char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele;
+ uint8_t *iupac_bitmask;
+ int miupac_bitmask;
+ char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
+ char mark_del, mark_ins, mark_snv;
}
args_t;
// fprintf(bcftools_stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
int num = chain->num;
- if (ref_start <= chain->ref_last_block_ori) {
+ if (num && ref_start <= chain->ref_last_block_ori) {
// In case this variant is back-to-back with the previous one
chain->ref_last_block_ori = ref_start + ref_len;
chain->alt_last_block_ori = alt_start + alt_len;
if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
args->isample = 0;
}
- if ( args->mask_fname )
+ int i;
+ for (i=0; i<args->nmask; i++)
{
- args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL);
- if ( !args->mask ) error("Failed to initialize mask regions\n");
- args->itr = regitr_init(args->mask);
+ mask_t *mask = &args->mask[i];
+ mask->idx = regidx_init(mask->fname,NULL,NULL,0,NULL);
+ if ( !mask->idx ) error("Failed to initialize mask regions\n");
+ mask->itr = regitr_init(mask->idx);
}
// In case we want to store the chains
if ( args->chain_fname )
if ( args->isample<0 ) fprintf(bcftools_stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n");
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
+ args->rid = -1;
+}
+static void add_mask(args_t *args, char *fname)
+{
+ args->nmask++;
+ args->mask = (mask_t*)realloc(args->mask,args->nmask*sizeof(*args->mask));
+ mask_t *mask = &args->mask[args->nmask-1];
+ mask->fname = fname;
+ mask->with = 'N';
+}
+static void add_mask_with(args_t *args, char *with)
+{
+ if ( !args->nmask ) error("The --mask-with option must follow --mask\n");
+ mask_t *mask = &args->mask[args->nmask-1];
+ if ( !strcasecmp(with,"uc") ) mask->with = MASK_UC;
+ else if ( !strcasecmp(with,"lc") ) mask->with = MASK_LC;
+ else if ( strlen(with)!=1 ) error("Expected \"lc\", \"uc\", or a single character with the --mask-with option\n");
+ else mask->with = *with;
}
-
static void destroy_data(args_t *args)
{
+ free(args->iupac_bitmask);
if (args->filter) filter_destroy(args->filter);
bcf_sr_destroy(args->files);
int i;
free(args->vcf_buf);
free(args->fa_buf.s);
free(args->chr);
- if ( args->mask ) regidx_destroy(args->mask);
- if ( args->itr ) regitr_destroy(args->itr);
+ for (i=0; i<args->nmask; i++)
+ {
+ mask_t *mask = &args->mask[i];
+ regidx_destroy(mask->idx);
+ regitr_destroy(mask->itr);
+ }
+ free(args->mask);
if ( args->chain_fname )
if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
args->fa_src_pos = from;
args->fa_mod_off = 0;
args->fa_frz_pos = -1;
+ args->fa_frz_mod = -1;
args->fa_case = -1;
args->vcf_rbuf.n = 0;
bcf_sr_seek(args->files,line,args->fa_ori_pos);
static void flush_fa_buffer(args_t *args, int len)
{
if ( !args->fa_buf.l ) return;
-
int nwr = 0;
while ( nwr + 60 <= args->fa_buf.l )
{
if ( nwr )
args->fa_ori_pos += nwr;
+ args->fa_frz_mod -= nwr;
+
if ( len )
{
// not finished on this chr yet and the buffer cannot be emptied completely
args->fa_mod_off = 0;
args->fa_buf.l = 0;
}
+static void apply_absent(args_t *args, hts_pos_t pos)
+{
+ if ( !args->fa_buf.l || pos <= args->fa_frz_pos + 1 || pos <= args->fa_ori_pos ) return;
+
+ int ie = pos && pos - args->fa_ori_pos + args->fa_mod_off < args->fa_buf.l ? pos - args->fa_ori_pos + args->fa_mod_off : args->fa_buf.l;
+ int ib = args->fa_frz_mod < 0 ? 0 : args->fa_frz_mod;
+ int i;
+ for (i=ib; i<ie; i++)
+ args->fa_buf.s[i] = args->absent_allele;
+}
+static void freeze_ref(args_t *args, bcf1_t *rec)
+{
+ if ( args->fa_frz_pos >= rec->pos + rec->rlen - 1 ) return;
+ args->fa_frz_pos = rec->pos + rec->rlen - 1;
+ args->fa_frz_mod = rec->pos - args->fa_ori_pos + args->fa_mod_off + rec->rlen;
+}
+static char *mark_del(char *ref, int rlen, char *alt, int mark)
+{
+ char *out = malloc(rlen+1);
+ int i;
+ if ( alt )
+ {
+ int nalt = strlen(alt);
+ for (i=0; i<nalt; i++) out[i] = alt[i];
+ }
+ else // symbolic <DEL>
+ {
+ int nref = strlen(ref);
+ for (i=0; i<nref; i++) out[i] = ref[i];
+ }
+ for (; i<rlen; i++) out[i] = mark;
+ out[rlen] = 0;
+ return out;
+}
+static void mark_ins(char *ref, char *alt, char mark)
+{
+ int i, nref = strlen(ref), nalt = strlen(alt);
+ if ( mark=='l' )
+ for (i=nref; i<nalt; i++) alt[i] = tolower(alt[i]);
+ else
+ for (i=nref; i<nalt; i++) alt[i] = toupper(alt[i]);
+}
+static void mark_snv(char *ref, char *alt, char mark)
+{
+ int i, nref = strlen(ref), nalt = strlen(alt);
+ int n = nref < nalt ? nref : nalt;
+ if ( mark=='l' )
+ {
+ for (i=0; i<n; i++)
+ if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = tolower(alt[i]);
+ }
+ else
+ {
+ for (i=0; i<n; i++)
+ if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
+ }
+}
static void apply_variant(args_t *args, bcf1_t *rec)
{
static int warned_haplotype = 0;
- if ( rec->n_allele==1 && !args->missing_allele ) return;
+ if ( args->absent_allele ) apply_absent(args, rec->pos);
+ if ( rec->n_allele==1 && !args->missing_allele && !args->absent_allele ) { return; }
+ int i,j;
if ( args->mask )
{
char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
int start = rec->pos;
int end = rec->pos + rec->rlen - 1;
- if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
+ for (i=0; i<args->nmask; i++)
+ {
+ mask_t *mask = &args->mask[i];
+ if ( MASK_SKIP(mask) && regidx_overlap(mask->idx, chr,start,end,NULL) ) return;
+ }
}
- int i, ialt = 1; // the alternate allele
+ int ialt = 1; // the alternate allele
if ( args->isample >= 0 )
{
bcf_unpack(rec, BCF_UN_FMT);
enum { use_hap, use_iupac, pick_one } action = use_hap;
if ( args->allele==PICK_IUPAC )
{
+ if ( !args->haplotype ) action = use_iupac;
if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac;
}
else if ( args->output_iupac ) action = use_iupac;
}
else if ( action==use_iupac )
{
- ialt = ptr[0];
- if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end )
+ ialt = -1;
+ int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1;
+ for (i=0; i<fmt->n; i++)
{
- if ( !args->missing_allele ) return;
- ialt = -1;
- }
- else
- ialt = bcf_gt_allele(ialt);
+ if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; }
+ if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
+ int jalt = bcf_gt_allele(ptr[i]);
+ if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ if ( fallback_alt <= 0 ) fallback_alt = jalt;
- int jalt;
- if ( fmt->n>1 )
- {
- jalt = ptr[1];
- if ( bcf_gt_is_missing(jalt) )
+ int l = strlen(rec->d.allele[jalt]);
+ for (j=0; j<l; j++)
+ if ( iupac2bitmask(rec->d.allele[jalt][j]) < 0 ) break;
+ if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
+
+ if ( l > mlen )
{
- if ( !args->missing_allele ) return;
- ialt = -1;
+ hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
+ for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
+ mlen = l;
}
- else if ( jalt==bcf_int32_vector_end ) jalt = ialt;
- else
- jalt = bcf_gt_allele(jalt);
- }
- else jalt = ialt;
-
- if ( ialt>=0 )
- {
- if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
- if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
+ if ( jalt>0 && l>alen )
{
- char ial = rec->d.allele[ialt][0];
- char jal = rec->d.allele[jalt][0];
- if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored
- rec->d.allele[ialt][0] = gt2iupac(ial,jal);
+ alen = l;
+ ialt = jalt;
}
+ for (j=0; j<l; j++)
+ args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]);
}
+ if ( alen > 0 )
+ for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
+ else if ( fallback_alt >= 0 )
+ ialt = fallback_alt;
+ else if ( is_missing && !args->missing_allele ) return;
}
else
{
}
}
}
- if ( !ialt ) return; // ref allele
+ if ( !ialt )
+ {
+ // ref allele
+ if ( args->absent_allele ) freeze_ref(args,rec);
+ return;
+ }
if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
}
- else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] )
+ else if ( args->output_iupac && rec->n_allele>1 )
{
- char ial = rec->d.allele[0][0];
- char jal = rec->d.allele[1][0];
- rec->d.allele[1][0] = gt2iupac(ial,jal);
+ int ialt, alen = 0, mlen = 0;
+ for (i=0; i<rec->n_allele; i++)
+ {
+ int l = strlen(rec->d.allele[i]);
+ for (j=0; j<l; j++)
+ if ( iupac2bitmask(rec->d.allele[i][j]) < 0 ) break;
+ if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
+
+ if ( l > mlen )
+ {
+ hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
+ for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
+ mlen = l;
+ }
+ if ( i>0 && l>alen )
+ {
+ alen = l;
+ ialt = i;
+ }
+ for (j=0; j<l; j++)
+ args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]);
+ }
+ if ( alen > 0 )
+ for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
+ else
+ ialt = 1;
}
- if ( rec->n_allele==1 && ialt!=-1 ) return; // non-missing reference
+ if ( rec->n_allele==1 && ialt!=-1 )
+ {
+ // non-missing reference
+ if ( args->absent_allele ) freeze_ref(args,rec);
+ return;
+ }
if ( ialt==-1 )
{
char alleles[4];
ialt = 1;
}
+ // For some variant types POS+REF refer to the base *before* the event; in such case set trim_beg
+ int trim_beg = 0;
+ int var_type = bcf_get_variant_type(rec,ialt);
+ int var_len = rec->d.var[ialt].n;
+ if ( var_type & VCF_INDEL )
+ {
+ // normally indel starts one base after, but not if the first base of the fa reference is deleted
+ if ( rec->d.allele[0][0] == rec->d.allele[ialt][0] )
+ trim_beg = 1;
+ else
+ trim_beg = 0;
+ }
+ else if ( (var_type & VCF_OTHER) && !strcasecmp(rec->d.allele[ialt],"<DEL>") )
+ {
+ trim_beg = 1;
+ var_len = 1 - rec->rlen;
+ }
+ else if ( (var_type & VCF_OTHER) && !strncasecmp(rec->d.allele[ialt],"<INS",4) ) trim_beg = 1;
+
// Overlapping variant?
if ( rec->pos <= args->fa_frz_pos )
{
// Can be still OK iff this is an insertion (and which does not follow another insertion, see #888).
// This still may not be enough for more complicated cases with multiple duplicate positions
// and other types in between. In such case let the user normalize the VCF and remove duplicates.
+
int overlap = 0;
- if ( rec->pos < args->fa_frz_pos || !(bcf_get_variant_type(rec,ialt) & VCF_INDEL) ) overlap = 1;
- else if ( rec->d.var[ialt].n <= 0 || args->prev_is_insert ) overlap = 1;
+ if ( rec->pos < args->fa_frz_pos || !trim_beg || var_len==0 || args->prev_is_insert ) overlap = 1;
if ( overlap )
{
}
+ char *alt_allele = rec->d.allele[ialt];
+ int rmme_alt = 0;
+
int len_diff = 0, alen = 0;
int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
if ( idx<0 )
if ( rec->rlen > args->fa_buf.l - idx )
{
rec->rlen = args->fa_buf.l - idx;
- alen = strlen(rec->d.allele[ialt]);
+ alen = strlen(alt_allele);
if ( alen > rec->rlen )
{
- rec->d.allele[ialt][rec->rlen] = 0;
+ alt_allele[rec->rlen] = 0;
fprintf(bcftools_stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
}
}
error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
// sanity check the reference base
- if ( rec->d.allele[ialt][0]=='<' )
+ if ( alt_allele[0]=='<' )
{
- if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
- error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%"PRId64"\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
- assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1
- len_diff = 1-rec->rlen;
- rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event
- alen = strlen(rec->d.allele[ialt]);
+ // TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG
+
+ if ( strcasecmp(alt_allele,"<DEL>") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"<NON_REF>") )
+ error("Symbolic alleles other than <DEL>, <*> or <NON_REF> are currently not supported, e.g. %s at %s:%"PRId64".\n"
+ "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n",
+ alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ if ( !strcasecmp(alt_allele,"<DEL>") )
+ {
+ static int multibase_ref_del_warned = 0;
+ if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned )
+ {
+ fprintf(bcftools_stderr,
+ "Warning: one REF base is expected with <DEL>, assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n"
+ " (This warning is printed only once.)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ multibase_ref_del_warned = 1;
+ }
+ if ( args->mark_del ) // insert dashes instead of delete sequence
+ {
+ alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del);
+ alen = rec->rlen;
+ len_diff = 0;
+ rmme_alt = 1;
+ }
+ else
+ {
+ len_diff = 1-rec->rlen;
+ alt_allele = rec->d.allele[0]; // according to VCF spec, the first REF base must precede the event
+ alen = 1;
+ }
+ }
+ else
+ {
+ // <*> or <NON_REF> .. gVCF, evidence for the reference allele throughout the whole block
+ freeze_ref(args,rec);
+ return;
+ }
}
else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
{
}
error(
"The fasta sequence does not match the REF allele at %s:%"PRId64":\n"
- " .vcf: [%s] <- (REF)\n"
- " .vcf: [%s] <- (ALT)\n"
- " .fa: [%s]%c%s\n",
- bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx,
+ " REF .vcf: [%s]\n"
+ " ALT .vcf: [%s]\n"
+ " REF .fa : [%s]%c%s\n",
+ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx,
tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
);
}
- alen = strlen(rec->d.allele[ialt]);
+ alen = strlen(alt_allele);
len_diff = alen - rec->rlen;
+
+ if ( args->mark_del && len_diff<0 )
+ {
+ alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
+ alen = rec->rlen;
+ len_diff = 0;
+ rmme_alt = 1;
+ }
}
else
{
- alen = strlen(rec->d.allele[ialt]);
+ alen = strlen(alt_allele);
len_diff = alen - rec->rlen;
+
+ if ( args->mark_del && len_diff<0 )
+ {
+ alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
+ alen = rec->rlen;
+ len_diff = 0;
+ rmme_alt = 1;
+ }
}
- if ( args->fa_case )
- for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]);
+ args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER;
+ if ( args->fa_case==TO_UPPER )
+ for (i=0; i<alen; i++) alt_allele[i] = toupper(alt_allele[i]);
else
- for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]);
+ for (i=0; i<alen; i++) alt_allele[i] = tolower(alt_allele[i]);
+
+ if ( args->mark_ins && len_diff>0 )
+ mark_ins(rec->d.allele[0], alt_allele, args->mark_ins);
+ if ( args->mark_snv )
+ mark_snv(rec->d.allele[0], alt_allele, args->mark_snv);
if ( len_diff <= 0 )
{
// deletion or same size event
- for (i=0; i<alen; i++)
- args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
+ assert( args->fa_buf.l >= idx+rec->rlen );
+ args->prev_base = args->fa_buf.s[idx+rec->rlen-1];
+ args->prev_base_pos = rec->pos + rec->rlen - 1;
+ args->prev_is_insert = 0;
+ args->fa_frz_mod = idx + alen;
+
+ for (i=trim_beg; i<alen; i++)
+ args->fa_buf.s[idx+i] = alt_allele[i];
if ( len_diff )
memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
-
- args->prev_base = rec->d.allele[0][rec->rlen - 1];
- args->prev_base_pos = rec->pos + rec->rlen - 1;
- args->prev_is_insert = 0;
}
else
{
// 1 C T
// 1 C CAA
int ibeg = 0;
- while ( ibeg<alen && rec->d.allele[0][ibeg]==rec->d.allele[ialt][ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++;
+ while ( ibeg<alen && rec->d.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++;
for (i=ibeg; i<alen; i++)
- args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
+ args->fa_buf.s[idx+i] = alt_allele[i];
+
+ args->fa_frz_mod = idx + alen - ibeg + 1;
}
if (args->chain && len_diff != 0)
{
// If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
- if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0)
+ if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0)
{
// ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
args->fa_mod_off += len_diff;
args->fa_frz_pos = rec->pos + rec->rlen - 1;
args->napplied++;
+ if ( rmme_alt ) free(alt_allele);
}
{
int start = args->fa_src_pos - len;
int end = args->fa_src_pos;
+ int i;
- if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;
-
- int idx_start, idx_end, i;
- while ( regitr_overlap(args->itr) )
+ for (i=0; i<args->nmask; i++)
{
- idx_start = args->itr->beg - start;
- idx_end = args->itr->end - start;
- if ( idx_start < 0 ) idx_start = 0;
- if ( idx_end >= len ) idx_end = len - 1;
- for (i=idx_start; i<=idx_end; i++) seq[i] = 'N';
+ mask_t *mask = &args->mask[i];
+ if ( !regidx_overlap(mask->idx, args->chr,start,end, mask->itr) ) continue;
+
+ int idx_start, idx_end, j;
+ while ( regitr_overlap(mask->itr) )
+ {
+ idx_start = mask->itr->beg - start;
+ idx_end = mask->itr->end - start;
+ if ( idx_start < 0 ) idx_start = 0;
+ if ( idx_end >= len ) idx_end = len - 1;
+ if ( mask->with==MASK_UC )
+ for (j=idx_start; j<=idx_end; j++) seq[j] = toupper(seq[j]);
+ else if ( mask->with==MASK_LC )
+ for (j=idx_start; j<=idx_end; j++) seq[j] = tolower(seq[j]);
+ else
+ for (j=idx_start; j<=idx_end; j++) seq[j] = mask->with;
+ }
}
}
print_chain(args);
destroy_chain(args);
}
- // apply all cached variants
- while ( args->vcf_rbuf.n )
+ // apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*)
+ bcf1_t **rec_ptr = NULL;
+ while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
{
- bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f];
+ bcf1_t *rec = *rec_ptr;
if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break;
- int i = rbuf_shift(&args->vcf_rbuf);
- apply_variant(args, args->vcf_buf[i]);
+ apply_variant(args, rec);
+ }
+ if ( args->absent_allele )
+ {
+ int pos = 0;
+ if ( args->vcf_rbuf.n && args->vcf_buf[args->vcf_rbuf.f]->rid==args->rid )
+ pos = args->vcf_buf[args->vcf_rbuf.f]->pos;
+ apply_absent(args, pos);
}
flush_fa_buffer(args, 0);
init_region(args, str.s+1);
}
apply_variant(args, rec);
}
- if ( !rec_ptr ) flush_fa_buffer(args, 60);
+ if ( !rec_ptr )
+ {
+ if ( args->absent_allele ) apply_absent(args, args->fa_ori_pos - args->fa_mod_off + args->fa_buf.l);
+ flush_fa_buffer(args, 60);
+ }
}
bcf1_t **rec_ptr = NULL;
while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
print_chain(args);
destroy_chain(args);
}
+ if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX);
flush_fa_buffer(args, 0);
bgzf_close(fasta);
free(str.s);
fprintf(bcftools_stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n");
fprintf(bcftools_stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
fprintf(bcftools_stderr, " information, such as INFO/AD or FORMAT/AD.\n");
- fprintf(bcftools_stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
+ fprintf(bcftools_stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -c, --chain <file> write a chain file for liftover\n");
- fprintf(bcftools_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
- fprintf(bcftools_stderr, " -H, --haplotype <which> choose which allele to use from the FORMAT/GT field, note\n");
- fprintf(bcftools_stderr, " the codes are case-insensitive:\n");
- fprintf(bcftools_stderr, " 1: first allele from GT, regardless of phasing\n");
- fprintf(bcftools_stderr, " 2: second allele from GT, regardless of phasing\n");
- fprintf(bcftools_stderr, " R: REF allele in het genotypes\n");
- fprintf(bcftools_stderr, " A: ALT allele\n");
- fprintf(bcftools_stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
- fprintf(bcftools_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
- fprintf(bcftools_stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
- fprintf(bcftools_stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
- fprintf(bcftools_stderr, " -m, --mask <file> replace regions with N\n");
- fprintf(bcftools_stderr, " -M, --missing <char> output <char> instead of skipping the missing genotypes\n");
- fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -p, --prefix <string> prefix to add to output sequence names\n");
- fprintf(bcftools_stderr, " -s, --sample <name> apply variants of the given sample\n");
+ fprintf(bcftools_stderr, " -c, --chain FILE write a chain file for liftover\n");
+ fprintf(bcftools_stderr, " -a, --absent CHAR replace positions absent from VCF with CHAR\n");
+ fprintf(bcftools_stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -f, --fasta-ref FILE reference sequence in fasta format\n");
+ fprintf(bcftools_stderr, " -H, --haplotype WHICH choose which allele to use from the FORMAT/GT field, note\n");
+ fprintf(bcftools_stderr, " the codes are case-insensitive:\n");
+ fprintf(bcftools_stderr, " 1: first allele from GT, regardless of phasing\n");
+ fprintf(bcftools_stderr, " 2: second allele from GT, regardless of phasing\n");
+ fprintf(bcftools_stderr, " R: REF allele in het genotypes\n");
+ fprintf(bcftools_stderr, " A: ALT allele\n");
+ fprintf(bcftools_stderr, " I: IUPAC code for all genotypes\n");
+ fprintf(bcftools_stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
+ fprintf(bcftools_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
+ fprintf(bcftools_stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
+ fprintf(bcftools_stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
+ fprintf(bcftools_stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n");
+ fprintf(bcftools_stderr, " --mark-ins uc|lc highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+ fprintf(bcftools_stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+ fprintf(bcftools_stderr, " -m, --mask FILE replace regions according to the next --mask-with option. The default is --mask-with N\n");
+ fprintf(bcftools_stderr, " --mask-with CHAR|uc|lc replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
+ fprintf(bcftools_stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n");
+ fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -p, --prefix STRING prefix to add to output sequence names\n");
+ fprintf(bcftools_stderr, " -s, --sample NAME apply variants of the given sample\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
fprintf(bcftools_stderr, " # in the form \">chr:from-to\".\n");
fprintf(bcftools_stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_consensus(int argc, char *argv[])
static struct option loptions[] =
{
+ {"mark-del",required_argument,NULL,1},
+ {"mark-ins",required_argument,NULL,2},
+ {"mark-snv",required_argument,NULL,3},
+ {"mask-with",1,0,4},
{"exclude",required_argument,NULL,'e'},
{"include",required_argument,NULL,'i'},
{"sample",1,0,'s'},
{"fasta-ref",1,0,'f'},
{"mask",1,0,'m'},
{"missing",1,0,'M'},
+ {"absent",1,0,'a'},
{"chain",1,0,'c'},
{"prefix",required_argument,0,'p'},
{0,0,0,0}
};
int c;
- while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
{
switch (c)
{
+ case 1 : args->mark_del = optarg[0]; break;
+ case 2 :
+ if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u';
+ else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l';
+ else error("The argument is not recognised: --mark-ins %s\n",optarg);
+ break;
+ case 3 :
+ if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u';
+ else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l';
+ else error("The argument is not recognised: --mark-snv %s\n",optarg);
+ break;
case 'p': args->chr_prefix = optarg; break;
case 's': args->sample = optarg; break;
case 'o': args->output_fname = optarg; break;
case 'I': args->output_iupac = 1; break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'f': args->ref_fname = optarg; break;
- case 'm': args->mask_fname = optarg; break;
+ case 'm': add_mask(args,optarg); break;
+ case 4 : add_mask_with(args,optarg); break;
+ case 'a':
+ args->absent_allele = optarg[0];
+ if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg);
+ break;
case 'M':
args->missing_allele = optarg[0];
if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg);
else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT;
else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF;
else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT;
+ else if ( !strcasecmp(optarg,"I") ) args->allele |= PICK_IUPAC;
else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1;
else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2;
else
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2018 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdio.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include "bcftools.h"
#include "variantkey.h"
#include "convert.h"
+#include "filter.h"
#define T_CHROM 1
#define T_POS 2
#define T_RSX 30 // RSID HEX
#define T_VKX 31 // VARIANTKEY HEX
#define T_PBINOM 32
+#define T_NPASS 33
typedef struct _fmt_t
{
type_t val = x[j]; \
if ( !val ) continue; \
for (i=0; i<nbits; i+=2) \
- if ( val & (mask<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
+ if ( val & (mask<<i) ) { kputs(csq->str[(j*30+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
} \
} \
if ( fmt->subscript<0 || fmt->subscript==2 ) \
type_t val = x[j]; \
if ( !val ) continue; \
for (i=1; i<nbits; i+=2) \
- if ( val & (1<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
+ if ( val & (1<<i) ) { kputs(csq->str[(j*30+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
} \
} \
}
{
case BCF_BT_INT8: BRANCH(uint8_t, 8); break;
case BCF_BT_INT16: BRANCH(uint16_t,16); break;
- case BCF_BT_INT32: BRANCH(uint32_t,32); break;
+ case BCF_BT_INT32: BRANCH(uint32_t,30); break; // 2 bytes unused to account for the reserved BCF values
default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break;
}
#undef BRANCH
int j;
for (j=0; j<n; j++)
{
- if ( ptr[j]==bcf_int32_vector_end ) break;
- if ( ptr[j]==bcf_int32_missing ) { ptr[j]=0; continue; }
+ if ( bcf_float_is_vector_end(ptr[j]) ) break;
+ if ( bcf_float_is_missing(ptr[j]) ) { ptr[j]=0; continue; }
if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]);
sum+=ptr[j];
}
ksprintf(str, "%016" PRIx64 "", vk);
}
+static void process_npass(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int i, nsmpl = 0;
+ filter_t *flt = (filter_t*) fmt->usr;
+ const uint8_t *smpl;
+ filter_test(flt,line,&smpl);
+ for (i=0; i<convert->nsamples; i++)
+ if ( smpl[i] ) nsmpl++;
+ kputd(nsmpl, str);
+}
+static void destroy_npass(void *usr)
+{
+ filter_destroy((filter_t*)usr);
+}
+
static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
int i;
else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; }
else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; }
}
- if ( fmt->type==T_PBINOM )
+ else if ( fmt->type==T_PBINOM )
{
fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key);
if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id) ) error("No such FORMAT tag defined in the header: %s\n", fmt->key);
}
+ else if ( fmt->type==T_NPASS )
+ {
+ filter_t *flt = filter_init(convert->header,key);
+ convert->max_unpack |= filter_max_unpack(flt);
+ fmt->usr = (void*) flt;
+ }
}
switch (fmt->type)
case T_RSX: fmt->handler = &process_rsid_hex; break;
case T_VKX: fmt->handler = &process_variantkey_hex; break;
case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_NPASS: fmt->handler = &process_npass; fmt->destroy = &destroy_npass; break;
default: error("TODO: handler for type %d\n", fmt->type);
}
if ( key && fmt->type==T_INFO )
register_tag(convert, T_PBINOM, str.s, is_gtf);
q++;
}
+ else if ( !strcmp(str.s,"N_PASS") )
+ error("N_PASS() must be placed outside the square brackets\n");
else
{
fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf);
else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf);
else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf);
else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf);
- else if ( !strcmp(str.s,"pbinom") ) error("Error: pbinom() is currently supported only with FORMAT tags. (todo)\n");
+ else if ( !strcmp(str.s,"PBINOM") ) error("Error: PBINOM() is currently supported only with FORMAT tags. (todo)\n");
else if ( !strcmp(str.s, "INFO") )
{
if ( *q=='/' )
}
else if ( !strcmp(str.s, "FORMAT") )
register_tag(convert, T_FORMAT, NULL, 0);
+ else if ( !strcmp(str.s,"N_PASS") )
+ {
+ if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str);
+ p = ++q;
+ str.l = 0;
+ int nopen = 1;
+ while ( *q && nopen )
+ {
+ if ( *q=='(' ) nopen++;
+ else if ( *q==')' ) nopen--;
+ q++;
+ }
+ if ( q-p==0 || nopen ) error("Could not parse format string: %s\n", convert->format_str);
+ kputsn(p, q-p-1, &str);
+ register_tag(convert, T_NPASS, str.s, is_gtf);
+ }
else
{
fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
for (js=0; js<convert->nsamples; js++)
{
// Skip samples when filtering was requested
- if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;
+ int ks = convert->samples[js];
+ if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] ) continue;
// Here comes a hack designed for TBCSQ. When running on large files,
// such as 1000GP, there are too many empty fields in the output and
// brackets here. This may be changed in future, time will show...
size_t l_start = str->l;
- int ks = convert->samples[js];
for (k=i; k<j; k++)
{
if ( convert->fmt[k].type == T_MASK )
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2018 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdio.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include "bcftools.h"
#include "variantkey.h"
#include "convert.h"
+#include "filter.h"
#define T_CHROM 1
#define T_POS 2
#define T_RSX 30 // RSID HEX
#define T_VKX 31 // VARIANTKEY HEX
#define T_PBINOM 32
+#define T_NPASS 33
typedef struct _fmt_t
{
case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else kputd(info->v1.f, str); break;
case BCF_BT_CHAR: kputc(info->v1.i, str); break;
- default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break;
+ default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); bcftools_exit(1); break;
}
}
else if ( fmt->subscript >=0 )
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
case BCF_BT_CHAR: _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
- default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break;
+ default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); bcftools_exit(1); break;
}
#undef BRANCH
}
type_t val = x[j]; \
if ( !val ) continue; \
for (i=0; i<nbits; i+=2) \
- if ( val & (mask<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
+ if ( val & (mask<<i) ) { kputs(csq->str[(j*30+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
} \
} \
if ( fmt->subscript<0 || fmt->subscript==2 ) \
type_t val = x[j]; \
if ( !val ) continue; \
for (i=1; i<nbits; i+=2) \
- if ( val & (1<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
+ if ( val & (1<<i) ) { kputs(csq->str[(j*30+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
} \
} \
}
{
case BCF_BT_INT8: BRANCH(uint8_t, 8); break;
case BCF_BT_INT16: BRANCH(uint16_t,16); break;
- case BCF_BT_INT32: BRANCH(uint32_t,32); break;
- default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break;
+ case BCF_BT_INT32: BRANCH(uint32_t,30); break; // 2 bytes unused to account for the reserved BCF values
+ default: error("Unexpected type: %d\n", fmt->fmt->type); bcftools_exit(1); break;
}
#undef BRANCH
int j;
for (j=0; j<n; j++)
{
- if ( ptr[j]==bcf_int32_vector_end ) break;
- if ( ptr[j]==bcf_int32_missing ) { ptr[j]=0; continue; }
+ if ( bcf_float_is_vector_end(ptr[j]) ) break;
+ if ( bcf_float_is_missing(ptr[j]) ) { ptr[j]=0; continue; }
if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]);
sum+=ptr[j];
}
ksprintf(str, "%016" PRIx64 "", vk);
}
+static void process_npass(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ int i, nsmpl = 0;
+ filter_t *flt = (filter_t*) fmt->usr;
+ const uint8_t *smpl;
+ filter_test(flt,line,&smpl);
+ for (i=0; i<convert->nsamples; i++)
+ if ( smpl[i] ) nsmpl++;
+ kputd(nsmpl, str);
+}
+static void destroy_npass(void *usr)
+{
+ filter_destroy((filter_t*)usr);
+}
+
static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
int i;
else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; }
else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; }
}
- if ( fmt->type==T_PBINOM )
+ else if ( fmt->type==T_PBINOM )
{
fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key);
if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id) ) error("No such FORMAT tag defined in the header: %s\n", fmt->key);
}
+ else if ( fmt->type==T_NPASS )
+ {
+ filter_t *flt = filter_init(convert->header,key);
+ convert->max_unpack |= filter_max_unpack(flt);
+ fmt->usr = (void*) flt;
+ }
}
switch (fmt->type)
case T_RSX: fmt->handler = &process_rsid_hex; break;
case T_VKX: fmt->handler = &process_variantkey_hex; break;
case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_NPASS: fmt->handler = &process_npass; fmt->destroy = &destroy_npass; break;
default: error("TODO: handler for type %d\n", fmt->type);
}
if ( key && fmt->type==T_INFO )
register_tag(convert, T_PBINOM, str.s, is_gtf);
q++;
}
+ else if ( !strcmp(str.s,"N_PASS") )
+ error("N_PASS() must be placed outside the square brackets\n");
else
{
fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf);
else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf);
else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf);
else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf);
- else if ( !strcmp(str.s,"pbinom") ) error("Error: pbinom() is currently supported only with FORMAT tags. (todo)\n");
+ else if ( !strcmp(str.s,"PBINOM") ) error("Error: PBINOM() is currently supported only with FORMAT tags. (todo)\n");
else if ( !strcmp(str.s, "INFO") )
{
if ( *q=='/' )
}
else if ( !strcmp(str.s, "FORMAT") )
register_tag(convert, T_FORMAT, NULL, 0);
+ else if ( !strcmp(str.s,"N_PASS") )
+ {
+ if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str);
+ p = ++q;
+ str.l = 0;
+ int nopen = 1;
+ while ( *q && nopen )
+ {
+ if ( *q=='(' ) nopen++;
+ else if ( *q==')' ) nopen--;
+ q++;
+ }
+ if ( q-p==0 || nopen ) error("Could not parse format string: %s\n", convert->format_str);
+ kputsn(p, q-p-1, &str);
+ register_tag(convert, T_NPASS, str.s, is_gtf);
+ }
else
{
fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
for (js=0; js<convert->nsamples; js++)
{
// Skip samples when filtering was requested
- if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;
+ int ks = convert->samples[js];
+ if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] ) continue;
// Here comes a hack designed for TBCSQ. When running on large files,
// such as 1000GP, there are too many empty fields in the output and
// brackets here. This may be changed in future, time will show...
size_t l_start = str->l;
- int ks = convert->samples[js];
for (k=i; k<j; k++)
{
if ( convert->fmt[k].type == T_MASK )
-//$bt csq -f $ref -g $gff -p r -Ou -o /dev/null /lustre/scratch116/vr/projects/g1k/phase3/release/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
-
-
/* The MIT License
- Copyright (c) 2016-2018 Genome Research Ltd.
+ Copyright (c) 2016-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdio.h>
#include <stdlib.h>
+#include <assert.h>
#include <getopt.h>
#include <math.h>
#include <inttypes.h>
char *bcsq_tag;
int argc, output_type;
int phase, verbosity, local_csq, record_cmd_line;
- int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ
- int ncsq_small_warned;
+ int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
+ int ncsq2_small_warned;
int brief_predictions;
int rid; // current chromosome
int iseq;
if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
{
- hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
- aux->seq[aux->nseq] = strdup(chr_beg);
- iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
- aux->nseq++;
- assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
+ // check for possible mismatch in chromosome naming convention such as chrX vs X
+ char *new_chr = NULL;
+ if ( faidx_has_seq(args->fai,chr_beg) )
+ new_chr = strdup(chr_beg); // valid chr name, the same in gff and faidx
+ else
+ {
+ int len = strlen(chr_beg);
+ if ( !strncmp("chr",chr_beg,3) && len>3 )
+ new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not
+ else
+ {
+ new_chr = malloc(len+3); // gff does not have the prefix, faidx has
+ memcpy(new_chr,"chr",3);
+ memcpy(new_chr+3,chr_beg,len);
+ new_chr[len+3] = 0;
+ }
+ if ( !faidx_has_seq(args->fai,new_chr) ) // modification did not help, this sequence is not in fai
+ {
+ static int unkwn_chr_warned = 0;
+ if ( !unkwn_chr_warned && args->verbosity>0 )
+ fprintf(stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg);
+ unkwn_chr_warned = 1;
+ free(new_chr);
+ new_chr = strdup(chr_beg); // use the original sequence name
+ }
+ }
+ if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 )
+ {
+ hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+ aux->seq[aux->nseq] = new_chr;
+ iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+ aux->nseq++;
+ assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
+ }
+ else
+ free(new_chr);
}
chr_end[1] = c;
return iseq;
tscript_ok = 0;
break;
}
- error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+ args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
len += tr->cds[i]->len;
}
tscript_ok = 0;
break;
}
- error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+ args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
len += tr->cds[i]->len;
}
gf_cds_t *a = tr->cds[i-1];
gf_cds_t *b = tr->cds[i];
if ( a->beg + a->len - 1 >= b->beg )
- error("Error: CDS overlap in the transcript %"PRIu32": %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32"\n",
- kh_key(aux->id2tr, k), a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+ {
+ if ( args->force )
+ {
+ fprintf(stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n",
+ args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+ }
+ else
+ error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
+ " Use the --force option to override (at your own risk).\n",
+ args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+ }
}
if ( len%3 != 0 )
{
khash_str2int_destroy_free(aux->ignored_biotypes);
}
+static inline int ncsq2_to_nfmt(int ncsq2)
+{
+ return 1 + (ncsq2 - 1) / 30;
+}
+static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit)
+{
+ *ival = icsq2 / 30;
+ *ibit = icsq2 % 30;
+}
+
void init_data(args_t *args)
{
- args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32;
+ args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max);
+
+ args->fai = fai_load(args->fa_fname);
+ if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname);
init_gff(args);
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
- args->fai = fai_load(args->fa_fname);
- if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
-
args->pos2vbuf = kh_init(pos2vbuf);
args->active_tr = khp_init(trhp);
args->hap = (hap_t*) calloc(1,sizeof(hap_t));
}
else
{
- args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno));
if ( args->n_threads > 0)
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p);
void destroy_data(args_t *args)
{
+ if ( args->ncsq2_small_warned )
+ fprintf(stderr,
+ "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n"
+ " the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2);
+
regidx_destroy(args->idx_cds);
regidx_destroy(args->idx_utr);
regidx_destroy(args->idx_exon);
void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
{
- if ( !args->brief_predictions )
+ if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 )
kputs(aa->s, str);
else
{
- int len = aa->l;
+ int i, len = aa->l;
if ( aa->s[len-1]=='*' ) len--;
- kputc(aa->s[0], str);
+ for (i=0; i<len && i<args->brief_predictions; i++) kputc(aa->s[i], str);
kputs("..", str);
kputw(beg+len, str);
}
{
csq_t *csq = node->csq_list + i;
vrec_t *vrec = csq->vrec;
- int icsq = 2*csq->idx + ihap;
- if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+ int icsq2 = 2*csq->idx + ihap;
+ if ( icsq2 >= args->ncsq2_max ) // more than ncsq2_max consequences, so can't fit it in FMT
{
- if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) )
+ if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) )
{
fprintf(stderr,
"Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n",
args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx);
- if ( !args->ncsq_small_warned )
+ if ( !args->ncsq2_small_warned )
fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n");
- args->ncsq_small_warned = 1;
}
+ if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2;
break;
}
- if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
- vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+ int ival, ibit;
+ icsq2_to_bit(icsq2, &ival,&ibit);
+ if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
+ vrec->smpl[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit;
}
}
{
if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
- int icsq = 2*csq->idx + j;
- if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+ int icsq2 = 2*csq->idx + j;
+ if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT
{
int ismpl = args->smpl->idx[i];
- if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) )
+ if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) )
{
fprintf(stderr,
"Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n",
- args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq+1);
- if ( !args->ncsq_small_warned )
+ args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq2+1);
+ if ( !args->ncsq2_small_warned )
fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n");
- args->ncsq_small_warned = 1;
+ args->ncsq2_small_warned = 1;
}
+ if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2;
+ break;
}
- if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
- vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+ int ival, ibit;
+ icsq2_to_bit(icsq2, &ival,&ibit);
+ if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
+ vrec->smpl[i*args->nfmt_bcsq + ival] |= 1 << ibit;
}
}
}
return
"\n"
"About: Haplotype-aware consequence caller.\n"
- "Usage: bcftools csq [options] in.vcf\n"
+ "Usage: bcftools csq [OPTIONS] in.vcf\n"
"\n"
"Required options:\n"
- " -f, --fasta-ref <file> reference file in fasta format\n"
- " -g, --gff-annot <file> gff3 annotation file\n"
+ " -f, --fasta-ref FILE reference file in fasta format\n"
+ " -g, --gff-annot FILE gff3 annotation file\n"
"\n"
"CSQ options:\n"
- " -b, --brief-predictions annotate with abbreviated protein-changing predictions\n"
- " -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
+ " -B, --trim-protein-seq INT abbreviate protein-changing predictions to max INT aminoacids\n"
+ " -c, --custom-tag STRING use this tag instead of the default BCSQ\n"
" -l, --local-csq localized predictions, consider only one VCF record at a time\n"
- " -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
- " -p, --phase <a|m|r|R|s> how to handle unphased heterozygous genotypes: [r]\n"
+ " -n, --ncsq INT maximum number of per-haplotype consequences to consider for each site [15]\n"
+ " -p, --phase a|m|r|R|s how to handle unphased heterozygous genotypes: [r]\n"
" a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
" m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
" r: require phased GTs, throw an error on unphased het GTs\n"
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
" s: skip unphased hets\n"
"Options:\n"
- " -e, --exclude <expr> exclude sites for which the expression is true\n"
+ " -e, --exclude EXPR exclude sites for which the expression is true\n"
" --force run even if some sanity checks fail\n"
- " -i, --include <expr> select sites for which the expression is true\n"
+ " -i, --include EXPR select sites for which the expression is true\n"
" --no-version do not append version and command line to the header\n"
- " -o, --output <file> write output to a file [standard output]\n"
- " -O, --output-type <b|u|z|v|t> b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
+ " -o, --output FILE write output to a file [standard output]\n"
+ " -O, --output-type b|u|z|v|t b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
" v: uncompressed VCF, t: plain tab-delimited text output [v]\n"
- " -r, --regions <region> restrict to comma-separated list of regions\n"
- " -R, --regions-file <file> restrict to regions listed in a file\n"
- " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n"
- " -S, --samples-file <file> samples to include\n"
- " -t, --targets <region> similar to -r but streams rather than index-jumps\n"
- " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n"
- " --threads <int> use multithreading with <int> worker threads [0]\n"
- " -v, --verbose <int> verbosity level 0-2 [1]\n"
+ " -r, --regions REGION restrict to comma-separated list of regions\n"
+ " -R, --regions-file FILE restrict to regions listed in a file\n"
+ " -s, --samples -|LIST samples to include or \"-\" to apply all variants and ignore samples\n"
+ " -S, --samples-file FILE samples to include\n"
+ " -t, --targets REGION similar to -r but streams rather than index-jumps\n"
+ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
+ " --threads INT use multithreading with <int> worker threads [0]\n"
+ " -v, --verbose INT verbosity level 0-2 [1]\n"
"\n"
"Example:\n"
" bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
args->argc = argc; args->argv = argv;
args->output_type = FT_VCF;
args->bcsq_tag = "BCSQ";
- args->ncsq_max = 2*16;
+ args->ncsq2_max = 2*(16-1); // 1 bit is reserved for BCF missing values
args->verbosity = 1;
args->record_cmd_line = 1;
{"threads",required_argument,NULL,2},
{"help",0,0,'h'},
{"ncsq",1,0,'n'},
- {"brief-predictions",0,0,'b'},
+ {"brief-predictions",no_argument,0,'b'},
+ {"trim-protein-seq",required_argument,0,'B'},
{"custom-tag",1,0,'c'},
{"local-csq",0,0,'l'},
{"gff-annot",1,0,'g'},
};
int c, targets_is_file = 0, regions_is_file = 0;
char *targets_list = NULL, *regions_list = NULL, *tmp;
- while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bv:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0)
{
switch (c)
{
if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
break;
case 3 : args->record_cmd_line = 0; break;
- case 'b': args->brief_predictions = 1; break;
+ case 'b':
+ args->brief_predictions = 1;
+ fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
+ break;
+ case 'B':
+ args->brief_predictions = strtol(optarg,&tmp,10);
+ if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg);
+ break;
case 'l': args->local_csq = 1; break;
case 'c': args->bcsq_tag = optarg; break;
case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break;
case 'f': args->fa_fname = optarg; break;
case 'g': args->gff_fname = optarg; break;
case 'n':
- args->ncsq_max = 2 * atoi(optarg);
- if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg);
+ args->ncsq2_max = 2 * atoi(optarg);
+ if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg);
break;
case 'o': args->output_fname = optarg; break;
case 'O':
default: error("The output type \"%s\" not recognised\n", optarg);
}
break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'r': regions_list = optarg; break;
case 'R': regions_list = optarg; regions_is_file = 1; break;
case 's': args->sample_list = optarg; break;
#include "bcftools.pysam.h"
-//$bt csq -f $ref -g $gff -p r -Ou -o /dev/null /lustre/scratch116/vr/projects/g1k/phase3/release/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
-
-
/* The MIT License
- Copyright (c) 2016-2018 Genome Research Ltd.
+ Copyright (c) 2016-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdio.h>
#include <stdlib.h>
+#include <assert.h>
#include <getopt.h>
#include <math.h>
#include <inttypes.h>
char *bcsq_tag;
int argc, output_type;
int phase, verbosity, local_csq, record_cmd_line;
- int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ
- int ncsq_small_warned;
+ int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
+ int ncsq2_small_warned;
int brief_predictions;
int rid; // current chromosome
int iseq;
if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
{
- hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
- aux->seq[aux->nseq] = strdup(chr_beg);
- iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
- aux->nseq++;
- assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
+ // check for possible mismatch in chromosome naming convention such as chrX vs X
+ char *new_chr = NULL;
+ if ( faidx_has_seq(args->fai,chr_beg) )
+ new_chr = strdup(chr_beg); // valid chr name, the same in gff and faidx
+ else
+ {
+ int len = strlen(chr_beg);
+ if ( !strncmp("chr",chr_beg,3) && len>3 )
+ new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not
+ else
+ {
+ new_chr = malloc(len+3); // gff does not have the prefix, faidx has
+ memcpy(new_chr,"chr",3);
+ memcpy(new_chr+3,chr_beg,len);
+ new_chr[len+3] = 0;
+ }
+ if ( !faidx_has_seq(args->fai,new_chr) ) // modification did not help, this sequence is not in fai
+ {
+ static int unkwn_chr_warned = 0;
+ if ( !unkwn_chr_warned && args->verbosity>0 )
+ fprintf(bcftools_stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg);
+ unkwn_chr_warned = 1;
+ free(new_chr);
+ new_chr = strdup(chr_beg); // use the original sequence name
+ }
+ }
+ if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 )
+ {
+ hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+ aux->seq[aux->nseq] = new_chr;
+ iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+ aux->nseq++;
+ assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
+ }
+ else
+ free(new_chr);
}
chr_end[1] = c;
return iseq;
tscript_ok = 0;
break;
}
- error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+ args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
len += tr->cds[i]->len;
}
tscript_ok = 0;
break;
}
- error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+ error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+ args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
len += tr->cds[i]->len;
}
gf_cds_t *a = tr->cds[i-1];
gf_cds_t *b = tr->cds[i];
if ( a->beg + a->len - 1 >= b->beg )
- error("Error: CDS overlap in the transcript %"PRIu32": %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32"\n",
- kh_key(aux->id2tr, k), a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+ {
+ if ( args->force )
+ {
+ fprintf(bcftools_stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n",
+ args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+ }
+ else
+ error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
+ " Use the --force option to override (at your own risk).\n",
+ args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+ }
}
if ( len%3 != 0 )
{
khash_str2int_destroy_free(aux->ignored_biotypes);
}
+static inline int ncsq2_to_nfmt(int ncsq2)
+{
+ return 1 + (ncsq2 - 1) / 30;
+}
+static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit)
+{
+ *ival = icsq2 / 30;
+ *ibit = icsq2 % 30;
+}
+
void init_data(args_t *args)
{
- args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32;
+ args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max);
+
+ args->fai = fai_load(args->fa_fname);
+ if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", args->gff_fname);
init_gff(args);
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
- args->fai = fai_load(args->fa_fname);
- if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
-
args->pos2vbuf = kh_init(pos2vbuf);
args->active_tr = khp_init(trhp);
args->hap = (hap_t*) calloc(1,sizeof(hap_t));
}
else
{
- args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno));
if ( args->n_threads > 0)
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p);
void destroy_data(args_t *args)
{
+ if ( args->ncsq2_small_warned )
+ fprintf(bcftools_stderr,
+ "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n"
+ " the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2);
+
regidx_destroy(args->idx_cds);
regidx_destroy(args->idx_utr);
regidx_destroy(args->idx_exon);
void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
{
- if ( !args->brief_predictions )
+ if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 )
kputs(aa->s, str);
else
{
- int len = aa->l;
+ int i, len = aa->l;
if ( aa->s[len-1]=='*' ) len--;
- kputc(aa->s[0], str);
+ for (i=0; i<len && i<args->brief_predictions; i++) kputc(aa->s[i], str);
kputs("..", str);
kputw(beg+len, str);
}
{
csq_t *csq = node->csq_list + i;
vrec_t *vrec = csq->vrec;
- int icsq = 2*csq->idx + ihap;
- if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+ int icsq2 = 2*csq->idx + ihap;
+ if ( icsq2 >= args->ncsq2_max ) // more than ncsq2_max consequences, so can't fit it in FMT
{
- if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) )
+ if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) )
{
fprintf(bcftools_stderr,
"Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n",
args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx);
- if ( !args->ncsq_small_warned )
+ if ( !args->ncsq2_small_warned )
fprintf(bcftools_stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n");
- args->ncsq_small_warned = 1;
}
+ if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2;
break;
}
- if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
- vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+ int ival, ibit;
+ icsq2_to_bit(icsq2, &ival,&ibit);
+ if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
+ vrec->smpl[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit;
}
}
{
if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
- int icsq = 2*csq->idx + j;
- if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+ int icsq2 = 2*csq->idx + j;
+ if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT
{
int ismpl = args->smpl->idx[i];
- if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) )
+ if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) )
{
fprintf(bcftools_stderr,
"Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n",
- args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq+1);
- if ( !args->ncsq_small_warned )
+ args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq2+1);
+ if ( !args->ncsq2_small_warned )
fprintf(bcftools_stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n");
- args->ncsq_small_warned = 1;
+ args->ncsq2_small_warned = 1;
}
+ if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2;
+ break;
}
- if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
- vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+ int ival, ibit;
+ icsq2_to_bit(icsq2, &ival,&ibit);
+ if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
+ vrec->smpl[i*args->nfmt_bcsq + ival] |= 1 << ibit;
}
}
}
return
"\n"
"About: Haplotype-aware consequence caller.\n"
- "Usage: bcftools csq [options] in.vcf\n"
+ "Usage: bcftools csq [OPTIONS] in.vcf\n"
"\n"
"Required options:\n"
- " -f, --fasta-ref <file> reference file in fasta format\n"
- " -g, --gff-annot <file> gff3 annotation file\n"
+ " -f, --fasta-ref FILE reference file in fasta format\n"
+ " -g, --gff-annot FILE gff3 annotation file\n"
"\n"
"CSQ options:\n"
- " -b, --brief-predictions annotate with abbreviated protein-changing predictions\n"
- " -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
+ " -B, --trim-protein-seq INT abbreviate protein-changing predictions to max INT aminoacids\n"
+ " -c, --custom-tag STRING use this tag instead of the default BCSQ\n"
" -l, --local-csq localized predictions, consider only one VCF record at a time\n"
- " -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
- " -p, --phase <a|m|r|R|s> how to handle unphased heterozygous genotypes: [r]\n"
+ " -n, --ncsq INT maximum number of per-haplotype consequences to consider for each site [15]\n"
+ " -p, --phase a|m|r|R|s how to handle unphased heterozygous genotypes: [r]\n"
" a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
" m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
" r: require phased GTs, throw an error on unphased het GTs\n"
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
" s: skip unphased hets\n"
"Options:\n"
- " -e, --exclude <expr> exclude sites for which the expression is true\n"
+ " -e, --exclude EXPR exclude sites for which the expression is true\n"
" --force run even if some sanity checks fail\n"
- " -i, --include <expr> select sites for which the expression is true\n"
+ " -i, --include EXPR select sites for which the expression is true\n"
" --no-version do not append version and command line to the header\n"
- " -o, --output <file> write output to a file [standard output]\n"
- " -O, --output-type <b|u|z|v|t> b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
+ " -o, --output FILE write output to a file [standard output]\n"
+ " -O, --output-type b|u|z|v|t b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
" v: uncompressed VCF, t: plain tab-delimited text output [v]\n"
- " -r, --regions <region> restrict to comma-separated list of regions\n"
- " -R, --regions-file <file> restrict to regions listed in a file\n"
- " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n"
- " -S, --samples-file <file> samples to include\n"
- " -t, --targets <region> similar to -r but streams rather than index-jumps\n"
- " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n"
- " --threads <int> use multithreading with <int> worker threads [0]\n"
- " -v, --verbose <int> verbosity level 0-2 [1]\n"
+ " -r, --regions REGION restrict to comma-separated list of regions\n"
+ " -R, --regions-file FILE restrict to regions listed in a file\n"
+ " -s, --samples -|LIST samples to include or \"-\" to apply all variants and ignore samples\n"
+ " -S, --samples-file FILE samples to include\n"
+ " -t, --targets REGION similar to -r but streams rather than index-jumps\n"
+ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
+ " --threads INT use multithreading with <int> worker threads [0]\n"
+ " -v, --verbose INT verbosity level 0-2 [1]\n"
"\n"
"Example:\n"
" bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
args->argc = argc; args->argv = argv;
args->output_type = FT_VCF;
args->bcsq_tag = "BCSQ";
- args->ncsq_max = 2*16;
+ args->ncsq2_max = 2*(16-1); // 1 bit is reserved for BCF missing values
args->verbosity = 1;
args->record_cmd_line = 1;
{"threads",required_argument,NULL,2},
{"help",0,0,'h'},
{"ncsq",1,0,'n'},
- {"brief-predictions",0,0,'b'},
+ {"brief-predictions",no_argument,0,'b'},
+ {"trim-protein-seq",required_argument,0,'B'},
{"custom-tag",1,0,'c'},
{"local-csq",0,0,'l'},
{"gff-annot",1,0,'g'},
};
int c, targets_is_file = 0, regions_is_file = 0;
char *targets_list = NULL, *regions_list = NULL, *tmp;
- while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bv:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0)
{
switch (c)
{
if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
break;
case 3 : args->record_cmd_line = 0; break;
- case 'b': args->brief_predictions = 1; break;
+ case 'b':
+ args->brief_predictions = 1;
+ fprintf(bcftools_stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
+ break;
+ case 'B':
+ args->brief_predictions = strtol(optarg,&tmp,10);
+ if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg);
+ break;
case 'l': args->local_csq = 1; break;
case 'c': args->bcsq_tag = optarg; break;
case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break;
case 'f': args->fa_fname = optarg; break;
case 'g': args->gff_fname = optarg; break;
case 'n':
- args->ncsq_max = 2 * atoi(optarg);
- if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg);
+ args->ncsq2_max = 2 * atoi(optarg);
+ if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg);
break;
case 'o': args->output_fname = optarg; break;
case 'O':
default: error("The output type \"%s\" not recognised\n", optarg);
}
break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'r': regions_list = optarg; break;
case 'R': regions_list = optarg; regions_is_file = 1; break;
case 's': args->sample_list = optarg; break;
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2016-2020 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include "dist.h"
+
+extern void error(const char *format, ...);
+
+struct _dist_t
+{
+ uint64_t *bins, nvalues;
+ int nbins;
+ int npow; // the number of orders of magnitude to represent exactly
+ int nexact; // pow(10,npow)
+ int nlevel;
+};
+
+dist_t *dist_init(int npow)
+{
+ dist_t *dist = (dist_t*) calloc(1,sizeof(dist_t));
+ dist->npow = npow;
+ dist->nexact = pow(10,npow);
+ dist->nlevel = dist->nexact - pow(10,npow-1);
+ return dist;
+}
+
+void dist_destroy(dist_t *dist)
+{
+ if ( !dist ) return;
+ free(dist->bins);
+ free(dist);
+}
+
+int dist_nbins(dist_t *dist)
+{
+ return dist->nbins;
+}
+
+int dist_nvalues(dist_t *dist)
+{
+ return dist->nvalues;
+}
+
+uint32_t dist_insert(dist_t *dist, uint32_t value)
+{
+ int ibin;
+
+ if ( value <= dist->nexact )
+ ibin = value;
+ else
+ {
+ int npow = (int) log10(value);
+ int level = npow - dist->npow + 1;
+ uint32_t step = pow(10, level);
+ ibin = dist->nexact + dist->nlevel*(level-1) + (value - pow(10,npow)) / step;
+ }
+
+ if ( ibin >= dist->nbins )
+ {
+ dist->bins = (uint64_t*) realloc(dist->bins, sizeof(*dist->bins)*(ibin+1));
+ memset(dist->bins + dist->nbins, 0, (ibin+1 - dist->nbins)*sizeof(*dist->bins));
+ dist->nbins = ibin+1;
+ }
+ dist->bins[ibin]++;
+ dist->nvalues++;
+ return ibin;
+}
+uint32_t dist_insert_n(dist_t *dist, uint32_t value, uint32_t cnt)
+{
+ if ( !cnt ) return 0;
+ int ibin = dist_insert(dist, value);
+ dist->bins[ibin] += cnt - 1;
+ dist->nvalues += cnt;
+ return ibin;
+}
+
+uint64_t dist_get(dist_t *dist, uint32_t idx, uint32_t *beg, uint32_t *end)
+{
+ if ( idx < dist->nexact )
+ {
+ if ( beg ) *beg = idx;
+ if ( end ) *end = idx + 1;
+ }
+ else
+ {
+ int level = (idx - dist->nexact) / dist->nlevel + 1;
+ int bin = idx - dist->nexact - dist->nlevel*(level-1);
+
+ uint32_t step = pow(10, level);
+ uint32_t value = pow(10, level + dist->npow - 1) + step*bin;
+
+ if ( beg ) *beg = value;
+ if ( end ) *end = value + step;
+ }
+ return dist->bins[idx];
+}
+
--- /dev/null
+#include "bcftools.pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2016-2020 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include "dist.h"
+
+extern void error(const char *format, ...);
+
+struct _dist_t
+{
+ uint64_t *bins, nvalues;
+ int nbins;
+ int npow; // the number of orders of magnitude to represent exactly
+ int nexact; // pow(10,npow)
+ int nlevel;
+};
+
+dist_t *dist_init(int npow)
+{
+ dist_t *dist = (dist_t*) calloc(1,sizeof(dist_t));
+ dist->npow = npow;
+ dist->nexact = pow(10,npow);
+ dist->nlevel = dist->nexact - pow(10,npow-1);
+ return dist;
+}
+
+void dist_destroy(dist_t *dist)
+{
+ if ( !dist ) return;
+ free(dist->bins);
+ free(dist);
+}
+
+int dist_nbins(dist_t *dist)
+{
+ return dist->nbins;
+}
+
+int dist_nvalues(dist_t *dist)
+{
+ return dist->nvalues;
+}
+
+uint32_t dist_insert(dist_t *dist, uint32_t value)
+{
+ int ibin;
+
+ if ( value <= dist->nexact )
+ ibin = value;
+ else
+ {
+ int npow = (int) log10(value);
+ int level = npow - dist->npow + 1;
+ uint32_t step = pow(10, level);
+ ibin = dist->nexact + dist->nlevel*(level-1) + (value - pow(10,npow)) / step;
+ }
+
+ if ( ibin >= dist->nbins )
+ {
+ dist->bins = (uint64_t*) realloc(dist->bins, sizeof(*dist->bins)*(ibin+1));
+ memset(dist->bins + dist->nbins, 0, (ibin+1 - dist->nbins)*sizeof(*dist->bins));
+ dist->nbins = ibin+1;
+ }
+ dist->bins[ibin]++;
+ dist->nvalues++;
+ return ibin;
+}
+uint32_t dist_insert_n(dist_t *dist, uint32_t value, uint32_t cnt)
+{
+ if ( !cnt ) return 0;
+ int ibin = dist_insert(dist, value);
+ dist->bins[ibin] += cnt - 1;
+ dist->nvalues += cnt;
+ return ibin;
+}
+
+uint64_t dist_get(dist_t *dist, uint32_t idx, uint32_t *beg, uint32_t *end)
+{
+ if ( idx < dist->nexact )
+ {
+ if ( beg ) *beg = idx;
+ if ( end ) *end = idx + 1;
+ }
+ else
+ {
+ int level = (idx - dist->nexact) / dist->nlevel + 1;
+ int bin = idx - dist->nexact - dist->nlevel*(level-1);
+
+ uint32_t step = pow(10, level);
+ uint32_t value = pow(10, level + dist->npow - 1) + step*bin;
+
+ if ( beg ) *beg = value;
+ if ( end ) *end = value + step;
+ }
+ return dist->bins[idx];
+}
+
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2016-2020 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+/*
+ Logarithmic binning
+
+ Example of usage:
+
+ // Initialize, make the binning exact up to 10^4, then add a log-step
+ dist_t *dist = dist_init(4);
+
+ // Insert values
+ int i;
+ for (i=0; i<1e6; i++)
+ dist_insert(dist, i);
+
+ // Number of bins used
+ int n = dist_n(dist);
+
+ // Now print the distribution
+ uint32_t beg, end;
+ for (i=0; i<n; i++)
+ {
+ // Raw count in the bin. The boundaries beg,end are optional,
+ // and can be used to plot correctly the density
+ uint64_t cnt = dist_get(dist, i, &beg, &end);
+ if ( !cnt ) continue;
+
+ // Print the interval, count and density
+ printf("%u\t%u\t%"PRIu64"\t%f\n", beg, end, cnt, (double)cnt/(end-beg));
+ }
+
+ // Clean up
+ dist_destroy(dist);
+ */
+
+#ifndef __DIST_H__
+#define __DIST_H__
+
+#include <stdio.h>
+#include <inttypes.h>
+
+typedef struct _dist_t dist_t;
+
+/*
+ * dist_init() - init bins
+ */
+dist_t *dist_init(int npow);
+void dist_destroy(dist_t *dist);
+
+/*
+ dist_nbins() - get the number of bins
+ */
+int dist_nbins(dist_t *dist);
+
+/*
+ dist_nvalues() - get the total number of values inserted
+ */
+int dist_nvalues(dist_t *dist);
+
+/*
+ dist_insert() - insert new value
+ dist_insert_n() - insert new value n times
+ */
+uint32_t dist_insert(dist_t *dist, uint32_t value);
+uint32_t dist_insert_n(dist_t *dist, uint32_t value, uint32_t cnt);
+
+/*
+ dist_get()
+ @idx: from the interval [0,dist_n-1]
+ @beg,end: [beg,end)
+ */
+uint64_t dist_get(dist_t *dist, uint32_t idx, uint32_t *beg, uint32_t *end);
+
+#endif
+
/* em.c -- mathematical functions.
Copyright (C) 2010, 2011 Broad Institute.
- Portions copyright (C) 2013 Genome Research Ltd.
+ Portions copyright (C) 2013-2014 Genome Research Ltd.
Author: Heng Li <lh3@live.co.uk>
/* em.c -- mathematical functions.
Copyright (C) 2010, 2011 Broad Institute.
- Portions copyright (C) 2013 Genome Research Ltd.
+ Portions copyright (C) 2013-2014 Genome Research Ltd.
Author: Heng Li <lh3@live.co.uk>
--- /dev/null
+/* ext-sort.h -- sort on disk
+
+ Copyright (C) 2020-2021 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <unistd.h> // for unlink()
+#include <sys/stat.h> // for chmod()
+#include <assert.h>
+#include <fcntl.h>
+#ifdef _WIN32
+#include <windows.h>
+#endif
+#include "bcftools.h"
+#include "extsort.h"
+#include "kheap.h"
+
+typedef struct
+{
+ extsort_t *es; // this is to get access to extsort_cmp_f from kheap
+ int fd;
+ char *fname;
+ void *dat;
+}
+blk_t;
+
+static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr);
+KHEAP_INIT(blk, blk_t*, blk_is_smaller) /* defines khp_blk_t */
+
+struct _extsort_t
+{
+ size_t dat_size, mem, max_mem;
+ char *tmp_prefix;
+ extsort_cmp_f cmp;
+
+ size_t nbuf, mbuf, nblk;
+ blk_t **blk;
+ void **buf, *tmp_dat;
+ khp_blk_t *bhp;
+};
+
+static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr)
+{
+ blk_t *a = *aptr;
+ blk_t *b = *bptr;
+ int ret = a->es->cmp(&a->dat,&b->dat);
+ if ( ret < 0 ) return 1;
+ return 0;
+}
+
+size_t parse_mem_string(const char *str);
+
+void extsort_set(extsort_t *es, extsort_opt_t key, void *value)
+{
+ if ( key==DAT_SIZE ) { es->dat_size = *((size_t*)value); return; }
+ if ( key==MAX_MEM )
+ {
+ es->max_mem = parse_mem_string(*((const char**)value));
+ if ( es->max_mem <=0 ) error("Could not parse the memory string, expected positive number: %s\n",*((const char**)value));
+ return;
+ }
+ if ( key==TMP_PREFIX ) { es->tmp_prefix = init_tmp_prefix(*((const char**)value)); return; }
+ if ( key==FUNC_CMP ) { es->cmp = *((extsort_cmp_f*)value); return; }
+}
+
+extsort_t *extsort_alloc(void)
+{
+ extsort_t *es = (extsort_t*) calloc(1,sizeof(*es));
+ es->max_mem = 100e6;
+ return es;
+}
+void extsort_init(extsort_t *es)
+{
+ assert( es->cmp );
+ assert( es->dat_size );
+ if ( !es->tmp_prefix ) es->tmp_prefix = init_tmp_prefix(NULL);
+ es->tmp_dat = malloc(es->dat_size);
+}
+
+void extsort_destroy(extsort_t *es)
+{
+ int i;
+ for (i=0; i<es->nblk; i++)
+ {
+ blk_t *blk = es->blk[i];
+ if ( blk->fd!=-1 )
+#ifdef _WIN32
+ _close(blk->fd);
+#else
+ close(blk->fd);
+#endif
+ free(blk->fname);
+ free(blk->dat);
+ free(blk);
+ }
+ free(es->tmp_dat);
+ free(es->tmp_prefix);
+ free(es->blk);
+ khp_destroy(blk, es->bhp);
+ free(es);
+}
+
+static void _buf_flush(extsort_t *es)
+{
+ int i;
+ if ( !es->nbuf ) return;
+
+ qsort(es->buf, es->nbuf, sizeof(void*), es->cmp);
+
+ es->nblk++;
+ es->blk = (blk_t**) realloc(es->blk, sizeof(blk_t*)*es->nblk);
+ es->blk[es->nblk-1] = (blk_t*) calloc(1,sizeof(blk_t));
+ blk_t *blk = es->blk[es->nblk-1];
+ blk->es = es;
+ blk->dat = malloc(es->dat_size);
+ blk->fname = strdup(es->tmp_prefix);
+ #ifdef _WIN32
+ for (i=0; i<100000; i++)
+ {
+ memcpy(blk->fname,es->tmp_prefix,strlen(es->tmp_prefix));
+ mktemp(blk->fname);
+ blk->fd = _open(blk->fname, O_RDWR|O_CREAT|O_EXCL|O_BINARY|O_TEMPORARY, 0600);
+ if ( blk->fd==-1 )
+ {
+ if ( errno==EEXIST ) continue;
+ error("Error: failed to open a temporary file %s\n",blk->fname);
+ }
+ break;
+ }
+ if ( !blk->fd ) error("Error: failed to create a unique temporary file name from %s\n",es->tmp_prefix);
+ if ( _chmod(blk->fname, S_IRUSR|S_IWUSR)!=0 ) error("Error: failed to set permissions of the temporary file %s\n",blk->fname);
+ #else
+ if ( (blk->fd = mkstemp(blk->fname))==-1 )
+ error("Error: failed to open a temporary file %s\n",blk->fname);
+ if ( fchmod(blk->fd,S_IRUSR|S_IWUSR)!=0 ) error("Error: failed to set permissions of the temporary file %s\n",blk->fname);
+ unlink(blk->fname); // should auto delete when closed on linux, the descriptor remains open
+ #endif
+
+ for (i=0; i<es->nbuf; i++)
+ {
+ #ifdef _WIN32
+ if ( _write(blk->fd, es->buf[i], es->dat_size)!=es->dat_size ) error("Error: failed to write %zu bytes to the temporary file %s\n",es->dat_size,blk->fname);
+ #else
+ if ( write(blk->fd, es->buf[i], es->dat_size)!=es->dat_size ) error("Error: failed to write %zu bytes to the temporary file %s\n",es->dat_size,blk->fname);
+ #endif
+ free(es->buf[i]);
+ }
+#ifdef _WIN32
+ if ( _lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#else
+ if ( lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#endif
+
+ es->nbuf = 0;
+ es->mem = 0;
+}
+
+void extsort_push(extsort_t *es, void *dat)
+{
+ int delta = sizeof(void*) + es->dat_size;
+ if ( es->nbuf && es->mem + delta > es->max_mem ) _buf_flush(es);
+ es->nbuf++;
+ es->mem += delta;
+ hts_expand(void*, es->nbuf, es->mbuf, es->buf);
+ es->buf[es->nbuf-1] = dat;
+}
+
+// return number of elements read
+static ssize_t _blk_read(extsort_t *es, blk_t *blk)
+{
+ ssize_t ret = 0;
+ if ( blk->fd==-1 ) return ret;
+#ifdef _WIN32
+ ret = _read(blk->fd, blk->dat, es->dat_size);
+#else
+ ret = read(blk->fd, blk->dat, es->dat_size);
+#endif
+ if ( ret < 0 ) error("Error: failed to read from the temporary file %s\n", blk->fname);
+ if ( ret == 0 )
+ {
+#ifdef _WIN32
+ if ( _close(blk->fd)!=0 ) error("Error: failed to close the temporary file %s\n", blk->fname);
+#else
+ if ( close(blk->fd)!=0 ) error("Error: failed to close the temporary file %s\n", blk->fname);
+#endif
+ blk->fd = -1;
+ return ret;
+ }
+ if ( ret < es->dat_size ) error("Error: failed to read %zu bytes from the temporary file %s\n",es->dat_size,blk->fname);
+ return ret;
+}
+
+void extsort_sort(extsort_t *es)
+{
+ _buf_flush(es);
+ free(es->buf);
+ es->buf = NULL;
+ es->bhp = khp_init(blk);
+
+ // open all blocks, read one record from each, create a heap
+ int i;
+ for (i=0; i<es->nblk; i++)
+ {
+ blk_t *blk = es->blk[i];
+#ifdef _WIN32
+ if ( _lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#else
+ if ( lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#endif
+ int ret = _blk_read(es, blk);
+ if ( ret ) khp_insert(blk, es->bhp, &blk);
+ }
+}
+
+void *extsort_shift(extsort_t *es)
+{
+ if ( !es->bhp->ndat ) return NULL;
+ blk_t *blk = es->bhp->dat[0];
+
+ // swap the pointer which keeps the location of user data so that it is not overwritten by the next read
+ void *tmp = es->tmp_dat; es->tmp_dat = blk->dat; blk->dat = tmp;
+ khp_delete(blk, es->bhp);
+
+ int ret = _blk_read(es, blk);
+ if ( ret ) khp_insert(blk, es->bhp, &blk);
+
+ return es->tmp_dat;
+}
+
--- /dev/null
+#include "bcftools.pysam.h"
+
+/* ext-sort.h -- sort on disk
+
+ Copyright (C) 2020-2021 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <unistd.h> // for unlink()
+#include <sys/stat.h> // for chmod()
+#include <assert.h>
+#include <fcntl.h>
+#ifdef _WIN32
+#include <windows.h>
+#endif
+#include "bcftools.h"
+#include "extsort.h"
+#include "kheap.h"
+
+typedef struct
+{
+ extsort_t *es; // this is to get access to extsort_cmp_f from kheap
+ int fd;
+ char *fname;
+ void *dat;
+}
+blk_t;
+
+static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr);
+KHEAP_INIT(blk, blk_t*, blk_is_smaller) /* defines khp_blk_t */
+
+struct _extsort_t
+{
+ size_t dat_size, mem, max_mem;
+ char *tmp_prefix;
+ extsort_cmp_f cmp;
+
+ size_t nbuf, mbuf, nblk;
+ blk_t **blk;
+ void **buf, *tmp_dat;
+ khp_blk_t *bhp;
+};
+
+static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr)
+{
+ blk_t *a = *aptr;
+ blk_t *b = *bptr;
+ int ret = a->es->cmp(&a->dat,&b->dat);
+ if ( ret < 0 ) return 1;
+ return 0;
+}
+
+size_t parse_mem_string(const char *str);
+
+void extsort_set(extsort_t *es, extsort_opt_t key, void *value)
+{
+ if ( key==DAT_SIZE ) { es->dat_size = *((size_t*)value); return; }
+ if ( key==MAX_MEM )
+ {
+ es->max_mem = parse_mem_string(*((const char**)value));
+ if ( es->max_mem <=0 ) error("Could not parse the memory string, expected positive number: %s\n",*((const char**)value));
+ return;
+ }
+ if ( key==TMP_PREFIX ) { es->tmp_prefix = init_tmp_prefix(*((const char**)value)); return; }
+ if ( key==FUNC_CMP ) { es->cmp = *((extsort_cmp_f*)value); return; }
+}
+
+extsort_t *extsort_alloc(void)
+{
+ extsort_t *es = (extsort_t*) calloc(1,sizeof(*es));
+ es->max_mem = 100e6;
+ return es;
+}
+void extsort_init(extsort_t *es)
+{
+ assert( es->cmp );
+ assert( es->dat_size );
+ if ( !es->tmp_prefix ) es->tmp_prefix = init_tmp_prefix(NULL);
+ es->tmp_dat = malloc(es->dat_size);
+}
+
+void extsort_destroy(extsort_t *es)
+{
+ int i;
+ for (i=0; i<es->nblk; i++)
+ {
+ blk_t *blk = es->blk[i];
+ if ( blk->fd!=-1 )
+#ifdef _WIN32
+ _close(blk->fd);
+#else
+ close(blk->fd);
+#endif
+ free(blk->fname);
+ free(blk->dat);
+ free(blk);
+ }
+ free(es->tmp_dat);
+ free(es->tmp_prefix);
+ free(es->blk);
+ khp_destroy(blk, es->bhp);
+ free(es);
+}
+
+static void _buf_flush(extsort_t *es)
+{
+ int i;
+ if ( !es->nbuf ) return;
+
+ qsort(es->buf, es->nbuf, sizeof(void*), es->cmp);
+
+ es->nblk++;
+ es->blk = (blk_t**) realloc(es->blk, sizeof(blk_t*)*es->nblk);
+ es->blk[es->nblk-1] = (blk_t*) calloc(1,sizeof(blk_t));
+ blk_t *blk = es->blk[es->nblk-1];
+ blk->es = es;
+ blk->dat = malloc(es->dat_size);
+ blk->fname = strdup(es->tmp_prefix);
+ #ifdef _WIN32
+ for (i=0; i<100000; i++)
+ {
+ memcpy(blk->fname,es->tmp_prefix,strlen(es->tmp_prefix));
+ mktemp(blk->fname);
+ blk->fd = _open(blk->fname, O_RDWR|O_CREAT|O_EXCL|O_BINARY|O_TEMPORARY, 0600);
+ if ( blk->fd==-1 )
+ {
+ if ( errno==EEXIST ) continue;
+ error("Error: failed to open a temporary file %s\n",blk->fname);
+ }
+ break;
+ }
+ if ( !blk->fd ) error("Error: failed to create a unique temporary file name from %s\n",es->tmp_prefix);
+ if ( _chmod(blk->fname, S_IRUSR|S_IWUSR)!=0 ) error("Error: failed to set permissions of the temporary file %s\n",blk->fname);
+ #else
+ if ( (blk->fd = mkstemp(blk->fname))==-1 )
+ error("Error: failed to open a temporary file %s\n",blk->fname);
+ if ( fchmod(blk->fd,S_IRUSR|S_IWUSR)!=0 ) error("Error: failed to set permissions of the temporary file %s\n",blk->fname);
+ unlink(blk->fname); // should auto delete when closed on linux, the descriptor remains open
+ #endif
+
+ for (i=0; i<es->nbuf; i++)
+ {
+ #ifdef _WIN32
+ if ( _write(blk->fd, es->buf[i], es->dat_size)!=es->dat_size ) error("Error: failed to write %zu bytes to the temporary file %s\n",es->dat_size,blk->fname);
+ #else
+ if ( write(blk->fd, es->buf[i], es->dat_size)!=es->dat_size ) error("Error: failed to write %zu bytes to the temporary file %s\n",es->dat_size,blk->fname);
+ #endif
+ free(es->buf[i]);
+ }
+#ifdef _WIN32
+ if ( _lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#else
+ if ( lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#endif
+
+ es->nbuf = 0;
+ es->mem = 0;
+}
+
+void extsort_push(extsort_t *es, void *dat)
+{
+ int delta = sizeof(void*) + es->dat_size;
+ if ( es->nbuf && es->mem + delta > es->max_mem ) _buf_flush(es);
+ es->nbuf++;
+ es->mem += delta;
+ hts_expand(void*, es->nbuf, es->mbuf, es->buf);
+ es->buf[es->nbuf-1] = dat;
+}
+
+// return number of elements read
+static ssize_t _blk_read(extsort_t *es, blk_t *blk)
+{
+ ssize_t ret = 0;
+ if ( blk->fd==-1 ) return ret;
+#ifdef _WIN32
+ ret = _read(blk->fd, blk->dat, es->dat_size);
+#else
+ ret = read(blk->fd, blk->dat, es->dat_size);
+#endif
+ if ( ret < 0 ) error("Error: failed to read from the temporary file %s\n", blk->fname);
+ if ( ret == 0 )
+ {
+#ifdef _WIN32
+ if ( _close(blk->fd)!=0 ) error("Error: failed to close the temporary file %s\n", blk->fname);
+#else
+ if ( close(blk->fd)!=0 ) error("Error: failed to close the temporary file %s\n", blk->fname);
+#endif
+ blk->fd = -1;
+ return ret;
+ }
+ if ( ret < es->dat_size ) error("Error: failed to read %zu bytes from the temporary file %s\n",es->dat_size,blk->fname);
+ return ret;
+}
+
+void extsort_sort(extsort_t *es)
+{
+ _buf_flush(es);
+ free(es->buf);
+ es->buf = NULL;
+ es->bhp = khp_init(blk);
+
+ // open all blocks, read one record from each, create a heap
+ int i;
+ for (i=0; i<es->nblk; i++)
+ {
+ blk_t *blk = es->blk[i];
+#ifdef _WIN32
+ if ( _lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#else
+ if ( lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#endif
+ int ret = _blk_read(es, blk);
+ if ( ret ) khp_insert(blk, es->bhp, &blk);
+ }
+}
+
+void *extsort_shift(extsort_t *es)
+{
+ if ( !es->bhp->ndat ) return NULL;
+ blk_t *blk = es->bhp->dat[0];
+
+ // swap the pointer which keeps the location of user data so that it is not overwritten by the next read
+ void *tmp = es->tmp_dat; es->tmp_dat = blk->dat; blk->dat = tmp;
+ khp_delete(blk, es->bhp);
+
+ int ret = _blk_read(es, blk);
+ if ( ret ) khp_insert(blk, es->bhp, &blk);
+
+ return es->tmp_dat;
+}
+
--- /dev/null
+/* ext-sort.h -- sort on disk
+
+ Copyright (C) 2020 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#ifndef __EXTSORT_H__
+#define __EXTSORT_H__
+
+//todo: return status to all functions
+
+typedef struct _extsort_t extsort_t;
+
+typedef int (*extsort_cmp_f) (const void *aptr, const void *bptr);
+
+// Modes of operation
+typedef enum
+{
+ DAT_SIZE, // size_t .. assuming constant size records for now
+ TMP_PREFIX, // const char* .. prefix of temporary files, XXXXXX will be appended
+ MAX_MEM, // const char* .. maximum memory to use, e.g. 100MB
+ FUNC_CMP, // extsort_cmp_f .. sort function
+}
+extsort_opt_t;
+
+#define extsort_set_opt(es,type,key,value) { type tmp = value; extsort_set(es, key, (void*)&tmp); }
+
+extsort_t *extsort_alloc(void);
+void extsort_set(extsort_t *es, extsort_opt_t key, void *value);
+void extsort_init(extsort_t *es);
+void extsort_push(extsort_t *es, void *dat); // dat will be freed by extsort later
+void extsort_sort(extsort_t *es);
+void *extsort_shift(extsort_t *es);
+void extsort_destroy(extsort_t *es);
+
+#endif
/* filter.c -- filter expressions.
- Copyright (C) 2013-2018 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <ctype.h>
#include <stdlib.h>
#include <strings.h>
+#include <assert.h>
#include <errno.h>
#include <math.h>
#include <sys/types.h>
# define __FUNCTION__ __func__
#endif
-static const uint64_t bcf_double_missing = 0x7ff0000000000001;
-static const uint64_t bcf_double_vector_end = 0x7ff0000000000002;
-static inline void bcf_double_set(double *ptr, uint64_t value)
-{
- union { uint64_t i; double d; } u;
- u.i = value;
- *ptr = u.d;
-}
-static inline int bcf_double_test(double d, uint64_t value)
-{
- union { uint64_t i; double d; } u;
- u.d = d;
- return u.i==value ? 1 : 0;
-}
-#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end)
-#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing)
-#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end)
-#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing)
-#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end))
-
-
typedef struct _token_t
{
// read-only values, same for all VCF lines
int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types
int idx; // 0-based index to VCF vectors,
// -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
- int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
+ int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only
int nidxs, nuidxs; // size of idxs array and the number of elements set to 1
- uint8_t *usmpl; // bitmask of used samples as set by idx
+ uint8_t *usmpl; // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise
int nsamples; // number of samples for format fields, 0 for info and other fields
void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack);
#define TOK_PHRED 29
#define TOK_MEDIAN 30
#define TOK_STDEV 31
-
-// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
-// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
-#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis"
+#define TOK_sMAX 32
+#define TOK_sMIN 33
+#define TOK_sAVG 34
+#define TOK_sMEDIAN 35
+#define TOK_sSTDEV 36
+#define TOK_sSUM 37
+#define TOK_IN 38 // contains, e.g. FILTER~"A"
+#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A"
+
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
+// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
+#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently
// Return negative values if it is a function with variable number of arguments
static int filters_next_token(char **str, int *len)
tmp = *str;
}
+ if ( !strncasecmp(tmp,"SMPL_MAX(",9) ) { (*str) += 8; return TOK_sMAX; }
+ if ( !strncasecmp(tmp,"SMPL_MIN(",9) ) { (*str) += 8; return TOK_sMIN; }
+ if ( !strncasecmp(tmp,"SMPL_MEAN(",10) ) { (*str) += 9; return TOK_sAVG; }
+ if ( !strncasecmp(tmp,"SMPL_MEDIAN(",12) ) { (*str) += 11; return TOK_sMEDIAN; }
+ if ( !strncasecmp(tmp,"SMPL_AVG(",9) ) { (*str) += 8; return TOK_sAVG; }
+ if ( !strncasecmp(tmp,"SMPL_STDEV(",11) ) { (*str) += 10; return TOK_sSTDEV; }
+ if ( !strncasecmp(tmp,"SMPL_SUM(",9) ) { (*str) += 8; return TOK_sSUM; }
+ if ( !strncasecmp(tmp,"sMAX(",5) ) { (*str) += 4; return TOK_sMAX; }
+ if ( !strncasecmp(tmp,"sMIN(",5) ) { (*str) += 4; return TOK_sMIN; }
+ if ( !strncasecmp(tmp,"sMEAN(",6) ) { (*str) += 5; return TOK_sAVG; }
+ if ( !strncasecmp(tmp,"sMEDIAN(",8) ) { (*str) += 7; return TOK_sMEDIAN; }
+ if ( !strncasecmp(tmp,"sAVG(",5) ) { (*str) += 4; return TOK_sAVG; }
+ if ( !strncasecmp(tmp,"sSTDEV(",7) ) { (*str) += 6; return TOK_sSTDEV; }
+ if ( !strncasecmp(tmp,"sSUM(",5) ) { (*str) += 4; return TOK_sSUM; }
if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; }
if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; }
if ( !strncasecmp(tmp,"MEAN(",5) ) { (*str) += 4; return TOK_AVG; }
static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
{
int i;
- if ( rtok->tok_type==TOK_NE ) // AND logic: none of the filters can match
+ if ( rtok->tok_type==TOK_NOT_IN )
{
if ( !line->d.n_flt )
{
rtok->pass_site = 1;
return;
}
- else if ( rtok->tok_type==TOK_EQ ) // OR logic: at least one of the filters must match
+ else if ( rtok->tok_type==TOK_IN )
{
if ( !line->d.n_flt )
{
if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; }
return;
}
+ else if ( rtok->tok_type==TOK_NE ) // exact match
+ {
+ if ( !line->d.n_flt )
+ {
+ if ( atok->hdr_id==-1 ) return; // missing value
+ rtok->pass_site = 1;
+ return; // no filter present, eval to true
+ }
+ if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) return; // exact match, fail iff a single matching value is present
+ rtok->pass_site = 1;
+ return;
+ }
+ else if ( rtok->tok_type==TOK_EQ ) // exact match, pass iff a single matching value is present
+ {
+ if ( !line->d.n_flt )
+ {
+ if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
+ return; // no filter present, eval to false
+ }
+ if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) rtok->pass_site = 1;
+ return;
+ }
else
- error("Only == and != operators are supported for FILTER\n");
+ error("Only ==, !=, ~, and !~ operators are supported for FILTER\n");
return;
}
static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
tok->nvalues = 0;
return;
}
- if ( fmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8\n");
-
+
int j,nmissing = 0;
- for (i=0; i<line->n_sample; i++)
- {
- int8_t *ptr = (int8_t*) (fmt->p + i*fmt->size);
- for (j=0; j<fmt->n; j++)
- {
- if ( ptr[j]==bcf_int8_vector_end ) break;
- if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; }
- }
+ #define BRANCH(type_t, is_vector_end) { \
+ for (i=0; i<line->n_sample; i++) \
+ { \
+ type_t *ptr = (type_t *) (fmt->p + i*fmt->size); \
+ for (j=0; j<fmt->n; j++) \
+ { \
+ if ( ptr[j]==is_vector_end ) break; \
+ if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } \
+ } \
+ } \
+ }
+ switch (fmt->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+ default: fprintf(stderr,"todo: type %d\n", fmt->type); exit(1); break;
}
+ #undef BRANCH
tok->nvalues = 1;
tok->values[0] = tok->tag[0]=='N' ? nmissing : (double)nmissing / line->n_sample;
}
static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
- if ( nstack==0 ) error("Error parsing the expresion\n");
+ if ( nstack==0 ) error("Error parsing the expression\n");
token_t *tok = stack[nstack - 1];
if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag);
-
- rtok->nsamples = tok->nsamples;
- memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples));
-
assert(tok->usmpl);
- if ( !rtok->usmpl )
- {
- rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl));
- memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl));
- }
int i, npass = 0;
- for (i=0; i<rtok->nsamples; i++)
+ for (i=0; i<tok->nsamples; i++)
{
- if ( !rtok->usmpl[i] ) continue;
- if ( rtok->pass_samples[i] ) npass++;
+ if ( !tok->usmpl[i] ) continue;
+ if ( tok->pass_samples[i] ) npass++;
}
-
- hts_expand(double,rtok->nsamples,rtok->mvalues,rtok->values);
- double value = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0);
- rtok->nval1 = 1;
- rtok->nvalues = rtok->nsamples;
-
- // Set per-sample status so that `query -i 'F_PASS(GT!="mis" & GQ >= 20) > 0.5'` or +trio-stats
- // consider only the passing site AND samples. The values for failed samples is set to -1 so
- // that it can never conflict with valid expressions.
- for (i=0; i<rtok->nsamples; i++)
- rtok->values[i] = rtok->pass_samples[i] ? value : -1;
+ hts_expand(double,1,rtok->mvalues,rtok->values);
+ rtok->nsamples = 0;
+ rtok->nvalues = 1;
+ rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0);
return 1;
}
token_t *tok = stack[nstack - 1];
rtok->nvalues = 0;
if ( !tok->nvalues ) return 1;
- double val = -HUGE_VAL;
- int i, has_value = 0;
- for (i=0; i<tok->nvalues; i++)
+ double *ptr, val = -HUGE_VAL;
+ int i,j, has_value = 0;
+ if ( tok->nsamples )
{
- if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
- has_value = 1;
- if ( val < tok->values[i] ) val = tok->values[i];
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) continue;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ has_value = 1;
+ if ( val < ptr[j] ) val = ptr[j];
+ }
+ }
+ }
+ else
+ {
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+ has_value = 1;
+ if ( val < tok->values[i] ) val = tok->values[i];
+ }
}
if ( has_value )
{
}
return 1;
}
+static int func_smpl_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) return func_max(flt,line,rtok,stack,nstack);
+ rtok->nsamples = tok->nsamples;
+ rtok->nvalues = tok->nsamples;
+ rtok->nval1 = 1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ int i, j, has_value;
+ double val, *ptr;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ val = -HUGE_VAL;
+ has_value = 0;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ has_value = 1;
+ if ( val < ptr[j] ) val = ptr[j];
+ }
+ if ( has_value ) rtok->values[i] = val;
+ else bcf_double_set_missing(rtok->values[i]);
+ }
+ return 1;
+}
static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
token_t *tok = stack[nstack - 1];
rtok->nvalues = 0;
if ( !tok->nvalues ) return 1;
- double val = HUGE_VAL;
- int i, has_value = 0;
- for (i=0; i<tok->nvalues; i++)
+ double *ptr, val = HUGE_VAL;
+ int i,j, has_value = 0;
+ if ( tok->nsamples )
+ {
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) continue;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ has_value = 1;
+ if ( val > ptr[j] ) val = ptr[j];
+ }
+ }
+ }
+ else
{
- if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
- has_value = 1;
- if ( val > tok->values[i] ) val = tok->values[i];
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+ has_value = 1;
+ if ( val > tok->values[i] ) val = tok->values[i];
+ }
}
if ( has_value )
{
}
return 1;
}
+static int func_smpl_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) return func_min(flt,line,rtok,stack,nstack);
+ rtok->nsamples = tok->nsamples;
+ rtok->nvalues = tok->nsamples;
+ rtok->nval1 = 1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ int i, j, has_value;
+ double val, *ptr;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ val = HUGE_VAL;
+ has_value = 0;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ has_value = 1;
+ if ( val > ptr[j] ) val = ptr[j];
+ }
+ if ( has_value ) rtok->values[i] = val;
+ else bcf_double_set_missing(rtok->values[i]);
+ }
+ return 1;
+}
static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
token_t *tok = stack[nstack - 1];
rtok->nvalues = 0;
if ( !tok->nvalues ) return 1;
- double val = 0;
- int i, n = 0;
- for (i=0; i<tok->nvalues; i++)
- if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+ double *ptr, val = 0;
+ int i,j, n = 0;
+ if ( tok->nsamples )
+ {
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) continue;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ val += ptr[j];
+ n++;
+ }
+ }
+ }
+ else
+ {
+ for (i=0; i<tok->nvalues; i++)
+ if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+ }
if ( n )
{
rtok->values[0] = val / n;
}
return 1;
}
+static int func_smpl_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+ rtok->nsamples = tok->nsamples;
+ rtok->nvalues = tok->nsamples;
+ rtok->nval1 = 1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ int i, j, n;
+ double val, *ptr;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ val = 0;
+ n = 0;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) { val += ptr[j]; n++; }
+ }
+ if ( n ) rtok->values[i] = val / n;
+ else bcf_double_set_missing(rtok->values[i]);
+ }
+ return 1;
+}
static int compare_doubles(const void *lhs, const void *rhs)
{
double arg1 = *(const double*) lhs;
token_t *tok = stack[nstack - 1];
rtok->nvalues = 0;
if ( !tok->nvalues ) return 1;
- int i, n = 0;
- for (i=0; i<tok->nvalues; i++)
+ // sweep through all tok->values and while excluding all missing values reuse the very same array
+ int i,j,k = 0, n = 0;
+ if ( tok->nsamples )
{
- if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
- if ( n < i ) tok->values[n] = tok->values[i];
- n++;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) { k += tok->nval1; continue; }
+ for (j=0; j<tok->nval1; k++,j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue;
+ if ( n < k ) tok->values[n] = tok->values[k];
+ n++;
+ }
+ }
+ }
+ else
+ {
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+ if ( n < i ) tok->values[n] = tok->values[i];
+ n++;
+ }
}
if ( !n ) return 1;
if ( n==1 ) rtok->values[0] = tok->values[0];
rtok->nvalues = 1;
return 1;
}
+static int func_smpl_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+ rtok->nsamples = tok->nsamples;
+ rtok->nvalues = tok->nsamples;
+ rtok->nval1 = 1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ int i, j, n;
+ double *ptr;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ n = 0;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ if ( n < j ) ptr[n] = ptr[j];
+ n++;
+ }
+ if ( n==0 )
+ bcf_double_set_missing(rtok->values[i]);
+ else if ( n==1 )
+ rtok->values[i] = ptr[0];
+ else
+ {
+ qsort(ptr, n, sizeof(double), compare_doubles);
+ rtok->values[i] = n % 2 ? ptr[n/2] : (ptr[n/2-1] + ptr[n/2]) * 0.5;
+ }
+ }
+ return 1;
+}
static int func_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
token_t *tok = stack[nstack - 1];
rtok->nvalues = 0;
if ( !tok->nvalues ) return 1;
- int i, n = 0;
- for (i=0; i<tok->nvalues; i++)
+ // sweep through all tok->values and while excluding all missing values reuse the very same array
+ int i,j,k = 0, n = 0;
+ if ( tok->nsamples )
+ {
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) { k += tok->nval1; continue; }
+ for (j=0; j<tok->nval1; k++,j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue;
+ if ( n < k ) tok->values[n] = tok->values[k];
+ n++;
+ }
+ }
+ }
+ else
{
- if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
- if ( n < i ) tok->values[n] = tok->values[i];
- n++;
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+ if ( n < i ) tok->values[n] = tok->values[i];
+ n++;
+ }
}
if ( !n ) return 1;
if ( n==1 ) rtok->values[0] = 0;
else
{
double sdev = 0, avg = 0;
- for (i=0; i<n; i++) avg += tok->values[n];
+ for (i=0; i<n; i++) avg += tok->values[i];
avg /= n;
- for (i=0; i<n; i++) sdev += (tok->values[n] - avg) * (tok->values[n] - avg);
+ for (i=0; i<n; i++) sdev += (tok->values[i] - avg) * (tok->values[i] - avg);
rtok->values[0] = sqrt(sdev/n);
}
rtok->nvalues = 1;
return 1;
}
+static int func_smpl_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+ rtok->nsamples = tok->nsamples;
+ rtok->nvalues = tok->nsamples;
+ rtok->nval1 = 1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ int i, j, n;
+ double *ptr;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ n = 0;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ if ( n < j ) ptr[n] = ptr[j];
+ n++;
+ }
+ if ( n==0 )
+ bcf_double_set_missing(rtok->values[i]);
+ else if ( n==1 )
+ rtok->values[i] = 0;
+ else
+ {
+ double sdev = 0, avg = 0;
+ for (j=0; j<n; j++) avg += ptr[j];
+ avg /= n;
+ for (j=0; j<n; j++) sdev += (ptr[j] - avg) * (ptr[j] - avg);
+ rtok->values[i] = sqrt(sdev/n);
+ }
+ }
+ return 1;
+}
static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
rtok->nvalues = 0;
token_t *tok = stack[nstack - 1];
if ( !tok->nvalues ) return 1;
- double val = 0;
- int i, n = 0;
- for (i=0; i<tok->nvalues; i++)
- if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+ double *ptr, val = 0;
+ int i,j, n = 0;
+ if ( tok->nsamples )
+ {
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) continue;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ val += ptr[j];
+ n++;
+ }
+ }
+ }
+ else
+ {
+ for (i=0; i<tok->nvalues; i++)
+ if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+ }
if ( n )
{
rtok->values[0] = val;
}
return 1;
}
+static int func_smpl_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+ rtok->nsamples = tok->nsamples;
+ rtok->nvalues = tok->nsamples;
+ rtok->nval1 = 1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ int i, j, has_value;
+ double val, *ptr;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ val = 0;
+ has_value = 0;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ has_value = 1;
+ val += ptr[j];
+ }
+ if ( has_value ) rtok->values[i] = val;
+ else bcf_double_set_missing(rtok->values[i]);
+ }
+ return 1;
+}
static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
token_t *tok = stack[nstack - 1];
if ( tok->is_str ) error("ABS() can be applied only on numeric values\n");
-
+ rtok->nsamples = tok->nsamples;
rtok->nvalues = tok->nvalues;
+ rtok->nval1 = tok->nval1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ if ( tok->usmpl )
+ {
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ }
if ( !tok->nvalues ) return 1;
hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
- int i;
- for (i=0; i<tok->nvalues; i++)
- if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
- else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]);
+ int i,j,k = 0;
+ if ( tok->usmpl )
+ {
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; }
+ for (j=0; j<tok->nval1; k++,j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]);
+ else rtok->values[k] = fabs(tok->values[k]);
+ }
+ }
+ }
+ else
+ {
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( tok->usmpl && !tok->usmpl[i] ) continue;
+ if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
+ else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]);
+ }
+ }
return 1;
}
static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
token_t *tok = stack[nstack - 1];
- int i, cnt = 0;
- if ( !tok->nsamples )
+ int i,j, cnt = 0;
+ if ( tok->tag && tok->nsamples )
{
- if ( tok->is_str )
+ // raw number of values in a FMT tag, e.g. COUNT(FMT/TAG)
+ if ( tok->is_str ) error("todo: Type=String for COUNT on FORMAT fields?\n");
+ for (i=0; i<tok->nsamples; i++)
{
- if ( tok->str_value.l ) cnt = 1;
- for (i=0; i<tok->str_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++;
+ if ( !tok->usmpl[i] ) continue;
+ double *ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) cnt++;
}
- else
- cnt = tok->nvalues;
}
- else
+ else if ( tok->nsamples )
{
+ // number of samples that pass a processed FMT tag
for (i=0; i<tok->nsamples; i++)
if ( tok->pass_samples[i] ) cnt++;
}
+ else if ( tok->is_str )
+ {
+ if ( tok->str_value.l ) cnt = 1;
+ for (i=0; i<tok->str_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++;
+ }
+ else
+ cnt = tok->nvalues;
rtok->nvalues = 1;
rtok->values[0] = cnt;
if ( !tok->nvalues ) return 1;
hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
- int i;
- for (i=0; i<tok->nvalues; i++)
- if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
- else rtok->values[i] = -4.34294481903*log(tok->values[i]);
-
+ int i,j,k = 0;
+ if ( tok->usmpl )
+ {
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; }
+ for (j=0; j<tok->nval1; k++,j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]);
+ else rtok->values[k] = -4.34294481903*log(tok->values[k]);
+ }
+ }
+ }
+ else
+ {
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
+ else rtok->values[i] = -4.34294481903*log(tok->values[i]);
+ }
+ }
return 1;
}
inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok)
for (i=0; i<atok->nsamples; i++) rtok->usmpl[i] |= atok->usmpl[i];
for (i=0; i<btok->nsamples; i++) rtok->usmpl[i] |= btok->usmpl[i];
}
- memset(rtok->pass_samples, 0, rtok->nsamples);
+ if (rtok->nsamples)
+ memset(rtok->pass_samples, 0, rtok->nsamples);
}
#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \
rtok->values[i] = atok->values[i] AOP btok->values[i]; \
} \
} \
+ else if ( atok->nsamples ) \
+ { \
+ assert( btok->nvalues==1 ); \
+ if ( !bcf_double_is_missing_or_vector_end(btok->values[0]) ) \
+ { \
+ for (i=0; i<atok->nvalues; i++) \
+ { \
+ if ( bcf_double_is_missing_or_vector_end(atok->values[i]) ) \
+ { \
+ bcf_double_set_missing(rtok->values[i]); \
+ continue; \
+ } \
+ has_values = 1; \
+ rtok->values[i] = atok->values[i] AOP btok->values[0]; \
+ } \
+ } \
+ } \
else \
{ \
- token_t *xtok = atok->nsamples ? atok : btok; \
- token_t *ytok = atok->nsamples ? btok : atok; \
- assert( ytok->nvalues==1 ); \
- if ( !bcf_double_is_missing_or_vector_end(ytok->values[0]) ) \
+ assert( atok->nvalues==1 ); \
+ if ( !bcf_double_is_missing_or_vector_end(atok->values[0]) ) \
{ \
- for (i=0; i<xtok->nvalues; i++) \
+ for (i=0; i<btok->nvalues; i++) \
{ \
- if ( bcf_double_is_missing_or_vector_end(xtok->values[i]) ) \
+ if ( bcf_double_is_missing_or_vector_end(btok->values[i]) ) \
{ \
bcf_double_set_missing(rtok->values[i]); \
continue; \
} \
has_values = 1; \
- rtok->values[i] = xtok->values[i] AOP ytok->values[0]; \
+ rtok->values[i] = atok->values[0] AOP btok->values[i]; \
} \
} \
} \
return 2;
}
-#define CMP_MISSING(atok,btok,CMP_OP,ret) \
-{ \
- if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \
- token_t *tok = (atok)->is_missing ? (btok) : (atok); \
- (ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \
- tok->nvalues = 1; \
-}
-
#define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
{ \
token_t *rtok = _rtok; \
} \
} \
} \
- else \
+ else if ( atok->nsamples )\
+ { \
+ for (i=0; i<atok->nsamples; i++) \
+ { \
+ if ( !rtok->usmpl[i] ) continue; \
+ double *aptr = atok->values + i*atok->nval1; \
+ double *bptr = btok->values + i*btok->nval1; \
+ for (j=0; j<atok->nval1; j++) \
+ { \
+ int miss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
+ if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
+ for (k=0; k<btok->nvalues; k++) \
+ { \
+ int nmiss = miss + (bcf_double_is_missing_or_vector_end(bptr[k]) ? 1 : 0); \
+ if ( nmiss ) \
+ { \
+ if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
+ } \
+ else if ( aptr[j] > 16777216 || bptr[k] > 16777216 ) /* Ugly, see #871 */ \
+ { \
+ if ( aptr[j] CMP_OP bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
+ } \
+ else if ( (float)aptr[j] CMP_OP (float)bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
+ } \
+ } \
+ } \
+ } \
+ else /* btok->nsamples */ \
{ \
- token_t *xtok = atok->nsamples ? atok : btok; \
- token_t *ytok = atok->nsamples ? btok : atok; \
- for (i=0; i<xtok->nsamples; i++) \
+ for (i=0; i<btok->nsamples; i++) \
{ \
if ( !rtok->usmpl[i] ) continue; \
- double *xptr = xtok->values + i*xtok->nval1; \
- double *yptr = ytok->values + i*ytok->nval1; \
- for (j=0; j<xtok->nval1; j++) \
+ double *aptr = atok->values + i*atok->nval1; \
+ double *bptr = btok->values + i*btok->nval1; \
+ for (j=0; j<btok->nval1; j++) \
{ \
- int miss = bcf_double_is_missing_or_vector_end(xptr[j]) ? 1 : 0; \
+ int miss = bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0; \
if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
- for (k=0; k<ytok->nvalues; k++) \
+ for (k=0; k<atok->nvalues; k++) \
{ \
- int nmiss = miss + (bcf_double_is_missing_or_vector_end(yptr[k]) ? 1 : 0); \
+ int nmiss = miss + (bcf_double_is_missing_or_vector_end(aptr[k]) ? 1 : 0); \
if ( nmiss ) \
{ \
- if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+ if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
} \
- else if ( xptr[j] > 16777216 || yptr[k] > 16777216 ) /* Ugly, see #871 */ \
+ else if ( bptr[j] > 16777216 || aptr[k] > 16777216 ) /* Ugly, see #871 */ \
{ \
- if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+ if ( aptr[k] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
} \
- else if ( (float)xptr[j] CMP_OP (float)yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+ else if ( (float)aptr[k] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
} \
} \
} \
{
int is_info = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ? 1 : 0;
is_fmt = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ? 1 : 0;
- if ( is_info && is_fmt ) error("Both INFO/%s and FORMAT/%s exist, which one do you want?\n", tmp.s,tmp.s);
+ if ( is_info && is_fmt )
+ error("Error: ambiguous filtering expression, both INFO/%s and FORMAT/%s are defined in the VCF header.\n" , tmp.s,tmp.s);
}
if ( is_fmt==-1 ) is_fmt = 0;
}
// Additionally, treat "." as missing value rather than a string in numeric equalities; that
// @file is only used with ID; etc.
// This code is fragile: improve me.
+ static int comma_separator_warned = 0;
int i;
for (i=0; i<nout; i++)
{
if ( regcomp(out[j].regex, out[j].key, cflags) )
error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str);
}
+ if ( out[i].is_str && out[i].tok_type==TOK_VAL && out[i].key && strchr(out[i].key,',') )
+ {
+ int print_note = 0;
+ if ( out[i+1].tok_type==TOK_EQ || (out[i+1].is_str && out[i+2].tok_type==TOK_EQ) ) print_note = 1;
+ else if ( out[i+1].tok_type==TOK_NE || (out[i+1].is_str && out[i+2].tok_type==TOK_NE) ) print_note = 1;
+ if ( print_note && !comma_separator_warned )
+ {
+ comma_separator_warned = 1;
+ fprintf(stderr,
+ "Warning: comma is interpreted as a separator and OR logic is used in string comparisons.\n"
+ " (Search the manual for \"Comma in strings\" to learn more.)\n");
+ }
+ }
if ( out[i].tok_type!=TOK_VAL ) continue;
if ( !out[i].tag ) continue;
if ( out[i].setter==filters_set_type )
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
int itok = i, ival;
if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
- else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_EQ, ival = i - 1;
- else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NE, ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_IN, ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NOT_IN, ival = i - 1;
else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i;
- else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_EQ, ival = ++i;
- else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NE, ival = ++i;
+ else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_IN, ival = ++i;
+ else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NOT_IN, ival = ++i;
else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
if ( out[ival].tok_type!=TOK_VAL || !out[ival].key )
error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
else if ( out[i].tok_type==TOK_PHRED ) { out[i].func = func_phred; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_BINOM ) { out[i].func = func_binom; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_PERLSUB ) { out[i].func = perl_exec; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_sMAX ) { out[i].func = func_smpl_max; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_sMIN ) { out[i].func = func_smpl_min; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_sAVG ) { out[i].func = func_smpl_avg; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_sMEDIAN ) { out[i].func = func_smpl_median; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_sSTDEV ) { out[i].func = func_smpl_stddev; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_sSUM ) { out[i].func = func_smpl_sum; out[i].tok_type = TOK_FUNC; }
hts_expand0(double,1,out[i].mvalues,out[i].values);
if ( filter->nsamples )
{
{
return flt->max_unpack;
}
+
+const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1)
+{
+ token_t *tok = filter->flt_stack[0];
+ if ( tok->nvalues )
+ {
+ *nval = tok->nvalues;
+ *nval1 = tok->nval1;
+ }
+ else
+ {
+ if ( !tok->values ) error("fixme in filter_get_doubles(): %s\n", filter->str);
+ *nval = 1;
+ *nval1 = 1;
+ tok->values[0] = filter->flt_stack[0]->pass_site;
+ }
+ return tok->values;
+}
+
+void filter_set_samples(filter_t *filter, const uint8_t *samples)
+{
+ int i,j;
+ for (i=0; i<filter->nfilters; i++)
+ {
+ if ( !filter->filters[i].nsamples ) continue;
+ for (j=0; j<filter->filters[i].nsamples; j++) filter->filters[i].usmpl[j] = samples[j];
+ }
+}
+
/* filter.c -- filter expressions.
- Copyright (C) 2013-2018 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <ctype.h>
#include <stdlib.h>
#include <strings.h>
+#include <assert.h>
#include <errno.h>
#include <math.h>
#include <sys/types.h>
# define __FUNCTION__ __func__
#endif
-static const uint64_t bcf_double_missing = 0x7ff0000000000001;
-static const uint64_t bcf_double_vector_end = 0x7ff0000000000002;
-static inline void bcf_double_set(double *ptr, uint64_t value)
-{
- union { uint64_t i; double d; } u;
- u.i = value;
- *ptr = u.d;
-}
-static inline int bcf_double_test(double d, uint64_t value)
-{
- union { uint64_t i; double d; } u;
- u.d = d;
- return u.i==value ? 1 : 0;
-}
-#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end)
-#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing)
-#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end)
-#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing)
-#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end))
-
-
typedef struct _token_t
{
// read-only values, same for all VCF lines
int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types
int idx; // 0-based index to VCF vectors,
// -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
- int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
+ int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only
int nidxs, nuidxs; // size of idxs array and the number of elements set to 1
- uint8_t *usmpl; // bitmask of used samples as set by idx
+ uint8_t *usmpl; // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise
int nsamples; // number of samples for format fields, 0 for info and other fields
void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack);
#define TOK_PHRED 29
#define TOK_MEDIAN 30
#define TOK_STDEV 31
-
-// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
-// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
-#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis"
+#define TOK_sMAX 32
+#define TOK_sMIN 33
+#define TOK_sAVG 34
+#define TOK_sMEDIAN 35
+#define TOK_sSTDEV 36
+#define TOK_sSUM 37
+#define TOK_IN 38 // contains, e.g. FILTER~"A"
+#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A"
+
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
+// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
+#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently
// Return negative values if it is a function with variable number of arguments
static int filters_next_token(char **str, int *len)
tmp = *str;
}
+ if ( !strncasecmp(tmp,"SMPL_MAX(",9) ) { (*str) += 8; return TOK_sMAX; }
+ if ( !strncasecmp(tmp,"SMPL_MIN(",9) ) { (*str) += 8; return TOK_sMIN; }
+ if ( !strncasecmp(tmp,"SMPL_MEAN(",10) ) { (*str) += 9; return TOK_sAVG; }
+ if ( !strncasecmp(tmp,"SMPL_MEDIAN(",12) ) { (*str) += 11; return TOK_sMEDIAN; }
+ if ( !strncasecmp(tmp,"SMPL_AVG(",9) ) { (*str) += 8; return TOK_sAVG; }
+ if ( !strncasecmp(tmp,"SMPL_STDEV(",11) ) { (*str) += 10; return TOK_sSTDEV; }
+ if ( !strncasecmp(tmp,"SMPL_SUM(",9) ) { (*str) += 8; return TOK_sSUM; }
+ if ( !strncasecmp(tmp,"sMAX(",5) ) { (*str) += 4; return TOK_sMAX; }
+ if ( !strncasecmp(tmp,"sMIN(",5) ) { (*str) += 4; return TOK_sMIN; }
+ if ( !strncasecmp(tmp,"sMEAN(",6) ) { (*str) += 5; return TOK_sAVG; }
+ if ( !strncasecmp(tmp,"sMEDIAN(",8) ) { (*str) += 7; return TOK_sMEDIAN; }
+ if ( !strncasecmp(tmp,"sAVG(",5) ) { (*str) += 4; return TOK_sAVG; }
+ if ( !strncasecmp(tmp,"sSTDEV(",7) ) { (*str) += 6; return TOK_sSTDEV; }
+ if ( !strncasecmp(tmp,"sSUM(",5) ) { (*str) += 4; return TOK_sSUM; }
if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; }
if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; }
if ( !strncasecmp(tmp,"MEAN(",5) ) { (*str) += 4; return TOK_AVG; }
static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
{
int i;
- if ( rtok->tok_type==TOK_NE ) // AND logic: none of the filters can match
+ if ( rtok->tok_type==TOK_NOT_IN )
{
if ( !line->d.n_flt )
{
rtok->pass_site = 1;
return;
}
- else if ( rtok->tok_type==TOK_EQ ) // OR logic: at least one of the filters must match
+ else if ( rtok->tok_type==TOK_IN )
{
if ( !line->d.n_flt )
{
if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; }
return;
}
+ else if ( rtok->tok_type==TOK_NE ) // exact match
+ {
+ if ( !line->d.n_flt )
+ {
+ if ( atok->hdr_id==-1 ) return; // missing value
+ rtok->pass_site = 1;
+ return; // no filter present, eval to true
+ }
+ if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) return; // exact match, fail iff a single matching value is present
+ rtok->pass_site = 1;
+ return;
+ }
+ else if ( rtok->tok_type==TOK_EQ ) // exact match, pass iff a single matching value is present
+ {
+ if ( !line->d.n_flt )
+ {
+ if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
+ return; // no filter present, eval to false
+ }
+ if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) rtok->pass_site = 1;
+ return;
+ }
else
- error("Only == and != operators are supported for FILTER\n");
+ error("Only ==, !=, ~, and !~ operators are supported for FILTER\n");
return;
}
static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break;
case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break;
- default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break;
+ default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); bcftools_exit(1); break;
}
#undef BRANCH
return -1; // this shouldn't happen
tok->nvalues = 0;
return;
}
- if ( fmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8\n");
-
+
int j,nmissing = 0;
- for (i=0; i<line->n_sample; i++)
- {
- int8_t *ptr = (int8_t*) (fmt->p + i*fmt->size);
- for (j=0; j<fmt->n; j++)
- {
- if ( ptr[j]==bcf_int8_vector_end ) break;
- if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; }
- }
+ #define BRANCH(type_t, is_vector_end) { \
+ for (i=0; i<line->n_sample; i++) \
+ { \
+ type_t *ptr = (type_t *) (fmt->p + i*fmt->size); \
+ for (j=0; j<fmt->n; j++) \
+ { \
+ if ( ptr[j]==is_vector_end ) break; \
+ if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } \
+ } \
+ } \
+ }
+ switch (fmt->type) {
+ case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+ default: fprintf(bcftools_stderr,"todo: type %d\n", fmt->type); bcftools_exit(1); break;
}
+ #undef BRANCH
tok->nvalues = 1;
tok->values[0] = tok->tag[0]=='N' ? nmissing : (double)nmissing / line->n_sample;
}
static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
- if ( nstack==0 ) error("Error parsing the expresion\n");
+ if ( nstack==0 ) error("Error parsing the expression\n");
token_t *tok = stack[nstack - 1];
if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag);
-
- rtok->nsamples = tok->nsamples;
- memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples));
-
assert(tok->usmpl);
- if ( !rtok->usmpl )
- {
- rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl));
- memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl));
- }
int i, npass = 0;
- for (i=0; i<rtok->nsamples; i++)
+ for (i=0; i<tok->nsamples; i++)
{
- if ( !rtok->usmpl[i] ) continue;
- if ( rtok->pass_samples[i] ) npass++;
+ if ( !tok->usmpl[i] ) continue;
+ if ( tok->pass_samples[i] ) npass++;
}
-
- hts_expand(double,rtok->nsamples,rtok->mvalues,rtok->values);
- double value = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0);
- rtok->nval1 = 1;
- rtok->nvalues = rtok->nsamples;
-
- // Set per-sample status so that `query -i 'F_PASS(GT!="mis" & GQ >= 20) > 0.5'` or +trio-stats
- // consider only the passing site AND samples. The values for failed samples is set to -1 so
- // that it can never conflict with valid expressions.
- for (i=0; i<rtok->nsamples; i++)
- rtok->values[i] = rtok->pass_samples[i] ? value : -1;
+ hts_expand(double,1,rtok->mvalues,rtok->values);
+ rtok->nsamples = 0;
+ rtok->nvalues = 1;
+ rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0);
return 1;
}
token_t *tok = stack[nstack - 1];
rtok->nvalues = 0;
if ( !tok->nvalues ) return 1;
- double val = -HUGE_VAL;
- int i, has_value = 0;
- for (i=0; i<tok->nvalues; i++)
+ double *ptr, val = -HUGE_VAL;
+ int i,j, has_value = 0;
+ if ( tok->nsamples )
{
- if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
- has_value = 1;
- if ( val < tok->values[i] ) val = tok->values[i];
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) continue;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ has_value = 1;
+ if ( val < ptr[j] ) val = ptr[j];
+ }
+ }
+ }
+ else
+ {
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+ has_value = 1;
+ if ( val < tok->values[i] ) val = tok->values[i];
+ }
}
if ( has_value )
{
}
return 1;
}
+static int func_smpl_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) return func_max(flt,line,rtok,stack,nstack);
+ rtok->nsamples = tok->nsamples;
+ rtok->nvalues = tok->nsamples;
+ rtok->nval1 = 1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ int i, j, has_value;
+ double val, *ptr;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ val = -HUGE_VAL;
+ has_value = 0;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ has_value = 1;
+ if ( val < ptr[j] ) val = ptr[j];
+ }
+ if ( has_value ) rtok->values[i] = val;
+ else bcf_double_set_missing(rtok->values[i]);
+ }
+ return 1;
+}
static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
token_t *tok = stack[nstack - 1];
rtok->nvalues = 0;
if ( !tok->nvalues ) return 1;
- double val = HUGE_VAL;
- int i, has_value = 0;
- for (i=0; i<tok->nvalues; i++)
+ double *ptr, val = HUGE_VAL;
+ int i,j, has_value = 0;
+ if ( tok->nsamples )
+ {
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) continue;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ has_value = 1;
+ if ( val > ptr[j] ) val = ptr[j];
+ }
+ }
+ }
+ else
{
- if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
- has_value = 1;
- if ( val > tok->values[i] ) val = tok->values[i];
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+ has_value = 1;
+ if ( val > tok->values[i] ) val = tok->values[i];
+ }
}
if ( has_value )
{
}
return 1;
}
+static int func_smpl_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) return func_min(flt,line,rtok,stack,nstack);
+ rtok->nsamples = tok->nsamples;
+ rtok->nvalues = tok->nsamples;
+ rtok->nval1 = 1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ int i, j, has_value;
+ double val, *ptr;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ val = HUGE_VAL;
+ has_value = 0;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ has_value = 1;
+ if ( val > ptr[j] ) val = ptr[j];
+ }
+ if ( has_value ) rtok->values[i] = val;
+ else bcf_double_set_missing(rtok->values[i]);
+ }
+ return 1;
+}
static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
token_t *tok = stack[nstack - 1];
rtok->nvalues = 0;
if ( !tok->nvalues ) return 1;
- double val = 0;
- int i, n = 0;
- for (i=0; i<tok->nvalues; i++)
- if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+ double *ptr, val = 0;
+ int i,j, n = 0;
+ if ( tok->nsamples )
+ {
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) continue;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ val += ptr[j];
+ n++;
+ }
+ }
+ }
+ else
+ {
+ for (i=0; i<tok->nvalues; i++)
+ if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+ }
if ( n )
{
rtok->values[0] = val / n;
}
return 1;
}
+static int func_smpl_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+ rtok->nsamples = tok->nsamples;
+ rtok->nvalues = tok->nsamples;
+ rtok->nval1 = 1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ int i, j, n;
+ double val, *ptr;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ val = 0;
+ n = 0;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) { val += ptr[j]; n++; }
+ }
+ if ( n ) rtok->values[i] = val / n;
+ else bcf_double_set_missing(rtok->values[i]);
+ }
+ return 1;
+}
static int compare_doubles(const void *lhs, const void *rhs)
{
double arg1 = *(const double*) lhs;
token_t *tok = stack[nstack - 1];
rtok->nvalues = 0;
if ( !tok->nvalues ) return 1;
- int i, n = 0;
- for (i=0; i<tok->nvalues; i++)
+ // sweep through all tok->values and while excluding all missing values reuse the very same array
+ int i,j,k = 0, n = 0;
+ if ( tok->nsamples )
{
- if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
- if ( n < i ) tok->values[n] = tok->values[i];
- n++;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) { k += tok->nval1; continue; }
+ for (j=0; j<tok->nval1; k++,j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue;
+ if ( n < k ) tok->values[n] = tok->values[k];
+ n++;
+ }
+ }
+ }
+ else
+ {
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+ if ( n < i ) tok->values[n] = tok->values[i];
+ n++;
+ }
}
if ( !n ) return 1;
if ( n==1 ) rtok->values[0] = tok->values[0];
rtok->nvalues = 1;
return 1;
}
+static int func_smpl_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+ rtok->nsamples = tok->nsamples;
+ rtok->nvalues = tok->nsamples;
+ rtok->nval1 = 1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ int i, j, n;
+ double *ptr;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ n = 0;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ if ( n < j ) ptr[n] = ptr[j];
+ n++;
+ }
+ if ( n==0 )
+ bcf_double_set_missing(rtok->values[i]);
+ else if ( n==1 )
+ rtok->values[i] = ptr[0];
+ else
+ {
+ qsort(ptr, n, sizeof(double), compare_doubles);
+ rtok->values[i] = n % 2 ? ptr[n/2] : (ptr[n/2-1] + ptr[n/2]) * 0.5;
+ }
+ }
+ return 1;
+}
static int func_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
token_t *tok = stack[nstack - 1];
rtok->nvalues = 0;
if ( !tok->nvalues ) return 1;
- int i, n = 0;
- for (i=0; i<tok->nvalues; i++)
+ // sweep through all tok->values and while excluding all missing values reuse the very same array
+ int i,j,k = 0, n = 0;
+ if ( tok->nsamples )
+ {
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) { k += tok->nval1; continue; }
+ for (j=0; j<tok->nval1; k++,j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue;
+ if ( n < k ) tok->values[n] = tok->values[k];
+ n++;
+ }
+ }
+ }
+ else
{
- if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
- if ( n < i ) tok->values[n] = tok->values[i];
- n++;
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+ if ( n < i ) tok->values[n] = tok->values[i];
+ n++;
+ }
}
if ( !n ) return 1;
if ( n==1 ) rtok->values[0] = 0;
else
{
double sdev = 0, avg = 0;
- for (i=0; i<n; i++) avg += tok->values[n];
+ for (i=0; i<n; i++) avg += tok->values[i];
avg /= n;
- for (i=0; i<n; i++) sdev += (tok->values[n] - avg) * (tok->values[n] - avg);
+ for (i=0; i<n; i++) sdev += (tok->values[i] - avg) * (tok->values[i] - avg);
rtok->values[0] = sqrt(sdev/n);
}
rtok->nvalues = 1;
return 1;
}
+static int func_smpl_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+ rtok->nsamples = tok->nsamples;
+ rtok->nvalues = tok->nsamples;
+ rtok->nval1 = 1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ int i, j, n;
+ double *ptr;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ n = 0;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ if ( n < j ) ptr[n] = ptr[j];
+ n++;
+ }
+ if ( n==0 )
+ bcf_double_set_missing(rtok->values[i]);
+ else if ( n==1 )
+ rtok->values[i] = 0;
+ else
+ {
+ double sdev = 0, avg = 0;
+ for (j=0; j<n; j++) avg += ptr[j];
+ avg /= n;
+ for (j=0; j<n; j++) sdev += (ptr[j] - avg) * (ptr[j] - avg);
+ rtok->values[i] = sqrt(sdev/n);
+ }
+ }
+ return 1;
+}
static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
rtok->nvalues = 0;
token_t *tok = stack[nstack - 1];
if ( !tok->nvalues ) return 1;
- double val = 0;
- int i, n = 0;
- for (i=0; i<tok->nvalues; i++)
- if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+ double *ptr, val = 0;
+ int i,j, n = 0;
+ if ( tok->nsamples )
+ {
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) continue;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ val += ptr[j];
+ n++;
+ }
+ }
+ }
+ else
+ {
+ for (i=0; i<tok->nvalues; i++)
+ if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+ }
if ( n )
{
rtok->values[0] = val;
}
return 1;
}
+static int func_smpl_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+ rtok->nsamples = tok->nsamples;
+ rtok->nvalues = tok->nsamples;
+ rtok->nval1 = 1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ assert(tok->usmpl);
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ int i, j, has_value;
+ double val, *ptr;
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ val = 0;
+ has_value = 0;
+ ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+ has_value = 1;
+ val += ptr[j];
+ }
+ if ( has_value ) rtok->values[i] = val;
+ else bcf_double_set_missing(rtok->values[i]);
+ }
+ return 1;
+}
static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
token_t *tok = stack[nstack - 1];
if ( tok->is_str ) error("ABS() can be applied only on numeric values\n");
-
+ rtok->nsamples = tok->nsamples;
rtok->nvalues = tok->nvalues;
+ rtok->nval1 = tok->nval1;
+ hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+ if ( tok->usmpl )
+ {
+ if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+ }
if ( !tok->nvalues ) return 1;
hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
- int i;
- for (i=0; i<tok->nvalues; i++)
- if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
- else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]);
+ int i,j,k = 0;
+ if ( tok->usmpl )
+ {
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; }
+ for (j=0; j<tok->nval1; k++,j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]);
+ else rtok->values[k] = fabs(tok->values[k]);
+ }
+ }
+ }
+ else
+ {
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( tok->usmpl && !tok->usmpl[i] ) continue;
+ if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
+ else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]);
+ }
+ }
return 1;
}
static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
token_t *tok = stack[nstack - 1];
- int i, cnt = 0;
- if ( !tok->nsamples )
+ int i,j, cnt = 0;
+ if ( tok->tag && tok->nsamples )
{
- if ( tok->is_str )
+ // raw number of values in a FMT tag, e.g. COUNT(FMT/TAG)
+ if ( tok->is_str ) error("todo: Type=String for COUNT on FORMAT fields?\n");
+ for (i=0; i<tok->nsamples; i++)
{
- if ( tok->str_value.l ) cnt = 1;
- for (i=0; i<tok->str_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++;
+ if ( !tok->usmpl[i] ) continue;
+ double *ptr = tok->values + i*tok->nval1;
+ for (j=0; j<tok->nval1; j++)
+ if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) cnt++;
}
- else
- cnt = tok->nvalues;
}
- else
+ else if ( tok->nsamples )
{
+ // number of samples that pass a processed FMT tag
for (i=0; i<tok->nsamples; i++)
if ( tok->pass_samples[i] ) cnt++;
}
+ else if ( tok->is_str )
+ {
+ if ( tok->str_value.l ) cnt = 1;
+ for (i=0; i<tok->str_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++;
+ }
+ else
+ cnt = tok->nvalues;
rtok->nvalues = 1;
rtok->values[0] = cnt;
if ( !tok->nvalues ) return 1;
hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
- int i;
- for (i=0; i<tok->nvalues; i++)
- if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
- else rtok->values[i] = -4.34294481903*log(tok->values[i]);
-
+ int i,j,k = 0;
+ if ( tok->usmpl )
+ {
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; }
+ for (j=0; j<tok->nval1; k++,j++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]);
+ else rtok->values[k] = -4.34294481903*log(tok->values[k]);
+ }
+ }
+ }
+ else
+ {
+ for (i=0; i<tok->nvalues; i++)
+ {
+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
+ else rtok->values[i] = -4.34294481903*log(tok->values[i]);
+ }
+ }
return 1;
}
inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok)
for (i=0; i<atok->nsamples; i++) rtok->usmpl[i] |= atok->usmpl[i];
for (i=0; i<btok->nsamples; i++) rtok->usmpl[i] |= btok->usmpl[i];
}
- memset(rtok->pass_samples, 0, rtok->nsamples);
+ if (rtok->nsamples)
+ memset(rtok->pass_samples, 0, rtok->nsamples);
}
#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \
rtok->values[i] = atok->values[i] AOP btok->values[i]; \
} \
} \
+ else if ( atok->nsamples ) \
+ { \
+ assert( btok->nvalues==1 ); \
+ if ( !bcf_double_is_missing_or_vector_end(btok->values[0]) ) \
+ { \
+ for (i=0; i<atok->nvalues; i++) \
+ { \
+ if ( bcf_double_is_missing_or_vector_end(atok->values[i]) ) \
+ { \
+ bcf_double_set_missing(rtok->values[i]); \
+ continue; \
+ } \
+ has_values = 1; \
+ rtok->values[i] = atok->values[i] AOP btok->values[0]; \
+ } \
+ } \
+ } \
else \
{ \
- token_t *xtok = atok->nsamples ? atok : btok; \
- token_t *ytok = atok->nsamples ? btok : atok; \
- assert( ytok->nvalues==1 ); \
- if ( !bcf_double_is_missing_or_vector_end(ytok->values[0]) ) \
+ assert( atok->nvalues==1 ); \
+ if ( !bcf_double_is_missing_or_vector_end(atok->values[0]) ) \
{ \
- for (i=0; i<xtok->nvalues; i++) \
+ for (i=0; i<btok->nvalues; i++) \
{ \
- if ( bcf_double_is_missing_or_vector_end(xtok->values[i]) ) \
+ if ( bcf_double_is_missing_or_vector_end(btok->values[i]) ) \
{ \
bcf_double_set_missing(rtok->values[i]); \
continue; \
} \
has_values = 1; \
- rtok->values[i] = xtok->values[i] AOP ytok->values[0]; \
+ rtok->values[i] = atok->values[0] AOP btok->values[i]; \
} \
} \
} \
return 2;
}
-#define CMP_MISSING(atok,btok,CMP_OP,ret) \
-{ \
- if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \
- token_t *tok = (atok)->is_missing ? (btok) : (atok); \
- (ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \
- tok->nvalues = 1; \
-}
-
#define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
{ \
token_t *rtok = _rtok; \
} \
} \
} \
- else \
+ else if ( atok->nsamples )\
+ { \
+ for (i=0; i<atok->nsamples; i++) \
+ { \
+ if ( !rtok->usmpl[i] ) continue; \
+ double *aptr = atok->values + i*atok->nval1; \
+ double *bptr = btok->values + i*btok->nval1; \
+ for (j=0; j<atok->nval1; j++) \
+ { \
+ int miss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
+ if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
+ for (k=0; k<btok->nvalues; k++) \
+ { \
+ int nmiss = miss + (bcf_double_is_missing_or_vector_end(bptr[k]) ? 1 : 0); \
+ if ( nmiss ) \
+ { \
+ if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
+ } \
+ else if ( aptr[j] > 16777216 || bptr[k] > 16777216 ) /* Ugly, see #871 */ \
+ { \
+ if ( aptr[j] CMP_OP bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
+ } \
+ else if ( (float)aptr[j] CMP_OP (float)bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
+ } \
+ } \
+ } \
+ } \
+ else /* btok->nsamples */ \
{ \
- token_t *xtok = atok->nsamples ? atok : btok; \
- token_t *ytok = atok->nsamples ? btok : atok; \
- for (i=0; i<xtok->nsamples; i++) \
+ for (i=0; i<btok->nsamples; i++) \
{ \
if ( !rtok->usmpl[i] ) continue; \
- double *xptr = xtok->values + i*xtok->nval1; \
- double *yptr = ytok->values + i*ytok->nval1; \
- for (j=0; j<xtok->nval1; j++) \
+ double *aptr = atok->values + i*atok->nval1; \
+ double *bptr = btok->values + i*btok->nval1; \
+ for (j=0; j<btok->nval1; j++) \
{ \
- int miss = bcf_double_is_missing_or_vector_end(xptr[j]) ? 1 : 0; \
+ int miss = bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0; \
if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
- for (k=0; k<ytok->nvalues; k++) \
+ for (k=0; k<atok->nvalues; k++) \
{ \
- int nmiss = miss + (bcf_double_is_missing_or_vector_end(yptr[k]) ? 1 : 0); \
+ int nmiss = miss + (bcf_double_is_missing_or_vector_end(aptr[k]) ? 1 : 0); \
if ( nmiss ) \
{ \
- if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+ if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
} \
- else if ( xptr[j] > 16777216 || yptr[k] > 16777216 ) /* Ugly, see #871 */ \
+ else if ( bptr[j] > 16777216 || aptr[k] > 16777216 ) /* Ugly, see #871 */ \
{ \
- if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+ if ( aptr[k] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
} \
- else if ( (float)xptr[j] CMP_OP (float)yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+ else if ( (float)aptr[k] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
} \
} \
} \
{
int is_info = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ? 1 : 0;
is_fmt = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ? 1 : 0;
- if ( is_info && is_fmt ) error("Both INFO/%s and FORMAT/%s exist, which one do you want?\n", tmp.s,tmp.s);
+ if ( is_info && is_fmt )
+ error("Error: ambiguous filtering expression, both INFO/%s and FORMAT/%s are defined in the VCF header.\n" , tmp.s,tmp.s);
}
if ( is_fmt==-1 ) is_fmt = 0;
}
// Additionally, treat "." as missing value rather than a string in numeric equalities; that
// @file is only used with ID; etc.
// This code is fragile: improve me.
+ static int comma_separator_warned = 0;
int i;
for (i=0; i<nout; i++)
{
if ( regcomp(out[j].regex, out[j].key, cflags) )
error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str);
}
+ if ( out[i].is_str && out[i].tok_type==TOK_VAL && out[i].key && strchr(out[i].key,',') )
+ {
+ int print_note = 0;
+ if ( out[i+1].tok_type==TOK_EQ || (out[i+1].is_str && out[i+2].tok_type==TOK_EQ) ) print_note = 1;
+ else if ( out[i+1].tok_type==TOK_NE || (out[i+1].is_str && out[i+2].tok_type==TOK_NE) ) print_note = 1;
+ if ( print_note && !comma_separator_warned )
+ {
+ comma_separator_warned = 1;
+ fprintf(bcftools_stderr,
+ "Warning: comma is interpreted as a separator and OR logic is used in string comparisons.\n"
+ " (Search the manual for \"Comma in strings\" to learn more.)\n");
+ }
+ }
if ( out[i].tok_type!=TOK_VAL ) continue;
if ( !out[i].tag ) continue;
if ( out[i].setter==filters_set_type )
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
int itok = i, ival;
if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
- else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_EQ, ival = i - 1;
- else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NE, ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_IN, ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NOT_IN, ival = i - 1;
else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i;
- else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_EQ, ival = ++i;
- else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NE, ival = ++i;
+ else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_IN, ival = ++i;
+ else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NOT_IN, ival = ++i;
else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
if ( out[ival].tok_type!=TOK_VAL || !out[ival].key )
error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
else if ( out[i].tok_type==TOK_PHRED ) { out[i].func = func_phred; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_BINOM ) { out[i].func = func_binom; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_PERLSUB ) { out[i].func = perl_exec; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_sMAX ) { out[i].func = func_smpl_max; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_sMIN ) { out[i].func = func_smpl_min; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_sAVG ) { out[i].func = func_smpl_avg; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_sMEDIAN ) { out[i].func = func_smpl_median; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_sSTDEV ) { out[i].func = func_smpl_stddev; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_sSUM ) { out[i].func = func_smpl_sum; out[i].tok_type = TOK_FUNC; }
hts_expand0(double,1,out[i].mvalues,out[i].values);
if ( filter->nsamples )
{
{
return flt->max_unpack;
}
+
+const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1)
+{
+ token_t *tok = filter->flt_stack[0];
+ if ( tok->nvalues )
+ {
+ *nval = tok->nvalues;
+ *nval1 = tok->nval1;
+ }
+ else
+ {
+ if ( !tok->values ) error("fixme in filter_get_doubles(): %s\n", filter->str);
+ *nval = 1;
+ *nval1 = 1;
+ tok->values[0] = filter->flt_stack[0]->pass_site;
+ }
+ return tok->values;
+}
+
+void filter_set_samples(filter_t *filter, const uint8_t *samples)
+{
+ int i,j;
+ for (i=0; i<filter->nfilters; i++)
+ {
+ if ( !filter->filters[i].nsamples ) continue;
+ for (j=0; j<filter->filters[i].nsamples; j++) filter->filters[i].usmpl[j] = samples[j];
+ }
+}
+
/* filter.h -- filter expressions.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
*/
int filter_test(filter_t *filter, bcf1_t *rec, const uint8_t **samples);
+/**
+ * filter_set_samples() - restrict filtering expression to samples.
+ * Call after filter_init().
+ * @samples: use samples set to 1, ignore samples set 0
+ */
+void filter_set_samples(filter_t *filter, const uint8_t *samples);
+
+/**
+ * filter_get_doubles() - return a pointer to values from the last filter_test() evaluation
+ */
+const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1);
+
void filter_expression_info(FILE *fp);
int filter_max_unpack(filter_t *filter);
#include <htslib/hts.h>
#include <htslib/kstring.h>
#include <stdlib.h>
+#include <assert.h>
#include "bcftools.h"
#include "hclust.h"
#include <htslib/hts.h>
#include <htslib/kstring.h>
#include <stdlib.h>
+#include <assert.h>
#include "bcftools.h"
#include "hclust.h"
+++ /dev/null
-[Files in this distribution outwith the cram/ subdirectory are distributed
-according to the terms of the following MIT/Expat license.]
-
-The MIT/Expat License
-
-Copyright (C) 2012-2019 Genome Research Ltd.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
-
-[Files within the cram/ subdirectory in this distribution are distributed
-according to the terms of the following Modified 3-Clause BSD license.]
-
-The Modified-BSD License
-
-Copyright (C) 2012-2019 Genome Research Ltd.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
-3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute
- nor the names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR ITS CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-[The use of a range of years within a copyright notice in this distribution
-should be interpreted as being equivalent to a list of years including the
-first and last year specified and all consecutive years between them.
-
-For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009,
-2011-2012" should be interpreted as being identical to a notice that reads
-"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice
-that reads "Copyright (C) 2005-2012" should be interpreted as being identical
-to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010,
-2011, 2012".]
+++ /dev/null
-HTSlib is an implementation of a unified C library for accessing common file
-formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing
-data. It is the core library used by samtools and bcftools.
-
-See INSTALL for building and installation instructions.
/* main.c -- main bcftools command front-end.
- Copyright (C) 2012-2018 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#endif
int main_consensus(int argc, char *argv[]);
int main_csq(int argc, char *argv[]);
-int bam_mpileup(int argc, char *argv[]);
+int main_mpileup(int argc, char *argv[]);
int main_sort(int argc, char *argv[]);
typedef struct
.alias = "gtcheck",
.help = "check sample concordance, detect sample swaps and contamination"
},
- { .func = bam_mpileup,
+ { .func = main_mpileup,
.alias = "mpileup",
.help = "multi-way pileup producing genotype likelihoods"
},
if (argc < 2) { usage(stderr); return 1; }
if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
- printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2019 Genome Research Ltd.\n", bcftools_version(), hts_version());
+ printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2021 Genome Research Ltd.\n", bcftools_version(), hts_version());
#if USE_GPL
printf("License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
#else
/* main.c -- main bcftools command front-end.
- Copyright (C) 2012-2018 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#endif
int main_consensus(int argc, char *argv[]);
int main_csq(int argc, char *argv[]);
-int bam_mpileup(int argc, char *argv[]);
+int main_mpileup(int argc, char *argv[]);
int main_sort(int argc, char *argv[]);
typedef struct
.alias = "gtcheck",
.help = "check sample concordance, detect sample swaps and contamination"
},
- { .func = bam_mpileup,
+ { .func = main_mpileup,
.alias = "mpileup",
.help = "multi-way pileup producing genotype likelihoods"
},
if (argc < 2) { usage(bcftools_stderr); return 1; }
if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
- fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2019 Genome Research Ltd.\n", bcftools_version(), hts_version());
+ fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2021 Genome Research Ltd.\n", bcftools_version(), hts_version());
#if USE_GPL
fprintf(bcftools_stdout, "License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
#else
/* mcall.c -- multiallelic and rare variant calling.
- Copyright (C) 2012-2016 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
+#include <assert.h>
#include <math.h>
#include <inttypes.h>
+#include <ctype.h>
#include <htslib/kfunc.h>
#include <htslib/khash_str2int.h>
#include "call.h"
+#include "prob1.h"
// Using priors for GTs does not seem to be mathematically justified. Although
// it seems effective in removing false calls, it also flips a significant
// genotypes is reported instead.
#define FLAT_PDG_FOR_MISSING 0
+int test16(float *anno16, anno16_t *a);
void qcall_init(call_t *call) { return; }
void qcall_destroy(call_t *call) { return; }
if ( !call->sample_groups )
{
// standard pooled calling, all samples in the same group
- grp_t *grps = &call->smpl_grp;
- grps->ngrp = 1;
- grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t));
- grps->smpl2grp = (int*)calloc(nsmpl,sizeof(int));
+ call->nsmpl_grp = 1;
+ call->smpl_grp = (smpl_grp_t*)calloc(1,sizeof(*call->smpl_grp));
+ call->smpl_grp[0].nsmpl = nsmpl;
+ call->smpl_grp[0].smpl = (uint32_t*)calloc(call->smpl_grp[0].nsmpl,sizeof(uint32_t));
+ for (i=0; i<nsmpl; i++)
+ call->smpl_grp[0].smpl[i] = i;
+ return;
+ }
+
+ if ( call->sample_groups_tag )
+ {
+ // Is the tag defined in the header?
+ int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->sample_groups_tag);
+ if ( tag_id==-1 ) error("No such tag \"%s\"\n",call->sample_groups_tag);
+ if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) error("No such FORMAT tag \"%s\"\n", call->sample_groups_tag);
+ }
+ else
+ {
+ int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"QS");
+ if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "QS";
+ else
+ {
+ tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"AD");
+ if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "AD";
+ else error("Error: neither \"AD\" nor \"QS\" FORMAT tag exists and no alternative given with -G\n");
+ }
}
- else if ( !strcmp("-",call->sample_groups) )
+
+ // Read samples/groups
+ if ( !strcmp("-",call->sample_groups) )
{
// single-sample calling, each sample creates its own group
- grp_t *grps = &call->smpl_grp;
- grps->ngrp = nsmpl;
- grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t));
- grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int));
- for (i=0; i<nsmpl; i++) grps->smpl2grp[i] = i;
+ call->nsmpl_grp = nsmpl;
+ call->smpl_grp = (smpl_grp_t*)calloc(nsmpl,sizeof(*call->smpl_grp));
+ for (i=0; i<nsmpl; i++)
+ {
+ call->smpl_grp[i].nsmpl = 1;
+ call->smpl_grp[i].smpl = (uint32_t*)calloc(call->smpl_grp[i].nsmpl,sizeof(uint32_t));
+ call->smpl_grp[i].smpl[0] = i;
+ }
}
else
{
char **lines = hts_readlist(call->sample_groups, 1, &nlines);
if ( !lines ) error("Could not read the file: %s\n", call->sample_groups);
- uint32_t *smpl2grp1 = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
+ uint32_t *smpl2grp = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
+ uint32_t *grp2n = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
void *grp2idx = khash_str2int_init();
- grp_t *grps = &call->smpl_grp;
+ call->nsmpl_grp = 0;
for (i=0; i<nlines; i++)
{
char *ptr = lines[i];
- while ( *ptr && *ptr!='\t' ) ptr++;
+ while ( *ptr && !isspace(*ptr) ) ptr++;
if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
- *ptr = 0;
+ char *tmp = ptr;
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
+ *tmp = 0;
int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]);
if ( ismpl<0 ) continue;
- if ( smpl2grp1[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
+ if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
if ( !khash_str2int_has_key(grp2idx,ptr+1) )
{
- khash_str2int_inc(grp2idx, ptr+1);
- grps->ngrp++;
+ khash_str2int_set(grp2idx, ptr+1, call->nsmpl_grp);
+ call->nsmpl_grp++;
}
- int igrp;
- if ( khash_str2int_get(grp2idx, ptr+1, &igrp)==0 )
- smpl2grp1[ismpl] = igrp+1;
- else
+ int igrp = -1;
+ if ( khash_str2int_get(grp2idx, ptr+1, &igrp)!=0 )
error("This should not happen, fixme: %s\n",ptr+1);
+ grp2n[igrp]++;
+ smpl2grp[ismpl] = igrp+1; // +1 to distinguish unlisted samples
}
khash_str2int_destroy(grp2idx);
+ if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", call->sample_groups);
- grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t));
- grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int));
+ call->smpl_grp = (smpl_grp_t*)calloc(call->nsmpl_grp,sizeof(*call->smpl_grp));
for (i=0; i<nsmpl; i++)
{
- if ( !smpl2grp1[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
- grps->smpl2grp[i] = smpl2grp1[i] - 1;
+ if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
+ int igrp = smpl2grp[i] - 1;
+ if ( !call->smpl_grp[igrp].nsmpl )
+ call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t));
+ call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i;
+ call->smpl_grp[igrp].nsmpl++;
}
- free(smpl2grp1);
+ free(smpl2grp);
+ free(grp2n);
for (i=0; i<nlines; i++) free(lines[i]);
free(lines);
}
static void destroy_sample_groups(call_t *call)
{
int i;
- grp_t *grps = &call->smpl_grp;
- for (i=0; i<grps->ngrp; i++)
- free(grps->grp[i].qsum);
- free(grps->grp);
- free(grps->smpl2grp);
+ for (i=0; i<call->nsmpl_grp; i++)
+ {
+ free(call->smpl_grp[i].qsum);
+ free(call->smpl_grp[i].smpl);
+ }
+ free(call->smpl_grp);
}
void mcall_init(call_t *call)
{
+ init_sample_groups(call);
call_init_pl2p(call);
call->nals_map = 5;
if ( call->output_tags & CALL_FMT_GQ )
bcf_hdr_append(call->hdr,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Phred-scaled Genotype Quality\">");
if ( call->output_tags & CALL_FMT_GP )
- bcf_hdr_append(call->hdr,"##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Phred-scaled genotype posterior probabilities\">");
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype posterior probabilities in the range 0 to 1\">");
if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr));
- bcf_hdr_append(call->hdr,"##INFO=<ID=ICB,Number=1,Type=Float,Description=\"Inbreeding Coefficient Binomial test (bigger is better)\">");
- bcf_hdr_append(call->hdr,"##INFO=<ID=HOB,Number=1,Type=Float,Description=\"Bias in the number of HOMs number (smaller is better)\">");
bcf_hdr_append(call->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">");
bcf_hdr_append(call->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">");
bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Average mapping quality\">");
+ if ( call->output_tags & CALL_FMT_PV4 )
+ bcf_hdr_append(call->hdr,"##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n");
// init the prior
if ( call->theta>0 )
}
call->theta = log(call->theta);
}
-
- init_sample_groups(call);
}
void mcall_destroy(call_t *call)
free(call->pdg);
free(call->als);
free(call->ac);
- free(call->qsum);
return;
}
}
// Create mapping between old and new (trimmed) alleles
-void init_allele_trimming_maps(call_t *call, int als, int nals)
+void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out)
{
- int i, j;
+ int i, j, nout = 0;
// als_map: old(i) -> new(j)
- for (i=0, j=0; i<nals; i++)
+ for (i=0; i<nals_ori; i++)
{
- if ( als & 1<<i ) call->als_map[i] = j++;
+ if ( als_out & (1<<i) ) call->als_map[i] = nout++;
else call->als_map[i] = -1;
}
// pl_map: new(k) -> old(l)
int k = 0, l = 0;
- for (i=0; i<nals; i++)
+ for (i=0; i<nals_ori; i++)
{
for (j=0; j<=i; j++)
{
- if ( (als & 1<<i) && (als & 1<<j) ) call->pl_map[k++] = l;
+ if ( (als_out & (1<<i)) && (als_out & (1<<j)) ) call->pl_map[k++] = l;
l++;
}
}
}
-double binom_dist(int N, double p, int k)
-{
- int mean = (int) (N*p);
- if ( mean==k ) return 1.0;
-
- double log_p = (k-mean)*log(p) + (mean-k)*log(1.0-p);
- if ( k > N - k ) k = N - k;
- if ( mean > N - mean ) mean = N - mean;
-
- if ( k < mean ) { int tmp = k; k = mean; mean = tmp; }
- double diff = k - mean;
-
- double val = 1.0;
- int i;
- for (i=0; i<diff; i++)
- val = val * (N-mean-i) / (k-i);
-
- return exp(log_p)/val;
-}
-
-
-// Inbreeding Coefficient, binomial test
-float calc_ICB(int nref, int nalt, int nhets, int ndiploid)
-{
- if ( !nref || !nalt || !ndiploid ) return HUGE_VAL;
-
- double fref = (double)nref/(nref+nalt); // fraction of reference allelels
- double falt = (double)nalt/(nref+nalt); // non-ref als
- double q = 2*fref*falt; // probability of a het, assuming HWE
- double mean = q*ndiploid;
-
- //fprintf(stderr,"\np=%e N=%d k=%d .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid);
-
- // Can we use normal approximation? The second condition is for performance only
- // and is not well justified.
- if ( (mean>10 && (1-q)*ndiploid>10 ) || ndiploid>200 )
- {
- //fprintf(stderr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))));
- return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)));
- }
-
- return binom_dist(ndiploid, q, nhets);
-}
-
-float calc_HOB(int nref, int nalt, int nhets, int ndiploid)
-{
- if ( !nref || !nalt || !ndiploid ) return HUGE_VAL;
-
- double fref = (double)nref/(nref+nalt); // fraction of reference allelels
- double falt = (double)nalt/(nref+nalt); // non-ref als
- return fabs((double)nhets/ndiploid - 2*fref*falt);
-}
-
-/**
- * log(sum_i exp(a_i))
- */
-// static inline double logsumexp(double *vals, int nvals)
-// {
-// int i;
-// double max_exp = vals[0];
-// for (i=1; i<nvals; i++)
-// if ( max_exp < vals[i] ) max_exp = vals[i];
-
-// double sum = 0;
-// for (i=0; i<nvals; i++)
-// sum += exp(vals[i] - max_exp);
-
-// return log(sum) + max_exp;
-// }
/** log(exp(a)+exp(b)) */
static inline double logsumexp2(double a, double b)
{
// Macro to set the most likely alleles
#define UPDATE_MAX_LKs(als,sum) { \
- if ( max_lk<lk_tot ) { max_lk = lk_tot; max_als = (als); } \
+ if ( max_lk<lk_tot && lk_tot_set ) { max_lk = lk_tot; max_als = (als); } \
if ( sum ) lk_sum = logsumexp2(lk_tot,lk_sum); \
}
// Determine the most likely combination of alleles. In this implementation,
// at most tri-allelic sites are considered. Returns the number of alleles.
-static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
+static int mcall_find_best_alleles(call_t *call, int nals, smpl_grp_t *grp)
{
- int j;
int ia,ib,ic; // iterators over up to three alleles
int max_als=0; // most likely combination of alleles
- double ref_lk = 0, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles
+ double ref_lk = -HUGE_VAL, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles
double lk_sum = -HUGE_VAL; // for normalizing the likelihoods
- int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int nsmpl = grp->nsmpl;
int ngts = nals*(nals+1)/2;
// Single allele
double lk_tot = 0;
int lk_tot_set = 0;
int iaa = (ia+1)*(ia+2)/2-1; // index in PL which corresponds to the homozygous "ia/ia" genotype
- int isample;
- double *pdg = call->pdg + iaa;
- for (isample=0; isample<nsmpl; isample++)
+ int ismpl;
+ for (ismpl=0; ismpl<nsmpl; ismpl++)
{
+ double *pdg = call->pdg + grp->smpl[ismpl]*ngts + iaa;
if ( *pdg ) { lk_tot += log(*pdg); lk_tot_set = 1; }
- pdg += ngts;
}
if ( ia==0 ) ref_lk = lk_tot; // likelihood of 0/0 for all samples
else lk_tot += call->theta; // the prior
UPDATE_MAX_LKs(1<<ia, ia>0 && lk_tot_set);
}
- grp_t *grps = &call->smpl_grp;
-
// Two alleles
if ( nals>1 )
{
for (ia=0; ia<nals; ia++)
{
- if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue;
+ if ( grp->qsum[ia]==0 ) continue;
int iaa = (ia+1)*(ia+2)/2-1;
for (ib=0; ib<ia; ib++)
{
- if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue;
+ if ( grp->qsum[ib]==0 ) continue;
double lk_tot = 0;
int lk_tot_set = 0;
- int ia_cov = 0, ib_cov = 0;
- for (j=0; j<grps->ngrp; j++)
+ double fa = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib]);
+ double fb = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib]);
+ double fa2 = fa*fa;
+ double fb2 = fb*fb;
+ double fab = 2*fa*fb;
+ int is, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
+ for (is=0; is<nsmpl; is++)
{
- grp1_t *grp = &grps->grp[j];
- if ( grp->qsum[ia] ) ia_cov = 1;
- if ( grp->qsum[ib] ) ib_cov = 1;
- if ( !grp->qsum[ia] && !grp->qsum[ib] ) { grp->dp = 0; continue; }
- grp->dp = 1;
- grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]);
- grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]);
- grp->fa2 = grp->fa*grp->fa;
- grp->fb2 = grp->fb*grp->fb;
- grp->fab = 2*grp->fa*grp->fb;
- }
- if ( !ia_cov || !ib_cov ) continue;
- int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
- double *pdg = call->pdg;
- for (isample=0; isample<nsmpl; isample++)
- {
- grp1_t *grp = &grps->grp[grps->smpl2grp[isample]];
- if ( !grp->dp ) continue;
+ int ismpl = grp->smpl[is];
+ double *pdg = call->pdg + ismpl*ngts;
double val = 0;
- if ( !call->ploidy || call->ploidy[isample]==2 )
- val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fab*pdg[iab];
- else if ( call->ploidy && call->ploidy[isample]==1 )
- val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb];
+ if ( !call->ploidy || call->ploidy[ismpl]==2 )
+ val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab];
+ else if ( call->ploidy && call->ploidy[ismpl]==1 )
+ val = fa*pdg[iaa] + fb*pdg[ibb];
if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
- pdg += ngts;
}
if ( ia!=0 ) lk_tot += call->theta; // the prior
if ( ib!=0 ) lk_tot += call->theta;
{
for (ia=0; ia<nals; ia++)
{
- if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue;
+ if ( grp->qsum[ia]==0 ) continue;
int iaa = (ia+1)*(ia+2)/2-1;
for (ib=0; ib<ia; ib++)
{
- if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue;
+ if ( grp->qsum[ib]==0 ) continue;
int ibb = (ib+1)*(ib+2)/2-1;
int iab = iaa - ia + ib;
for (ic=0; ic<ib; ic++)
{
- if ( grps->ngrp==1 && grps->grp[0].qsum[ic]==0 ) continue;
+ if ( grp->qsum[ic]==0 ) continue;
double lk_tot = 0;
- int lk_tot_set = 1;
- int ia_cov = 0, ib_cov = 0, ic_cov = 0;
- for (j=0; j<grps->ngrp; j++)
- {
- grp1_t *grp = &grps->grp[j];
- if ( grp->qsum[ia] ) ia_cov = 1;
- if ( grp->qsum[ib] ) ib_cov = 1;
- if ( grp->qsum[ic] ) ic_cov = 1;
- if ( !grp->qsum[ia] && !grp->qsum[ib] && !grp->qsum[ic] ) { grp->dp = 0; continue; }
- grp->dp = 1;
- grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]);
- grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]);
- grp->fc = grp->qsum[ic]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]);
- grp->fa2 = grp->fa*grp->fa;
- grp->fb2 = grp->fb*grp->fb;
- grp->fc2 = grp->fc*grp->fc;
- grp->fab = 2*grp->fa*grp->fb, grp->fac = 2*grp->fa*grp->fc, grp->fbc = 2*grp->fb*grp->fc;
- }
- if ( !ia_cov || !ib_cov || !ic_cov ) continue;
- int isample, icc = (ic+1)*(ic+2)/2-1;
+ int lk_tot_set = 0;
+
+ double fa = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
+ double fb = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
+ double fc = grp->qsum[ic]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
+ double fa2 = fa*fa;
+ double fb2 = fb*fb;
+ double fc2 = fc*fc;
+ double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc;
+ int is, icc = (ic+1)*(ic+2)/2-1;
int iac = iaa - ia + ic, ibc = ibb - ib + ic;
- double *pdg = call->pdg;
- for (isample=0; isample<nsmpl; isample++)
+ for (is=0; is<nsmpl; is++)
{
- grp1_t *grp = &grps->grp[grps->smpl2grp[isample]];
- if ( !grp->dp ) continue;
+ int ismpl = grp->smpl[is];
+ double *pdg = call->pdg + ismpl*ngts;
double val = 0;
- if ( !call->ploidy || call->ploidy[isample]==2 )
- val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fc2*pdg[icc] + grp->fab*pdg[iab] + grp->fac*pdg[iac] + grp->fbc*pdg[ibc];
- else if ( call->ploidy && call->ploidy[isample]==1 )
- val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb] + grp->fc*pdg[icc];
+ if ( !call->ploidy || call->ploidy[ismpl]==2 )
+ val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
+ else if ( call->ploidy && call->ploidy[ismpl]==1 )
+ val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc];
if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
- pdg += ngts;
}
if ( ia!=0 ) lk_tot += call->theta; // the prior
if ( ib!=0 ) lk_tot += call->theta; // the prior
}
}
- call->ref_lk = ref_lk;
- call->lk_sum = lk_sum;
- *out_als = max_als;
-
int i, n = 0;
for (i=0; i<nals; i++) if ( max_als & 1<<i) n++;
+ grp->max_lk = max_lk;
+ grp->ref_lk = ref_lk;
+ grp->lk_sum = lk_sum;
+ grp->als = max_als;
+ grp->nals = n;
+
return n;
}
-static void mcall_set_ref_genotypes(call_t *call, int nals)
+// Sets GT=0/0 or GT=. if PL=0,0,0
+static void mcall_set_ref_genotypes(call_t *call, int nals_ori)
{
int i;
- int ngts = nals*(nals+1)/2;
+ int ngts = nals_ori*(nals_ori+1)/2; // need this to distinguish between GT=0/0 vs GT=.
int nsmpl = bcf_hdr_nsamples(call->hdr);
- for (i=0; i<nals; i++) call->ac[i] = 0;
- call->nhets = 0;
- call->ndiploid = 0;
+ for (i=0; i<nals_ori; i++) call->ac[i] = 0; // nals_new<=nals_ori, never mind setting extra 0's
// Set all genotypes to 0/0 or 0
int *gts = call->gts;
}
}
-static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp)
{
int ia, ib, i;
- int ngts = nals*(nals+1)/2;
- int nsmpl = bcf_hdr_nsamples(call->hdr);
- int nout_gts = nout_als*(nout_als+1)/2;
- hts_expand(float,nout_gts*nsmpl,call->nGPs,call->GPs);
-
- for (i=0; i<nout_als; i++) call->ac[i] = 0;
- call->nhets = 0;
- call->ndiploid = 0;
+ int ngts_ori = nals_ori*(nals_ori+1)/2;
+ int ngts_new = call->nals_new*(call->nals_new+1)/2;
+ int nsmpl = grp->nsmpl;
#if USE_PRIOR_FOR_GTS
float prior = exp(call->theta);
#endif
- float *gps = call->GPs - nout_gts;
- double *pdg = call->pdg - ngts;
- int *gts = call->gts - 2;
- int isample;
- for (isample = 0; isample < nsmpl; isample++)
+ int is;
+ for (is = 0; is < nsmpl; is++)
{
- int ploidy = call->ploidy ? call->ploidy[isample] : 2;
- assert( ploidy>=0 && ploidy<=2 );
+ int ismpl = grp->smpl[is];
+ double *pdg = call->pdg + ismpl*ngts_ori;
+ float *gps = call->GPs + ismpl*ngts_new;
+ int *gts = call->gts + ismpl*2;
- pdg += ngts;
- gts += 2;
- gps += nout_gts;
+ int ploidy = call->ploidy ? call->ploidy[ismpl] : 2;
+ assert( ploidy>=0 && ploidy<=2 );
if ( !ploidy )
{
#if !FLAT_PDG_FOR_MISSING
// Skip samples with zero depth, they have all pdg's equal to 0
- for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
- if ( i==ngts )
+ for (i=0; i<ngts_ori; i++) if ( pdg[i]!=0.0 ) break;
+ if ( i==ngts_ori )
{
gts[0] = bcf_gt_missing;
gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
}
#endif
- if ( ploidy==2 ) call->ndiploid++;
-
// Default fallback for the case all LKs are the same
gts[0] = bcf_gt_unphased(0);
gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end;
// Non-zero depth, determine the most likely genotype
- grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]];
double best_lk = 0;
- for (ia=0; ia<nals; ia++)
+ for (ia=0; ia<nals_ori; ia++)
{
- if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
- int iaa = (ia+1)*(ia+2)/2-1; // PL index of the ia/ia genotype
+ if ( !(grp->als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
+ int iaa = (ia+1)*(ia+2)/2-1; // PL index of the ia/ia genotype
double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia];
#if USE_PRIOR_FOR_GTS
if ( ia!=0 ) lk *= prior;
if ( ploidy==2 )
{
gts[1] = gts[0];
- for (ia=0; ia<nals; ia++)
+ for (ia=0; ia<nals_ori; ia++)
{
- if ( !(out_als & 1<<ia) ) continue;
+ if ( !(grp->als & 1<<ia) ) continue;
int iaa = (ia+1)*(ia+2)/2-1;
for (ib=0; ib<ia; ib++)
{
- if ( !(out_als & 1<<ib) ) continue;
+ if ( !(grp->als & 1<<ib) ) continue;
int iab = iaa - ia + ib;
double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib];
#if USE_PRIOR_FOR_GTS
}
}
}
- if ( gts[0] != gts[1] ) call->nhets++;
}
else
gts[1] = bcf_int32_vector_end;
call->ac[ bcf_gt_allele(gts[0]) ]++;
if ( gts[1]!=bcf_int32_vector_end ) call->ac[ bcf_gt_allele(gts[1]) ]++;
}
- if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
+ if ( !(call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP)) ) return;
+ double max, sum;
+ for (is=0; is<nsmpl; is++)
{
- double max, sum;
- for (isample=0; isample<nsmpl; isample++)
- {
- gps = call->GPs + isample*nout_gts;
+ int ismpl = grp->smpl[is];
+ float *gps = call->GPs + ismpl*ngts_new;
- int nmax;
- if ( call->ploidy )
- {
- if ( call->ploidy[isample]==2 ) nmax = nout_gts;
- else if ( call->ploidy[isample]==1 ) nmax = nout_als;
- else nmax = 0;
- }
- else nmax = nout_gts;
+ int nmax;
+ if ( call->ploidy )
+ {
+ if ( call->ploidy[ismpl]==2 ) nmax = ngts_new;
+ else if ( call->ploidy[ismpl]==1 ) nmax = grp->nals;
+ else nmax = 0;
+ }
+ else nmax = ngts_new;
- max = gps[0];
- if ( max<0 || nmax==0 )
- {
- // no call
- if ( call->output_tags & CALL_FMT_GP )
- {
- for (i=0; i<nmax; i++) gps[i] = 0;
- if ( nmax==0 ) { bcf_float_set_missing(gps[i]); nmax++; }
- if ( nmax < nout_gts ) bcf_float_set_vector_end(gps[nmax]);
- }
- call->GQs[isample] = 0;
- continue;
- }
- sum = gps[0];
- for (i=1; i<nmax; i++)
- {
- if ( max < gps[i] ) max = gps[i];
- sum += gps[i];
- }
- max = -4.34294*log(1 - max/sum);
- call->GQs[isample] = max<=INT8_MAX ? max : INT8_MAX;
+ max = gps[0];
+ if ( max<0 || nmax==0 )
+ {
+ // no call
if ( call->output_tags & CALL_FMT_GP )
{
- assert( max );
- for (i=0; i<nmax; i++) gps[i] = (int)(-4.34294*log(gps[i]/sum));
- if ( nmax < nout_gts ) bcf_float_set_vector_end(gps[nmax]);
+ for (i=0; i<nmax; i++) gps[i] = 0;
+ if ( nmax==0 ) { bcf_float_set_missing(gps[i]); nmax++; }
+ if ( nmax < ngts_new ) bcf_float_set_vector_end(gps[nmax]);
}
+ call->GQs[ismpl] = 0;
+ continue;
+ }
+ sum = gps[0];
+ for (i=1; i<nmax; i++)
+ {
+ if ( max < gps[i] ) max = gps[i];
+ sum += gps[i];
+ }
+ max = -4.34294*log(1 - max/sum);
+ call->GQs[ismpl] = max<=INT8_MAX ? max : INT8_MAX;
+ if ( call->output_tags & CALL_FMT_GP )
+ {
+ assert( max );
+ for (i=0; i<nmax; i++) gps[i] = gps[i]/sum;
+ for (; i<ngts_new; i++) bcf_float_set_vector_end(gps[i]);
}
}
- if ( call->output_tags & CALL_FMT_GP )
- bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*nout_gts);
- if ( call->output_tags & CALL_FMT_GQ )
- bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl);
}
Individual qualities are calculated as
GQ(F=i,M=j,K=k) = P(F=i,M=j,K=k) / \sum_{x,y} P(F=i,M=x,K=y)
*/
-static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+#if 0
+static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nals_new, int als_new)
{
int ia, ib, i;
int nsmpl = bcf_hdr_nsamples(call->hdr);
int ngts = nals*(nals+1)/2;
- int nout_gts = nout_als*(nout_als+1)/2;
+ int nout_gts = nals_new*(nals_new+1)/2;
double *gls = call->GLs - nout_gts;
double *pdg = call->pdg - ngts;
double best_lk = 0;
for (ia=0; ia<nals; ia++)
{
- if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
+ if ( !(als_new & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
int iaa = bcf_alleles2gt(ia,ia); // PL index of the ia/ia genotype
int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia];
{
for (ia=0; ia<nals; ia++)
{
- if ( !(out_als & 1<<ia) ) continue;
+ if ( !(als_new & 1<<ia) ) continue;
for (ib=0; ib<ia; ib++)
{
- if ( !(out_als & 1<<ib) ) continue;
+ if ( !(als_new & 1<<ib) ) continue;
int iab = bcf_alleles2gt(ia,ib);
int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ib]);
double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib];
for (ifm=0; ifm<call->nfams; ifm++)
{
family_t *fam = &call->fams[ifm];
- int ntrio = call->ntrio[fam->type][nout_als];
- uint16_t *trio = call->trio[fam->type][nout_als];
+ int ntrio = call->ntrio[fam->type][nals_new];
+ uint16_t *trio = call->trio[fam->type][nals_new];
// Unconstrained likelihood
int uc_itr = 0;
bcf_update_format_int32(call->hdr,rec,"CGT",call->cgts,nsmpl);
}
}
+#endif
-static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+static void mcall_trim_and_update_PLs(call_t *call, bcf1_t *rec, int nals_ori, int nals_new)
{
- int ngts = nals*(nals+1)/2;
- int npls_src = ngts, npls_dst = nout_als*(nout_als+1)/2; // number of PL values in diploid samples, ori and new
+ int npls_src = nals_ori*(nals_ori+1)/2;
+ int npls_dst = nals_new*(nals_new+1)/2; // number of PL values in diploid samples, ori and new
if ( call->all_diploid && npls_src == npls_dst ) return;
int *pls_src = call->PLs, *pls_dst = call->PLs;
}
else if ( ploidy==1 )
{
- for (ia=0; ia<nout_als; ia++)
+ for (ia=0; ia<nals_new; ia++)
{
int isrc = (ia+1)*(ia+2)/2-1;
pls_dst[ia] = pls_src[ call->pl_map[isrc] ];
else
{
pls_dst[0] = bcf_int32_missing;
- pls_dst[1] = bcf_int32_vector_end; // relying on nout_als>1 in mcall()
+ pls_dst[1] = bcf_int32_vector_end; // relying on nals_new>1 in mcall()
}
pls_src += npls_src;
pls_dst += npls_dst;
bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*nsmpl);
}
-void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new)
{
- if ( nals==nout_als ) return;
+ if ( nals_ori==nals_new ) return;
int i,j, nret, size = sizeof(float);
nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
if ( nret<=0 ) continue;
- if ( nout_als==1 )
+ if ( nals_new==1 )
bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1); // has to be the REF, the order could not change
else
{
- for (j=0; j<nals; j++)
+ for (j=0; j<nals_ori; j++)
{
int k = call->als_map[j];
if ( k==-1 ) continue; // to be dropped
memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size);
}
- bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als);
+ bcf_update_info_int32(call->hdr, rec, key, tmp_new, nals_new);
}
}
if (nret<=0) continue;
int nsmpl = bcf_hdr_nsamples(call->hdr);
- assert( nret==nals*nsmpl );
+ assert( nret==nals_ori*nsmpl );
for (j=0; j<nsmpl; j++)
{
- char *ptr_src = (char *)tmp_ori + j*nals*size;
- char *ptr_dst = (char *)tmp_new + j*nout_als*size;
+ char *ptr_src = (char *)tmp_ori + j*nals_ori*size;
+ char *ptr_dst = (char *)tmp_new + j*nals_new*size;
int k;
- for (k=0; k<nals; k++)
+ for (k=0; k<nals_ori; k++)
{
int l = call->als_map[k];
if ( l==-1 ) continue; // to be dropped
memcpy(ptr_dst+size*l, ptr_src+size*k, size);
}
}
- bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl);
+ bcf_update_format_int32(call->hdr, rec, key, tmp_new, nals_new*nsmpl);
}
call->PLs = (int32_t*) tmp_new;
}
bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl);
- // update QS
- int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum);
- hts_expand(float,nals,call->nqsum,call->qsum);
+ // update QS, use temporarily call->GPs to store the values
+ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum);
+ hts_expand(float,nals,call->nGPs,call->GPs);
for (i=0; i<nals; i++)
- call->qsum[i] = call->als_map[i]<nqs ? call->smpl_grp.grp[0].qsum[call->als_map[i]] : 0;
- bcf_update_info_float(call->hdr, rec, "QS", call->qsum, nals);
+ call->GPs[i] = call->als_map[i]<nqs ? call->smpl_grp[0].qsum[call->als_map[i]] : 0;
+ bcf_update_info_float(call->hdr, rec, "QS", call->GPs, nals);
// update any Number=R tags
void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point
call->itmp = (int32_t*) tmp_ori;
call->n_itmp = ntmp_ori;
-
if ( *unseen ) *unseen = nals-1;
return 0;
}
// Force alleles when calling genotypes given alleles was requested
if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2;
- int nsmpl = bcf_hdr_nsamples(call->hdr);
- int nals = rec->n_allele;
- hts_expand(int,nals,call->nac,call->ac);
- hts_expand(int,nals,call->nals_map,call->als_map);
- hts_expand(int,nals*(nals+1)/2,call->npl_map,call->pl_map);
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int nals_ori = rec->n_allele;
+ hts_expand(int,nals_ori,call->nac,call->ac);
+ hts_expand(int,nals_ori,call->nals_map,call->als_map);
+ hts_expand(int,nals_ori*(nals_ori+1)/2,call->npl_map,call->pl_map);
// Get the genotype likelihoods
call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
- if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals ) // a mixture of diploid and haploid or haploid only
- error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs);
+ if ( call->nPLs!=nsmpl*nals_ori*(nals_ori+1)/2 && call->nPLs!=nsmpl*nals_ori ) // a mixture of diploid and haploid or haploid only
+ error("Wrong number of PL fields? nals=%d npl=%d\n", nals_ori,call->nPLs);
// Convert PLs to probabilities
- int ngts = nals*(nals+1)/2;
+ int ngts_ori = nals_ori*(nals_ori+1)/2;
hts_expand(double, call->nPLs, call->npdg, call->pdg);
- set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen);
+ set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts_ori, unseen);
// Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes.
- if ( call->smpl_grp.ngrp == 1 )
+ if ( call->nsmpl_grp == 1 )
{
- int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum);
+ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum);
if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
- if ( nqs < nals )
+ if ( nqs < nals_ori )
{
// Some of the listed alleles do not have the corresponding QS field. This is
// typically ref-only site with <*> in ALT.
- hts_expand(float,nals,call->smpl_grp.grp[0].nqsum,call->smpl_grp.grp[0].qsum);
- for (i=nqs; i<nals; i++) call->smpl_grp.grp[0].qsum[i] = 0;
+ hts_expand(float,nals_ori,call->smpl_grp[0].nqsum,call->smpl_grp[0].qsum);
+ for (i=nqs; i<nals_ori; i++) call->smpl_grp[0].qsum[i] = 0;
}
}
else
{
- for (j=0; j<call->smpl_grp.ngrp; j++)
+ for (j=0; j<call->nsmpl_grp; j++)
{
- hts_expand(float,nals,call->smpl_grp.grp[j].nqsum,call->smpl_grp.grp[j].qsum);
- memset(call->smpl_grp.grp[j].qsum, 0, sizeof(float)*nals);
+ hts_expand(float,nals_ori,call->smpl_grp[j].nqsum,call->smpl_grp[j].qsum);
+ memset(call->smpl_grp[j].qsum, 0, sizeof(float)*nals_ori);
}
- int nad = bcf_get_format_int32(call->hdr, rec, "AD", &call->ADs, &call->nADs);
- if ( nad<1 ) error("Error: FORMAT/AD is required with the -G option, mpileup must be run with -a AD\n");
+ // Use FORMAT/AD or FORMAT/QS
+ int nad = bcf_get_format_int32(call->hdr, rec, call->sample_groups_tag, &call->ADs, &call->nADs);
+ if ( nad<1 ) error("Error: FORMAT/%s is required with the -G option, mpileup must be run with \"-a AD\" or \"-a QS\"\n",call->sample_groups_tag);
nad /= bcf_hdr_nsamples(call->hdr);
- hts_expand(float,nals,call->nqsum,call->qsum);
- float qsum = 0;
- for (i=0; i<bcf_hdr_nsamples(call->hdr); i++)
+ for (i=0; i<call->nsmpl_grp; i++)
{
- int32_t *ptr = call->ADs + i*nad;
- for (j=0; j<nad; j++)
+ int is;
+ smpl_grp_t *grp = &call->smpl_grp[i];
+ hts_expand(float,nals_ori,grp->nqsum,grp->qsum);
+ for (j=0; j<nals_ori; j++) grp->qsum[j] = 0;
+ for (is=0; is<grp->nsmpl; is++)
{
- if ( ptr[j]==bcf_int32_vector_end ) break;
- if ( ptr[j]==bcf_int32_missing ) call->qsum[j] = 0;
- else { call->qsum[j] = ptr[j]; qsum += ptr[j]; }
+ int ismpl = grp->smpl[is];
+ int32_t *ptr = call->ADs + ismpl*nad;
+ float sum = 0;
+ for (j=0; j<nad; j++)
+ {
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+ if ( ptr[j]!=bcf_int32_missing ) sum += ptr[j];
+ }
+ if ( sum )
+ {
+ for (j=0; j<nad; j++)
+ {
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+ if ( ptr[j]!=bcf_int32_missing ) grp->qsum[j] += ptr[j]/sum;
+ }
+ }
}
- for (; j<nals; j++) call->qsum[j] = 0;
- if ( qsum )
- for (j=0; j<nals; j++) call->qsum[j] /= qsum;
-
- grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[i]];
- for (j=0; j<nals; j++)
- grp->qsum[j] += call->qsum[j];
}
}
// If available, take into account reference panel AFs
if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
{
- int an = call->ac[0];
- if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 )
+ int an = call->ac[0]; // number of alleles total, procede only if not zero; reuse call->ac
+ if ( an > 0 && bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals_ori-1 ) // number of ALT alleles
{
- int ac0 = an; // number of alleles in the reference population
- for (i=0; i<nals-1; i++)
+ int ac0 = an; // this will become the number of REFs
+ for (i=0; i<nals_ori-1; i++)
{
if ( call->ac[i]==bcf_int32_vector_end ) break;
if ( call->ac[i]==bcf_int32_missing ) continue;
ac0 -= call->ac[i];
- for (j=0; j<call->smpl_grp.ngrp; j++)
- call->smpl_grp.grp[j].qsum[i+1] += call->ac[i]*0.5;
+
+ // here an*0.5 is the number of samples in the populatio and ac*0.5 is the AF weighted by the number of samples
+ for (j=0; j<call->nsmpl_grp; j++)
+ call->smpl_grp[j].qsum[i+1] = (call->smpl_grp[j].qsum[i+1] + 0.5*call->ac[i]) / (call->smpl_grp[j].nsmpl + 0.5*an);
}
if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1);
- for (j=0; j<call->smpl_grp.ngrp; j++)
- call->smpl_grp.grp[j].qsum[0] += ac0*0.5;
- for (i=0; i<nals; i++)
- {
- for (j=0; j<call->smpl_grp.ngrp; j++)
- call->smpl_grp.grp[j].qsum[i] /= nsmpl + 0.5*an;
- }
+ for (j=0; j<call->nsmpl_grp; j++)
+ call->smpl_grp[j].qsum[0] = (call->smpl_grp[j].qsum[0] + 0.5*ac0) / (call->smpl_grp[j].nsmpl + 0.5*an);
}
}
- for (j=0; j<call->smpl_grp.ngrp; j++)
+ // normalize so that QS sums to 1 for each group
+ for (j=0; j<call->nsmpl_grp; j++)
{
- float qsum_tot = 0;
- for (i=0; i<nals; i++) qsum_tot += call->smpl_grp.grp[j].qsum[i];
- if ( qsum_tot ) for (i=0; i<nals; i++) call->smpl_grp.grp[j].qsum[i] /= qsum_tot;
+ float sum = 0;
+ for (i=0; i<nals_ori; i++) sum += call->smpl_grp[j].qsum[i];
+ if ( sum ) for (i=0; i<nals_ori; i++) call->smpl_grp[j].qsum[i] /= sum;
}
bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
- // Find the best combination of alleles
- int out_als, nout;
- if ( nals > 8*sizeof(out_als) )
+ if ( nals_ori > 8*sizeof(call->als_new) )
{
fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
return 0;
}
- nout = mcall_find_best_alleles(call, nals, &out_als);
- // Make sure the REF allele is always present
- if ( !(out_als&1) )
+ // For each group find the best combination of alleles
+ call->als_new = 0;
+ double ref_lk = -HUGE_VAL, lk_sum = -HUGE_VAL, max_qual = -HUGE_VAL;
+ for (j=0; j<call->nsmpl_grp; j++)
{
- out_als |= 1;
- nout++;
+ smpl_grp_t *grp = &call->smpl_grp[j];
+ mcall_find_best_alleles(call, nals_ori, grp);
+ call->als_new |= grp->als;
+ if ( grp->max_lk==-HUGE_VAL ) continue;
+ double qual = -4.343*(grp->ref_lk - logsumexp2(grp->lk_sum,grp->ref_lk));
+ if ( max_qual < qual )
+ {
+ max_qual = qual;
+ lk_sum = grp->lk_sum;
+ ref_lk = grp->ref_lk;
+ }
}
- int is_variant = out_als==1 ? 0 : 1;
+
+ // Make sure the REF allele is always present
+ if ( !(call->als_new&1) ) call->als_new |= 1;
+
+ int is_variant = call->als_new==1 ? 0 : 1;
if ( call->flag & CALL_VARONLY && !is_variant ) return 0;
- // With -A, keep all ALTs except X
- if ( call->flag & CALL_KEEPALT )
+ call->nals_new = 0;
+ for (i=0; i<nals_ori; i++)
{
- nout = 0;
- for (i=0; i<nals; i++)
- {
- if ( i>0 && i==unseen ) continue;
- out_als |= 1<<i;
- nout++;
- }
+ if ( i>0 && i==unseen ) continue;
+ if ( call->flag & CALL_KEEPALT ) call->als_new |= 1<<i;
+ if ( call->als_new & (1<<i) ) call->nals_new++;
}
+ init_allele_trimming_maps(call,nals_ori,call->als_new);
+
int nAC = 0;
- if ( out_als==1 ) // only REF allele on output
+ if ( call->als_new==1 ) // only REF allele on output
{
- init_allele_trimming_maps(call, 1, nals);
- mcall_set_ref_genotypes(call,nals);
+ mcall_set_ref_genotypes(call,nals_ori);
bcf_update_format_int32(call->hdr, rec, "PL", NULL, 0); // remove PL, useless now
}
+ else if ( !is_variant )
+ {
+ mcall_set_ref_genotypes(call,nals_ori); // running with -A, prevent mcall_call_genotypes from putting some ALT back
+ mcall_trim_and_update_PLs(call, rec, nals_ori, call->nals_new);
+ }
else
{
// The most likely set of alleles includes non-reference allele (or was enforced), call genotypes.
// Note that it is a valid outcome if the called genotypes exclude some of the ALTs.
- init_allele_trimming_maps(call, out_als, nals);
- if ( !is_variant )
- mcall_set_ref_genotypes(call,nals); // running with -A, prevent mcall_call_genotypes from putting some ALT back
- else if ( call->flag & CALL_CONSTR_TRIO )
+ int ngts_new = call->nals_new*(call->nals_new+1)/2;
+ hts_expand(float,ngts_new*nsmpl,call->nGPs,call->GPs);
+ for (i=0; i<call->nals_new; i++) call->ac[i] = 0;
+
+ if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 )
+ {
+ fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
+ return 0;
+ }
+ if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
{
- if ( nout>4 )
- {
- fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
- return 0;
- }
- mcall_call_trio_genotypes(call, rec, nals,nout,out_als);
+ memset(call->GPs,0,nsmpl*ngts_new*sizeof(*call->GPs));
+ memset(call->GQs,0,nsmpl*sizeof(*call->GQs));
+ }
+ for (i=0; i<call->nsmpl_grp; i++)
+ {
+ if ( call->flag & CALL_CONSTR_TRIO )
+ error("todo: constrained trio calling temporarily disabled\n"); //mcall_call_trio_genotypes(call,rec,nals,&call->smpl_grp[i]);
+ else
+ mcall_call_genotypes(call,nals_ori,&call->smpl_grp[i]);
}
- else
- mcall_call_genotypes(call,rec,nals,nout,out_als);
// Skip the site if all samples are 0/0. This can happen occasionally.
- nAC = 0;
- for (i=1; i<nout; i++) nAC += call->ac[i];
+ for (i=1; i<call->nals_new; i++) nAC += call->ac[i];
if ( !nAC && call->flag & CALL_VARONLY ) return 0;
- mcall_trim_PLs(call, rec, nals, nout, out_als);
+
+ if ( call->output_tags & CALL_FMT_GP )
+ bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*ngts_new);
+ if ( call->output_tags & CALL_FMT_GQ )
+ bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl);
+
+ mcall_trim_and_update_PLs(call,rec,nals_ori,call->nals_new);
}
- if ( nals!=nout ) mcall_trim_numberR(call, rec, nals, nout, out_als);
+ if ( nals_ori!=call->nals_new )
+ mcall_trim_and_update_numberR(call,rec,nals_ori,call->nals_new);
- // Set QUAL and calculate HWE-related annotations
+ // Set QUAL
if ( nAC )
{
- float icb = calc_ICB(call->ac[0],nAC, call->nhets, call->ndiploid);
- if ( icb != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "ICB", &icb, 1);
-
- float hob = calc_HOB(call->ac[0],nAC, call->nhets, call->ndiploid);
- if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1);
-
// Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set
- rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk));
+ rec->qual = max_qual;
}
else
{
// Set the quality of a REF site
- if ( call->lk_sum==-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior
+ if ( lk_sum!=-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior
+ rec->qual = -4.343*(lk_sum - logsumexp2(lk_sum,ref_lk));
+ else if ( call->ac[0] )
rec->qual = call->theta ? -4.343*call->theta : 0;
else
- rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk));
+ bcf_float_set_missing(rec->qual);
}
- if ( rec->qual>999 ) rec->qual = 999;
- if ( rec->qual>50 ) rec->qual = rint(rec->qual);
-
// AC, AN
- if ( nout>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, nout-1);
+ if ( call->nals_new>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, call->nals_new-1);
nAC += call->ac[0];
bcf_update_info_int32(call->hdr, rec, "AN", &nAC, 1);
// Remove unused alleles
- hts_expand(char*,nout,call->nals,call->als);
- for (i=0; i<nals; i++)
+ hts_expand(char*,call->nals_new,call->nals,call->als);
+ for (i=0; i<nals_ori; i++)
if ( call->als_map[i]>=0 ) call->als[call->als_map[i]] = rec->d.allele[i];
- bcf_update_alleles(call->hdr, rec, (const char**)call->als, nout);
+ bcf_update_alleles(call->hdr, rec, (const char**)call->als, call->nals_new);
bcf_update_genotypes(call->hdr, rec, call->gts, nsmpl*2);
- // DP4 tag
+ // DP4 and PV4 tags
if ( bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16)==16 )
{
int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3];
int32_t mq = (call->anno16[8]+call->anno16[10])/(call->anno16[0]+call->anno16[1]+call->anno16[2]+call->anno16[3]);
bcf_update_info_int32(call->hdr, rec, "MQ", &mq, 1);
+
+ if ( call->output_tags & CALL_FMT_PV4 )
+ {
+ anno16_t a;
+ float tmpf[4];
+ int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0;
+ if ( is_tested )
+ {
+ for (i=0; i<4; i++) tmpf[i] = a.p[i];
+ bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4);
+ }
+ }
}
bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
- return nout;
+ return call->nals_new;
}
/* mcall.c -- multiallelic and rare variant calling.
- Copyright (C) 2012-2016 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
+#include <assert.h>
#include <math.h>
#include <inttypes.h>
+#include <ctype.h>
#include <htslib/kfunc.h>
#include <htslib/khash_str2int.h>
#include "call.h"
+#include "prob1.h"
// Using priors for GTs does not seem to be mathematically justified. Although
// it seems effective in removing false calls, it also flips a significant
// genotypes is reported instead.
#define FLAT_PDG_FOR_MISSING 0
+int test16(float *anno16, anno16_t *a);
void qcall_init(call_t *call) { return; }
void qcall_destroy(call_t *call) { return; }
if ( !call->sample_groups )
{
// standard pooled calling, all samples in the same group
- grp_t *grps = &call->smpl_grp;
- grps->ngrp = 1;
- grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t));
- grps->smpl2grp = (int*)calloc(nsmpl,sizeof(int));
+ call->nsmpl_grp = 1;
+ call->smpl_grp = (smpl_grp_t*)calloc(1,sizeof(*call->smpl_grp));
+ call->smpl_grp[0].nsmpl = nsmpl;
+ call->smpl_grp[0].smpl = (uint32_t*)calloc(call->smpl_grp[0].nsmpl,sizeof(uint32_t));
+ for (i=0; i<nsmpl; i++)
+ call->smpl_grp[0].smpl[i] = i;
+ return;
+ }
+
+ if ( call->sample_groups_tag )
+ {
+ // Is the tag defined in the header?
+ int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->sample_groups_tag);
+ if ( tag_id==-1 ) error("No such tag \"%s\"\n",call->sample_groups_tag);
+ if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) error("No such FORMAT tag \"%s\"\n", call->sample_groups_tag);
+ }
+ else
+ {
+ int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"QS");
+ if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "QS";
+ else
+ {
+ tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"AD");
+ if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "AD";
+ else error("Error: neither \"AD\" nor \"QS\" FORMAT tag exists and no alternative given with -G\n");
+ }
}
- else if ( !strcmp("-",call->sample_groups) )
+
+ // Read samples/groups
+ if ( !strcmp("-",call->sample_groups) )
{
// single-sample calling, each sample creates its own group
- grp_t *grps = &call->smpl_grp;
- grps->ngrp = nsmpl;
- grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t));
- grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int));
- for (i=0; i<nsmpl; i++) grps->smpl2grp[i] = i;
+ call->nsmpl_grp = nsmpl;
+ call->smpl_grp = (smpl_grp_t*)calloc(nsmpl,sizeof(*call->smpl_grp));
+ for (i=0; i<nsmpl; i++)
+ {
+ call->smpl_grp[i].nsmpl = 1;
+ call->smpl_grp[i].smpl = (uint32_t*)calloc(call->smpl_grp[i].nsmpl,sizeof(uint32_t));
+ call->smpl_grp[i].smpl[0] = i;
+ }
}
else
{
char **lines = hts_readlist(call->sample_groups, 1, &nlines);
if ( !lines ) error("Could not read the file: %s\n", call->sample_groups);
- uint32_t *smpl2grp1 = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
+ uint32_t *smpl2grp = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
+ uint32_t *grp2n = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
void *grp2idx = khash_str2int_init();
- grp_t *grps = &call->smpl_grp;
+ call->nsmpl_grp = 0;
for (i=0; i<nlines; i++)
{
char *ptr = lines[i];
- while ( *ptr && *ptr!='\t' ) ptr++;
+ while ( *ptr && !isspace(*ptr) ) ptr++;
if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
- *ptr = 0;
+ char *tmp = ptr;
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
+ *tmp = 0;
int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]);
if ( ismpl<0 ) continue;
- if ( smpl2grp1[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
+ if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
if ( !khash_str2int_has_key(grp2idx,ptr+1) )
{
- khash_str2int_inc(grp2idx, ptr+1);
- grps->ngrp++;
+ khash_str2int_set(grp2idx, ptr+1, call->nsmpl_grp);
+ call->nsmpl_grp++;
}
- int igrp;
- if ( khash_str2int_get(grp2idx, ptr+1, &igrp)==0 )
- smpl2grp1[ismpl] = igrp+1;
- else
+ int igrp = -1;
+ if ( khash_str2int_get(grp2idx, ptr+1, &igrp)!=0 )
error("This should not happen, fixme: %s\n",ptr+1);
+ grp2n[igrp]++;
+ smpl2grp[ismpl] = igrp+1; // +1 to distinguish unlisted samples
}
khash_str2int_destroy(grp2idx);
+ if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", call->sample_groups);
- grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t));
- grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int));
+ call->smpl_grp = (smpl_grp_t*)calloc(call->nsmpl_grp,sizeof(*call->smpl_grp));
for (i=0; i<nsmpl; i++)
{
- if ( !smpl2grp1[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
- grps->smpl2grp[i] = smpl2grp1[i] - 1;
+ if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
+ int igrp = smpl2grp[i] - 1;
+ if ( !call->smpl_grp[igrp].nsmpl )
+ call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t));
+ call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i;
+ call->smpl_grp[igrp].nsmpl++;
}
- free(smpl2grp1);
+ free(smpl2grp);
+ free(grp2n);
for (i=0; i<nlines; i++) free(lines[i]);
free(lines);
}
static void destroy_sample_groups(call_t *call)
{
int i;
- grp_t *grps = &call->smpl_grp;
- for (i=0; i<grps->ngrp; i++)
- free(grps->grp[i].qsum);
- free(grps->grp);
- free(grps->smpl2grp);
+ for (i=0; i<call->nsmpl_grp; i++)
+ {
+ free(call->smpl_grp[i].qsum);
+ free(call->smpl_grp[i].smpl);
+ }
+ free(call->smpl_grp);
}
void mcall_init(call_t *call)
{
+ init_sample_groups(call);
call_init_pl2p(call);
call->nals_map = 5;
if ( call->output_tags & CALL_FMT_GQ )
bcf_hdr_append(call->hdr,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Phred-scaled Genotype Quality\">");
if ( call->output_tags & CALL_FMT_GP )
- bcf_hdr_append(call->hdr,"##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Phred-scaled genotype posterior probabilities\">");
+ bcf_hdr_append(call->hdr,"##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype posterior probabilities in the range 0 to 1\">");
if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr));
- bcf_hdr_append(call->hdr,"##INFO=<ID=ICB,Number=1,Type=Float,Description=\"Inbreeding Coefficient Binomial test (bigger is better)\">");
- bcf_hdr_append(call->hdr,"##INFO=<ID=HOB,Number=1,Type=Float,Description=\"Bias in the number of HOMs number (smaller is better)\">");
bcf_hdr_append(call->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">");
bcf_hdr_append(call->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">");
bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Average mapping quality\">");
+ if ( call->output_tags & CALL_FMT_PV4 )
+ bcf_hdr_append(call->hdr,"##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n");
// init the prior
if ( call->theta>0 )
}
call->theta = log(call->theta);
}
-
- init_sample_groups(call);
}
void mcall_destroy(call_t *call)
free(call->pdg);
free(call->als);
free(call->ac);
- free(call->qsum);
return;
}
}
// Create mapping between old and new (trimmed) alleles
-void init_allele_trimming_maps(call_t *call, int als, int nals)
+void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out)
{
- int i, j;
+ int i, j, nout = 0;
// als_map: old(i) -> new(j)
- for (i=0, j=0; i<nals; i++)
+ for (i=0; i<nals_ori; i++)
{
- if ( als & 1<<i ) call->als_map[i] = j++;
+ if ( als_out & (1<<i) ) call->als_map[i] = nout++;
else call->als_map[i] = -1;
}
// pl_map: new(k) -> old(l)
int k = 0, l = 0;
- for (i=0; i<nals; i++)
+ for (i=0; i<nals_ori; i++)
{
for (j=0; j<=i; j++)
{
- if ( (als & 1<<i) && (als & 1<<j) ) call->pl_map[k++] = l;
+ if ( (als_out & (1<<i)) && (als_out & (1<<j)) ) call->pl_map[k++] = l;
l++;
}
}
}
-double binom_dist(int N, double p, int k)
-{
- int mean = (int) (N*p);
- if ( mean==k ) return 1.0;
-
- double log_p = (k-mean)*log(p) + (mean-k)*log(1.0-p);
- if ( k > N - k ) k = N - k;
- if ( mean > N - mean ) mean = N - mean;
-
- if ( k < mean ) { int tmp = k; k = mean; mean = tmp; }
- double diff = k - mean;
-
- double val = 1.0;
- int i;
- for (i=0; i<diff; i++)
- val = val * (N-mean-i) / (k-i);
-
- return exp(log_p)/val;
-}
-
-
-// Inbreeding Coefficient, binomial test
-float calc_ICB(int nref, int nalt, int nhets, int ndiploid)
-{
- if ( !nref || !nalt || !ndiploid ) return HUGE_VAL;
-
- double fref = (double)nref/(nref+nalt); // fraction of reference allelels
- double falt = (double)nalt/(nref+nalt); // non-ref als
- double q = 2*fref*falt; // probability of a het, assuming HWE
- double mean = q*ndiploid;
-
- //fprintf(bcftools_stderr,"\np=%e N=%d k=%d .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid);
-
- // Can we use normal approximation? The second condition is for performance only
- // and is not well justified.
- if ( (mean>10 && (1-q)*ndiploid>10 ) || ndiploid>200 )
- {
- //fprintf(bcftools_stderr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))));
- return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)));
- }
-
- return binom_dist(ndiploid, q, nhets);
-}
-
-float calc_HOB(int nref, int nalt, int nhets, int ndiploid)
-{
- if ( !nref || !nalt || !ndiploid ) return HUGE_VAL;
-
- double fref = (double)nref/(nref+nalt); // fraction of reference allelels
- double falt = (double)nalt/(nref+nalt); // non-ref als
- return fabs((double)nhets/ndiploid - 2*fref*falt);
-}
-
-/**
- * log(sum_i exp(a_i))
- */
-// static inline double logsumexp(double *vals, int nvals)
-// {
-// int i;
-// double max_exp = vals[0];
-// for (i=1; i<nvals; i++)
-// if ( max_exp < vals[i] ) max_exp = vals[i];
-
-// double sum = 0;
-// for (i=0; i<nvals; i++)
-// sum += exp(vals[i] - max_exp);
-
-// return log(sum) + max_exp;
-// }
/** log(exp(a)+exp(b)) */
static inline double logsumexp2(double a, double b)
{
// Macro to set the most likely alleles
#define UPDATE_MAX_LKs(als,sum) { \
- if ( max_lk<lk_tot ) { max_lk = lk_tot; max_als = (als); } \
+ if ( max_lk<lk_tot && lk_tot_set ) { max_lk = lk_tot; max_als = (als); } \
if ( sum ) lk_sum = logsumexp2(lk_tot,lk_sum); \
}
// Determine the most likely combination of alleles. In this implementation,
// at most tri-allelic sites are considered. Returns the number of alleles.
-static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
+static int mcall_find_best_alleles(call_t *call, int nals, smpl_grp_t *grp)
{
- int j;
int ia,ib,ic; // iterators over up to three alleles
int max_als=0; // most likely combination of alleles
- double ref_lk = 0, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles
+ double ref_lk = -HUGE_VAL, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles
double lk_sum = -HUGE_VAL; // for normalizing the likelihoods
- int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int nsmpl = grp->nsmpl;
int ngts = nals*(nals+1)/2;
// Single allele
double lk_tot = 0;
int lk_tot_set = 0;
int iaa = (ia+1)*(ia+2)/2-1; // index in PL which corresponds to the homozygous "ia/ia" genotype
- int isample;
- double *pdg = call->pdg + iaa;
- for (isample=0; isample<nsmpl; isample++)
+ int ismpl;
+ for (ismpl=0; ismpl<nsmpl; ismpl++)
{
+ double *pdg = call->pdg + grp->smpl[ismpl]*ngts + iaa;
if ( *pdg ) { lk_tot += log(*pdg); lk_tot_set = 1; }
- pdg += ngts;
}
if ( ia==0 ) ref_lk = lk_tot; // likelihood of 0/0 for all samples
else lk_tot += call->theta; // the prior
UPDATE_MAX_LKs(1<<ia, ia>0 && lk_tot_set);
}
- grp_t *grps = &call->smpl_grp;
-
// Two alleles
if ( nals>1 )
{
for (ia=0; ia<nals; ia++)
{
- if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue;
+ if ( grp->qsum[ia]==0 ) continue;
int iaa = (ia+1)*(ia+2)/2-1;
for (ib=0; ib<ia; ib++)
{
- if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue;
+ if ( grp->qsum[ib]==0 ) continue;
double lk_tot = 0;
int lk_tot_set = 0;
- int ia_cov = 0, ib_cov = 0;
- for (j=0; j<grps->ngrp; j++)
+ double fa = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib]);
+ double fb = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib]);
+ double fa2 = fa*fa;
+ double fb2 = fb*fb;
+ double fab = 2*fa*fb;
+ int is, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
+ for (is=0; is<nsmpl; is++)
{
- grp1_t *grp = &grps->grp[j];
- if ( grp->qsum[ia] ) ia_cov = 1;
- if ( grp->qsum[ib] ) ib_cov = 1;
- if ( !grp->qsum[ia] && !grp->qsum[ib] ) { grp->dp = 0; continue; }
- grp->dp = 1;
- grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]);
- grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]);
- grp->fa2 = grp->fa*grp->fa;
- grp->fb2 = grp->fb*grp->fb;
- grp->fab = 2*grp->fa*grp->fb;
- }
- if ( !ia_cov || !ib_cov ) continue;
- int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
- double *pdg = call->pdg;
- for (isample=0; isample<nsmpl; isample++)
- {
- grp1_t *grp = &grps->grp[grps->smpl2grp[isample]];
- if ( !grp->dp ) continue;
+ int ismpl = grp->smpl[is];
+ double *pdg = call->pdg + ismpl*ngts;
double val = 0;
- if ( !call->ploidy || call->ploidy[isample]==2 )
- val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fab*pdg[iab];
- else if ( call->ploidy && call->ploidy[isample]==1 )
- val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb];
+ if ( !call->ploidy || call->ploidy[ismpl]==2 )
+ val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab];
+ else if ( call->ploidy && call->ploidy[ismpl]==1 )
+ val = fa*pdg[iaa] + fb*pdg[ibb];
if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
- pdg += ngts;
}
if ( ia!=0 ) lk_tot += call->theta; // the prior
if ( ib!=0 ) lk_tot += call->theta;
{
for (ia=0; ia<nals; ia++)
{
- if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue;
+ if ( grp->qsum[ia]==0 ) continue;
int iaa = (ia+1)*(ia+2)/2-1;
for (ib=0; ib<ia; ib++)
{
- if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue;
+ if ( grp->qsum[ib]==0 ) continue;
int ibb = (ib+1)*(ib+2)/2-1;
int iab = iaa - ia + ib;
for (ic=0; ic<ib; ic++)
{
- if ( grps->ngrp==1 && grps->grp[0].qsum[ic]==0 ) continue;
+ if ( grp->qsum[ic]==0 ) continue;
double lk_tot = 0;
- int lk_tot_set = 1;
- int ia_cov = 0, ib_cov = 0, ic_cov = 0;
- for (j=0; j<grps->ngrp; j++)
- {
- grp1_t *grp = &grps->grp[j];
- if ( grp->qsum[ia] ) ia_cov = 1;
- if ( grp->qsum[ib] ) ib_cov = 1;
- if ( grp->qsum[ic] ) ic_cov = 1;
- if ( !grp->qsum[ia] && !grp->qsum[ib] && !grp->qsum[ic] ) { grp->dp = 0; continue; }
- grp->dp = 1;
- grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]);
- grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]);
- grp->fc = grp->qsum[ic]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]);
- grp->fa2 = grp->fa*grp->fa;
- grp->fb2 = grp->fb*grp->fb;
- grp->fc2 = grp->fc*grp->fc;
- grp->fab = 2*grp->fa*grp->fb, grp->fac = 2*grp->fa*grp->fc, grp->fbc = 2*grp->fb*grp->fc;
- }
- if ( !ia_cov || !ib_cov || !ic_cov ) continue;
- int isample, icc = (ic+1)*(ic+2)/2-1;
+ int lk_tot_set = 0;
+
+ double fa = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
+ double fb = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
+ double fc = grp->qsum[ic]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
+ double fa2 = fa*fa;
+ double fb2 = fb*fb;
+ double fc2 = fc*fc;
+ double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc;
+ int is, icc = (ic+1)*(ic+2)/2-1;
int iac = iaa - ia + ic, ibc = ibb - ib + ic;
- double *pdg = call->pdg;
- for (isample=0; isample<nsmpl; isample++)
+ for (is=0; is<nsmpl; is++)
{
- grp1_t *grp = &grps->grp[grps->smpl2grp[isample]];
- if ( !grp->dp ) continue;
+ int ismpl = grp->smpl[is];
+ double *pdg = call->pdg + ismpl*ngts;
double val = 0;
- if ( !call->ploidy || call->ploidy[isample]==2 )
- val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fc2*pdg[icc] + grp->fab*pdg[iab] + grp->fac*pdg[iac] + grp->fbc*pdg[ibc];
- else if ( call->ploidy && call->ploidy[isample]==1 )
- val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb] + grp->fc*pdg[icc];
+ if ( !call->ploidy || call->ploidy[ismpl]==2 )
+ val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
+ else if ( call->ploidy && call->ploidy[ismpl]==1 )
+ val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc];
if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
- pdg += ngts;
}
if ( ia!=0 ) lk_tot += call->theta; // the prior
if ( ib!=0 ) lk_tot += call->theta; // the prior
}
}
- call->ref_lk = ref_lk;
- call->lk_sum = lk_sum;
- *out_als = max_als;
-
int i, n = 0;
for (i=0; i<nals; i++) if ( max_als & 1<<i) n++;
+ grp->max_lk = max_lk;
+ grp->ref_lk = ref_lk;
+ grp->lk_sum = lk_sum;
+ grp->als = max_als;
+ grp->nals = n;
+
return n;
}
-static void mcall_set_ref_genotypes(call_t *call, int nals)
+// Sets GT=0/0 or GT=. if PL=0,0,0
+static void mcall_set_ref_genotypes(call_t *call, int nals_ori)
{
int i;
- int ngts = nals*(nals+1)/2;
+ int ngts = nals_ori*(nals_ori+1)/2; // need this to distinguish between GT=0/0 vs GT=.
int nsmpl = bcf_hdr_nsamples(call->hdr);
- for (i=0; i<nals; i++) call->ac[i] = 0;
- call->nhets = 0;
- call->ndiploid = 0;
+ for (i=0; i<nals_ori; i++) call->ac[i] = 0; // nals_new<=nals_ori, never mind setting extra 0's
// Set all genotypes to 0/0 or 0
int *gts = call->gts;
}
}
-static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp)
{
int ia, ib, i;
- int ngts = nals*(nals+1)/2;
- int nsmpl = bcf_hdr_nsamples(call->hdr);
- int nout_gts = nout_als*(nout_als+1)/2;
- hts_expand(float,nout_gts*nsmpl,call->nGPs,call->GPs);
-
- for (i=0; i<nout_als; i++) call->ac[i] = 0;
- call->nhets = 0;
- call->ndiploid = 0;
+ int ngts_ori = nals_ori*(nals_ori+1)/2;
+ int ngts_new = call->nals_new*(call->nals_new+1)/2;
+ int nsmpl = grp->nsmpl;
#if USE_PRIOR_FOR_GTS
float prior = exp(call->theta);
#endif
- float *gps = call->GPs - nout_gts;
- double *pdg = call->pdg - ngts;
- int *gts = call->gts - 2;
- int isample;
- for (isample = 0; isample < nsmpl; isample++)
+ int is;
+ for (is = 0; is < nsmpl; is++)
{
- int ploidy = call->ploidy ? call->ploidy[isample] : 2;
- assert( ploidy>=0 && ploidy<=2 );
+ int ismpl = grp->smpl[is];
+ double *pdg = call->pdg + ismpl*ngts_ori;
+ float *gps = call->GPs + ismpl*ngts_new;
+ int *gts = call->gts + ismpl*2;
- pdg += ngts;
- gts += 2;
- gps += nout_gts;
+ int ploidy = call->ploidy ? call->ploidy[ismpl] : 2;
+ assert( ploidy>=0 && ploidy<=2 );
if ( !ploidy )
{
#if !FLAT_PDG_FOR_MISSING
// Skip samples with zero depth, they have all pdg's equal to 0
- for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
- if ( i==ngts )
+ for (i=0; i<ngts_ori; i++) if ( pdg[i]!=0.0 ) break;
+ if ( i==ngts_ori )
{
gts[0] = bcf_gt_missing;
gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
}
#endif
- if ( ploidy==2 ) call->ndiploid++;
-
// Default fallback for the case all LKs are the same
gts[0] = bcf_gt_unphased(0);
gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end;
// Non-zero depth, determine the most likely genotype
- grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]];
double best_lk = 0;
- for (ia=0; ia<nals; ia++)
+ for (ia=0; ia<nals_ori; ia++)
{
- if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
- int iaa = (ia+1)*(ia+2)/2-1; // PL index of the ia/ia genotype
+ if ( !(grp->als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
+ int iaa = (ia+1)*(ia+2)/2-1; // PL index of the ia/ia genotype
double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia];
#if USE_PRIOR_FOR_GTS
if ( ia!=0 ) lk *= prior;
if ( ploidy==2 )
{
gts[1] = gts[0];
- for (ia=0; ia<nals; ia++)
+ for (ia=0; ia<nals_ori; ia++)
{
- if ( !(out_als & 1<<ia) ) continue;
+ if ( !(grp->als & 1<<ia) ) continue;
int iaa = (ia+1)*(ia+2)/2-1;
for (ib=0; ib<ia; ib++)
{
- if ( !(out_als & 1<<ib) ) continue;
+ if ( !(grp->als & 1<<ib) ) continue;
int iab = iaa - ia + ib;
double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib];
#if USE_PRIOR_FOR_GTS
}
}
}
- if ( gts[0] != gts[1] ) call->nhets++;
}
else
gts[1] = bcf_int32_vector_end;
call->ac[ bcf_gt_allele(gts[0]) ]++;
if ( gts[1]!=bcf_int32_vector_end ) call->ac[ bcf_gt_allele(gts[1]) ]++;
}
- if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
+ if ( !(call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP)) ) return;
+ double max, sum;
+ for (is=0; is<nsmpl; is++)
{
- double max, sum;
- for (isample=0; isample<nsmpl; isample++)
- {
- gps = call->GPs + isample*nout_gts;
+ int ismpl = grp->smpl[is];
+ float *gps = call->GPs + ismpl*ngts_new;
- int nmax;
- if ( call->ploidy )
- {
- if ( call->ploidy[isample]==2 ) nmax = nout_gts;
- else if ( call->ploidy[isample]==1 ) nmax = nout_als;
- else nmax = 0;
- }
- else nmax = nout_gts;
+ int nmax;
+ if ( call->ploidy )
+ {
+ if ( call->ploidy[ismpl]==2 ) nmax = ngts_new;
+ else if ( call->ploidy[ismpl]==1 ) nmax = grp->nals;
+ else nmax = 0;
+ }
+ else nmax = ngts_new;
- max = gps[0];
- if ( max<0 || nmax==0 )
- {
- // no call
- if ( call->output_tags & CALL_FMT_GP )
- {
- for (i=0; i<nmax; i++) gps[i] = 0;
- if ( nmax==0 ) { bcf_float_set_missing(gps[i]); nmax++; }
- if ( nmax < nout_gts ) bcf_float_set_vector_end(gps[nmax]);
- }
- call->GQs[isample] = 0;
- continue;
- }
- sum = gps[0];
- for (i=1; i<nmax; i++)
- {
- if ( max < gps[i] ) max = gps[i];
- sum += gps[i];
- }
- max = -4.34294*log(1 - max/sum);
- call->GQs[isample] = max<=INT8_MAX ? max : INT8_MAX;
+ max = gps[0];
+ if ( max<0 || nmax==0 )
+ {
+ // no call
if ( call->output_tags & CALL_FMT_GP )
{
- assert( max );
- for (i=0; i<nmax; i++) gps[i] = (int)(-4.34294*log(gps[i]/sum));
- if ( nmax < nout_gts ) bcf_float_set_vector_end(gps[nmax]);
+ for (i=0; i<nmax; i++) gps[i] = 0;
+ if ( nmax==0 ) { bcf_float_set_missing(gps[i]); nmax++; }
+ if ( nmax < ngts_new ) bcf_float_set_vector_end(gps[nmax]);
}
+ call->GQs[ismpl] = 0;
+ continue;
+ }
+ sum = gps[0];
+ for (i=1; i<nmax; i++)
+ {
+ if ( max < gps[i] ) max = gps[i];
+ sum += gps[i];
+ }
+ max = -4.34294*log(1 - max/sum);
+ call->GQs[ismpl] = max<=INT8_MAX ? max : INT8_MAX;
+ if ( call->output_tags & CALL_FMT_GP )
+ {
+ assert( max );
+ for (i=0; i<nmax; i++) gps[i] = gps[i]/sum;
+ for (; i<ngts_new; i++) bcf_float_set_vector_end(gps[i]);
}
}
- if ( call->output_tags & CALL_FMT_GP )
- bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*nout_gts);
- if ( call->output_tags & CALL_FMT_GQ )
- bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl);
}
Individual qualities are calculated as
GQ(F=i,M=j,K=k) = P(F=i,M=j,K=k) / \sum_{x,y} P(F=i,M=x,K=y)
*/
-static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+#if 0
+static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nals_new, int als_new)
{
int ia, ib, i;
int nsmpl = bcf_hdr_nsamples(call->hdr);
int ngts = nals*(nals+1)/2;
- int nout_gts = nout_als*(nout_als+1)/2;
+ int nout_gts = nals_new*(nals_new+1)/2;
double *gls = call->GLs - nout_gts;
double *pdg = call->pdg - ngts;
double best_lk = 0;
for (ia=0; ia<nals; ia++)
{
- if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
+ if ( !(als_new & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
int iaa = bcf_alleles2gt(ia,ia); // PL index of the ia/ia genotype
int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia];
{
for (ia=0; ia<nals; ia++)
{
- if ( !(out_als & 1<<ia) ) continue;
+ if ( !(als_new & 1<<ia) ) continue;
for (ib=0; ib<ia; ib++)
{
- if ( !(out_als & 1<<ib) ) continue;
+ if ( !(als_new & 1<<ib) ) continue;
int iab = bcf_alleles2gt(ia,ib);
int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ib]);
double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib];
for (ifm=0; ifm<call->nfams; ifm++)
{
family_t *fam = &call->fams[ifm];
- int ntrio = call->ntrio[fam->type][nout_als];
- uint16_t *trio = call->trio[fam->type][nout_als];
+ int ntrio = call->ntrio[fam->type][nals_new];
+ uint16_t *trio = call->trio[fam->type][nals_new];
// Unconstrained likelihood
int uc_itr = 0;
bcf_update_format_int32(call->hdr,rec,"CGT",call->cgts,nsmpl);
}
}
+#endif
-static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+static void mcall_trim_and_update_PLs(call_t *call, bcf1_t *rec, int nals_ori, int nals_new)
{
- int ngts = nals*(nals+1)/2;
- int npls_src = ngts, npls_dst = nout_als*(nout_als+1)/2; // number of PL values in diploid samples, ori and new
+ int npls_src = nals_ori*(nals_ori+1)/2;
+ int npls_dst = nals_new*(nals_new+1)/2; // number of PL values in diploid samples, ori and new
if ( call->all_diploid && npls_src == npls_dst ) return;
int *pls_src = call->PLs, *pls_dst = call->PLs;
}
else if ( ploidy==1 )
{
- for (ia=0; ia<nout_als; ia++)
+ for (ia=0; ia<nals_new; ia++)
{
int isrc = (ia+1)*(ia+2)/2-1;
pls_dst[ia] = pls_src[ call->pl_map[isrc] ];
else
{
pls_dst[0] = bcf_int32_missing;
- pls_dst[1] = bcf_int32_vector_end; // relying on nout_als>1 in mcall()
+ pls_dst[1] = bcf_int32_vector_end; // relying on nals_new>1 in mcall()
}
pls_src += npls_src;
pls_dst += npls_dst;
bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*nsmpl);
}
-void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new)
{
- if ( nals==nout_als ) return;
+ if ( nals_ori==nals_new ) return;
int i,j, nret, size = sizeof(float);
nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
if ( nret<=0 ) continue;
- if ( nout_als==1 )
+ if ( nals_new==1 )
bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1); // has to be the REF, the order could not change
else
{
- for (j=0; j<nals; j++)
+ for (j=0; j<nals_ori; j++)
{
int k = call->als_map[j];
if ( k==-1 ) continue; // to be dropped
memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size);
}
- bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als);
+ bcf_update_info_int32(call->hdr, rec, key, tmp_new, nals_new);
}
}
if (nret<=0) continue;
int nsmpl = bcf_hdr_nsamples(call->hdr);
- assert( nret==nals*nsmpl );
+ assert( nret==nals_ori*nsmpl );
for (j=0; j<nsmpl; j++)
{
- char *ptr_src = (char *)tmp_ori + j*nals*size;
- char *ptr_dst = (char *)tmp_new + j*nout_als*size;
+ char *ptr_src = (char *)tmp_ori + j*nals_ori*size;
+ char *ptr_dst = (char *)tmp_new + j*nals_new*size;
int k;
- for (k=0; k<nals; k++)
+ for (k=0; k<nals_ori; k++)
{
int l = call->als_map[k];
if ( l==-1 ) continue; // to be dropped
memcpy(ptr_dst+size*l, ptr_src+size*k, size);
}
}
- bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl);
+ bcf_update_format_int32(call->hdr, rec, key, tmp_new, nals_new*nsmpl);
}
call->PLs = (int32_t*) tmp_new;
}
bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl);
- // update QS
- int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum);
- hts_expand(float,nals,call->nqsum,call->qsum);
+ // update QS, use temporarily call->GPs to store the values
+ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum);
+ hts_expand(float,nals,call->nGPs,call->GPs);
for (i=0; i<nals; i++)
- call->qsum[i] = call->als_map[i]<nqs ? call->smpl_grp.grp[0].qsum[call->als_map[i]] : 0;
- bcf_update_info_float(call->hdr, rec, "QS", call->qsum, nals);
+ call->GPs[i] = call->als_map[i]<nqs ? call->smpl_grp[0].qsum[call->als_map[i]] : 0;
+ bcf_update_info_float(call->hdr, rec, "QS", call->GPs, nals);
// update any Number=R tags
void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point
call->itmp = (int32_t*) tmp_ori;
call->n_itmp = ntmp_ori;
-
if ( *unseen ) *unseen = nals-1;
return 0;
}
// Force alleles when calling genotypes given alleles was requested
if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2;
- int nsmpl = bcf_hdr_nsamples(call->hdr);
- int nals = rec->n_allele;
- hts_expand(int,nals,call->nac,call->ac);
- hts_expand(int,nals,call->nals_map,call->als_map);
- hts_expand(int,nals*(nals+1)/2,call->npl_map,call->pl_map);
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
+ int nals_ori = rec->n_allele;
+ hts_expand(int,nals_ori,call->nac,call->ac);
+ hts_expand(int,nals_ori,call->nals_map,call->als_map);
+ hts_expand(int,nals_ori*(nals_ori+1)/2,call->npl_map,call->pl_map);
// Get the genotype likelihoods
call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
- if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals ) // a mixture of diploid and haploid or haploid only
- error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs);
+ if ( call->nPLs!=nsmpl*nals_ori*(nals_ori+1)/2 && call->nPLs!=nsmpl*nals_ori ) // a mixture of diploid and haploid or haploid only
+ error("Wrong number of PL fields? nals=%d npl=%d\n", nals_ori,call->nPLs);
// Convert PLs to probabilities
- int ngts = nals*(nals+1)/2;
+ int ngts_ori = nals_ori*(nals_ori+1)/2;
hts_expand(double, call->nPLs, call->npdg, call->pdg);
- set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen);
+ set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts_ori, unseen);
// Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes.
- if ( call->smpl_grp.ngrp == 1 )
+ if ( call->nsmpl_grp == 1 )
{
- int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum);
+ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum);
if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
- if ( nqs < nals )
+ if ( nqs < nals_ori )
{
// Some of the listed alleles do not have the corresponding QS field. This is
// typically ref-only site with <*> in ALT.
- hts_expand(float,nals,call->smpl_grp.grp[0].nqsum,call->smpl_grp.grp[0].qsum);
- for (i=nqs; i<nals; i++) call->smpl_grp.grp[0].qsum[i] = 0;
+ hts_expand(float,nals_ori,call->smpl_grp[0].nqsum,call->smpl_grp[0].qsum);
+ for (i=nqs; i<nals_ori; i++) call->smpl_grp[0].qsum[i] = 0;
}
}
else
{
- for (j=0; j<call->smpl_grp.ngrp; j++)
+ for (j=0; j<call->nsmpl_grp; j++)
{
- hts_expand(float,nals,call->smpl_grp.grp[j].nqsum,call->smpl_grp.grp[j].qsum);
- memset(call->smpl_grp.grp[j].qsum, 0, sizeof(float)*nals);
+ hts_expand(float,nals_ori,call->smpl_grp[j].nqsum,call->smpl_grp[j].qsum);
+ memset(call->smpl_grp[j].qsum, 0, sizeof(float)*nals_ori);
}
- int nad = bcf_get_format_int32(call->hdr, rec, "AD", &call->ADs, &call->nADs);
- if ( nad<1 ) error("Error: FORMAT/AD is required with the -G option, mpileup must be run with -a AD\n");
+ // Use FORMAT/AD or FORMAT/QS
+ int nad = bcf_get_format_int32(call->hdr, rec, call->sample_groups_tag, &call->ADs, &call->nADs);
+ if ( nad<1 ) error("Error: FORMAT/%s is required with the -G option, mpileup must be run with \"-a AD\" or \"-a QS\"\n",call->sample_groups_tag);
nad /= bcf_hdr_nsamples(call->hdr);
- hts_expand(float,nals,call->nqsum,call->qsum);
- float qsum = 0;
- for (i=0; i<bcf_hdr_nsamples(call->hdr); i++)
+ for (i=0; i<call->nsmpl_grp; i++)
{
- int32_t *ptr = call->ADs + i*nad;
- for (j=0; j<nad; j++)
+ int is;
+ smpl_grp_t *grp = &call->smpl_grp[i];
+ hts_expand(float,nals_ori,grp->nqsum,grp->qsum);
+ for (j=0; j<nals_ori; j++) grp->qsum[j] = 0;
+ for (is=0; is<grp->nsmpl; is++)
{
- if ( ptr[j]==bcf_int32_vector_end ) break;
- if ( ptr[j]==bcf_int32_missing ) call->qsum[j] = 0;
- else { call->qsum[j] = ptr[j]; qsum += ptr[j]; }
+ int ismpl = grp->smpl[is];
+ int32_t *ptr = call->ADs + ismpl*nad;
+ float sum = 0;
+ for (j=0; j<nad; j++)
+ {
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+ if ( ptr[j]!=bcf_int32_missing ) sum += ptr[j];
+ }
+ if ( sum )
+ {
+ for (j=0; j<nad; j++)
+ {
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+ if ( ptr[j]!=bcf_int32_missing ) grp->qsum[j] += ptr[j]/sum;
+ }
+ }
}
- for (; j<nals; j++) call->qsum[j] = 0;
- if ( qsum )
- for (j=0; j<nals; j++) call->qsum[j] /= qsum;
-
- grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[i]];
- for (j=0; j<nals; j++)
- grp->qsum[j] += call->qsum[j];
}
}
// If available, take into account reference panel AFs
if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
{
- int an = call->ac[0];
- if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 )
+ int an = call->ac[0]; // number of alleles total, procede only if not zero; reuse call->ac
+ if ( an > 0 && bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals_ori-1 ) // number of ALT alleles
{
- int ac0 = an; // number of alleles in the reference population
- for (i=0; i<nals-1; i++)
+ int ac0 = an; // this will become the number of REFs
+ for (i=0; i<nals_ori-1; i++)
{
if ( call->ac[i]==bcf_int32_vector_end ) break;
if ( call->ac[i]==bcf_int32_missing ) continue;
ac0 -= call->ac[i];
- for (j=0; j<call->smpl_grp.ngrp; j++)
- call->smpl_grp.grp[j].qsum[i+1] += call->ac[i]*0.5;
+
+ // here an*0.5 is the number of samples in the populatio and ac*0.5 is the AF weighted by the number of samples
+ for (j=0; j<call->nsmpl_grp; j++)
+ call->smpl_grp[j].qsum[i+1] = (call->smpl_grp[j].qsum[i+1] + 0.5*call->ac[i]) / (call->smpl_grp[j].nsmpl + 0.5*an);
}
if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1);
- for (j=0; j<call->smpl_grp.ngrp; j++)
- call->smpl_grp.grp[j].qsum[0] += ac0*0.5;
- for (i=0; i<nals; i++)
- {
- for (j=0; j<call->smpl_grp.ngrp; j++)
- call->smpl_grp.grp[j].qsum[i] /= nsmpl + 0.5*an;
- }
+ for (j=0; j<call->nsmpl_grp; j++)
+ call->smpl_grp[j].qsum[0] = (call->smpl_grp[j].qsum[0] + 0.5*ac0) / (call->smpl_grp[j].nsmpl + 0.5*an);
}
}
- for (j=0; j<call->smpl_grp.ngrp; j++)
+ // normalize so that QS sums to 1 for each group
+ for (j=0; j<call->nsmpl_grp; j++)
{
- float qsum_tot = 0;
- for (i=0; i<nals; i++) qsum_tot += call->smpl_grp.grp[j].qsum[i];
- if ( qsum_tot ) for (i=0; i<nals; i++) call->smpl_grp.grp[j].qsum[i] /= qsum_tot;
+ float sum = 0;
+ for (i=0; i<nals_ori; i++) sum += call->smpl_grp[j].qsum[i];
+ if ( sum ) for (i=0; i<nals_ori; i++) call->smpl_grp[j].qsum[i] /= sum;
}
bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
- // Find the best combination of alleles
- int out_als, nout;
- if ( nals > 8*sizeof(out_als) )
+ if ( nals_ori > 8*sizeof(call->als_new) )
{
fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
return 0;
}
- nout = mcall_find_best_alleles(call, nals, &out_als);
- // Make sure the REF allele is always present
- if ( !(out_als&1) )
+ // For each group find the best combination of alleles
+ call->als_new = 0;
+ double ref_lk = -HUGE_VAL, lk_sum = -HUGE_VAL, max_qual = -HUGE_VAL;
+ for (j=0; j<call->nsmpl_grp; j++)
{
- out_als |= 1;
- nout++;
+ smpl_grp_t *grp = &call->smpl_grp[j];
+ mcall_find_best_alleles(call, nals_ori, grp);
+ call->als_new |= grp->als;
+ if ( grp->max_lk==-HUGE_VAL ) continue;
+ double qual = -4.343*(grp->ref_lk - logsumexp2(grp->lk_sum,grp->ref_lk));
+ if ( max_qual < qual )
+ {
+ max_qual = qual;
+ lk_sum = grp->lk_sum;
+ ref_lk = grp->ref_lk;
+ }
}
- int is_variant = out_als==1 ? 0 : 1;
+
+ // Make sure the REF allele is always present
+ if ( !(call->als_new&1) ) call->als_new |= 1;
+
+ int is_variant = call->als_new==1 ? 0 : 1;
if ( call->flag & CALL_VARONLY && !is_variant ) return 0;
- // With -A, keep all ALTs except X
- if ( call->flag & CALL_KEEPALT )
+ call->nals_new = 0;
+ for (i=0; i<nals_ori; i++)
{
- nout = 0;
- for (i=0; i<nals; i++)
- {
- if ( i>0 && i==unseen ) continue;
- out_als |= 1<<i;
- nout++;
- }
+ if ( i>0 && i==unseen ) continue;
+ if ( call->flag & CALL_KEEPALT ) call->als_new |= 1<<i;
+ if ( call->als_new & (1<<i) ) call->nals_new++;
}
+ init_allele_trimming_maps(call,nals_ori,call->als_new);
+
int nAC = 0;
- if ( out_als==1 ) // only REF allele on output
+ if ( call->als_new==1 ) // only REF allele on output
{
- init_allele_trimming_maps(call, 1, nals);
- mcall_set_ref_genotypes(call,nals);
+ mcall_set_ref_genotypes(call,nals_ori);
bcf_update_format_int32(call->hdr, rec, "PL", NULL, 0); // remove PL, useless now
}
+ else if ( !is_variant )
+ {
+ mcall_set_ref_genotypes(call,nals_ori); // running with -A, prevent mcall_call_genotypes from putting some ALT back
+ mcall_trim_and_update_PLs(call, rec, nals_ori, call->nals_new);
+ }
else
{
// The most likely set of alleles includes non-reference allele (or was enforced), call genotypes.
// Note that it is a valid outcome if the called genotypes exclude some of the ALTs.
- init_allele_trimming_maps(call, out_als, nals);
- if ( !is_variant )
- mcall_set_ref_genotypes(call,nals); // running with -A, prevent mcall_call_genotypes from putting some ALT back
- else if ( call->flag & CALL_CONSTR_TRIO )
+ int ngts_new = call->nals_new*(call->nals_new+1)/2;
+ hts_expand(float,ngts_new*nsmpl,call->nGPs,call->GPs);
+ for (i=0; i<call->nals_new; i++) call->ac[i] = 0;
+
+ if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 )
+ {
+ fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
+ return 0;
+ }
+ if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
{
- if ( nout>4 )
- {
- fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
- return 0;
- }
- mcall_call_trio_genotypes(call, rec, nals,nout,out_als);
+ memset(call->GPs,0,nsmpl*ngts_new*sizeof(*call->GPs));
+ memset(call->GQs,0,nsmpl*sizeof(*call->GQs));
+ }
+ for (i=0; i<call->nsmpl_grp; i++)
+ {
+ if ( call->flag & CALL_CONSTR_TRIO )
+ error("todo: constrained trio calling temporarily disabled\n"); //mcall_call_trio_genotypes(call,rec,nals,&call->smpl_grp[i]);
+ else
+ mcall_call_genotypes(call,nals_ori,&call->smpl_grp[i]);
}
- else
- mcall_call_genotypes(call,rec,nals,nout,out_als);
// Skip the site if all samples are 0/0. This can happen occasionally.
- nAC = 0;
- for (i=1; i<nout; i++) nAC += call->ac[i];
+ for (i=1; i<call->nals_new; i++) nAC += call->ac[i];
if ( !nAC && call->flag & CALL_VARONLY ) return 0;
- mcall_trim_PLs(call, rec, nals, nout, out_als);
+
+ if ( call->output_tags & CALL_FMT_GP )
+ bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*ngts_new);
+ if ( call->output_tags & CALL_FMT_GQ )
+ bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl);
+
+ mcall_trim_and_update_PLs(call,rec,nals_ori,call->nals_new);
}
- if ( nals!=nout ) mcall_trim_numberR(call, rec, nals, nout, out_als);
+ if ( nals_ori!=call->nals_new )
+ mcall_trim_and_update_numberR(call,rec,nals_ori,call->nals_new);
- // Set QUAL and calculate HWE-related annotations
+ // Set QUAL
if ( nAC )
{
- float icb = calc_ICB(call->ac[0],nAC, call->nhets, call->ndiploid);
- if ( icb != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "ICB", &icb, 1);
-
- float hob = calc_HOB(call->ac[0],nAC, call->nhets, call->ndiploid);
- if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1);
-
// Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set
- rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk));
+ rec->qual = max_qual;
}
else
{
// Set the quality of a REF site
- if ( call->lk_sum==-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior
+ if ( lk_sum!=-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior
+ rec->qual = -4.343*(lk_sum - logsumexp2(lk_sum,ref_lk));
+ else if ( call->ac[0] )
rec->qual = call->theta ? -4.343*call->theta : 0;
else
- rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk));
+ bcf_float_set_missing(rec->qual);
}
- if ( rec->qual>999 ) rec->qual = 999;
- if ( rec->qual>50 ) rec->qual = rint(rec->qual);
-
// AC, AN
- if ( nout>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, nout-1);
+ if ( call->nals_new>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, call->nals_new-1);
nAC += call->ac[0];
bcf_update_info_int32(call->hdr, rec, "AN", &nAC, 1);
// Remove unused alleles
- hts_expand(char*,nout,call->nals,call->als);
- for (i=0; i<nals; i++)
+ hts_expand(char*,call->nals_new,call->nals,call->als);
+ for (i=0; i<nals_ori; i++)
if ( call->als_map[i]>=0 ) call->als[call->als_map[i]] = rec->d.allele[i];
- bcf_update_alleles(call->hdr, rec, (const char**)call->als, nout);
+ bcf_update_alleles(call->hdr, rec, (const char**)call->als, call->nals_new);
bcf_update_genotypes(call->hdr, rec, call->gts, nsmpl*2);
- // DP4 tag
+ // DP4 and PV4 tags
if ( bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16)==16 )
{
int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3];
int32_t mq = (call->anno16[8]+call->anno16[10])/(call->anno16[0]+call->anno16[1]+call->anno16[2]+call->anno16[3]);
bcf_update_info_int32(call->hdr, rec, "MQ", &mq, 1);
+
+ if ( call->output_tags & CALL_FMT_PV4 )
+ {
+ anno16_t a;
+ float tmpf[4];
+ int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0;
+ if ( is_tested )
+ {
+ for (i=0; i<4; i++) tmpf[i] = a.p[i];
+ bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4);
+ }
+ }
}
bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
- return nout;
+ return call->nals_new;
}
/* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
- Copyright (C) 2008-2018 Genome Research Ltd.
+ Copyright (C) 2008-2021 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <htslib/faidx.h>
#include <htslib/kstring.h>
#include <htslib/khash_str2int.h>
+#include <htslib/hts_os.h>
#include <assert.h>
#include "regidx.h"
#include "bcftools.h"
#define MPLP_PRINT_MAPQ (1<<10)
#define MPLP_PER_SAMPLE (1<<11)
#define MPLP_SMART_OVERLAPS (1<<12)
+#define MPLP_REALN_PARTIAL (1<<13)
typedef struct _mplp_aux_t mplp_aux_t;
typedef struct _mplp_pileup_t mplp_pileup_t;
// Data shared by all bam files
typedef struct {
- int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+ int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth,
+ max_indel_depth, max_read_len, fmt_flag, ambig_reads;
int rflag_require, rflag_filter, output_type;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
+ double indel_bias;
char *reg_fname, *pl_list, *fai_fname, *output_fname;
int reg_is_file, record_cmd_line, n_threads;
faidx_t *fai;
has_ref = 0;
}
- if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ // Allow sufficient room for bam_aux_append of ZQ tag without
+ // a realloc and consequent breakage of pileup's cached pointers.
+ if (has_ref && (ma->conf->flag &MPLP_REALN) && !bam_aux_get(b, "ZQ")) {
+ // Doing sam_prob_realn later is problematic as it adds to
+ // the tag list (ZQ or BQ), which causes a realloc of b->data.
+ // This happens after pileup has built a hash table on the
+ // read name. It's a deficiency in pileup IMO.
+
+ // We could implement a new sam_prob_realn that returns ZQ
+ // somewhere else and cache it ourselves (pileup clientdata),
+ // but for now we simply use a workaround.
+ //
+ // We create a fake tag of the correct length, which we remove
+ // just prior calling sam_prob_realn so we can guarantee there is
+ // room. (We can't just make room now as bam_copy1 removes it
+ // again).
+ if (b->core.l_qseq > 500) {
+ uint8_t *ZQ = malloc((uint32_t)b->core.l_qseq+1);
+ memset(ZQ, '@', b->core.l_qseq);
+ ZQ[b->core.l_qseq] = 0;
+ bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ);
+ free(ZQ);
+ } else {
+ static uint8_t ZQ[501] =
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@";
+ ZQ[b->core.l_qseq] = 0;
+ bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ);
+ ZQ[b->core.l_qseq] = '@';
+ }
+ }
+
if (has_ref && ma->conf->capQ_thres > 10) {
int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
if (q < 0) continue; // skip
static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd)
{
mplp_aux_t *ma = (mplp_aux_t *)data;
- cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b) << 1;
- if ( ma->conf->fmt_flag & (B2B_INFO_SCR|B2B_FMT_SCR) )
- {
- int i;
- for (i=0; i<b->core.n_cigar; i++)
- {
- int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK;
- if ( cig!=BAM_CSOFT_CLIP ) continue;
- cd->i |= 1;
+ int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b);
+ cd->i = 0;
+ PLP_SET_SAMPLE_ID(cd->i, n);
+ // Whether read has a soft-clip is used in mplp_realn's heuristics.
+ // TODO: consider whether clip length is beneficial to use?
+ int i;
+ for (i=0; i<b->core.n_cigar; i++) {
+ int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK;
+ if (cig == BAM_CSOFT_CLIP) {
+ PLP_SET_SOFT_CLIP(cd->i);
break;
}
}
+
+ if (ma->conf->flag & MPLP_REALN) {
+ int i, tot_ins = 0;
+ uint32_t *cigar = bam_get_cigar(b);
+ int p = 0;
+ for (i=0; i<b->core.n_cigar; i++) {
+ int cig = cigar[i] & BAM_CIGAR_MASK;
+ if (bam_cigar_type(cig) & 2)
+ p += cigar[i] >> BAM_CIGAR_SHIFT;
+ if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) {
+ tot_ins += cigar[i] >> BAM_CIGAR_SHIFT;
+ // Possible further optimsation, check tot_ins==1 later
+ // (and remove break) so we can detect single bp indels.
+ // We may want to focus BAQ on more complex regions only.
+ PLP_SET_INDEL(cd->i);
+ break;
+ }
+
+ // TODO: proper p->cd struct and have cd->i as a size rather
+ // than a flag.
+
+ // Then aggregate together the sizes and if just 1 size for all
+ // reads or 2 sizes for approx 50/50 split in all reads, then
+ // treat this as a well-aligned variant and don't run BAQ.
+ }
+ }
+
return 0;
}
{
const bam_pileup1_t *p = plp[i] + j;
int id = PLP_SAMPLE_ID(p->cd.i);
- if (m->n_plp[id] == m->m_plp[id])
+ if (m->n_plp[id] == m->m_plp[id])
{
m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output");
}
+/*
+ * Loops for an indel at this position.
+ *
+ * Only reads that overlap an indel loci get realigned. This considerably
+ * reduces the cost of running BAQ while keeping the main benefits.
+ *
+ * TODO: also consider only realigning reads that don't span the indel
+ * by more than a certain amount either-side. Ie focus BAQ only on reads
+ * ending adjacent to the indel, where the alignment is most likely to
+ * be wrong. (2nd TODO: do this based on sequence context; STRs bad, unique
+ * data good.)
+ *
+ * NB: this may sadly realign after we've already used the data. Hmm...
+ */
+static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp,
+ int flag, int max_read_len,
+ char *ref, int ref_len, int pos) {
+ int i, j, has_indel = 0, has_clip = 0, nt = 0;
+ int min_indel = INT_MAX, max_indel = INT_MIN;
+
+ // Is an indel present.
+ // NB: don't bother even checking if very long as almost guaranteed
+ // to have indel (and likely soft-clips too).
+ for (i = 0; i < n; i++) { // iterate over bams
+ nt += n_plp[i];
+ for (j = 0; j < n_plp[i]; j++) { // iterate over reads
+ bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j;
+ has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0;
+ // Has_clip is almost always true for very long reads
+ // (eg PacBio CCS), but these rarely matter as the clip
+ // is likely a long way from this indel.
+ has_clip += (PLP_HAS_SOFT_CLIP(p->cd.i)) ? 1 : 0;
+ if (max_indel < p->indel)
+ max_indel = p->indel;
+ if (min_indel > p->indel)
+ min_indel = p->indel;
+ }
+ }
+
+ if (flag & MPLP_REALN_PARTIAL) {
+ if (has_indel == 0 ||
+ (has_clip < 0.2*nt && max_indel == min_indel &&
+ (has_indel < 0.1*nt /*|| has_indel > 0.9*nt*/ || has_indel == 1)))
+ return;
+ }
+
+ // Realign
+ for (i = 0; i < n; i++) { // iterate over bams
+ for (j = 0; j < n_plp[i]; j++) { // iterate over reads
+ const bam_pileup1_t *p = plp[i] + j;
+ bam1_t *b = p->b;
+
+ // Avoid doing multiple times.
+ //
+ // Note we cannot modify p->cd.i here with a PLP_SET macro
+ // because the cd item is held by mpileup in an lbnode_t
+ // struct and copied over to the pileup struct for each
+ // iteration, essentially making p->cd.i read only.
+ //
+ // We could use our own structure (p->cd.p), allocated during
+ // the constructor, but for simplicity we play dirty and
+ // abuse an unused flag bit instead.
+ if (b->core.flag & 32768)
+ continue;
+ b->core.flag |= 32768;
+
+ if (b->core.l_qseq > max_read_len)
+ continue;
+
+ // Check p->cigar_ind and see what cigar elements are before
+ // and after. How close is this location to the end of the
+ // read? Only realign if we don't span by more than X bases.
+ //
+ // Again, best only done on deeper data as BAQ helps
+ // disproportionately more on shallow data sets.
+ //
+ // This rescues some of the false negatives that are caused by
+ // systematic reduction in quality due to sample vs ref alignment.
+
+// At deep coverage we skip realigning more reads as we have sufficient depth.
+// This rescues for false negatives. At shallow depth we pay for this with
+// more FP so are more stringent on spanning size.
+#define REALN_DIST (40+10*(nt<40)+10*(nt<20))
+ uint32_t *cig = bam_get_cigar(b);
+ int ncig = b->core.n_cigar;
+
+ // Don't realign reads where indel is in middle?
+ // On long read data we don't care about soft-clips at the ends.
+ // For short read data, we always calc BAQ on these as they're
+ // a common source of false positives.
+ if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) {
+ // Left & right cigar op match.
+ int lr = b->core.l_qseq > 500;
+ int lm = 0, rm = 0, k;
+ for (k = 0; k < ncig; k++) {
+ int cop = bam_cigar_op(cig[k]);
+ if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+ continue;
+
+ if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+ cop == BAM_CEQUAL)
+ lm += bam_cigar_oplen(cig[k]);
+ else
+ break;
+ }
+
+ for (k = ncig-1; k >= 0; k--) {
+ int cop = bam_cigar_op(cig[k]);
+ if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+ continue;
+
+ if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+ cop == BAM_CEQUAL)
+ rm += bam_cigar_oplen(cig[k]);
+ else
+ break;
+ }
+
+ if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
+ continue;
+
+ if (lm >= REALN_DIST && rm >= REALN_DIST &&
+ has_clip < (0.15+0.05*(nt>20))*nt)
+ continue;
+ }
+
+ if (b->core.l_qseq > 500) {
+ // don't do BAQ on long-read data if it's going to
+ // cause us to have a large band-with and costly in CPU
+ int rl = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
+ if (abs(rl - b->core.l_qseq) * b->core.l_qseq >= 500000)
+ continue;
+ }
+
+ // Fudge: make room for ZQ tag.
+ uint8_t *_Q = bam_aux_get(b, "_Q");
+ if (_Q) bam_aux_del(b, _Q);
+ sam_prob_realn(b, ref, ref_len, (flag & MPLP_REDO_BAQ) ? 7 : 3);
+ }
+ }
+
+ return;
+}
+
static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
{
bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list
int ret, i, tid, pos, ref_len;
char *ref;
- while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0)
+ while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0)
{
if ( pos<beg || pos>end ) continue;
if ( conf->bed && tid >= 0 )
if ( !conf->bed_logic ) overlap = overlap ? 0 : 1;
if ( !overlap ) continue;
}
- mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
+ int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
+ if (has_ref && (conf->flag & MPLP_REALN))
+ mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag,
+ conf->max_read_len, ref, ref_len, pos);
int total_depth, _ref0, ref16;
for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i];
conf->bc.tid = tid; conf->bc.pos = pos;
bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
bcf_clear1(conf->bcf_rec);
- bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag,
+ conf->bca, 0);
flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
// call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
// check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
- if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
- && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)
+ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
+ && (bcf_callaux_clean(conf->bca, &conf->bc),
+ bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0))
{
- bcf_callaux_clean(conf->bca, &conf->bc);
for (i = 0; i < conf->gplp->n; ++i)
bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
- if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
+ if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
{
bcf_clear1(conf->bcf_rec);
bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
conf->buf.l = 0;
ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s);
- if ( !conf->mplp_data[i]->iter )
+ if ( !conf->mplp_data[i]->iter )
{
conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
if ( conf->mplp_data[i]->iter ) {
conf->mplp_data[i]->h = hdr;
}
}
+ if ( !hdr ) {
+ fprintf(stderr, "[%s] failed to find a file header with usable read groups\n", __func__);
+ exit(EXIT_FAILURE);
+ }
// allocate data storage proportionate to number of samples being studied sm->n
bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n);
conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int));
conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int));
- conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));
+ conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));
fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
// write the VCF header
- conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type));
+ conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode2(conf->output_type,conf->output_fname));
if (conf->bcf_fp == NULL) {
fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
exit(EXIT_FAILURE);
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
if ( conf->fmt_flag&B2B_INFO_VDB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
- if ( conf->fmt_flag&B2B_INFO_RPB )
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
+
+ if (conf->fmt_flag & B2B_INFO_ZSCORE) {
+ if ( conf->fmt_flag&B2B_INFO_RPB )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Read Position Bias (closer to 0 is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_SCB )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
+ } else {
+ if ( conf->fmt_flag&B2B_INFO_RPB )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
+ }
+
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=FS,Number=1,Type=Float,Description=\"Phred-scaled p-value using Fisher's exact test to detect strand bias\">");
#if CDF_MWU_TESTS
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand (high-quality bases)\">");
if ( conf->fmt_flag&B2B_FMT_ADR )
bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand (high-quality bases)\">");
+ if ( conf->fmt_flag&B2B_FMT_QS )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=QS,Number=R,Type=Integer,Description=\"Phred-score allele quality sum used by `call -mG` and `+trio-dnm`\">");
if ( conf->fmt_flag&B2B_INFO_AD )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths (high-quality bases)\">");
if ( conf->fmt_flag&B2B_INFO_ADF )
bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
- conf->bca = bcf_call_init(-1., conf->min_baseQ);
+ conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
+ conf->delta_baseQ);
conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
+ conf->bca->indel_bias = conf->indel_bias;
conf->bca->min_frac = conf->min_frac;
conf->bca->min_support = conf->min_support;
conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
conf->bca->fmt_flag = conf->fmt_flag;
+ conf->bca->ambig_reads = conf->ambig_reads;
conf->bc.bcf_hdr = conf->bcf_hdr;
conf->bc.n = nsmpl;
conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL));
+ conf->bc.QS = (int32_t*) malloc(nsmpl*sizeof(*conf->bc.QS)*B2B_MAX_ALLELES);
+ for (i=0; i<nsmpl; i++)
+ conf->bcr[i].QS = conf->bc.QS + i*B2B_MAX_ALLELES;
if (conf->fmt_flag)
{
assert( sizeof(float)==sizeof(int32_t) );
if ( nregs )
{
int ireg = 0;
- do
+ do
{
// first region is already positioned
if ( ireg++ > 0 )
conf->buf.l = 0;
ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
- for (i=0; i<conf->nfiles; i++)
+ for (i=0; i<conf->nfiles; i++)
{
hts_itr_destroy(conf->mplp_data[i]->iter);
conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s);
- if ( !conf->mplp_data[i]->iter )
+ if ( !conf->mplp_data[i]->iter )
{
conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
if ( conf->mplp_data[i]->iter ) {
free(conf->bc.ADR);
free(conf->bc.ADF);
free(conf->bc.SCR);
+ free(conf->bc.QS);
free(conf->bc.fmt_arr);
free(conf->bcr);
}
else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR;
+ else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS;
else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR;
else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
+ else if ( !strcasecmp(tags[i],"SCB") || !strcasecmp(tags[i],"INFO/SCB")) flag |= B2B_INFO_SCB;
else
{
fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
+" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
"\n"
// source code in 80 columns, to the extent that's possible.)
fprintf(fp,
-"\n"
-"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
-"\n"
-"Input options:\n"
-" -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n"
-" -A, --count-orphans do not discard anomalous read pairs\n"
-" -b, --bam-list FILE list of input BAM filenames, one per line\n"
-" -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n"
-" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n"
-" -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+ "\n"
+ "Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
+ "\n"
+ "Input options:\n"
+ " -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n"
+ " -A, --count-orphans do not discard anomalous read pairs\n"
+ " -b, --bam-list FILE list of input BAM filenames, one per line\n"
+ " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n"
+ " -C, --adjust-MQ INT adjust mapping quality [0]\n"
+ " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n"
+ " -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+ fprintf(fp,
+ " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n"
+ " -f, --fasta-ref FILE faidx indexed reference sequence file\n"
+ " --no-reference do not require fasta reference file\n"
+ " -G, --read-groups FILE select or exclude read groups listed in the file\n"
+ " -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
fprintf(fp,
-" -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n"
-" -f, --fasta-ref FILE faidx indexed reference sequence file\n"
-" --no-reference do not require fasta reference file\n"
-" -G, --read-groups FILE select or exclude read groups listed in the file\n"
-" -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
+ " -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
fprintf(fp,
-" -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
+ " --max-BQ INT limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ);
fprintf(fp,
-" -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
-" -R, --regions-file FILE restrict to regions listed in a file\n"
-" --ignore-RG ignore RG tags (one BAM = one sample)\n"
-" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+ " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ);
fprintf(fp,
-" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
-" [%s]\n", tmp_filter);
+ " -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
+ " -R, --regions-file FILE restrict to regions listed in a file\n"
+ " --ignore-RG ignore RG tags (one BAM = one sample)\n"
+ " --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
fprintf(fp,
-" -s, --samples LIST comma separated list of samples to include\n"
-" -S, --samples-file FILE file of samples to include\n"
-" -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
-" -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
-" -x, --ignore-overlaps disable read-pair overlap detection\n"
-"\n"
-"Output options:\n"
-" -a, --annotate LIST optional tags to output; '?' to list []\n"
-" -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n"
-" to minimum per-sample DP\n"
-" --no-version do not append version and command line to the header\n"
-" -o, --output FILE write output to FILE [standard output]\n"
-" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
-" 'z' compressed VCF; 'v' uncompressed VCF [v]\n"
-" --threads INT use multithreading with INT worker threads [0]\n"
-"\n"
-"SNP/INDEL genotype likelihoods options:\n"
-" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
+ " --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
+ " [%s]\n", tmp_filter);
fprintf(fp,
-" -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac);
+ " -s, --samples LIST comma separated list of samples to include\n"
+ " -S, --samples-file FILE file of samples to include\n"
+ " -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
+ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
+ " -x, --ignore-overlaps disable read-pair overlap detection\n"
+ " --seed INT random number seed used for sampling deep regions [0]\n"
+ "\n"
+ "Output options:\n"
+ " -a, --annotate LIST optional tags to output; '?' to list available tags []\n"
+ " -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n"
+ " to minimum per-sample DP\n"
+ " --no-version do not append version and command line to the header\n"
+ " -o, --output FILE write output to FILE [standard output]\n"
+ " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
+ " 'z' compressed VCF; 'v' uncompressed VCF [v]\n"
+ " -U, --mwu-u use older probability scale for Mann-Whitney U test\n"
+ " --threads INT use multithreading with INT worker threads [0]\n"
+ "\n"
+ "SNP/INDEL genotype likelihoods options:\n"
+ " -X, --config STR Specify platform specific profiles (see below)\n"
+ " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
fprintf(fp,
-" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
+ " -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac);
fprintf(fp,
-" -I, --skip-indels do not perform indel calling\n"
-" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
+ " -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
fprintf(fp,
-" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
+ " -I, --skip-indels do not perform indel calling\n"
+ " -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
fprintf(fp,
-" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
+ " -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
fprintf(fp,
-" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
-" -P, --platforms STR comma separated list of platforms for indels [all]\n"
-"\n"
-"Notes: Assuming diploid individuals.\n"
-"\n"
-"Example:\n"
-" # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
-" bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
-"\n");
+ " -M, --max-read-len INT maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len);
+ fprintf(fp,
+ " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
+ fprintf(fp,
+ " -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
+ " -P, --platforms STR comma separated list of platforms for indels [all]\n"
+ " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n");
+ fprintf(fp,
+ " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
+ fprintf(fp,"\n");
+ fprintf(fp,
+ "Configuration profiles activated with -X, --config:\n"
+ " 1.12: -Q13 -h100 -m1 -F0.002\n"
+ " illumina: [ default values ]\n"
+ " ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n"
+ " pacbio-ccs: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n"
+ "\n"
+ "Notes: Assuming diploid individuals.\n"
+ "\n"
+ "Example:\n"
+ " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
+ " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
+ "\n");
free(tmp_require);
free(tmp_filter);
}
-int bam_mpileup(int argc, char *argv[])
+int main_mpileup(int argc, char *argv[])
{
int c;
const char *file_list = NULL;
int nfiles = 0, use_orphan = 0, noref = 0;
mplp_conf_t mplp;
memset(&mplp, 0, sizeof(mplp_conf_t));
- mplp.min_baseQ = 13;
+ mplp.min_baseQ = 1;
+ mplp.max_baseQ = 60;
+ mplp.delta_baseQ = 30;
mplp.capQ_thres = 0;
mplp.max_depth = 250; mplp.max_indel_depth = 250;
- mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
- mplp.min_frac = 0.002; mplp.min_support = 1;
- mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS;
+ mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500;
+ mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2;
+ mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL
+ | MPLP_SMART_OVERLAPS;
mplp.argc = argc; mplp.argv = argv;
mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
mplp.output_fname = NULL;
mplp.record_cmd_line = 1;
mplp.n_threads = 0;
mplp.bsmpl = bam_smpl_init();
- mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB; // the default to be changed in future, see also parse_format_flag()
+ // the default to be changed in future, see also parse_format_flag()
+ mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE;
+ mplp.max_read_len = 500;
+ mplp.ambig_reads = B2B_DROP;
+ hts_srand48(0);
static const struct option lopts[] =
{
{"bam-list", required_argument, NULL, 'b'},
{"no-BAQ", no_argument, NULL, 'B'},
{"no-baq", no_argument, NULL, 'B'},
+ {"full-BAQ", no_argument, NULL, 'D'},
+ {"full-baq", no_argument, NULL, 'D'},
{"adjust-MQ", required_argument, NULL, 'C'},
{"adjust-mq", required_argument, NULL, 'C'},
{"max-depth", required_argument, NULL, 'd'},
{"min-mq", required_argument, NULL, 'q'},
{"min-BQ", required_argument, NULL, 'Q'},
{"min-bq", required_argument, NULL, 'Q'},
+ {"max-bq", required_argument, NULL, 11},
+ {"max-BQ", required_argument, NULL, 11},
+ {"delta-BQ", required_argument, NULL, 12},
{"ignore-overlaps", no_argument, NULL, 'x'},
{"output-type", required_argument, NULL, 'O'},
{"samples", required_argument, NULL, 's'},
{"annotate", required_argument, NULL, 'a'},
{"ext-prob", required_argument, NULL, 'e'},
{"gap-frac", required_argument, NULL, 'F'},
+ {"indel-bias", required_argument, NULL, 10},
{"tandem-qual", required_argument, NULL, 'h'},
{"skip-indels", no_argument, NULL, 'I'},
{"max-idepth", required_argument, NULL, 'L'},
- {"min-ireads ", required_argument, NULL, 'm'},
+ {"min-ireads", required_argument, NULL, 'm'},
{"per-sample-mF", no_argument, NULL, 'p'},
{"per-sample-mf", no_argument, NULL, 'p'},
{"platforms", required_argument, NULL, 'P'},
+ {"max-read-len", required_argument, NULL, 'M'},
+ {"config", required_argument, NULL, 'X'},
+ {"mwu-u", no_argument, NULL, 'U'},
+ {"seed", required_argument, NULL, 13},
+ {"ambig-reads", required_argument, NULL, 14},
+ {"ar", required_argument, NULL, 14},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
switch (c) {
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 1 :
case 'P': mplp.pl_list = strdup(optarg); break;
case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
case 'B': mplp.flag &= ~MPLP_REALN; break;
+ case 'D': mplp.flag &= ~MPLP_REALN_PARTIAL; break;
case 'I': mplp.flag |= MPLP_NO_INDEL; break;
case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
case '6': mplp.flag |= MPLP_ILLUMINA13; break;
case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break;
case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break;
- case 'O':
+ case 'O':
switch (optarg[0]) {
case 'b': mplp.output_type = FT_BCF_GZ; break;
case 'u': mplp.output_type = FT_BCF; break;
case 'z': mplp.output_type = FT_VCF_GZ; break;
case 'v': mplp.output_type = FT_VCF; break;
- default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n");
+ default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n");
}
break;
case 'C': mplp.capQ_thres = atoi(optarg); break;
case 'q': mplp.min_mq = atoi(optarg); break;
case 'Q': mplp.min_baseQ = atoi(optarg); break;
+ case 11: mplp.max_baseQ = atoi(optarg); break;
+ case 12: mplp.delta_baseQ = atoi(optarg); break;
case 'b': file_list = optarg; break;
case 'o': {
char *end;
break;
case 'e': mplp.extQ = atoi(optarg); break;
case 'h': mplp.tandemQ = atoi(optarg); break;
+ case 10: // --indel-bias (inverted so higher => more indels called)
+ if (atof(optarg) < 1e-2)
+ mplp.indel_bias = 1/1e2;
+ else
+ mplp.indel_bias = 1/atof(optarg);
+ break;
case 'A': use_orphan = 1; break;
case 'F': mplp.min_frac = atof(optarg); break;
case 'm': mplp.min_support = atoi(optarg); break;
}
mplp.fmt_flag |= parse_format_flag(optarg);
break;
+ case 'M': mplp.max_read_len = atoi(optarg); break;
+ case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break;
+ case 'X':
+ if (strcasecmp(optarg, "pacbio-ccs") == 0) {
+ mplp.min_frac = 0.1;
+ mplp.min_baseQ = 5;
+ mplp.max_baseQ = 50;
+ mplp.delta_baseQ = 10;
+ mplp.openQ = 25;
+ mplp.extQ = 1;
+ mplp.flag |= MPLP_REALN_PARTIAL;
+ mplp.max_read_len = 99999;
+ } else if (strcasecmp(optarg, "ont") == 0) {
+ fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with "
+ "a higher -P, eg -P0.01 or -P 0.1\n");
+ mplp.min_baseQ = 5;
+ mplp.max_baseQ = 30;
+ mplp.flag &= ~MPLP_REALN;
+ mplp.flag |= MPLP_NO_INDEL;
+ } else if (strcasecmp(optarg, "1.12") == 0) {
+ // 1.12 and earlier
+ mplp.min_frac = 0.002;
+ mplp.min_support = 1;
+ mplp.min_baseQ = 13;
+ mplp.tandemQ = 100;
+ mplp.flag &= ~MPLP_REALN_PARTIAL;
+ mplp.flag |= MPLP_REALN;
+ } else if (strcasecmp(optarg, "illumina") == 0) {
+ mplp.flag |= MPLP_REALN_PARTIAL;
+ } else {
+ fprintf(stderr, "Unknown configuration name '%s'\n"
+ "Please choose from 1.12, illumina, pacbio-ccs or ont\n",
+ optarg);
+ return 1;
+ }
+ break;
+ case 13: hts_srand48(atoi(optarg)); break;
+ case 14:
+ if ( !strcasecmp(optarg,"drop") ) mplp.ambig_reads = B2B_DROP;
+ else if ( !strcasecmp(optarg,"incAD") ) mplp.ambig_reads = B2B_INC_AD;
+ else if ( !strcasecmp(optarg,"incAD0") ) mplp.ambig_reads = B2B_INC_AD0;
+ else error("The option to --ambig-reads not recognised: %s\n",optarg);
+ break;
default:
fprintf(stderr,"Invalid option: '%c'\n", c);
return 1;
return 1;
}
int ret,i;
- if (file_list)
+ if (file_list)
{
if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
mplp.files = fn;
if (mplp.bed_itr) regitr_destroy(mplp.bed_itr);
if (mplp.reg) regidx_destroy(mplp.reg);
bam_smpl_destroy(mplp.bsmpl);
+
return ret;
}
/* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
- Copyright (C) 2008-2018 Genome Research Ltd.
+ Copyright (C) 2008-2021 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <htslib/faidx.h>
#include <htslib/kstring.h>
#include <htslib/khash_str2int.h>
+#include <htslib/hts_os.h>
#include <assert.h>
#include "regidx.h"
#include "bcftools.h"
#define MPLP_PRINT_MAPQ (1<<10)
#define MPLP_PER_SAMPLE (1<<11)
#define MPLP_SMART_OVERLAPS (1<<12)
+#define MPLP_REALN_PARTIAL (1<<13)
typedef struct _mplp_aux_t mplp_aux_t;
typedef struct _mplp_pileup_t mplp_pileup_t;
// Data shared by all bam files
typedef struct {
- int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+ int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth,
+ max_indel_depth, max_read_len, fmt_flag, ambig_reads;
int rflag_require, rflag_filter, output_type;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
+ double indel_bias;
char *reg_fname, *pl_list, *fai_fname, *output_fname;
int reg_is_file, record_cmd_line, n_threads;
faidx_t *fai;
has_ref = 0;
}
- if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ // Allow sufficient room for bam_aux_append of ZQ tag without
+ // a realloc and consequent breakage of pileup's cached pointers.
+ if (has_ref && (ma->conf->flag &MPLP_REALN) && !bam_aux_get(b, "ZQ")) {
+ // Doing sam_prob_realn later is problematic as it adds to
+ // the tag list (ZQ or BQ), which causes a realloc of b->data.
+ // This happens after pileup has built a hash table on the
+ // read name. It's a deficiency in pileup IMO.
+
+ // We could implement a new sam_prob_realn that returns ZQ
+ // somewhere else and cache it ourselves (pileup clientdata),
+ // but for now we simply use a workaround.
+ //
+ // We create a fake tag of the correct length, which we remove
+ // just prior calling sam_prob_realn so we can guarantee there is
+ // room. (We can't just make room now as bam_copy1 removes it
+ // again).
+ if (b->core.l_qseq > 500) {
+ uint8_t *ZQ = malloc((uint32_t)b->core.l_qseq+1);
+ memset(ZQ, '@', b->core.l_qseq);
+ ZQ[b->core.l_qseq] = 0;
+ bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ);
+ free(ZQ);
+ } else {
+ static uint8_t ZQ[501] =
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+ "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@";
+ ZQ[b->core.l_qseq] = 0;
+ bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ);
+ ZQ[b->core.l_qseq] = '@';
+ }
+ }
+
if (has_ref && ma->conf->capQ_thres > 10) {
int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
if (q < 0) continue; // skip
static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd)
{
mplp_aux_t *ma = (mplp_aux_t *)data;
- cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b) << 1;
- if ( ma->conf->fmt_flag & (B2B_INFO_SCR|B2B_FMT_SCR) )
- {
- int i;
- for (i=0; i<b->core.n_cigar; i++)
- {
- int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK;
- if ( cig!=BAM_CSOFT_CLIP ) continue;
- cd->i |= 1;
+ int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b);
+ cd->i = 0;
+ PLP_SET_SAMPLE_ID(cd->i, n);
+ // Whether read has a soft-clip is used in mplp_realn's heuristics.
+ // TODO: consider whether clip length is beneficial to use?
+ int i;
+ for (i=0; i<b->core.n_cigar; i++) {
+ int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK;
+ if (cig == BAM_CSOFT_CLIP) {
+ PLP_SET_SOFT_CLIP(cd->i);
break;
}
}
+
+ if (ma->conf->flag & MPLP_REALN) {
+ int i, tot_ins = 0;
+ uint32_t *cigar = bam_get_cigar(b);
+ int p = 0;
+ for (i=0; i<b->core.n_cigar; i++) {
+ int cig = cigar[i] & BAM_CIGAR_MASK;
+ if (bam_cigar_type(cig) & 2)
+ p += cigar[i] >> BAM_CIGAR_SHIFT;
+ if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) {
+ tot_ins += cigar[i] >> BAM_CIGAR_SHIFT;
+ // Possible further optimsation, check tot_ins==1 later
+ // (and remove break) so we can detect single bp indels.
+ // We may want to focus BAQ on more complex regions only.
+ PLP_SET_INDEL(cd->i);
+ break;
+ }
+
+ // TODO: proper p->cd struct and have cd->i as a size rather
+ // than a flag.
+
+ // Then aggregate together the sizes and if just 1 size for all
+ // reads or 2 sizes for approx 50/50 split in all reads, then
+ // treat this as a well-aligned variant and don't run BAQ.
+ }
+ }
+
return 0;
}
{
const bam_pileup1_t *p = plp[i] + j;
int id = PLP_SAMPLE_ID(p->cd.i);
- if (m->n_plp[id] == m->m_plp[id])
+ if (m->n_plp[id] == m->m_plp[id])
{
m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output");
}
+/*
+ * Loops for an indel at this position.
+ *
+ * Only reads that overlap an indel loci get realigned. This considerably
+ * reduces the cost of running BAQ while keeping the main benefits.
+ *
+ * TODO: also consider only realigning reads that don't span the indel
+ * by more than a certain amount either-side. Ie focus BAQ only on reads
+ * ending adjacent to the indel, where the alignment is most likely to
+ * be wrong. (2nd TODO: do this based on sequence context; STRs bad, unique
+ * data good.)
+ *
+ * NB: this may sadly realign after we've already used the data. Hmm...
+ */
+static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp,
+ int flag, int max_read_len,
+ char *ref, int ref_len, int pos) {
+ int i, j, has_indel = 0, has_clip = 0, nt = 0;
+ int min_indel = INT_MAX, max_indel = INT_MIN;
+
+ // Is an indel present.
+ // NB: don't bother even checking if very long as almost guaranteed
+ // to have indel (and likely soft-clips too).
+ for (i = 0; i < n; i++) { // iterate over bams
+ nt += n_plp[i];
+ for (j = 0; j < n_plp[i]; j++) { // iterate over reads
+ bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j;
+ has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0;
+ // Has_clip is almost always true for very long reads
+ // (eg PacBio CCS), but these rarely matter as the clip
+ // is likely a long way from this indel.
+ has_clip += (PLP_HAS_SOFT_CLIP(p->cd.i)) ? 1 : 0;
+ if (max_indel < p->indel)
+ max_indel = p->indel;
+ if (min_indel > p->indel)
+ min_indel = p->indel;
+ }
+ }
+
+ if (flag & MPLP_REALN_PARTIAL) {
+ if (has_indel == 0 ||
+ (has_clip < 0.2*nt && max_indel == min_indel &&
+ (has_indel < 0.1*nt /*|| has_indel > 0.9*nt*/ || has_indel == 1)))
+ return;
+ }
+
+ // Realign
+ for (i = 0; i < n; i++) { // iterate over bams
+ for (j = 0; j < n_plp[i]; j++) { // iterate over reads
+ const bam_pileup1_t *p = plp[i] + j;
+ bam1_t *b = p->b;
+
+ // Avoid doing multiple times.
+ //
+ // Note we cannot modify p->cd.i here with a PLP_SET macro
+ // because the cd item is held by mpileup in an lbnode_t
+ // struct and copied over to the pileup struct for each
+ // iteration, essentially making p->cd.i read only.
+ //
+ // We could use our own structure (p->cd.p), allocated during
+ // the constructor, but for simplicity we play dirty and
+ // abuse an unused flag bit instead.
+ if (b->core.flag & 32768)
+ continue;
+ b->core.flag |= 32768;
+
+ if (b->core.l_qseq > max_read_len)
+ continue;
+
+ // Check p->cigar_ind and see what cigar elements are before
+ // and after. How close is this location to the end of the
+ // read? Only realign if we don't span by more than X bases.
+ //
+ // Again, best only done on deeper data as BAQ helps
+ // disproportionately more on shallow data sets.
+ //
+ // This rescues some of the false negatives that are caused by
+ // systematic reduction in quality due to sample vs ref alignment.
+
+// At deep coverage we skip realigning more reads as we have sufficient depth.
+// This rescues for false negatives. At shallow depth we pay for this with
+// more FP so are more stringent on spanning size.
+#define REALN_DIST (40+10*(nt<40)+10*(nt<20))
+ uint32_t *cig = bam_get_cigar(b);
+ int ncig = b->core.n_cigar;
+
+ // Don't realign reads where indel is in middle?
+ // On long read data we don't care about soft-clips at the ends.
+ // For short read data, we always calc BAQ on these as they're
+ // a common source of false positives.
+ if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) {
+ // Left & right cigar op match.
+ int lr = b->core.l_qseq > 500;
+ int lm = 0, rm = 0, k;
+ for (k = 0; k < ncig; k++) {
+ int cop = bam_cigar_op(cig[k]);
+ if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+ continue;
+
+ if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+ cop == BAM_CEQUAL)
+ lm += bam_cigar_oplen(cig[k]);
+ else
+ break;
+ }
+
+ for (k = ncig-1; k >= 0; k--) {
+ int cop = bam_cigar_op(cig[k]);
+ if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+ continue;
+
+ if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+ cop == BAM_CEQUAL)
+ rm += bam_cigar_oplen(cig[k]);
+ else
+ break;
+ }
+
+ if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
+ continue;
+
+ if (lm >= REALN_DIST && rm >= REALN_DIST &&
+ has_clip < (0.15+0.05*(nt>20))*nt)
+ continue;
+ }
+
+ if (b->core.l_qseq > 500) {
+ // don't do BAQ on long-read data if it's going to
+ // cause us to have a large band-with and costly in CPU
+ int rl = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
+ if (abs(rl - b->core.l_qseq) * b->core.l_qseq >= 500000)
+ continue;
+ }
+
+ // Fudge: make room for ZQ tag.
+ uint8_t *_Q = bam_aux_get(b, "_Q");
+ if (_Q) bam_aux_del(b, _Q);
+ sam_prob_realn(b, ref, ref_len, (flag & MPLP_REDO_BAQ) ? 7 : 3);
+ }
+ }
+
+ return;
+}
+
static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
{
bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list
int ret, i, tid, pos, ref_len;
char *ref;
- while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0)
+ while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0)
{
if ( pos<beg || pos>end ) continue;
if ( conf->bed && tid >= 0 )
if ( !conf->bed_logic ) overlap = overlap ? 0 : 1;
if ( !overlap ) continue;
}
- mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
+ int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
+ if (has_ref && (conf->flag & MPLP_REALN))
+ mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag,
+ conf->max_read_len, ref, ref_len, pos);
int total_depth, _ref0, ref16;
for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i];
conf->bc.tid = tid; conf->bc.pos = pos;
bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
bcf_clear1(conf->bcf_rec);
- bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag,
+ conf->bca, 0);
flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
// call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
// check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
- if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
- && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)
+ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
+ && (bcf_callaux_clean(conf->bca, &conf->bc),
+ bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0))
{
- bcf_callaux_clean(conf->bca, &conf->bc);
for (i = 0; i < conf->gplp->n; ++i)
bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
- if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
+ if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
{
bcf_clear1(conf->bcf_rec);
bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
{
if (conf->nfiles == 0) {
fprintf(bcftools_stderr,"[%s] no input file/data given\n", __func__);
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
mplp_ref_t mp_ref = MPLP_REF_INIT;
conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL);
if ( !conf->reg ) {
fprintf(bcftools_stderr,"Could not parse the regions: %s\n", conf->reg_fname);
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
}
else
conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) {
fprintf(bcftools_stderr,"Could not parse the regions: %s\n", conf->reg_fname);
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
}
nregs = regidx_nregs(conf->reg);
if ( !conf->mplp_data[i]->fp )
{
fprintf(bcftools_stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno));
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
fprintf(bcftools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) {
fprintf(bcftools_stderr, "[%s] failed to process %s: %s\n",
__func__, conf->fai_fname, strerror(errno));
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
conf->mplp_data[i]->conf = conf;
conf->mplp_data[i]->ref = &mp_ref;
h_tmp = sam_hdr_read(conf->mplp_data[i]->fp);
if ( !h_tmp ) {
fprintf(bcftools_stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]);
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet
conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]);
hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]);
if (idx == NULL) {
fprintf(bcftools_stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]);
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
conf->buf.l = 0;
ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s);
- if ( !conf->mplp_data[i]->iter )
+ if ( !conf->mplp_data[i]->iter )
{
conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
if ( conf->mplp_data[i]->iter ) {
fprintf(bcftools_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
fprintf(bcftools_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
if ( nregs==1 ) // no need to keep the index in memory
hts_idx_destroy(idx);
conf->mplp_data[i]->h = hdr;
}
}
+ if ( !hdr ) {
+ fprintf(bcftools_stderr, "[%s] failed to find a file header with usable read groups\n", __func__);
+ bcftools_exit(EXIT_FAILURE);
+ }
// allocate data storage proportionate to number of samples being studied sm->n
bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n);
conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int));
conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int));
- conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));
+ conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));
fprintf(bcftools_stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
// write the VCF header
- conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type));
+ conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode2(conf->output_type,conf->output_fname));
if (conf->bcf_fp == NULL) {
fprintf(bcftools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads);
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
if ( conf->fmt_flag&B2B_INFO_VDB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
- if ( conf->fmt_flag&B2B_INFO_RPB )
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
+
+ if (conf->fmt_flag & B2B_INFO_ZSCORE) {
+ if ( conf->fmt_flag&B2B_INFO_RPB )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Read Position Bias (closer to 0 is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_SCB )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
+ } else {
+ if ( conf->fmt_flag&B2B_INFO_RPB )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
+ }
+
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=FS,Number=1,Type=Float,Description=\"Phred-scaled p-value using Fisher's exact test to detect strand bias\">");
#if CDF_MWU_TESTS
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand (high-quality bases)\">");
if ( conf->fmt_flag&B2B_FMT_ADR )
bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand (high-quality bases)\">");
+ if ( conf->fmt_flag&B2B_FMT_QS )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=QS,Number=R,Type=Integer,Description=\"Phred-score allele quality sum used by `call -mG` and `+trio-dnm`\">");
if ( conf->fmt_flag&B2B_INFO_AD )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths (high-quality bases)\">");
if ( conf->fmt_flag&B2B_INFO_ADF )
bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
- conf->bca = bcf_call_init(-1., conf->min_baseQ);
+ conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
+ conf->delta_baseQ);
conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
+ conf->bca->indel_bias = conf->indel_bias;
conf->bca->min_frac = conf->min_frac;
conf->bca->min_support = conf->min_support;
conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
conf->bca->fmt_flag = conf->fmt_flag;
+ conf->bca->ambig_reads = conf->ambig_reads;
conf->bc.bcf_hdr = conf->bcf_hdr;
conf->bc.n = nsmpl;
conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL));
+ conf->bc.QS = (int32_t*) malloc(nsmpl*sizeof(*conf->bc.QS)*B2B_MAX_ALLELES);
+ for (i=0; i<nsmpl; i++)
+ conf->bcr[i].QS = conf->bc.QS + i*B2B_MAX_ALLELES;
if (conf->fmt_flag)
{
assert( sizeof(float)==sizeof(int32_t) );
if ( nregs )
{
int ireg = 0;
- do
+ do
{
// first region is already positioned
if ( ireg++ > 0 )
conf->buf.l = 0;
ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
- for (i=0; i<conf->nfiles; i++)
+ for (i=0; i<conf->nfiles; i++)
{
hts_itr_destroy(conf->mplp_data[i]->iter);
conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s);
- if ( !conf->mplp_data[i]->iter )
+ if ( !conf->mplp_data[i]->iter )
{
conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
if ( conf->mplp_data[i]->iter ) {
fprintf(bcftools_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
fprintf(bcftools_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
bam_mplp_reset(conf->iter);
}
free(conf->bc.ADR);
free(conf->bc.ADF);
free(conf->bc.SCR);
+ free(conf->bc.QS);
free(conf->bc.fmt_arr);
free(conf->bcr);
}
else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR;
+ else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS;
else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR;
else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
+ else if ( !strcasecmp(tags[i],"SCB") || !strcasecmp(tags[i],"INFO/SCB")) flag |= B2B_INFO_SCB;
else
{
fprintf(bcftools_stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
free(tags[i]);
}
" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
+" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
"\n"
// source code in 80 columns, to the extent that's possible.)
fprintf(fp,
-"\n"
-"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
-"\n"
-"Input options:\n"
-" -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n"
-" -A, --count-orphans do not discard anomalous read pairs\n"
-" -b, --bam-list FILE list of input BAM filenames, one per line\n"
-" -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n"
-" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n"
-" -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+ "\n"
+ "Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
+ "\n"
+ "Input options:\n"
+ " -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n"
+ " -A, --count-orphans do not discard anomalous read pairs\n"
+ " -b, --bam-list FILE list of input BAM filenames, one per line\n"
+ " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n"
+ " -C, --adjust-MQ INT adjust mapping quality [0]\n"
+ " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n"
+ " -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+ fprintf(fp,
+ " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n"
+ " -f, --fasta-ref FILE faidx indexed reference sequence file\n"
+ " --no-reference do not require fasta reference file\n"
+ " -G, --read-groups FILE select or exclude read groups listed in the file\n"
+ " -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
fprintf(fp,
-" -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n"
-" -f, --fasta-ref FILE faidx indexed reference sequence file\n"
-" --no-reference do not require fasta reference file\n"
-" -G, --read-groups FILE select or exclude read groups listed in the file\n"
-" -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
+ " -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
fprintf(fp,
-" -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
+ " --max-BQ INT limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ);
fprintf(fp,
-" -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
-" -R, --regions-file FILE restrict to regions listed in a file\n"
-" --ignore-RG ignore RG tags (one BAM = one sample)\n"
-" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+ " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ);
fprintf(fp,
-" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
-" [%s]\n", tmp_filter);
+ " -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
+ " -R, --regions-file FILE restrict to regions listed in a file\n"
+ " --ignore-RG ignore RG tags (one BAM = one sample)\n"
+ " --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
fprintf(fp,
-" -s, --samples LIST comma separated list of samples to include\n"
-" -S, --samples-file FILE file of samples to include\n"
-" -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
-" -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
-" -x, --ignore-overlaps disable read-pair overlap detection\n"
-"\n"
-"Output options:\n"
-" -a, --annotate LIST optional tags to output; '?' to list []\n"
-" -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n"
-" to minimum per-sample DP\n"
-" --no-version do not append version and command line to the header\n"
-" -o, --output FILE write output to FILE [standard output]\n"
-" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
-" 'z' compressed VCF; 'v' uncompressed VCF [v]\n"
-" --threads INT use multithreading with INT worker threads [0]\n"
-"\n"
-"SNP/INDEL genotype likelihoods options:\n"
-" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
+ " --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
+ " [%s]\n", tmp_filter);
fprintf(fp,
-" -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac);
+ " -s, --samples LIST comma separated list of samples to include\n"
+ " -S, --samples-file FILE file of samples to include\n"
+ " -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
+ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
+ " -x, --ignore-overlaps disable read-pair overlap detection\n"
+ " --seed INT random number seed used for sampling deep regions [0]\n"
+ "\n"
+ "Output options:\n"
+ " -a, --annotate LIST optional tags to output; '?' to list available tags []\n"
+ " -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n"
+ " to minimum per-sample DP\n"
+ " --no-version do not append version and command line to the header\n"
+ " -o, --output FILE write output to FILE [standard output]\n"
+ " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
+ " 'z' compressed VCF; 'v' uncompressed VCF [v]\n"
+ " -U, --mwu-u use older probability scale for Mann-Whitney U test\n"
+ " --threads INT use multithreading with INT worker threads [0]\n"
+ "\n"
+ "SNP/INDEL genotype likelihoods options:\n"
+ " -X, --config STR Specify platform specific profiles (see below)\n"
+ " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
fprintf(fp,
-" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
+ " -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac);
fprintf(fp,
-" -I, --skip-indels do not perform indel calling\n"
-" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
+ " -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
fprintf(fp,
-" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
+ " -I, --skip-indels do not perform indel calling\n"
+ " -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
fprintf(fp,
-" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
+ " -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
fprintf(fp,
-" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
-" -P, --platforms STR comma separated list of platforms for indels [all]\n"
-"\n"
-"Notes: Assuming diploid individuals.\n"
-"\n"
-"Example:\n"
-" # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
-" bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
-"\n");
+ " -M, --max-read-len INT maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len);
+ fprintf(fp,
+ " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
+ fprintf(fp,
+ " -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
+ " -P, --platforms STR comma separated list of platforms for indels [all]\n"
+ " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n");
+ fprintf(fp,
+ " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
+ fprintf(fp,"\n");
+ fprintf(fp,
+ "Configuration profiles activated with -X, --config:\n"
+ " 1.12: -Q13 -h100 -m1 -F0.002\n"
+ " illumina: [ default values ]\n"
+ " ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n"
+ " pacbio-ccs: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n"
+ "\n"
+ "Notes: Assuming diploid individuals.\n"
+ "\n"
+ "Example:\n"
+ " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
+ " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
+ "\n");
free(tmp_require);
free(tmp_filter);
}
-int bam_mpileup(int argc, char *argv[])
+int main_mpileup(int argc, char *argv[])
{
int c;
const char *file_list = NULL;
int nfiles = 0, use_orphan = 0, noref = 0;
mplp_conf_t mplp;
memset(&mplp, 0, sizeof(mplp_conf_t));
- mplp.min_baseQ = 13;
+ mplp.min_baseQ = 1;
+ mplp.max_baseQ = 60;
+ mplp.delta_baseQ = 30;
mplp.capQ_thres = 0;
mplp.max_depth = 250; mplp.max_indel_depth = 250;
- mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
- mplp.min_frac = 0.002; mplp.min_support = 1;
- mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS;
+ mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500;
+ mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2;
+ mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL
+ | MPLP_SMART_OVERLAPS;
mplp.argc = argc; mplp.argv = argv;
mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
mplp.output_fname = NULL;
mplp.record_cmd_line = 1;
mplp.n_threads = 0;
mplp.bsmpl = bam_smpl_init();
- mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB; // the default to be changed in future, see also parse_format_flag()
+ // the default to be changed in future, see also parse_format_flag()
+ mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE;
+ mplp.max_read_len = 500;
+ mplp.ambig_reads = B2B_DROP;
+ hts_srand48(0);
static const struct option lopts[] =
{
{"bam-list", required_argument, NULL, 'b'},
{"no-BAQ", no_argument, NULL, 'B'},
{"no-baq", no_argument, NULL, 'B'},
+ {"full-BAQ", no_argument, NULL, 'D'},
+ {"full-baq", no_argument, NULL, 'D'},
{"adjust-MQ", required_argument, NULL, 'C'},
{"adjust-mq", required_argument, NULL, 'C'},
{"max-depth", required_argument, NULL, 'd'},
{"min-mq", required_argument, NULL, 'q'},
{"min-BQ", required_argument, NULL, 'Q'},
{"min-bq", required_argument, NULL, 'Q'},
+ {"max-bq", required_argument, NULL, 11},
+ {"max-BQ", required_argument, NULL, 11},
+ {"delta-BQ", required_argument, NULL, 12},
{"ignore-overlaps", no_argument, NULL, 'x'},
{"output-type", required_argument, NULL, 'O'},
{"samples", required_argument, NULL, 's'},
{"annotate", required_argument, NULL, 'a'},
{"ext-prob", required_argument, NULL, 'e'},
{"gap-frac", required_argument, NULL, 'F'},
+ {"indel-bias", required_argument, NULL, 10},
{"tandem-qual", required_argument, NULL, 'h'},
{"skip-indels", no_argument, NULL, 'I'},
{"max-idepth", required_argument, NULL, 'L'},
- {"min-ireads ", required_argument, NULL, 'm'},
+ {"min-ireads", required_argument, NULL, 'm'},
{"per-sample-mF", no_argument, NULL, 'p'},
{"per-sample-mf", no_argument, NULL, 'p'},
{"platforms", required_argument, NULL, 'P'},
+ {"max-read-len", required_argument, NULL, 'M'},
+ {"config", required_argument, NULL, 'X'},
+ {"mwu-u", no_argument, NULL, 'U'},
+ {"seed", required_argument, NULL, 13},
+ {"ambig-reads", required_argument, NULL, 14},
+ {"ar", required_argument, NULL, 14},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
switch (c) {
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 1 :
if ( regidx_insert_list(mplp.bed,optarg,',') !=0 )
{
fprintf(bcftools_stderr,"Could not parse the targets: %s\n", optarg);
- exit(EXIT_FAILURE);
+ bcftools_exit(EXIT_FAILURE);
}
break;
case 'T':
case 'P': mplp.pl_list = strdup(optarg); break;
case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
case 'B': mplp.flag &= ~MPLP_REALN; break;
+ case 'D': mplp.flag &= ~MPLP_REALN_PARTIAL; break;
case 'I': mplp.flag |= MPLP_NO_INDEL; break;
case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
case '6': mplp.flag |= MPLP_ILLUMINA13; break;
case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break;
case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break;
- case 'O':
+ case 'O':
switch (optarg[0]) {
case 'b': mplp.output_type = FT_BCF_GZ; break;
case 'u': mplp.output_type = FT_BCF; break;
case 'z': mplp.output_type = FT_VCF_GZ; break;
case 'v': mplp.output_type = FT_VCF; break;
- default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n");
+ default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n");
}
break;
case 'C': mplp.capQ_thres = atoi(optarg); break;
case 'q': mplp.min_mq = atoi(optarg); break;
case 'Q': mplp.min_baseQ = atoi(optarg); break;
+ case 11: mplp.max_baseQ = atoi(optarg); break;
+ case 12: mplp.delta_baseQ = atoi(optarg); break;
case 'b': file_list = optarg; break;
case 'o': {
char *end;
break;
case 'e': mplp.extQ = atoi(optarg); break;
case 'h': mplp.tandemQ = atoi(optarg); break;
+ case 10: // --indel-bias (inverted so higher => more indels called)
+ if (atof(optarg) < 1e-2)
+ mplp.indel_bias = 1/1e2;
+ else
+ mplp.indel_bias = 1/atof(optarg);
+ break;
case 'A': use_orphan = 1; break;
case 'F': mplp.min_frac = atof(optarg); break;
case 'm': mplp.min_support = atoi(optarg); break;
}
mplp.fmt_flag |= parse_format_flag(optarg);
break;
+ case 'M': mplp.max_read_len = atoi(optarg); break;
+ case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break;
+ case 'X':
+ if (strcasecmp(optarg, "pacbio-ccs") == 0) {
+ mplp.min_frac = 0.1;
+ mplp.min_baseQ = 5;
+ mplp.max_baseQ = 50;
+ mplp.delta_baseQ = 10;
+ mplp.openQ = 25;
+ mplp.extQ = 1;
+ mplp.flag |= MPLP_REALN_PARTIAL;
+ mplp.max_read_len = 99999;
+ } else if (strcasecmp(optarg, "ont") == 0) {
+ fprintf(bcftools_stderr, "For ONT it may be beneficial to also run bcftools call with "
+ "a higher -P, eg -P0.01 or -P 0.1\n");
+ mplp.min_baseQ = 5;
+ mplp.max_baseQ = 30;
+ mplp.flag &= ~MPLP_REALN;
+ mplp.flag |= MPLP_NO_INDEL;
+ } else if (strcasecmp(optarg, "1.12") == 0) {
+ // 1.12 and earlier
+ mplp.min_frac = 0.002;
+ mplp.min_support = 1;
+ mplp.min_baseQ = 13;
+ mplp.tandemQ = 100;
+ mplp.flag &= ~MPLP_REALN_PARTIAL;
+ mplp.flag |= MPLP_REALN;
+ } else if (strcasecmp(optarg, "illumina") == 0) {
+ mplp.flag |= MPLP_REALN_PARTIAL;
+ } else {
+ fprintf(bcftools_stderr, "Unknown configuration name '%s'\n"
+ "Please choose from 1.12, illumina, pacbio-ccs or ont\n",
+ optarg);
+ return 1;
+ }
+ break;
+ case 13: hts_srand48(atoi(optarg)); break;
+ case 14:
+ if ( !strcasecmp(optarg,"drop") ) mplp.ambig_reads = B2B_DROP;
+ else if ( !strcasecmp(optarg,"incAD") ) mplp.ambig_reads = B2B_INC_AD;
+ else if ( !strcasecmp(optarg,"incAD0") ) mplp.ambig_reads = B2B_INC_AD0;
+ else error("The option to --ambig-reads not recognised: %s\n",optarg);
+ break;
default:
fprintf(bcftools_stderr,"Invalid option: '%c'\n", c);
return 1;
return 1;
}
int ret,i;
- if (file_list)
+ if (file_list)
{
if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
mplp.files = fn;
if (mplp.bed_itr) regitr_destroy(mplp.bed_itr);
if (mplp.reg) regidx_destroy(mplp.reg);
bam_smpl_destroy(mplp.bsmpl);
+
return ret;
}
/*
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014-2015 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
/* prob1.c -- mathematical utility functions.
Copyright (C) 2010, 2011 Broad Institute.
- Copyright (C) 2012, 2013 Genome Research Ltd.
+ Copyright (C) 2012, 2013-2014, 2017 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
/* prob1.c -- mathematical utility functions.
Copyright (C) 2010, 2011 Broad Institute.
- Copyright (C) 2012, 2013 Genome Research Ltd.
+ Copyright (C) 2012, 2013-2014, 2017 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
/* prob1.h -- mathematical utility functions.
Copyright (C) 2010, 2011 Broad Institute.
- Copyright (C) 2012, 2013 Genome Research Ltd.
+ Copyright (C) 2012, 2013-2014 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
/* rbuf.h -- round buffers.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2014, 2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
/*
- Copyright (C) 2014-2017 Genome Research Ltd.
+ Copyright (C) 2014-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include "bcftools.pysam.h"
/*
- Copyright (C) 2014-2017 Genome Research Ltd.
+ Copyright (C) 2014-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
/*
- Copyright (C) 2014-2016 Genome Research Ltd.
+ Copyright (C) 2014-2016, 2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
/* reheader.c -- reheader subcommand.
- Copyright (C) 2014-2018 Genome Research Ltd.
+ Copyright (C) 2014-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
typedef struct _args_t
{
char **argv, *fname, *samples_fname, *header_fname, *output_fname;
- char *fai_fname, *rm_tmpfile;
+ char *fai_fname, *rm_tmpfile, *tmp_prefix;
htsFile *fp;
htsFormat type;
htsThreadPool *threads;
free(key.s); free(val.s); free(tmp.s);
return q;
}
+char *init_tmp_prefix(const char *tmp_prefix)
+{
+ char *prefix = NULL;
+ if ( tmp_prefix )
+ {
+ int len = strlen(tmp_prefix);
+ prefix = (char*) calloc(len+7,1);
+ memcpy(prefix,tmp_prefix,len);
+ memcpy(prefix+len,"XXXXXX",6);
+ }
+ else
+ {
+ #ifdef _WIN32
+ char tmp_path[MAX_PATH];
+ int ret = GetTempPath(MAX_PATH, tmp_path);
+ if (!ret || ret > MAX_PATH)
+ error("Could not get the path to the temporary folder\n");
+ if (strlen(tmp_path) + strlen("/bcftools.XXXXXX") >= MAX_PATH)
+ error("Full path to the temporary folder is too long\n");
+ strcat(tmp_path, "/bcftools.XXXXXX");
+ prefix = strdup(tmp_path);
+ #else
+ prefix = strdup("/tmp/bcftools.XXXXXX");
+ #endif
+ }
+ return prefix;
+}
static void update_from_fai(args_t *args)
{
if ( !strcmp("-",args->fname) )
faidx_t *fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA);
if ( !fai ) error("Could not parse %s\n", args->fai_fname);
-#ifdef _WIN32
- char tmp_path[MAX_PATH];
- int ret = GetTempPath(MAX_PATH, tmp_path);
- if (!ret || ret > MAX_PATH)
- error("Could not get the path to the temporary folder\n");
- if (strlen(tmp_path) + strlen("/bcftools-fai-header-XXXXXX") >= MAX_PATH)
- error("Full path to the temporary folder is too long\n");
- strcat(tmp_path, "/bcftools-fai-header-XXXXXX");
- args->rm_tmpfile = strdup(tmp_path);
-#else
- args->rm_tmpfile = strdup("/tmp/bcftools-fai-header-XXXXXX");
-#endif
+ args->rm_tmpfile = init_tmp_prefix(args->tmp_prefix);
int fd = mkstemp(args->rm_tmpfile);
if ( fd<0 ) error("Could not open a temporary file for writing: %s\n", args->rm_tmpfile);
hdr->s[hdr->l] = 0;
kstring_t tmp = {0,0,0};
- i = j = n = 0;
- while ( hdr->s[idx+i] && hdr->s[idx+i])
+ i = j = n = 0; // i:traverse the #CHROM line 1 by 1; j:points to the last column
+ while ( hdr->s[idx+i] )
{
if ( hdr->s[idx+i]=='\t' )
{
if ( ++n>9 )
{
- char *ori = khash_str2str_get(hash,hdr->s+idx+j);
- kputs(ori ? ori : hdr->s+idx+j, &tmp);
+ char *new_name = khash_str2str_get(hash,hdr->s+idx+j);
+ kputs(new_name ? new_name : hdr->s+idx+j, &tmp);
}
else
kputs(hdr->s+idx+j, &tmp);
}
i++;
}
- char *ori = khash_str2str_get(hash,hdr->s+idx+j);
- kputs(ori ? ori : hdr->s+idx+j, &tmp);
+ char *new_name = khash_str2str_get(hash,hdr->s+idx+j);
+ kputs(new_name ? new_name : hdr->s+idx+j, &tmp);
khash_str2str_destroy_free_all(hash);
if ( hdr->s[i]=='\t' ) ncols++;
i--;
}
- if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) ) error("Could not parse the header: %s\n", hdr->s);
+ if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) )
+ {
+ if ( i>0 && !strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",38) )
+ error("Error: missing FORMAT fields, cowardly refusing to add samples\n");
+
+ error("Could not parse the header: %s\n", hdr->s);
+ }
// Are the samples "old-sample new-sample" pairs?
if ( set_sample_pairs(samples,nsamples,hdr, i+1) ) return;
int nsamples = 0;
char **samples = NULL;
if ( args->samples_fname )
+ {
samples = hts_readlines(args->samples_fname, &nsamples);
+ if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname);
+ }
if ( args->header_fname )
{
free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
int nsamples = 0;
char **samples = NULL;
if ( args->samples_fname )
+ {
samples = hts_readlines(args->samples_fname, &nsamples);
+ if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname);
+ }
if ( args->header_fname )
{
free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
int i, nsamples = 0;
char **samples = NULL;
if ( args->samples_fname )
+ {
samples = hts_readlines(args->samples_fname, &nsamples);
+ if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname);
+ }
if ( args->header_fname )
{
free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0;
fprintf(stderr, "Usage: bcftools reheader [OPTIONS] <in.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -f, --fai <file> update sequences and their lengths from the .fai file\n");
- fprintf(stderr, " -h, --header <file> new header\n");
- fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(stderr, " -s, --samples <file> new sample names\n");
- fprintf(stderr, " --threads <int> use multithreading with <int> worker threads (BCF only) [0]\n");
+ fprintf(stderr, " -f, --fai FILE update sequences and their lengths from the .fai file\n");
+ fprintf(stderr, " -h, --header FILE new header\n");
+ fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n");
+ fprintf(stderr, " -s, --samples FILE new sample names\n");
+#ifdef _WIN32
+ fprintf(stderr, " -T, --temp-prefix PATH template for temporary file name [/bcftools.XXXXXX]\n");
+#else
+ fprintf(stderr, " -T, --temp-prefix PATH template for temporary file name [/tmp/bcftools.XXXXXX]\n");
+#endif
+ fprintf(stderr, " --threads INT use multithreading with <int> worker threads (BCF only) [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Example:\n");
fprintf(stderr, " # Write out the header to be modified\n");
static struct option loptions[] =
{
+ {"temp-prefix",1,0,'T'},
{"fai",1,0,'f'},
{"output",1,0,'o'},
{"header",1,0,'h'},
{"threads",1,NULL,1},
{0,0,0,0}
};
- while ((c = getopt_long(argc, argv, "s:h:o:f:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "s:h:o:f:T:",loptions,NULL)) >= 0)
{
switch (c)
{
case 1 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'T': args->tmp_prefix = optarg; break;
case 'f': args->fai_fname = optarg; break;
case 'o': args->output_fname = optarg; break;
case 's': args->samples_fname = optarg; break;
if ( args->type.format==vcf )
{
- if ( args->type.compression==bgzf || args->type.compression==gzip )
+ if ( args->type.compression==bgzf )
reheader_vcf_gz(args);
- else
+ else if ( args->type.compression==no_compression )
reheader_vcf(args);
+ else if ( args->type.compression==gzip )
+ error("Error: cannot reheader gzip-compressed files, first convert with `bcftools view --output-type` to a supported format\n");
+ else
+ error("Error: the compression type of \"%s\" is not recognised/supported\n", args->fname);
}
else
reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip);
/* reheader.c -- reheader subcommand.
- Copyright (C) 2014-2018 Genome Research Ltd.
+ Copyright (C) 2014-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
typedef struct _args_t
{
char **argv, *fname, *samples_fname, *header_fname, *output_fname;
- char *fai_fname, *rm_tmpfile;
+ char *fai_fname, *rm_tmpfile, *tmp_prefix;
htsFile *fp;
htsFormat type;
htsThreadPool *threads;
free(key.s); free(val.s); free(tmp.s);
return q;
}
+char *init_tmp_prefix(const char *tmp_prefix)
+{
+ char *prefix = NULL;
+ if ( tmp_prefix )
+ {
+ int len = strlen(tmp_prefix);
+ prefix = (char*) calloc(len+7,1);
+ memcpy(prefix,tmp_prefix,len);
+ memcpy(prefix+len,"XXXXXX",6);
+ }
+ else
+ {
+ #ifdef _WIN32
+ char tmp_path[MAX_PATH];
+ int ret = GetTempPath(MAX_PATH, tmp_path);
+ if (!ret || ret > MAX_PATH)
+ error("Could not get the path to the temporary folder\n");
+ if (strlen(tmp_path) + strlen("/bcftools.XXXXXX") >= MAX_PATH)
+ error("Full path to the temporary folder is too long\n");
+ strcat(tmp_path, "/bcftools.XXXXXX");
+ prefix = strdup(tmp_path);
+ #else
+ prefix = strdup("/tmp/bcftools.XXXXXX");
+ #endif
+ }
+ return prefix;
+}
static void update_from_fai(args_t *args)
{
if ( !strcmp("-",args->fname) )
faidx_t *fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA);
if ( !fai ) error("Could not parse %s\n", args->fai_fname);
-#ifdef _WIN32
- char tmp_path[MAX_PATH];
- int ret = GetTempPath(MAX_PATH, tmp_path);
- if (!ret || ret > MAX_PATH)
- error("Could not get the path to the temporary folder\n");
- if (strlen(tmp_path) + strlen("/bcftools-fai-header-XXXXXX") >= MAX_PATH)
- error("Full path to the temporary folder is too long\n");
- strcat(tmp_path, "/bcftools-fai-header-XXXXXX");
- args->rm_tmpfile = strdup(tmp_path);
-#else
- args->rm_tmpfile = strdup("/tmp/bcftools-fai-header-XXXXXX");
-#endif
+ args->rm_tmpfile = init_tmp_prefix(args->tmp_prefix);
int fd = mkstemp(args->rm_tmpfile);
if ( fd<0 ) error("Could not open a temporary file for writing: %s\n", args->rm_tmpfile);
hdr->s[hdr->l] = 0;
kstring_t tmp = {0,0,0};
- i = j = n = 0;
- while ( hdr->s[idx+i] && hdr->s[idx+i])
+ i = j = n = 0; // i:traverse the #CHROM line 1 by 1; j:points to the last column
+ while ( hdr->s[idx+i] )
{
if ( hdr->s[idx+i]=='\t' )
{
if ( ++n>9 )
{
- char *ori = khash_str2str_get(hash,hdr->s+idx+j);
- kputs(ori ? ori : hdr->s+idx+j, &tmp);
+ char *new_name = khash_str2str_get(hash,hdr->s+idx+j);
+ kputs(new_name ? new_name : hdr->s+idx+j, &tmp);
}
else
kputs(hdr->s+idx+j, &tmp);
}
i++;
}
- char *ori = khash_str2str_get(hash,hdr->s+idx+j);
- kputs(ori ? ori : hdr->s+idx+j, &tmp);
+ char *new_name = khash_str2str_get(hash,hdr->s+idx+j);
+ kputs(new_name ? new_name : hdr->s+idx+j, &tmp);
khash_str2str_destroy_free_all(hash);
if ( hdr->s[i]=='\t' ) ncols++;
i--;
}
- if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) ) error("Could not parse the header: %s\n", hdr->s);
+ if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) )
+ {
+ if ( i>0 && !strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",38) )
+ error("Error: missing FORMAT fields, cowardly refusing to add samples\n");
+
+ error("Could not parse the header: %s\n", hdr->s);
+ }
// Are the samples "old-sample new-sample" pairs?
if ( set_sample_pairs(samples,nsamples,hdr, i+1) ) return;
int nsamples = 0;
char **samples = NULL;
if ( args->samples_fname )
+ {
samples = hts_readlines(args->samples_fname, &nsamples);
+ if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname);
+ }
if ( args->header_fname )
{
free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
int nsamples = 0;
char **samples = NULL;
if ( args->samples_fname )
+ {
samples = hts_readlines(args->samples_fname, &nsamples);
+ if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname);
+ }
if ( args->header_fname )
{
free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
int i, nsamples = 0;
char **samples = NULL;
if ( args->samples_fname )
+ {
samples = hts_readlines(args->samples_fname, &nsamples);
+ if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname);
+ }
if ( args->header_fname )
{
free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0;
fprintf(bcftools_stderr, "Usage: bcftools reheader [OPTIONS] <in.vcf.gz>\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -f, --fai <file> update sequences and their lengths from the .fai file\n");
- fprintf(bcftools_stderr, " -h, --header <file> new header\n");
- fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -s, --samples <file> new sample names\n");
- fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads (BCF only) [0]\n");
+ fprintf(bcftools_stderr, " -f, --fai FILE update sequences and their lengths from the .fai file\n");
+ fprintf(bcftools_stderr, " -h, --header FILE new header\n");
+ fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -s, --samples FILE new sample names\n");
+#ifdef _WIN32
+ fprintf(bcftools_stderr, " -T, --temp-prefix PATH template for temporary file name [/bcftools.XXXXXX]\n");
+#else
+ fprintf(bcftools_stderr, " -T, --temp-prefix PATH template for temporary file name [/tmp/bcftools.XXXXXX]\n");
+#endif
+ fprintf(bcftools_stderr, " --threads INT use multithreading with <int> worker threads (BCF only) [0]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Example:\n");
fprintf(bcftools_stderr, " # Write out the header to be modified\n");
fprintf(bcftools_stderr, " # Reheader the file\n");
fprintf(bcftools_stderr, " bcftools reheader -h header.txt -o new.bcf old.bcf\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_reheader(int argc, char *argv[])
static struct option loptions[] =
{
+ {"temp-prefix",1,0,'T'},
{"fai",1,0,'f'},
{"output",1,0,'o'},
{"header",1,0,'h'},
{"threads",1,NULL,1},
{0,0,0,0}
};
- while ((c = getopt_long(argc, argv, "s:h:o:f:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "s:h:o:f:T:",loptions,NULL)) >= 0)
{
switch (c)
{
case 1 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'T': args->tmp_prefix = optarg; break;
case 'f': args->fai_fname = optarg; break;
case 'o': args->output_fname = optarg; break;
case 's': args->samples_fname = optarg; break;
if ( args->type.format==vcf )
{
- if ( args->type.compression==bgzf || args->type.compression==gzip )
+ if ( args->type.compression==bgzf )
reheader_vcf_gz(args);
- else
+ else if ( args->type.compression==no_compression )
reheader_vcf(args);
+ else if ( args->type.compression==gzip )
+ error("Error: cannot reheader gzip-compressed files, first convert with `bcftools view --output-type` to a supported format\n");
+ else
+ error("Error: the compression type of \"%s\" is not recognised/supported\n", args->fname);
}
else
reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip);
/*
- Copyright (C) 2016 Genome Research Ltd.
+ Copyright (C) 2016, 2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include "bcftools.pysam.h"
/*
- Copyright (C) 2016 Genome Research Ltd.
+ Copyright (C) 2016, 2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
--- /dev/null
+/* str_finder.c -- Short Tandem Repeat finder.
+ Originally from Crumble (https://github.com/jkbonfield/crumble)
+
+ Copyright (C) 2015-2016, 2021 Genome Research Ltd.
+
+ Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <ctype.h>
+
+#include "str_finder.h"
+#include "utlist.h"
+
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
+typedef unsigned char uc;
+
+static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen,
+ int lower_only, unsigned int w) {
+ rep_ele *el, *tmp, *prev;
+ char *cp1, *cp2, *cp_end;
+ int i;
+
+ // Already handled this in previous overlap?
+ if (*list) {
+ tmp = DL_TAIL(*list);
+ if (tmp->start <= pos-rlen*2+1 && tmp->end >= pos)
+ return;
+ }
+
+ // Find current and last occurence of repeated word.
+
+ cp2 = &cons[pos+1];
+ // If unpadded, this is quicker: cp1 = &cons[pos+1-rlen];
+
+ for (cp1 = &cons[pos], i = 1; i < rlen; cp1--) // compensate for pads
+ if (*cp1 == '*')
+ continue;
+ else
+ i++;
+ while (*cp1 == '*')
+ cp1--;
+
+
+ // Scan ahead to see how much further it goes.
+ cp_end = &cons[clen];
+ while (cp2 < cp_end) {
+ if (*cp1 != *cp2)
+ break;
+
+ w<<=2;
+ w|=*cp2;
+ cp1++;
+ cp2++;
+ }
+
+ if (!(el = malloc(sizeof(*el))))
+ return;
+
+ el->end = pos + cp2-&cons[pos+1];
+ el->rep_len = rlen;
+ pos++;
+ while (rlen--) {
+ while (cons[--pos] == '*');
+ while (cons[--pos] == '*');
+ }
+ //pos++;
+ while (pos > 1 && cons[pos-1] == '*') pos--;
+ el->start = pos;
+
+ // Check it meets the lower-case only criteria
+ if (lower_only) {
+ int lc = 0;
+ for (i = el->start; i <= el->end; i++) {
+ if (islower(cons[i])) {
+ lc = 1;
+ break;
+ }
+ }
+
+ if (!lc) {
+ free(el);
+ return;
+ }
+ }
+
+ // Remove any older items on the list that are entirely contained within el
+ if (*list) {
+ tmp = DL_TAIL(*list);
+ do {
+ prev = tmp->prev;
+ if (tmp->end < el->start)
+ break;
+
+ if (tmp->start >= el->start) {
+ DL_DELETE(*list, tmp);
+ free(tmp);
+ }
+
+ if (tmp == DL_HEAD(*list))
+ break;
+ tmp = prev;
+ } while (*list);
+ }
+
+ DL_APPEND(*list, el);
+
+ return;
+}
+
+/*
+ * Finds repeated homopolymers up to 8-mers.
+ * Note this assumes cons is 0-3, so N of 4 may rarely give false hits.
+ *
+ * Returns a list of rep_ele structs holding the start,end tuples of repeats;
+ * NULL on failure.
+ */
+rep_ele *find_STR(char *cons, int len, int lower_only) {
+ int i, j;
+ uint32_t w = 0;
+ rep_ele *reps = NULL;
+
+ for (i = j = 0; i < len && j < 15; i++) {
+ if (cons[i] == '*') continue;
+
+ w <<= 2;
+ w |= cons[i];
+ //printf("%3d %c w=%08x\n", i, cons[i], w);
+ if (j>= 1 && (w&0x0003) == ((w>> 2)&0x0003))
+ add_rep(&reps, cons, len, i, 1, lower_only, w);
+ if (j>= 3 && (w&0x000f) == ((w>> 4)&0x000f))
+ add_rep(&reps, cons, len, i, 2, lower_only, w);
+ if (j>= 5 && (w&0x003f) == ((w>> 6)&0x003f))
+ add_rep(&reps, cons, len, i, 3, lower_only, w);
+ if (j>= 7 && (w&0x00ff) == ((w>> 8)&0x00ff))
+ add_rep(&reps, cons, len, i, 4, lower_only, w);
+ if (j>= 9 && (w&0x03ff) == ((w>>10)&0x03ff))
+ add_rep(&reps, cons, len, i, 5, lower_only, w);
+ if (j>=11 && (w&0x0fff) == ((w>>12)&0x0fff))
+ add_rep(&reps, cons, len, i, 6, lower_only, w);
+ if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff))
+ add_rep(&reps, cons, len, i, 7, lower_only, w);
+
+ j++;
+ }
+
+ for (; i < len; i++) {
+ if (cons[i] == '*') continue;
+
+ w <<= 2;
+ w |= cons[i];
+ //printf("%3d %c w=%08x\n", i, cons[i], w);
+ if ((w&0xffff) == ((w>>16)&0xffff))
+ add_rep(&reps, cons, len, i, 8, lower_only, w);
+ else if ((w&0x3fff) == ((w>>14)&0x3fff))
+ add_rep(&reps, cons, len, i, 7, lower_only, w);
+ else if ((w&0x0fff) == ((w>>12)&0x0fff))
+ add_rep(&reps, cons, len, i, 6, lower_only, w);
+ else if ((w&0x03ff) == ((w>>10)&0x03ff))
+ add_rep(&reps, cons, len, i, 5, lower_only, w);
+ else if ((w&0x00ff) == ((w>> 8)&0x00ff))
+ add_rep(&reps, cons, len, i, 4, lower_only, w);
+ else if ((w&0x003f) == ((w>> 6)&0x003f))
+ add_rep(&reps, cons, len, i, 3, lower_only, w);
+ else if ((w&0x000f) == ((w>> 4)&0x000f))
+ add_rep(&reps, cons, len, i, 2, lower_only, w);
+ else if ((w&0x0003) == ((w>> 2)&0x0003))
+ add_rep(&reps, cons, len, i, 1, lower_only, w);
+ }
+
+ return reps;
+}
+
+/* -----------------------------------------------------------------------------
+ * Computes repeat regions in the consensus and then provides a bit mask
+ * indicating the extend of the STRs.
+ *
+ * The purpose of this is to identify where a read needs to span the entire
+ * region in order to validate how many copies of a repeat word are present.
+ * This only really has a major impact when indels are involved.
+ *
+ * For example, given this multiple alignment:
+ *
+ * S1 GATCGGACGAGAG
+ * S2 GATCGGACGAGAGAGAGAGAGT
+ * S3 GATCGGACGAGAGAGAGAG**TCGGAC
+ * S4 GGACGAGAGAGAGAGAGTCGGAC
+ * S5 CGAGAGAGAGAG**TCGGAC
+ * S6 AGAGAGAGTCGGAC
+ *
+ * We have subseq of GAGAGAGAGAG** vs GAGAGAGAGAGAG. The first and last
+ * (S1 and S6) sequences do not span and so we do not know which allele they
+ * match. Specifically as the pad is at the right hand end, the alignment of
+ * S6 gives incorrect weight to the consensus as it is stating AG when it
+ * may actually be ** at that point.
+ *
+ * By identifying the repeats we can soft clip as follows:
+ *
+ * S1 GATCGGACgagag
+ * S2 GATCGGACGAGAGAGAGAGAGT
+ * S3 GATCGGACGAGAGAGAGAG**TCGGAC
+ * S4 GGACGAGAGAGAGAGAGTCGGAC
+ * S5 CGAGAGAGAGAG**TCGGAC
+ * S6 agagagagTCGGAC
+ *
+ * Returns an array of STR vs no-STR values.
+ * 0 => non repetitive.
+ * 1+ => repeat with consecutive bit-number for repeat size.
+ *
+ * Eg: AGGGGAGGAGAAGAC
+ * 1111 1111
+ * 2222222
+ * 444444
+ * => 011331137754440
+ */
+char *cons_mark_STR(char *cons, int len, int lower_only) {
+ rep_ele *reps, *elt, *tmp;
+ char *str;
+
+ str = calloc(1, len);
+ reps = find_STR(cons, len, lower_only);
+
+ DL_FOREACH_SAFE(reps, elt, tmp) {
+ int i, v = 0;
+
+ //printf("%2d .. %2d %.*s\n", elt->start, elt->end,
+ // elt->end - elt->start+1, &cons[elt->start]);
+
+ // What is there?
+ for (i = MAX(elt->start-1,0); i <= MIN(elt->end+1,len-1); i++)
+ v |= str[i];
+
+ for (i = 0; i < 8; i++) {
+ if (!(v&(1<<i)))
+ break;
+ }
+ v = (i == 8) ? 1 : (1<<i);
+
+ // Add new if available, or just overload 1 if not
+ for (i = elt->start; i <= elt->end; i++)
+ str[i] |= v;
+
+ DL_DELETE(reps, elt);
+ free(elt);
+ }
+
+ return str;
+}
--- /dev/null
+#include "bcftools.pysam.h"
+
+/* str_finder.c -- Short Tandem Repeat finder.
+ Originally from Crumble (https://github.com/jkbonfield/crumble)
+
+ Copyright (C) 2015-2016, 2021 Genome Research Ltd.
+
+ Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <ctype.h>
+
+#include "str_finder.h"
+#include "utlist.h"
+
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
+typedef unsigned char uc;
+
+static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen,
+ int lower_only, unsigned int w) {
+ rep_ele *el, *tmp, *prev;
+ char *cp1, *cp2, *cp_end;
+ int i;
+
+ // Already handled this in previous overlap?
+ if (*list) {
+ tmp = DL_TAIL(*list);
+ if (tmp->start <= pos-rlen*2+1 && tmp->end >= pos)
+ return;
+ }
+
+ // Find current and last occurence of repeated word.
+
+ cp2 = &cons[pos+1];
+ // If unpadded, this is quicker: cp1 = &cons[pos+1-rlen];
+
+ for (cp1 = &cons[pos], i = 1; i < rlen; cp1--) // compensate for pads
+ if (*cp1 == '*')
+ continue;
+ else
+ i++;
+ while (*cp1 == '*')
+ cp1--;
+
+
+ // Scan ahead to see how much further it goes.
+ cp_end = &cons[clen];
+ while (cp2 < cp_end) {
+ if (*cp1 != *cp2)
+ break;
+
+ w<<=2;
+ w|=*cp2;
+ cp1++;
+ cp2++;
+ }
+
+ if (!(el = malloc(sizeof(*el))))
+ return;
+
+ el->end = pos + cp2-&cons[pos+1];
+ el->rep_len = rlen;
+ pos++;
+ while (rlen--) {
+ while (cons[--pos] == '*');
+ while (cons[--pos] == '*');
+ }
+ //pos++;
+ while (pos > 1 && cons[pos-1] == '*') pos--;
+ el->start = pos;
+
+ // Check it meets the lower-case only criteria
+ if (lower_only) {
+ int lc = 0;
+ for (i = el->start; i <= el->end; i++) {
+ if (islower(cons[i])) {
+ lc = 1;
+ break;
+ }
+ }
+
+ if (!lc) {
+ free(el);
+ return;
+ }
+ }
+
+ // Remove any older items on the list that are entirely contained within el
+ if (*list) {
+ tmp = DL_TAIL(*list);
+ do {
+ prev = tmp->prev;
+ if (tmp->end < el->start)
+ break;
+
+ if (tmp->start >= el->start) {
+ DL_DELETE(*list, tmp);
+ free(tmp);
+ }
+
+ if (tmp == DL_HEAD(*list))
+ break;
+ tmp = prev;
+ } while (*list);
+ }
+
+ DL_APPEND(*list, el);
+
+ return;
+}
+
+/*
+ * Finds repeated homopolymers up to 8-mers.
+ * Note this assumes cons is 0-3, so N of 4 may rarely give false hits.
+ *
+ * Returns a list of rep_ele structs holding the start,end tuples of repeats;
+ * NULL on failure.
+ */
+rep_ele *find_STR(char *cons, int len, int lower_only) {
+ int i, j;
+ uint32_t w = 0;
+ rep_ele *reps = NULL;
+
+ for (i = j = 0; i < len && j < 15; i++) {
+ if (cons[i] == '*') continue;
+
+ w <<= 2;
+ w |= cons[i];
+ //printf("%3d %c w=%08x\n", i, cons[i], w);
+ if (j>= 1 && (w&0x0003) == ((w>> 2)&0x0003))
+ add_rep(&reps, cons, len, i, 1, lower_only, w);
+ if (j>= 3 && (w&0x000f) == ((w>> 4)&0x000f))
+ add_rep(&reps, cons, len, i, 2, lower_only, w);
+ if (j>= 5 && (w&0x003f) == ((w>> 6)&0x003f))
+ add_rep(&reps, cons, len, i, 3, lower_only, w);
+ if (j>= 7 && (w&0x00ff) == ((w>> 8)&0x00ff))
+ add_rep(&reps, cons, len, i, 4, lower_only, w);
+ if (j>= 9 && (w&0x03ff) == ((w>>10)&0x03ff))
+ add_rep(&reps, cons, len, i, 5, lower_only, w);
+ if (j>=11 && (w&0x0fff) == ((w>>12)&0x0fff))
+ add_rep(&reps, cons, len, i, 6, lower_only, w);
+ if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff))
+ add_rep(&reps, cons, len, i, 7, lower_only, w);
+
+ j++;
+ }
+
+ for (; i < len; i++) {
+ if (cons[i] == '*') continue;
+
+ w <<= 2;
+ w |= cons[i];
+ //printf("%3d %c w=%08x\n", i, cons[i], w);
+ if ((w&0xffff) == ((w>>16)&0xffff))
+ add_rep(&reps, cons, len, i, 8, lower_only, w);
+ else if ((w&0x3fff) == ((w>>14)&0x3fff))
+ add_rep(&reps, cons, len, i, 7, lower_only, w);
+ else if ((w&0x0fff) == ((w>>12)&0x0fff))
+ add_rep(&reps, cons, len, i, 6, lower_only, w);
+ else if ((w&0x03ff) == ((w>>10)&0x03ff))
+ add_rep(&reps, cons, len, i, 5, lower_only, w);
+ else if ((w&0x00ff) == ((w>> 8)&0x00ff))
+ add_rep(&reps, cons, len, i, 4, lower_only, w);
+ else if ((w&0x003f) == ((w>> 6)&0x003f))
+ add_rep(&reps, cons, len, i, 3, lower_only, w);
+ else if ((w&0x000f) == ((w>> 4)&0x000f))
+ add_rep(&reps, cons, len, i, 2, lower_only, w);
+ else if ((w&0x0003) == ((w>> 2)&0x0003))
+ add_rep(&reps, cons, len, i, 1, lower_only, w);
+ }
+
+ return reps;
+}
+
+/* -----------------------------------------------------------------------------
+ * Computes repeat regions in the consensus and then provides a bit mask
+ * indicating the extend of the STRs.
+ *
+ * The purpose of this is to identify where a read needs to span the entire
+ * region in order to validate how many copies of a repeat word are present.
+ * This only really has a major impact when indels are involved.
+ *
+ * For example, given this multiple alignment:
+ *
+ * S1 GATCGGACGAGAG
+ * S2 GATCGGACGAGAGAGAGAGAGT
+ * S3 GATCGGACGAGAGAGAGAG**TCGGAC
+ * S4 GGACGAGAGAGAGAGAGTCGGAC
+ * S5 CGAGAGAGAGAG**TCGGAC
+ * S6 AGAGAGAGTCGGAC
+ *
+ * We have subseq of GAGAGAGAGAG** vs GAGAGAGAGAGAG. The first and last
+ * (S1 and S6) sequences do not span and so we do not know which allele they
+ * match. Specifically as the pad is at the right hand end, the alignment of
+ * S6 gives incorrect weight to the consensus as it is stating AG when it
+ * may actually be ** at that point.
+ *
+ * By identifying the repeats we can soft clip as follows:
+ *
+ * S1 GATCGGACgagag
+ * S2 GATCGGACGAGAGAGAGAGAGT
+ * S3 GATCGGACGAGAGAGAGAG**TCGGAC
+ * S4 GGACGAGAGAGAGAGAGTCGGAC
+ * S5 CGAGAGAGAGAG**TCGGAC
+ * S6 agagagagTCGGAC
+ *
+ * Returns an array of STR vs no-STR values.
+ * 0 => non repetitive.
+ * 1+ => repeat with consecutive bit-number for repeat size.
+ *
+ * Eg: AGGGGAGGAGAAGAC
+ * 1111 1111
+ * 2222222
+ * 444444
+ * => 011331137754440
+ */
+char *cons_mark_STR(char *cons, int len, int lower_only) {
+ rep_ele *reps, *elt, *tmp;
+ char *str;
+
+ str = calloc(1, len);
+ reps = find_STR(cons, len, lower_only);
+
+ DL_FOREACH_SAFE(reps, elt, tmp) {
+ int i, v = 0;
+
+ //printf("%2d .. %2d %.*s\n", elt->start, elt->end,
+ // elt->end - elt->start+1, &cons[elt->start]);
+
+ // What is there?
+ for (i = MAX(elt->start-1,0); i <= MIN(elt->end+1,len-1); i++)
+ v |= str[i];
+
+ for (i = 0; i < 8; i++) {
+ if (!(v&(1<<i)))
+ break;
+ }
+ v = (i == 8) ? 1 : (1<<i);
+
+ // Add new if available, or just overload 1 if not
+ for (i = elt->start; i <= elt->end; i++)
+ str[i] |= v;
+
+ DL_DELETE(reps, elt);
+ free(elt);
+ }
+
+ return str;
+}
--- /dev/null
+/* str_finder.c -- Short Tandem Repeat finder.
+ Originally from Crumble (https://github.com/jkbonfield/crumble)
+
+ Copyright (C) 2015-2016, 2021 Genome Research Ltd.
+
+ Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef _STR_FINDER_H_
+#define _STR_FINDER_H_
+
+#include "utlist.h"
+
+typedef struct rep_ele {
+ int start, end, rep_len;
+ struct rep_ele *prev;
+ struct rep_ele *next;
+} rep_ele;
+
+/*
+ * Finds repeated homopolymers up to 8-mers.
+ *
+ * If lower_only is true then it only adds STRs for regions that
+ * contain at least one lower-case base. This can be used as a marker
+ * for looking for specific types of repeats.
+ * (One use for this is to only mark STRs that overlap a heterozygous
+ * indel region.)
+ *
+ * Returns a list of rep_ele structs holding the start,end tuples of repeats;
+ * NULL on failure.
+ */
+rep_ele *find_STR(char *cons, int len, int lower_only);
+
+/*
+ * Returns an array of STR vs no-STR values.
+ * 0 => non repetitive.
+ * 1+ => repeat with consecutive bit-number for repeat size.
+ *
+ * Eg: AGGGGAGGAGAAGAC
+ * 1111 1111
+ * 2222222
+ * 444444
+ * => 011331137754440
+ */
+char *cons_mark_STR(char *cons, int len, int lower_only);
+
+#endif /* _STR_FINDER_H_ */
--- /dev/null
+/*
+Copyright (c) 2007-2014, Troy D. Hanson http://troydhanson.github.com/uthash/
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef UTLIST_H
+#define UTLIST_H
+
+#define UTLIST_VERSION 1.9.9
+
+#include <assert.h>
+
+/*
+ * This file contains macros to manipulate singly and doubly-linked lists.
+ *
+ * 1. LL_ macros: singly-linked lists.
+ * 2. DL_ macros: doubly-linked lists.
+ * 3. CDL_ macros: circular doubly-linked lists.
+ *
+ * To use singly-linked lists, your structure must have a "next" pointer.
+ * To use doubly-linked lists, your structure must "prev" and "next" pointers.
+ * Either way, the pointer to the head of the list must be initialized to NULL.
+ *
+ * ----------------.EXAMPLE -------------------------
+ * struct item {
+ * int id;
+ * struct item *prev, *next;
+ * }
+ *
+ * struct item *list = NULL:
+ *
+ * int main() {
+ * struct item *item;
+ * ... allocate and populate item ...
+ * DL_APPEND(list, item);
+ * }
+ * --------------------------------------------------
+ *
+ * For doubly-linked lists, the append and delete macros are O(1)
+ * For singly-linked lists, append and delete are O(n) but prepend is O(1)
+ * The sort macro is O(n log(n)) for all types of single/double/circular lists.
+ */
+
+/* These macros use decltype or the earlier __typeof GNU extension.
+ As decltype is only available in newer compilers (VS2010 or gcc 4.3+
+ when compiling c++ code), this code uses whatever method is needed
+ or, for VS2008 where neither is available, uses casting workarounds. */
+#ifdef _MSC_VER /* MS compiler */
+#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */
+#define LDECLTYPE(x) decltype(x)
+#else /* VS2008 or older (or VS2010 in C mode) */
+#define NO_DECLTYPE
+#define LDECLTYPE(x) char*
+#endif
+#elif defined(__ICCARM__)
+#define NO_DECLTYPE
+#define LDECLTYPE(x) char*
+#else /* GNU, Sun and other compilers */
+#define LDECLTYPE(x) __typeof(x)
+#endif
+
+/* for VS2008 we use some workarounds to get around the lack of decltype,
+ * namely, we always reassign our tmp variable to the list head if we need
+ * to dereference its prev/next pointers, and save/restore the real head.*/
+#ifdef NO_DECLTYPE
+#define _SV(elt,list) _tmp = (char*)(list); {char **_alias = (char**)&(list); *_alias = (elt); }
+#define _NEXT(elt,list,next) ((char*)((list)->next))
+#define _NEXTASGN(elt,list,to,next) { char **_alias = (char**)&((list)->next); *_alias=(char*)(to); }
+/* #define _PREV(elt,list,prev) ((char*)((list)->prev)) */
+#define _PREVASGN(elt,list,to,prev) { char **_alias = (char**)&((list)->prev); *_alias=(char*)(to); }
+#define _RS(list) { char **_alias = (char**)&(list); *_alias=_tmp; }
+#define _CASTASGN(a,b) { char **_alias = (char**)&(a); *_alias=(char*)(b); }
+#else
+#define _SV(elt,list)
+#define _NEXT(elt,list,next) ((elt)->next)
+#define _NEXTASGN(elt,list,to,next) ((elt)->next)=(to)
+/* #define _PREV(elt,list,prev) ((elt)->prev) */
+#define _PREVASGN(elt,list,to,prev) ((elt)->prev)=(to)
+#define _RS(list)
+#define _CASTASGN(a,b) (a)=(b)
+#endif
+
+/******************************************************************************
+ * The sort macro is an adaptation of Simon Tatham's O(n log(n)) mergesort *
+ * Unwieldy variable names used here to avoid shadowing passed-in variables. *
+ *****************************************************************************/
+#define LL_SORT(list, cmp) \
+ LL_SORT2(list, cmp, next)
+
+#define LL_SORT2(list, cmp, next) \
+do { \
+ LDECLTYPE(list) _ls_p; \
+ LDECLTYPE(list) _ls_q; \
+ LDECLTYPE(list) _ls_e; \
+ LDECLTYPE(list) _ls_tail; \
+ int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \
+ if (list) { \
+ _ls_insize = 1; \
+ _ls_looping = 1; \
+ while (_ls_looping) { \
+ _CASTASGN(_ls_p,list); \
+ list = NULL; \
+ _ls_tail = NULL; \
+ _ls_nmerges = 0; \
+ while (_ls_p) { \
+ _ls_nmerges++; \
+ _ls_q = _ls_p; \
+ _ls_psize = 0; \
+ for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \
+ _ls_psize++; \
+ _SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list); \
+ if (!_ls_q) break; \
+ } \
+ _ls_qsize = _ls_insize; \
+ while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \
+ if (_ls_psize == 0) { \
+ _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \
+ _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \
+ } else if (_ls_qsize == 0 || !_ls_q) { \
+ _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \
+ _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \
+ } else if (cmp(_ls_p,_ls_q) <= 0) { \
+ _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \
+ _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \
+ } else { \
+ _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \
+ _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \
+ } \
+ if (_ls_tail) { \
+ _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \
+ } else { \
+ _CASTASGN(list,_ls_e); \
+ } \
+ _ls_tail = _ls_e; \
+ } \
+ _ls_p = _ls_q; \
+ } \
+ if (_ls_tail) { \
+ _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list); \
+ } \
+ if (_ls_nmerges <= 1) { \
+ _ls_looping=0; \
+ } \
+ _ls_insize *= 2; \
+ } \
+ } \
+} while (0)
+
+
+#define DL_SORT(list, cmp) \
+ DL_SORT2(list, cmp, prev, next)
+
+#define DL_SORT2(list, cmp, prev, next) \
+do { \
+ LDECLTYPE(list) _ls_p; \
+ LDECLTYPE(list) _ls_q; \
+ LDECLTYPE(list) _ls_e; \
+ LDECLTYPE(list) _ls_tail; \
+ int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \
+ if (list) { \
+ _ls_insize = 1; \
+ _ls_looping = 1; \
+ while (_ls_looping) { \
+ _CASTASGN(_ls_p,list); \
+ list = NULL; \
+ _ls_tail = NULL; \
+ _ls_nmerges = 0; \
+ while (_ls_p) { \
+ _ls_nmerges++; \
+ _ls_q = _ls_p; \
+ _ls_psize = 0; \
+ for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \
+ _ls_psize++; \
+ _SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list); \
+ if (!_ls_q) break; \
+ } \
+ _ls_qsize = _ls_insize; \
+ while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \
+ if (_ls_psize == 0) { \
+ _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \
+ _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \
+ } else if (_ls_qsize == 0 || !_ls_q) { \
+ _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \
+ _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \
+ } else if (cmp(_ls_p,_ls_q) <= 0) { \
+ _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \
+ _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \
+ } else { \
+ _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \
+ _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \
+ } \
+ if (_ls_tail) { \
+ _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \
+ } else { \
+ _CASTASGN(list,_ls_e); \
+ } \
+ _SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list); \
+ _ls_tail = _ls_e; \
+ } \
+ _ls_p = _ls_q; \
+ } \
+ _CASTASGN(list->prev, _ls_tail); \
+ _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list); \
+ if (_ls_nmerges <= 1) { \
+ _ls_looping=0; \
+ } \
+ _ls_insize *= 2; \
+ } \
+ } \
+} while (0)
+
+
+#define DL_HEAD(list) (list)
+#define DL_TAIL(list) ((list) ? (list)->prev : NULL)
+
+#define CDL_SORT(list, cmp) \
+ CDL_SORT2(list, cmp, prev, next)
+
+#define CDL_SORT2(list, cmp, prev, next) \
+do { \
+ LDECLTYPE(list) _ls_p; \
+ LDECLTYPE(list) _ls_q; \
+ LDECLTYPE(list) _ls_e; \
+ LDECLTYPE(list) _ls_tail; \
+ LDECLTYPE(list) _ls_oldhead; \
+ LDECLTYPE(list) _tmp; \
+ int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \
+ if (list) { \
+ _ls_insize = 1; \
+ _ls_looping = 1; \
+ while (_ls_looping) { \
+ _CASTASGN(_ls_p,list); \
+ _CASTASGN(_ls_oldhead,list); \
+ list = NULL; \
+ _ls_tail = NULL; \
+ _ls_nmerges = 0; \
+ while (_ls_p) { \
+ _ls_nmerges++; \
+ _ls_q = _ls_p; \
+ _ls_psize = 0; \
+ for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \
+ _ls_psize++; \
+ _SV(_ls_q,list); \
+ if (_NEXT(_ls_q,list,next) == _ls_oldhead) { \
+ _ls_q = NULL; \
+ } else { \
+ _ls_q = _NEXT(_ls_q,list,next); \
+ } \
+ _RS(list); \
+ if (!_ls_q) break; \
+ } \
+ _ls_qsize = _ls_insize; \
+ while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \
+ if (_ls_psize == 0) { \
+ _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \
+ _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \
+ if (_ls_q == _ls_oldhead) { _ls_q = NULL; } \
+ } else if (_ls_qsize == 0 || !_ls_q) { \
+ _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \
+ _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \
+ if (_ls_p == _ls_oldhead) { _ls_p = NULL; } \
+ } else if (cmp(_ls_p,_ls_q) <= 0) { \
+ _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \
+ _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \
+ if (_ls_p == _ls_oldhead) { _ls_p = NULL; } \
+ } else { \
+ _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \
+ _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \
+ if (_ls_q == _ls_oldhead) { _ls_q = NULL; } \
+ } \
+ if (_ls_tail) { \
+ _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \
+ } else { \
+ _CASTASGN(list,_ls_e); \
+ } \
+ _SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list); \
+ _ls_tail = _ls_e; \
+ } \
+ _ls_p = _ls_q; \
+ } \
+ _CASTASGN(list->prev,_ls_tail); \
+ _CASTASGN(_tmp,list); \
+ _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_tmp,next); _RS(list); \
+ if (_ls_nmerges <= 1) { \
+ _ls_looping=0; \
+ } \
+ _ls_insize *= 2; \
+ } \
+ } \
+} while (0)
+
+/******************************************************************************
+ * singly linked list macros (non-circular) *
+ *****************************************************************************/
+#define LL_PREPEND(head,add) \
+ LL_PREPEND2(head,add,next)
+
+#define LL_PREPEND2(head,add,next) \
+do { \
+ (add)->next = head; \
+ head = add; \
+} while (0)
+
+#define LL_CONCAT(head1,head2) \
+ LL_CONCAT2(head1,head2,next)
+
+#define LL_CONCAT2(head1,head2,next) \
+do { \
+ LDECLTYPE(head1) _tmp; \
+ if (head1) { \
+ _tmp = head1; \
+ while (_tmp->next) { _tmp = _tmp->next; } \
+ _tmp->next=(head2); \
+ } else { \
+ (head1)=(head2); \
+ } \
+} while (0)
+
+#define LL_APPEND(head,add) \
+ LL_APPEND2(head,add,next)
+
+#define LL_APPEND2(head,add,next) \
+do { \
+ LDECLTYPE(head) _tmp; \
+ (add)->next=NULL; \
+ if (head) { \
+ _tmp = head; \
+ while (_tmp->next) { _tmp = _tmp->next; } \
+ _tmp->next=(add); \
+ } else { \
+ (head)=(add); \
+ } \
+} while (0)
+
+#define LL_DELETE(head,del) \
+ LL_DELETE2(head,del,next)
+
+#define LL_DELETE2(head,del,next) \
+do { \
+ LDECLTYPE(head) _tmp; \
+ if ((head) == (del)) { \
+ (head)=(head)->next; \
+ } else { \
+ _tmp = head; \
+ while (_tmp->next && (_tmp->next != (del))) { \
+ _tmp = _tmp->next; \
+ } \
+ if (_tmp->next) { \
+ _tmp->next = ((del)->next); \
+ } \
+ } \
+} while (0)
+
+/* Here are VS2008 replacements for LL_APPEND and LL_DELETE */
+#define LL_APPEND_VS2008(head,add) \
+ LL_APPEND2_VS2008(head,add,next)
+
+#define LL_APPEND2_VS2008(head,add,next) \
+do { \
+ if (head) { \
+ (add)->next = head; /* use add->next as a temp variable */ \
+ while ((add)->next->next) { (add)->next = (add)->next->next; } \
+ (add)->next->next=(add); \
+ } else { \
+ (head)=(add); \
+ } \
+ (add)->next=NULL; \
+} while (0)
+
+#define LL_DELETE_VS2008(head,del) \
+ LL_DELETE2_VS2008(head,del,next)
+
+#define LL_DELETE2_VS2008(head,del,next) \
+do { \
+ if ((head) == (del)) { \
+ (head)=(head)->next; \
+ } else { \
+ char *_tmp = (char*)(head); \
+ while ((head)->next && ((head)->next != (del))) { \
+ head = (head)->next; \
+ } \
+ if ((head)->next) { \
+ (head)->next = ((del)->next); \
+ } \
+ { \
+ char **_head_alias = (char**)&(head); \
+ *_head_alias = _tmp; \
+ } \
+ } \
+} while (0)
+#ifdef NO_DECLTYPE
+#undef LL_APPEND
+#define LL_APPEND LL_APPEND_VS2008
+#undef LL_DELETE
+#define LL_DELETE LL_DELETE_VS2008
+#undef LL_DELETE2
+#define LL_DELETE2 LL_DELETE2_VS2008
+#undef LL_APPEND2
+#define LL_APPEND2 LL_APPEND2_VS2008
+#undef LL_CONCAT /* no LL_CONCAT_VS2008 */
+#undef DL_CONCAT /* no DL_CONCAT_VS2008 */
+#endif
+/* end VS2008 replacements */
+
+#define LL_COUNT(head,el,counter) \
+ LL_COUNT2(head,el,counter,next) \
+
+#define LL_COUNT2(head,el,counter,next) \
+{ \
+ counter = 0; \
+ LL_FOREACH2(head,el,next){ ++counter; } \
+}
+
+#define LL_FOREACH(head,el) \
+ LL_FOREACH2(head,el,next)
+
+#define LL_FOREACH2(head,el,next) \
+ for(el=head;el;el=(el)->next)
+
+#define LL_FOREACH_SAFE(head,el,tmp) \
+ LL_FOREACH_SAFE2(head,el,tmp,next)
+
+#define LL_FOREACH_SAFE2(head,el,tmp,next) \
+ for((el)=(head);(el) && (tmp = (el)->next, 1); (el) = tmp)
+
+#define LL_SEARCH_SCALAR(head,out,field,val) \
+ LL_SEARCH_SCALAR2(head,out,field,val,next)
+
+#define LL_SEARCH_SCALAR2(head,out,field,val,next) \
+do { \
+ LL_FOREACH2(head,out,next) { \
+ if ((out)->field == (val)) break; \
+ } \
+} while(0)
+
+#define LL_SEARCH(head,out,elt,cmp) \
+ LL_SEARCH2(head,out,elt,cmp,next)
+
+#define LL_SEARCH2(head,out,elt,cmp,next) \
+do { \
+ LL_FOREACH2(head,out,next) { \
+ if ((cmp(out,elt))==0) break; \
+ } \
+} while(0)
+
+#define LL_REPLACE_ELEM(head, el, add) \
+do { \
+ LDECLTYPE(head) _tmp; \
+ assert(head != NULL); \
+ assert(el != NULL); \
+ assert(add != NULL); \
+ (add)->next = (el)->next; \
+ if ((head) == (el)) { \
+ (head) = (add); \
+ } else { \
+ _tmp = head; \
+ while (_tmp->next && (_tmp->next != (el))) { \
+ _tmp = _tmp->next; \
+ } \
+ if (_tmp->next) { \
+ _tmp->next = (add); \
+ } \
+ } \
+} while (0)
+
+#define LL_PREPEND_ELEM(head, el, add) \
+do { \
+ LDECLTYPE(head) _tmp; \
+ assert(head != NULL); \
+ assert(el != NULL); \
+ assert(add != NULL); \
+ (add)->next = (el); \
+ if ((head) == (el)) { \
+ (head) = (add); \
+ } else { \
+ _tmp = head; \
+ while (_tmp->next && (_tmp->next != (el))) { \
+ _tmp = _tmp->next; \
+ } \
+ if (_tmp->next) { \
+ _tmp->next = (add); \
+ } \
+ } \
+} while (0) \
+
+
+/******************************************************************************
+ * doubly linked list macros (non-circular) *
+ *****************************************************************************/
+#define DL_PREPEND(head,add) \
+ DL_PREPEND2(head,add,prev,next)
+
+#define DL_PREPEND2(head,add,prev,next) \
+do { \
+ (add)->next = head; \
+ if (head) { \
+ (add)->prev = (head)->prev; \
+ (head)->prev = (add); \
+ } else { \
+ (add)->prev = (add); \
+ } \
+ (head) = (add); \
+} while (0)
+
+#define DL_APPEND(head,add) \
+ DL_APPEND2(head,add,prev,next)
+
+#define DL_APPEND2(head,add,prev,next) \
+do { \
+ if (head) { \
+ (add)->prev = (head)->prev; \
+ (head)->prev->next = (add); \
+ (head)->prev = (add); \
+ (add)->next = NULL; \
+ } else { \
+ (head)=(add); \
+ (head)->prev = (head); \
+ (head)->next = NULL; \
+ } \
+} while (0)
+
+#define DL_CONCAT(head1,head2) \
+ DL_CONCAT2(head1,head2,prev,next)
+
+#define DL_CONCAT2(head1,head2,prev,next) \
+do { \
+ LDECLTYPE(head1) _tmp; \
+ if (head2) { \
+ if (head1) { \
+ _tmp = (head2)->prev; \
+ (head2)->prev = (head1)->prev; \
+ (head1)->prev->next = (head2); \
+ (head1)->prev = _tmp; \
+ } else { \
+ (head1)=(head2); \
+ } \
+ } \
+} while (0)
+
+#define DL_DELETE(head,del) \
+ DL_DELETE2(head,del,prev,next)
+
+#define DL_DELETE2(head,del,prev,next) \
+do { \
+ assert((del)->prev != NULL); \
+ if ((del)->prev == (del)) { \
+ (head)=NULL; \
+ } else if ((del)==(head)) { \
+ (del)->next->prev = (del)->prev; \
+ (head) = (del)->next; \
+ } else { \
+ (del)->prev->next = (del)->next; \
+ if ((del)->next) { \
+ (del)->next->prev = (del)->prev; \
+ } else { \
+ (head)->prev = (del)->prev; \
+ } \
+ } \
+} while (0)
+
+#define DL_COUNT(head,el,counter) \
+ DL_COUNT2(head,el,counter,next) \
+
+#define DL_COUNT2(head,el,counter,next) \
+{ \
+ counter = 0; \
+ DL_FOREACH2(head,el,next){ ++counter; } \
+}
+
+#define DL_FOREACH(head,el) \
+ DL_FOREACH2(head,el,next)
+
+#define DL_FOREACH2(head,el,next) \
+ for(el=head;el;el=(el)->next)
+
+/* this version is safe for deleting the elements during iteration */
+#define DL_FOREACH_SAFE(head,el,tmp) \
+ DL_FOREACH_SAFE2(head,el,tmp,next)
+
+#define DL_FOREACH_SAFE2(head,el,tmp,next) \
+ for((el)=(head);(el) && (tmp = (el)->next, 1); (el) = tmp)
+
+/* these are identical to their singly-linked list counterparts */
+#define DL_SEARCH_SCALAR LL_SEARCH_SCALAR
+#define DL_SEARCH LL_SEARCH
+#define DL_SEARCH_SCALAR2 LL_SEARCH_SCALAR2
+#define DL_SEARCH2 LL_SEARCH2
+
+#define DL_REPLACE_ELEM(head, el, add) \
+do { \
+ assert(head != NULL); \
+ assert(el != NULL); \
+ assert(add != NULL); \
+ if ((head) == (el)) { \
+ (head) = (add); \
+ (add)->next = (el)->next; \
+ if ((el)->next == NULL) { \
+ (add)->prev = (add); \
+ } else { \
+ (add)->prev = (el)->prev; \
+ (add)->next->prev = (add); \
+ } \
+ } else { \
+ (add)->next = (el)->next; \
+ (add)->prev = (el)->prev; \
+ (add)->prev->next = (add); \
+ if ((el)->next == NULL) { \
+ (head)->prev = (add); \
+ } else { \
+ (add)->next->prev = (add); \
+ } \
+ } \
+} while (0)
+
+#define DL_PREPEND_ELEM(head, el, add) \
+do { \
+ assert(head != NULL); \
+ assert(el != NULL); \
+ assert(add != NULL); \
+ (add)->next = (el); \
+ (add)->prev = (el)->prev; \
+ (el)->prev = (add); \
+ if ((head) == (el)) { \
+ (head) = (add); \
+ } else { \
+ (add)->prev->next = (add); \
+ } \
+} while (0) \
+
+
+/******************************************************************************
+ * circular doubly linked list macros *
+ *****************************************************************************/
+#define CDL_PREPEND(head,add) \
+ CDL_PREPEND2(head,add,prev,next)
+
+#define CDL_PREPEND2(head,add,prev,next) \
+do { \
+ if (head) { \
+ (add)->prev = (head)->prev; \
+ (add)->next = (head); \
+ (head)->prev = (add); \
+ (add)->prev->next = (add); \
+ } else { \
+ (add)->prev = (add); \
+ (add)->next = (add); \
+ } \
+(head)=(add); \
+} while (0)
+
+#define CDL_DELETE(head,del) \
+ CDL_DELETE2(head,del,prev,next)
+
+#define CDL_DELETE2(head,del,prev,next) \
+do { \
+ if ( ((head)==(del)) && ((head)->next == (head))) { \
+ (head) = 0L; \
+ } else { \
+ (del)->next->prev = (del)->prev; \
+ (del)->prev->next = (del)->next; \
+ if ((del) == (head)) (head)=(del)->next; \
+ } \
+} while (0)
+
+#define CDL_COUNT(head,el,counter) \
+ CDL_COUNT2(head,el,counter,next) \
+
+#define CDL_COUNT2(head, el, counter,next) \
+{ \
+ counter = 0; \
+ CDL_FOREACH2(head,el,next){ ++counter; } \
+}
+
+#define CDL_FOREACH(head,el) \
+ CDL_FOREACH2(head,el,next)
+
+#define CDL_FOREACH2(head,el,next) \
+ for(el=head;el;el=((el)->next==head ? 0L : (el)->next))
+
+#define CDL_FOREACH_SAFE(head,el,tmp1,tmp2) \
+ CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next)
+
+#define CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next) \
+ for((el)=(head), ((tmp1)=(head)?((head)->prev):NULL); \
+ (el) && ((tmp2)=(el)->next, 1); \
+ ((el) = (((el)==(tmp1)) ? 0L : (tmp2))))
+
+#define CDL_SEARCH_SCALAR(head,out,field,val) \
+ CDL_SEARCH_SCALAR2(head,out,field,val,next)
+
+#define CDL_SEARCH_SCALAR2(head,out,field,val,next) \
+do { \
+ CDL_FOREACH2(head,out,next) { \
+ if ((out)->field == (val)) break; \
+ } \
+} while(0)
+
+#define CDL_SEARCH(head,out,elt,cmp) \
+ CDL_SEARCH2(head,out,elt,cmp,next)
+
+#define CDL_SEARCH2(head,out,elt,cmp,next) \
+do { \
+ CDL_FOREACH2(head,out,next) { \
+ if ((cmp(out,elt))==0) break; \
+ } \
+} while(0)
+
+#define CDL_REPLACE_ELEM(head, el, add) \
+do { \
+ assert(head != NULL); \
+ assert(el != NULL); \
+ assert(add != NULL); \
+ if ((el)->next == (el)) { \
+ (add)->next = (add); \
+ (add)->prev = (add); \
+ (head) = (add); \
+ } else { \
+ (add)->next = (el)->next; \
+ (add)->prev = (el)->prev; \
+ (add)->next->prev = (add); \
+ (add)->prev->next = (add); \
+ if ((head) == (el)) { \
+ (head) = (add); \
+ } \
+ } \
+} while (0)
+
+#define CDL_PREPEND_ELEM(head, el, add) \
+do { \
+ assert(head != NULL); \
+ assert(el != NULL); \
+ assert(add != NULL); \
+ (add)->next = (el); \
+ (add)->prev = (el)->prev; \
+ (el)->prev = (add); \
+ (add)->prev->next = (add); \
+ if ((head) == (el)) { \
+ (head) = (add); \
+ } \
+} while (0) \
+
+#endif /* UTLIST_H */
+
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
- Copyright (C) 2013-2019 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <strings.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#define REPLACE_ALL 1 // replace both missing and existing values
#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing
#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise
+#define MATCH_VALUE 4 // do not set, just match the value -c ~ID
#define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest
#define MM_APPEND 1 // append, possibly multiple times
#define MM_UNIQUE 2 // append, only unique values
#define MM_AVG 4
#define MM_MIN 5
#define MM_MAX 6
+#define MM_APPEND_MISSING 7 // missing values will be transferred as well
typedef struct _annot_col_t
{
int icol, replace, number; // number: one of BCF_VL_* types
char *hdr_key_src, *hdr_key_dst;
- int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*);
+ // The setters return 0 on successful update of the bcf record, negative value (bcf_update_* return status) on errors,
+ // or 1 on (repeated partial updates) concluded with a src=NULL call
+ int (*setter)(struct _args_t *, bcf1_t *dst, struct _annot_col_t *, void *src); // the last is the annotation line, either src bcf1_t or annot_line_t
+ int (*getter)(struct _args_t *, bcf1_t *src, struct _annot_col_t *, void **ptr, int *mptr);
int merge_method; // one of the MM_* defines
khash_t(str2int) *mm_str_hash; // lookup table to ensure uniqueness of added string values
kstring_t mm_kstr;
- double
+ size_t
mm_dbl_nalloc, // the allocated size --merge-logic values array
mm_dbl_nused, // the number of used elements in the mm_dbl array
- mm_dbl_ndat, // the number of merged rows (for calculating the average)
+ mm_dbl_ndat; // the number of merged rows (for calculating the average)
+ double
*mm_dbl;
+ void *ptr;
+ int mptr, done;
}
annot_col_t;
typedef struct _args_t
{
bcf_srs_t *files;
- bcf_hdr_t *hdr, *hdr_out;
+ bcf_hdr_t *hdr, *hdr_out, *tgts_hdr;
htsFile *out_fh;
int output_type, n_threads;
bcf_sr_regions_t *tgts;
- regidx_t *tgt_idx;
+ regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns
regitr_t *tgt_itr;
int tgt_is_bed;
vcmp_t *vcmp; // for matching annotation and VCF lines by allele
annot_line_t *alines; // buffered annotation lines
- int nalines, malines;
+ annot_line_t *aline_missing;
+ uint32_t *srt_alines; // sorted indexes (iALT<<16 || iAline)
+ int nalines, malines, nsrt_alines, msrt_alines;
int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present
annot_col_t *cols; // column indexes and setters
int ncols;
+ int match_id; // set iff `-c ~ID` given
char *set_ids_fmt;
convert_t *set_ids;
kstring_t tmpks;
char **argv, *output_fname, *targets_fname, *regions_list, *header_fname;
- char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites;
- char *merge_method_str;
+ char *remove_annots, *columns, *rename_chrs, *rename_annots, *sample_names, *mark_sites;
+ kstring_t merge_method_str;
int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps;
+ int columns_is_file, has_append_mode;
}
args_t;
for (i=0; i<line->n_info; i++)
{
bcf_info_t *inf = &line->d.info[i];
+ if ( !strcmp("END",bcf_hdr_int2id(args->hdr,BCF_DT_ID,inf->key)) )
+ line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
if ( inf->vptr_free )
{
free(inf->vptr - inf->vptr_off);
}
else if ( str.l )
{
+ int id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, str.s);
+ if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,id) ) error("Error: did you mean INFO/%s?\n",str.s);
+ if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) error("Error: did you mean FORMAT/%s?\n",str.s);
+
if ( !args->keep_sites )
{
if ( str.s[0]=='#' && str.s[1]=='#' )
if (bcf_hdr_sync(args->hdr) < 0)
error_errno("[%s] Failed to update input header", __func__);
}
+static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+ return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr);
+}
+static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+ char *str = *((char**)ptr);
+ int len = strlen(rec->d.id);
+ if ( len >= *mptr ) str = realloc(str, len+1);
+ strcpy(str, rec->d.id);
+ *((char**)ptr) = str;
+ *mptr = len+1;
+ return len;
+}
+static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+ kstring_t str;
+ str.s = *((char**)ptr);
+ str.m = *mptr;
+ str.l = 0;
+
+ int i;
+ if ( rec->d.n_flt )
+ {
+ for (i=0; i<rec->d.n_flt; i++)
+ {
+ if (i) kputc(';', &str);
+ kputs(bcf_hdr_int2id(args->tgts_hdr,BCF_DT_ID,rec->d.flt[i]), &str);
+ }
+ }
+ else kputc('.', &str);
+
+ *((char**)ptr) = str.s;
+ *mptr = str.m;
+ return str.l;
+}
static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n");
if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
hts_expand(int,1,args->mtmpi,args->tmpi);
args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]);
- if ( args->tmpi[0]<0 ) error("The FILTER is not defined in the header: %s\n", tab->cols[col->icol]);
- if ( col->replace==SET_OR_APPEND ) { bcf_add_filter(args->hdr_out,line,args->tmpi[0]); return 0; }
+ if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]);
+ if ( col->replace==SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]);
if ( col->replace!=REPLACE_MISSING )
{
bcf_update_filter(args->hdr_out,line,NULL,0);
- bcf_update_filter(args->hdr_out,line,args->tmpi,1);
- return 0;
+ return bcf_update_filter(args->hdr_out,line,args->tmpi,1);
}
// only update missing FILTER
if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
if ( !line->d.n_flt )
- bcf_update_filter(args->hdr_out,line,args->tmpi,1);
+ return bcf_update_filter(args->hdr_out,line,args->tmpi,1);
+
return 0;
}
static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
- int i;
+ int i, ret = 0;
bcf1_t *rec = (bcf1_t*) data;
if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT);
if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
for (i=0; i<rec->d.n_flt; i++)
{
const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
- bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt));
+ if ( bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt)) < 0 ) ret = -1;
}
- return 0;
+ return ret;
}
hts_expand(int,rec->d.n_flt,args->mtmpi,args->tmpi);
for (i=0; i<rec->d.n_flt; i++)
args->tmpi[i] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt);
}
bcf_update_filter(args->hdr_out,line,NULL,0);
- bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
- return 0;
+ return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
}
static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n");
+ if ( col->replace==MATCH_VALUE ) return 0;
// possible cases:
// IN ANNOT OUT ACHIEVED_BY
}
static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
+ if ( col->replace==MATCH_VALUE ) return 0;
+
bcf1_t *rec = (bcf1_t*) data;
- if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "."
- if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,rec->d.id);
- if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,rec->d.id);
+
+ char *id;
+ if ( col->getter )
+ {
+ int nret = col->getter(args,rec,col,&col->ptr,&col->mptr);
+ id = (char*) col->ptr;
+ if ( nret<=0 || (nret==1 && *id=='.') ) return 0; // don't replace with "."
+ }
+ else
+ {
+ if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "."
+ id = rec->d.id;
+ }
+ if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id);
+ if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,id);
// running with +ID, only update missing ids
if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
- return bcf_update_id(args->hdr_out,line,rec->d.id);
+ return bcf_update_id(args->hdr_out,line,id);
return 0;
}
static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
als[0] = rec->d.allele[0];
int i;
for (i=1; i<line->n_allele; i++) als[i] = line->d.allele[i];
- bcf_update_alleles(args->hdr_out, line, als, line->n_allele);
+ int ret = bcf_update_alleles(args->hdr_out, line, als, line->n_allele);
free(als);
- return 0;
+ return ret;
}
static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
const char **als = (const char**) malloc(sizeof(char*)*rec->n_allele);
als[0] = line->d.allele[0];
for (i=1; i<rec->n_allele; i++) als[i] = rec->d.allele[i];
- bcf_update_alleles(args->hdr_out, line, als, rec->n_allele);
+ int ret = bcf_update_alleles(args->hdr_out, line, als, rec->n_allele);
free(als);
- return 0;
+ return ret;
}
static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
args->tmpi2[i] = args->tmpi[ map[i] ];
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
- return 0;
+ return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
}
static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
annot_line_t *tab = (annot_line_t*) data;
+ // This is a bit hacky, only to reuse existing code with minimal changes:
+ // -c =TAG will now behave as -l TAG:APPEND for integers
+ if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND;
+
if ( !tab )
{
- if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND )
- error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Integer\n");
+ if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG &&
+ col->merge_method!=MM_MIN && col->merge_method!=MM_MAX &&
+ col->merge_method!=MM_APPEND &&
+ col->merge_method!=MM_APPEND_MISSING )
+ error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Integer\n");
}
int i,ntmpi = 0;
- if ( tab )
+ if ( tab ) // has data, not flushing yet
{
char *str = tab->cols[col->icol], *end = str;
- if ( str[0]=='.' && str[1]==0 ) return 0;
+ if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1;
while ( *end )
{
- int val = strtol(str, &end, 10);
- if ( end==str )
- error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
ntmpi++;
hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
- args->tmpi[ntmpi-1] = val;
- str = end+1;
+ if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
+ {
+ if ( col->merge_method==MM_APPEND_MISSING )
+ args->tmpi[ntmpi-1] = bcf_int32_missing;
+ else
+ ntmpi--;
+ if ( str[1]==0 ) end = str+1;
+ str += 2;
+ }
+ else
+ {
+ args->tmpi[ntmpi-1] = strtol(str, &end, 10);
+ if ( end==str )
+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
+ str = end+1;
+ }
}
if ( col->merge_method!=MM_FIRST )
{
}
else
{
- if ( col->merge_method==MM_APPEND )
+ if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
{
int nori = col->mm_dbl_nused;
col->mm_dbl_nused += ntmpi;
}
}
col->mm_dbl_ndat++;
+ return 1;
}
}
- else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND )
+ else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
{
ntmpi = col->mm_dbl_nused;
hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
- return 0;
+ return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
}
static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
- return 0;
+ return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
}
static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
{
args->tmpf2[i] = args->tmpf[ map[i] ];
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
- return 0;
+ return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
}
static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
annot_line_t *tab = (annot_line_t*) data;
+ // This is a bit hacky, only to reuse existing code with minimal changes:
+ // -c =TAG will now behave as -l TAG:APPEND for floats
+ if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND;
+
if ( !tab )
{
- if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND )
- error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Float\n");
+ if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG &&
+ col->merge_method!=MM_MIN && col->merge_method!=MM_MAX &&
+ col->merge_method!=MM_APPEND &&
+ col->merge_method!=MM_APPEND_MISSING )
+ error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Float\n");
}
int i,ntmpf = 0;
if ( tab )
{
char *str = tab->cols[col->icol], *end = str;
- if ( str[0]=='.' && str[1]==0 ) return 0;
+ if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1;
while ( *end )
{
- double val = strtod(str, &end);
- if ( end==str )
- error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
ntmpf++;
hts_expand(float,ntmpf,args->mtmpf,args->tmpf);
- args->tmpf[ntmpf-1] = val;
- str = end+1;
+ if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
+ {
+ if ( col->merge_method==MM_APPEND_MISSING )
+ bcf_float_set_missing(args->tmpf[ntmpf-1]);
+ else
+ ntmpf--;
+ if ( str[1]==0 ) end = str+1;
+ str += 2;
+ }
+ else
+ {
+ args->tmpf[ntmpf-1] = strtod(str, &end);
+ if ( end==str )
+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
+ str = end+1;
+ }
}
if ( col->merge_method!=MM_FIRST )
{
col->mm_dbl_nused = ntmpf;
hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
for (i=0; i<ntmpf; i++)
- col->mm_dbl[i] = args->tmpf[i];
+ {
+ if ( bcf_float_is_missing(args->tmpf[i]) )
+ bcf_double_set_missing(col->mm_dbl[i]);
+ else
+ col->mm_dbl[i] = args->tmpf[i];
+ }
}
else
{
- if ( col->merge_method==MM_APPEND )
+ if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
{
int nori = col->mm_dbl_nused;
col->mm_dbl_nused += ntmpf;
hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
for (i=0; i<ntmpf; i++)
- col->mm_dbl[i+nori] = args->tmpf[i];
+ {
+ if ( bcf_float_is_missing(args->tmpf[i]) )
+ bcf_double_set_missing(col->mm_dbl[i+nori]);
+ else
+ col->mm_dbl[i+nori] = args->tmpf[i];
+ }
}
else
{
}
}
col->mm_dbl_ndat++;
+ return 1;
}
}
- else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND )
+ else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
{
ntmpf = col->mm_dbl_nused;
hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf);
- for (i=0; i<ntmpf; i++) args->tmpf[i] = col->mm_dbl[i];
+ for (i=0; i<ntmpf; i++)
+ {
+ if ( bcf_double_is_missing(col->mm_dbl[i]) )
+ bcf_float_set_missing(args->tmpf[i]);
+ else
+ args->tmpf[i] = col->mm_dbl[i];
+ }
col->mm_dbl_nused = col->mm_dbl_ndat = 0;
}
else if ( col->merge_method==MM_AVG )
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
- return 0;
+ return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
}
static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
- return 0;
+ return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
}
int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als)
if ( str[0]!='.' || (str[1]!=',' && str[1]!=0) ) continue; // value already set
}
int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i);
- assert( ret==0 );
+ if ( ret!=0 ) error("[%s:%d %s] Failed to copy a string field\n", __FILE__,__LINE__,__func__);
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
- return 0;
+ return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
}
void khash_str2int_clear_free(void *_hash)
{
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
}
+ // This is a bit hacky, only to reuse existing code with minimal changes:
+ // -c =TAG will now behave as -l TAG:unique for strings
+ if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_UNIQUE;
+
annot_line_t *tab = (annot_line_t*) data;
-
+
int len = 0;
if ( tab )
{
len = strlen(tab->cols[col->icol]);
if ( !len ) return 0;
- if ( len==1 && tab->cols[col->icol][0]=='.' ) return 0;
+ if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING ) return 1;
}
if ( col->merge_method!=MM_FIRST )
if ( data )
{
- assert( col->merge_method==MM_APPEND || col->merge_method==MM_UNIQUE );
+ assert( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING || col->merge_method==MM_UNIQUE );
if ( col->merge_method==MM_UNIQUE )
{
if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init();
- if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 0;
+ if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 1;
khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol]));
}
if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr);
kputs(tab->cols[col->icol], &col->mm_kstr);
- return 0;
+ return 1;
}
if ( col->mm_kstr.l )
else
return 0;
- if ( !data ) // flush the line
- {
- if ( col->merge_method==MM_UNIQUE )
- khash_str2int_clear_free(col->mm_str_hash);
- col->mm_kstr.l = 0;
- }
+ // flush the line
+ if ( col->merge_method==MM_UNIQUE )
+ khash_str2int_clear_free(col->mm_str_hash);
+ col->mm_kstr.l = 0;
}
else
{
return setter_ARinfo_string(args,line,col,tab->nals,tab->als);
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
- return 0;
+ return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
}
static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
- if ( ntmps < 0 ) return 0; // nothing to add
+
+ if ( col->getter )
+ col->getter(args,rec,col,(void**)&args->tmps, &args->mtmps);
+ else
+ {
+ int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
+ if ( ntmps < 0 ) return 0; // nothing to add
+ }
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
- return 0;
+ return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
}
static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str)
{
}
}
return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nsmpl_dst*ndst1);
-
}
static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
// tab annotation file, expecting that all samples are present: sample map not needed
if ( !src ) return 0;
- int nmatch = 0, order_ok = 1;
+ int nmatch = 0;
for (i=0; i<bcf_hdr_nsamples(src); i++)
{
int id = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, src->samples[i]);
- if ( id!=-1 )
- {
- nmatch++;
- if ( i!=id ) order_ok = 0;
- }
+ if ( id!=-1 ) nmatch++;
}
- if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok ) return 0; // not needed
if ( !nmatch ) return -1; // No matching samples found in the source and the destination file
args->nsample_map = bcf_hdr_nsamples(dst);
int need_sample_map = 0;
int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr);
+ kstring_t tmp = {0,0,0};
+ if ( args->columns_is_file )
+ {
+ int i,n;
+ char **str = hts_readlist(args->columns, args->columns_is_file, &n);
+ if ( !str ) error("Could not parse %s\n", args->columns);
+ for (i=0; i<n; i++)
+ {
+ char *ptr = str[i];
+ while ( *ptr && !isspace(*ptr) ) ptr++;
+ if ( *ptr )
+ {
+ *ptr = 0;
+ ptr++;
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ if ( *ptr )
+ {
+ if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str);
+ kputs(str[i],&args->merge_method_str);
+ kputc(':',&args->merge_method_str);
+ kputs(ptr,&args->merge_method_str);
+ }
+ }
+ if ( tmp.l ) kputc(',',&tmp);
+ kputs(str[i],&tmp);
+ free(str[i]);
+ }
+ free(str);
+ free(args->columns);
+ args->columns = tmp.s;
+ tmp.l = tmp.m = 0;
+ tmp.s = NULL;
+ }
+
void *skip_fmt = NULL, *skip_info = NULL;
if ( args->tgts_is_vcf )
args->columns = columns_complement(args->columns, &skip_info, &skip_fmt);
- kstring_t str = {0,0,0}, tmp = {0,0,0};
+ kstring_t str = {0,0,0};
char *ss = args->columns, *se = ss;
args->ncols = 0;
int icol = -1, has_fmt_str = 0;
{
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->setter = vcf_setter_ref;
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
{
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->setter = vcf_setter_alt;
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
}
else args->alt_idx = icol;
}
- else if ( !strcasecmp("ID",str.s) )
+ else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) )
{
if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ if ( str.s[0]=='~' ) replace = MATCH_VALUE;
+ if ( args->tgts_is_vcf && replace==MATCH_VALUE ) error("todo: -c ~ID with -a VCF?\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
+ if ( replace==MATCH_VALUE ) args->match_id = icol;
+ }
+ else if ( !strncasecmp("ID:=",str.s,4) ) // transfer a tag from INFO to ID column
+ {
+ if ( !args->tgts_is_vcf ) error("The annotation source must be a VCF for \"%s\"\n",str.s);
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
+ col->icol = icol;
+ col->replace = replace;
+ col->setter = vcf_setter_id;
+ col->getter = vcf_getter_info_str2str;
+ str.s[2] = 0;
+ col->hdr_key_dst = strdup(str.s);
+ col->hdr_key_src = strncasecmp("INFO/",str.s+4,5) ? strdup(str.s+4) : strdup(str.s+4+5);
+ int hdr_id = bcf_hdr_id2int(args->tgts_hdr, BCF_DT_ID,col->hdr_key_src);
+ if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) )
+ error("The INFO tag \"%s\" is not defined in %s\n", col->hdr_key_src, args->targets_fname);
+ if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR )
+ error("Only Type=String tags can be used to annotate the ID column\n");
}
else if ( !strcasecmp("FILTER",str.s) )
{
if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
bcf_hrec_t *hrec = tgts_hdr->hrec[j];
if ( hrec->type!=BCF_HL_FLT ) continue;
int k = bcf_hrec_find_key(hrec,"ID");
- assert( k>=0 ); // this should always be true for valid VCFs
+ if ( k<0 ) error("[%s] Failed to parse the header, the ID attribute not found", __func__);
tmp.l = 0;
bcf_hrec_format(hrec, &tmp);
bcf_hdr_append(args->hdr_out, tmp.s);
if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
{
if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
- if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n");
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
int j;
for (j=0; j<tgts_hdr->nhrec; j++)
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->icol = -1;
col->replace = replace;
col->hdr_key_src = strdup(hrec->vals[k]);
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->icol = -1;
col->replace = replace;
col->hdr_key_src = strdup(hrec->vals[k]);
col->hdr_key_dst = strdup(hrec->vals[k]);
- if ( !strcasecmp("GT",col->hdr_key_src) ) col->setter = vcf_setter_format_gt;
+ if ( !strcasecmp("GT",col->hdr_key_src) )
+ {
+ if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n");
+ col->setter = vcf_setter_format_gt;
+ }
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
}
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
- error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
+ error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
if ( !args->tgts_is_vcf )
{
col->icol = icol;
col->replace = replace;
col->hdr_key_src = strdup(key_src);
col->hdr_key_dst = strdup(key_dst);
- if ( !strcasecmp("GT",key_src) ) col->setter = vcf_setter_format_gt;
+ if ( !strcasecmp("GT",key_src) )
+ {
+ if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n");
+ col->setter = vcf_setter_format_gt;
+ }
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
else
{
if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
- if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
- int explicit_info = 0;
+ if ( replace==SET_OR_APPEND )
+ {
+ if ( args->tgts_is_vcf )
+ error("Error: the =INFO/TAG feature is currently supported only with TAB annotation files and has limitations\n"
+ " (the annotation type is modified to \"Number=.\" and allele ordering is disregarded)\n");
+ fprintf(stderr,"Warning: the =INFO/TAG feature modifies the annotation to \"Number=.\" and disregards allele ordering\n");
+ }
+ int explicit_src_info = 0;
+ int explicit_dst_info = 0;
char *key_dst;
if ( !strncasecmp("INFO/",str.s,5) )
{
key_dst = str.s + 5;
- explicit_info = 1;
+ explicit_dst_info = 1;
}
else
key_dst = str.s;
if ( !strncasecmp("INFO/",key_src,5) )
{
key_src += 5;
- explicit_info = 1;
+ explicit_src_info = 1;
}
else if ( !strncasecmp("FMT/",key_src,4) || !strncasecmp("FORMAT/",key_src,5) )
{
}
else
key_src = key_dst;
+
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
+ col->icol = icol;
+ col->replace = replace;
+ col->hdr_key_src = strdup(key_src);
+ col->hdr_key_dst = strdup(key_dst);
+
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line
{
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
- if ( !hrec )
+ if ( !strcasecmp("ID",key_src) && !explicit_src_info )
{
- if ( !explicit_info && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) )
- error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
- fprintf(stderr,"[%s] %d\n",key_src,explicit_info);
- error("The tag \"%s\" is not defined in %s\n", key_src,args->files->readers[1].fname);
+ // transferring ID column into a new INFO tag
+ tmp.l = 0;
+ ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred ID column\">",key_dst);
+ }
+ else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info )
+ {
+ // transferring FILTER column into a new INFO tag
+ tmp.l = 0;
+ ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred FILTER column\">",key_dst);
+ }
+ else
+ {
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
+ if ( !hrec )
+ {
+ if ( explicit_dst_info+explicit_src_info==0 && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) )
+ error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
+ char *ptr = strchr(key_src,'=');
+ if ( ptr )
+ {
+ *ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '=';
+ error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s);
+ }
+ error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname);
+ }
+ tmp.l = 0;
+ bcf_hrec_format_rename(hrec, key_dst, &tmp);
}
- tmp.l = 0;
- bcf_hrec_format_rename(hrec, key_dst, &tmp);
bcf_hdr_append(args->hdr_out, tmp.s);
if (bcf_hdr_sync(args->hdr_out) < 0)
error_errno("[%s] Failed to update header", __func__);
hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
}
else
- error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname);
+ error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname);
assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
}
-
- args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
- annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = icol;
- col->replace = replace;
- col->hdr_key_src = strdup(key_src);
- col->hdr_key_dst = strdup(key_dst);
- col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
+ if ( args->tgts_is_vcf )
+ {
+ if ( !strcasecmp("ID",key_src) && !explicit_src_info ) col->getter = vcf_getter_id2str;
+ else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) col->getter = vcf_getter_filter2str;
+ }
+ col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
case BCF_HT_FLAG: col->setter = args->tgts_is_vcf ? vcf_setter_info_flag : setter_info_flag; break;
case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_info_str : setter_info_str; break;
default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
}
+ if ( replace==SET_OR_APPEND ) // change to Number=.
+ {
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_INFO, "ID", key_dst, NULL);
+ if ( !hrec ) error("Uh, could not find the new tag \"%s\" in the header\n", key_dst);
+ hrec = bcf_hrec_dup(hrec);
+ int j = bcf_hrec_find_key(hrec, "Number");
+ if ( j<0 ) error("Uh, could not find the entry Number in the header record of %s\n",key_dst);
+ free(hrec->vals[j]);
+ hrec->vals[j] = strdup(".");
+ bcf_hdr_remove(args->hdr_out,BCF_HL_INFO, key_dst);
+ bcf_hdr_add_hrec(args->hdr_out, hrec);
+ }
}
if ( !*se ) break;
ss = ++se;
args->cols[i].mm_dbl_nalloc = args->cols[i].mm_dbl_nused = args->cols[i].mm_dbl_ndat = 0;
memset(&args->cols[i].mm_kstr, 0, sizeof(args->cols[i].mm_kstr));
}
- if ( !args->merge_method_str ) return;
+ if ( !args->merge_method_str.l ) return;
if ( args->tgts_is_vcf ) error("Error: the --merge-logic is intended for use with BED or TAB-delimited files only.\n");
- if ( !args->tgt_idx ) error("Error: BEG,END (or FROM,TO) columns are expected with the --merge-logic option.\n");
- char *sb = args->merge_method_str;
+ if ( !args->tgt_idx && !args->tgts ) error("Error: BEG,END (or FROM,TO) columns or REF,ALT columns are expected with the --merge-logic option.\n");
+ char *sb = args->merge_method_str.s;
while ( *sb )
{
char *se = sb;
char *mm_type_str = args->tmpks.s + args->tmpks.l;
while ( *mm_type_str!=':' && mm_type_str > args->tmpks.s ) mm_type_str--;
if ( *mm_type_str!=':' )
- error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str);
+ error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str.s);
*mm_type_str = 0;
mm_type_str++;
int mm_type = MM_FIRST;
if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE;
+ else if ( !strcasecmp("first",mm_type_str) ) mm_type = MM_FIRST;
else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND;
+ else if ( !strcasecmp("append-missing",mm_type_str) )
+ {
+ mm_type = MM_APPEND_MISSING;
+ if ( args->ref_idx!=-1 ) args->has_append_mode = 1;
+ }
else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM;
else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG;
else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN;
else if ( !strcasecmp("max",mm_type_str) ) mm_type = MM_MAX;
- else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str,mm_type_str);
+ else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str.s,mm_type_str);
for (i=0; i<args->ncols; i++)
{
if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue;
- if ( mm_type==MM_APPEND && args->cols[i].number!=BCF_VL_VAR )
+ if ( (mm_type==MM_APPEND || mm_type==MM_APPEND_MISSING) && args->cols[i].number!=BCF_VL_VAR )
error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n");
args->cols[i].merge_method = mm_type;
break;
if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s);
sb = *se ? se + 1 : se;
}
+ if ( args->has_append_mode )
+ {
+ // create a missing line to insert missing values when VCF ALT finds no match in the annotation file
+ args->aline_missing = (annot_line_t*)calloc(1,sizeof(*args->aline_missing));
+ int ncol = 0;
+ for (i=0; i<args->ncols; i++)
+ if ( ncol < args->cols[i].icol + 1 ) ncol = args->cols[i].icol + 1;
+ if ( ncol < args->ref_idx + 1 ) ncol = args->ref_idx + 1;
+ args->aline_missing->mcols = ncol;
+ args->aline_missing->ncols = ncol;
+ args->aline_missing->cols = (char**) malloc(ncol*sizeof(char*));
+ for (i=0; i<ncol; i++)
+ args->aline_missing->cols[i] = strdup(".");
+ }
}
static void rename_chrs(args_t *args, char *fname)
free(map);
}
+static void rename_annots(args_t *args, char *fname)
+{
+ int n, i;
+ char **map = hts_readlist(fname, 1, &n);
+ if ( !map ) error("Could not read: %s\n", fname);
+ for (i=0; i<n; i++)
+ {
+ char *sb = NULL, *ss = map[i];
+ while ( *ss && !isspace(*ss) ) ss++;
+ if ( !*ss ) error("Could not parse: %s\n", fname);
+ *ss = 0;
+ int type;
+ if ( !strncasecmp("info/",map[i],5) ) type = BCF_HL_INFO, sb = map[i] + 5;
+ else if ( !strncasecmp("format/",map[i],7) ) type = BCF_HL_FMT, sb = map[i] + 7;
+ else if ( !strncasecmp("fmt/",map[i],4) ) type = BCF_HL_FMT, sb = map[i] + 4;
+ else if ( !strncasecmp("filter/",map[i],7) ) type = BCF_HL_FLT, sb = map[i] + 7;
+ else error("Could not parse \"%s\", expected INFO, FORMAT, or FILTER prefix for each line: %s\n",map[i],fname);
+ int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, sb);
+ if ( id<0 ) continue;
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", sb, NULL);
+ if ( !hrec ) continue; // the sequence not present
+ int j = bcf_hrec_find_key(hrec, "ID");
+ assert( j>=0 );
+ free(hrec->vals[j]);
+ ss++;
+ while ( *ss && isspace(*ss) ) ss++;
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ *se = 0;
+ hrec->vals[j] = strdup(ss);
+ args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j];
+ }
+ for (i=0; i<n; i++) free(map[i]);
+ free(map);
+}
+
static void init_data(args_t *args)
{
args->hdr = args->files->readers[0].header;
// reading annots from a VCF
if ( !bcf_sr_add_reader(args->files, args->targets_fname) )
error("Failed to open %s: %s\n", args->targets_fname,bcf_sr_strerror(args->files->errnum));
+ args->tgts_hdr = args->files->readers[1].header;
}
if ( args->columns ) init_columns(args);
if ( args->targets_fname && !args->tgts_is_vcf )
if ( !args->columns ) error("The -c option not given\n");
if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n");
if ( args->beg_idx==-1 ) error("The -c POS option not given\n");
- if ( args->single_overlaps && args->merge_method_str ) error("The options --merge-logic and --single-overlaps cannot be combined\n");
- if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str) )
+ if ( args->single_overlaps && args->merge_method_str.l ) error("The options --merge-logic and --single-overlaps cannot be combined\n");
+ if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str.l) )
{
args->end_idx = -args->beg_idx - 1;
args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->beg_idx,args->end_idx);
if ( !args->drop_header )
{
if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
+ if ( args->rename_annots ) rename_annots(args, args->rename_annots);
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno));
if ( args->n_threads )
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
free(args->cols[i].mm_kstr.s);
if ( args->cols[i].mm_str_hash ) khash_str2int_destroy_free(args->cols[i].mm_str_hash);
free(args->cols[i].mm_dbl);
+ free(args->cols[i].ptr);
}
free(args->cols);
+ if ( args->aline_missing )
+ {
+ for (i=0; i<args->aline_missing->ncols; i++) free(args->aline_missing->cols[i]);
+ free(args->aline_missing->cols);
+ free(args->aline_missing);
+ }
for (i=0; i<args->malines; i++)
{
free(args->alines[i].cols);
free(args->alines[i].line.s);
}
free(args->alines);
+ free(args->srt_alines);
if ( args->tgt_idx )
{
regidx_destroy(args->tgt_idx);
filter_destroy(args->filter);
if (args->out_fh) hts_close(args->out_fh);
free(args->sample_map);
+ free(args->merge_method_str.s);
}
static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp)
}
else i++;
}
-
if ( args->ref_idx==-1 && args->nalines ) return;
while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) )
}
}
+// search string in semicolon separated strings (xx vs aa;bb)
+static int str_match(char *needle, char *haystack)
+{
+ int len = strlen(needle);
+ char *ptr = haystack;
+ while ( *ptr && (ptr=strstr(ptr,needle)) )
+ {
+ if ( ptr[len]!=0 && ptr[len]!=';' ) ptr++; // a prefix, not a match
+ else if ( ptr==haystack || ptr[-1]==';' ) return 1; // a match
+ ptr++; // a suffix, not a match
+ }
+ return 0;
+}
+// search common string in semicolon separated strings (xx;yy;zz vs aa;bb)
+static int strstr_match(char *a, char *b)
+{
+ char *beg = a;
+ while ( *beg )
+ {
+ char *end = beg;
+ while ( *end && *end!=';' ) end++;
+ char tmp = *end;
+ if ( *end==';' ) *end = 0;
+ int ret = str_match(beg,b);
+ *end = tmp;
+ if ( ret || !*end ) return ret;
+ beg = end + 1;
+ }
+ return 0;
+}
static void annotate(args_t *args, bcf1_t *line)
{
int i, j;
args->rm[i].handler(args, line, &args->rm[i]);
int has_overlap = 0;
-
if ( args->tgt_idx )
{
+ for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
{
while ( regitr_overlap(args->tgt_itr) )
tmp->end = args->tgt_itr->end;
parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp);
for (j=0; j<args->ncols; j++)
- if ( args->cols[j].setter(args,line,&args->cols[j],tmp) )
+ {
+ if ( args->cols[j].done==1 ) continue;
+ int ret = args->cols[j].setter(args,line,&args->cols[j],tmp);
+ if ( ret < 0 )
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ if ( ret==0 )
+ args->cols[j].done = 1;
+ }
}
has_overlap = 1;
}
for (j=0; j<args->ncols; j++)
- if ( args->cols[j].merge_method != MM_FIRST )
- args->cols[j].setter(args,line,&args->cols[j],NULL);
+ {
+ if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+ if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 )
+ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ }
}
else if ( args->tgts )
{
- // Buffer annotation lines. When multiple ALT alleles are present in the
- // annotation file, at least one must match one of the VCF alleles.
- int len = 0;
- bcf_get_variant_types(line);
- for (i=1; i<line->n_allele; i++)
- if ( len > line->d.var[i].n ) len = line->d.var[i].n;
- int end_pos = len<0 ? line->pos - len : line->pos;
+ // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one
+ // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the
+ // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found
+ // for an ALT, missing value is appended instead.
+ int end_pos = line->pos + line->rlen - 1;
buffer_annot_lines(args, line, line->pos, end_pos);
+
+ args->nsrt_alines = 0;
+ hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines);
+ if ( args->nalines >= 0xffff || line->n_allele >= 0xffff )
+ error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+
+ // Find matching lines
for (i=0; i<args->nalines; i++)
{
if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue;
- if ( args->ref_idx != -1 )
+ if ( args->ref_idx != -1 ) // REF+ALT matching requested
{
- if ( vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs not compatible
+ if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs are not compatible
for (j=1; j<args->alines[i].nals; j++)
{
- if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) break; // no ALT allele in VCF and annot file has "."
- if ( vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]) >= 0 ) break;
+ int ialt;
+ if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) // match: no ALT allele in VCF and annot file has "."
+ ialt = 0;
+ else
+ {
+ ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]);
+ if ( ialt < 0 ) continue;
+ ialt++;
+ }
+ if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
+ args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
+ has_overlap = 1;
+ break;
}
- if ( j==args->alines[i].nals ) continue; // none of the annot alleles present in VCF's ALT
}
- break;
+ else // overlap, REF+ALT matching not requested
+ {
+ args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i;
+ has_overlap = 1;
+ }
}
-
- if ( i<args->nalines )
+ // Sort lines if needed
+ if ( args->has_append_mode )
+ {
+ // insertion sort by VCF ALT index (top bits) and alines index (low bits)
+ uint32_t tmp;
+ for (i=1; i<args->nsrt_alines; i++)
+ for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--)
+ tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp;
+ }
+ // Annotate
+ for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
+ int ialt_exp = 1;
+ for (i=0; i<args->nsrt_alines; i++)
{
- // there is a matching line
+ int ialt = args->srt_alines[i] >> 16;
+ int ilin = args->srt_alines[i] & 0xffff;
+ if ( args->has_append_mode )
+ {
+ if ( ialt_exp > ialt ) continue; // multiple annotation lines for the same position
+ if ( ialt_exp < ialt )
+ {
+ // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT
+ while ( ialt_exp++ < ialt )
+ {
+ for (j=0; j<args->ncols; j++)
+ {
+ if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
+ if ( args->cols[j].done==1 ) continue;
+ int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
+ if ( ret < 0 )
+ error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ if ( ret==0 )
+ args->cols[j].done = 1;
+ }
+ }
+ }
+ }
for (j=0; j<args->ncols; j++)
- if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) )
+ {
+ if ( args->cols[j].done==1 ) continue;
+ int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]);
+ if ( ret < 0 )
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ if ( ret==0 )
+ args->cols[j].done = 1;
+ }
+ ialt_exp = ialt + 1;
+ }
+ if ( args->nsrt_alines )
+ {
+ // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one
+ // record was found. Otherwise leave the row will be left without annotation.
+ if ( args->has_append_mode && ialt_exp < line->n_allele )
+ {
+ while ( ialt_exp++ < line->n_allele )
+ {
+ for (j=0; j<args->ncols; j++)
+ {
+ if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
+ if ( args->cols[j].done==1 ) continue;
+ int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
+ if ( ret < 0 )
+ error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ if ( ret==0 )
+ args->cols[j].done = 1;
+ }
+ }
+ }
+ // Flush
+ for (j=0; j<args->ncols; j++)
+ {
+ if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+ int ret = args->cols[j].setter(args,line,&args->cols[j],NULL);
+ if ( ret < 0 )
+ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ }
}
- has_overlap = i<args->nalines ? 1 : 0;
}
else if ( args->files->nreaders == 2 )
{
fprintf(stderr, "Usage: bcftools annotate [options] <in.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -a, --annotations <file> VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n");
- fprintf(stderr, " --collapse <string> matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
- fprintf(stderr, " -c, --columns <list> list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
- fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " --force continue despite parsing error (at your own risk!)\n");
- fprintf(stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
- fprintf(stderr, " -I, --set-id [+]<format> set ID column, see man page for details\n");
- fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n");
- fprintf(stderr, " -l, --merge-logic <tag:type> merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
- fprintf(stderr, " -m, --mark-sites [+-]<tag> add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
- fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " --rename-chrs <file> rename sequences according to map file: from\\tto\n");
- fprintf(stderr, " -s, --samples [^]<list> comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
- fprintf(stderr, " -S, --samples-file [^]<file> file of samples to annotate (or exclude with \"^\" prefix)\n");
- fprintf(stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
- fprintf(stderr, " -x, --remove <list> list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
- fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n");
+ fprintf(stderr, " --collapse STR matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
+ fprintf(stderr, " -c, --columns LIST list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
+ fprintf(stderr, " -C, --columns-file FILE read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n");
+ fprintf(stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " --force continue despite parsing error (at your own risk!)\n");
+ fprintf(stderr, " -h, --header-lines FILE lines which should be appended to the VCF header\n");
+ fprintf(stderr, " -I, --set-id [+]FORMAT set ID column using a `bcftools query`-like expression, see man page for details\n");
+ fprintf(stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n");
+ fprintf(stderr, " -l, --merge-logic TAG:TYPE merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
+ fprintf(stderr, " -m, --mark-sites [+-]TAG add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
+ fprintf(stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type [b|u|z|v] b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, " -r, --regions REGION restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE restrict to regions listed in FILE\n");
+ fprintf(stderr, " --rename-annots FILE rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n");
+ fprintf(stderr, " --rename-chrs FILE rename sequences according to the mapping: old\\tnew\n");
+ fprintf(stderr, " -s, --samples [^]LIST comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(stderr, " -S, --samples-file [^]FILE file of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
+ fprintf(stderr, " -x, --remove LIST list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
+ fprintf(stderr, " --threads INT number of extra output compression threads [0]\n");
fprintf(stderr, "\n");
exit(1);
}
args->record_cmd_line = 1;
args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1;
args->set_ids_replace = 1;
+ args->match_id = -1;
int regions_is_file = 0, collapse = 0;
static struct option loptions[] =
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
{"remove",required_argument,NULL,'x'},
+ {"columns-file",required_argument,NULL,'C'},
{"columns",required_argument,NULL,'c'},
+ {"rename-annots",required_argument,NULL,11},
{"rename-chrs",required_argument,NULL,1},
{"header-lines",required_argument,NULL,'h'},
{"samples",required_argument,NULL,'s'},
{"force",no_argument,NULL,'f'},
{NULL,0,NULL,0}
};
- while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
{
switch (c) {
case 'f': args->force = 1; break;
else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; }
else args->mark_sites = optarg;
break;
- case 'l': args->merge_method_str = optarg; break;
+ case 'l':
+ if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str);
+ kputs(optarg,&args->merge_method_str);
+ break;
case 'I': args->set_ids_fmt = optarg; break;
case 's': args->sample_names = optarg; break;
case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
case 'c': args->columns = strdup(optarg); break;
+ case 'C': args->columns = strdup(optarg); args->columns_is_file = 1; break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
default: error("The output type \"%s\" not recognised\n", optarg);
};
break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'x': args->remove_annots = optarg; break;
case 'a': args->targets_fname = optarg; break;
case 'r': args->regions_list = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 10 : args->single_overlaps = 1; break;
+ case 11 : args->rename_annots = optarg; break;
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
- Copyright (C) 2013-2019 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <strings.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#define REPLACE_ALL 1 // replace both missing and existing values
#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing
#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise
+#define MATCH_VALUE 4 // do not set, just match the value -c ~ID
#define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest
#define MM_APPEND 1 // append, possibly multiple times
#define MM_UNIQUE 2 // append, only unique values
#define MM_AVG 4
#define MM_MIN 5
#define MM_MAX 6
+#define MM_APPEND_MISSING 7 // missing values will be transferred as well
typedef struct _annot_col_t
{
int icol, replace, number; // number: one of BCF_VL_* types
char *hdr_key_src, *hdr_key_dst;
- int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*);
+ // The setters return 0 on successful update of the bcf record, negative value (bcf_update_* return status) on errors,
+ // or 1 on (repeated partial updates) concluded with a src=NULL call
+ int (*setter)(struct _args_t *, bcf1_t *dst, struct _annot_col_t *, void *src); // the last is the annotation line, either src bcf1_t or annot_line_t
+ int (*getter)(struct _args_t *, bcf1_t *src, struct _annot_col_t *, void **ptr, int *mptr);
int merge_method; // one of the MM_* defines
khash_t(str2int) *mm_str_hash; // lookup table to ensure uniqueness of added string values
kstring_t mm_kstr;
- double
+ size_t
mm_dbl_nalloc, // the allocated size --merge-logic values array
mm_dbl_nused, // the number of used elements in the mm_dbl array
- mm_dbl_ndat, // the number of merged rows (for calculating the average)
+ mm_dbl_ndat; // the number of merged rows (for calculating the average)
+ double
*mm_dbl;
+ void *ptr;
+ int mptr, done;
}
annot_col_t;
typedef struct _args_t
{
bcf_srs_t *files;
- bcf_hdr_t *hdr, *hdr_out;
+ bcf_hdr_t *hdr, *hdr_out, *tgts_hdr;
htsFile *out_fh;
int output_type, n_threads;
bcf_sr_regions_t *tgts;
- regidx_t *tgt_idx;
+ regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns
regitr_t *tgt_itr;
int tgt_is_bed;
vcmp_t *vcmp; // for matching annotation and VCF lines by allele
annot_line_t *alines; // buffered annotation lines
- int nalines, malines;
+ annot_line_t *aline_missing;
+ uint32_t *srt_alines; // sorted indexes (iALT<<16 || iAline)
+ int nalines, malines, nsrt_alines, msrt_alines;
int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present
annot_col_t *cols; // column indexes and setters
int ncols;
+ int match_id; // set iff `-c ~ID` given
char *set_ids_fmt;
convert_t *set_ids;
kstring_t tmpks;
char **argv, *output_fname, *targets_fname, *regions_list, *header_fname;
- char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites;
- char *merge_method_str;
+ char *remove_annots, *columns, *rename_chrs, *rename_annots, *sample_names, *mark_sites;
+ kstring_t merge_method_str;
int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps;
+ int columns_is_file, has_append_mode;
}
args_t;
for (i=0; i<line->n_info; i++)
{
bcf_info_t *inf = &line->d.info[i];
+ if ( !strcmp("END",bcf_hdr_int2id(args->hdr,BCF_DT_ID,inf->key)) )
+ line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
if ( inf->vptr_free )
{
free(inf->vptr - inf->vptr_off);
}
else if ( str.l )
{
+ int id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, str.s);
+ if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,id) ) error("Error: did you mean INFO/%s?\n",str.s);
+ if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) error("Error: did you mean FORMAT/%s?\n",str.s);
+
if ( !args->keep_sites )
{
if ( str.s[0]=='#' && str.s[1]=='#' )
if (bcf_hdr_sync(args->hdr) < 0)
error_errno("[%s] Failed to update input header", __func__);
}
+static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+ return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr);
+}
+static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+ char *str = *((char**)ptr);
+ int len = strlen(rec->d.id);
+ if ( len >= *mptr ) str = realloc(str, len+1);
+ strcpy(str, rec->d.id);
+ *((char**)ptr) = str;
+ *mptr = len+1;
+ return len;
+}
+static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+ kstring_t str;
+ str.s = *((char**)ptr);
+ str.m = *mptr;
+ str.l = 0;
+
+ int i;
+ if ( rec->d.n_flt )
+ {
+ for (i=0; i<rec->d.n_flt; i++)
+ {
+ if (i) kputc(';', &str);
+ kputs(bcf_hdr_int2id(args->tgts_hdr,BCF_DT_ID,rec->d.flt[i]), &str);
+ }
+ }
+ else kputc('.', &str);
+
+ *((char**)ptr) = str.s;
+ *mptr = str.m;
+ return str.l;
+}
static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n");
if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
hts_expand(int,1,args->mtmpi,args->tmpi);
args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]);
- if ( args->tmpi[0]<0 ) error("The FILTER is not defined in the header: %s\n", tab->cols[col->icol]);
- if ( col->replace==SET_OR_APPEND ) { bcf_add_filter(args->hdr_out,line,args->tmpi[0]); return 0; }
+ if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]);
+ if ( col->replace==SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]);
if ( col->replace!=REPLACE_MISSING )
{
bcf_update_filter(args->hdr_out,line,NULL,0);
- bcf_update_filter(args->hdr_out,line,args->tmpi,1);
- return 0;
+ return bcf_update_filter(args->hdr_out,line,args->tmpi,1);
}
// only update missing FILTER
if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
if ( !line->d.n_flt )
- bcf_update_filter(args->hdr_out,line,args->tmpi,1);
+ return bcf_update_filter(args->hdr_out,line,args->tmpi,1);
+
return 0;
}
static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
- int i;
+ int i, ret = 0;
bcf1_t *rec = (bcf1_t*) data;
if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT);
if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
for (i=0; i<rec->d.n_flt; i++)
{
const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
- bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt));
+ if ( bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt)) < 0 ) ret = -1;
}
- return 0;
+ return ret;
}
hts_expand(int,rec->d.n_flt,args->mtmpi,args->tmpi);
for (i=0; i<rec->d.n_flt; i++)
args->tmpi[i] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt);
}
bcf_update_filter(args->hdr_out,line,NULL,0);
- bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
- return 0;
+ return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
}
static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n");
+ if ( col->replace==MATCH_VALUE ) return 0;
// possible cases:
// IN ANNOT OUT ACHIEVED_BY
}
static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
+ if ( col->replace==MATCH_VALUE ) return 0;
+
bcf1_t *rec = (bcf1_t*) data;
- if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "."
- if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,rec->d.id);
- if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,rec->d.id);
+
+ char *id;
+ if ( col->getter )
+ {
+ int nret = col->getter(args,rec,col,&col->ptr,&col->mptr);
+ id = (char*) col->ptr;
+ if ( nret<=0 || (nret==1 && *id=='.') ) return 0; // don't replace with "."
+ }
+ else
+ {
+ if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "."
+ id = rec->d.id;
+ }
+ if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id);
+ if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,id);
// running with +ID, only update missing ids
if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
- return bcf_update_id(args->hdr_out,line,rec->d.id);
+ return bcf_update_id(args->hdr_out,line,id);
return 0;
}
static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
als[0] = rec->d.allele[0];
int i;
for (i=1; i<line->n_allele; i++) als[i] = line->d.allele[i];
- bcf_update_alleles(args->hdr_out, line, als, line->n_allele);
+ int ret = bcf_update_alleles(args->hdr_out, line, als, line->n_allele);
free(als);
- return 0;
+ return ret;
}
static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
const char **als = (const char**) malloc(sizeof(char*)*rec->n_allele);
als[0] = line->d.allele[0];
for (i=1; i<rec->n_allele; i++) als[i] = rec->d.allele[i];
- bcf_update_alleles(args->hdr_out, line, als, rec->n_allele);
+ int ret = bcf_update_alleles(args->hdr_out, line, als, rec->n_allele);
free(als);
- return 0;
+ return ret;
}
static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
args->tmpi2[i] = args->tmpi[ map[i] ];
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
- return 0;
+ return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
}
static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
annot_line_t *tab = (annot_line_t*) data;
+ // This is a bit hacky, only to reuse existing code with minimal changes:
+ // -c =TAG will now behave as -l TAG:APPEND for integers
+ if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND;
+
if ( !tab )
{
- if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND )
- error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Integer\n");
+ if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG &&
+ col->merge_method!=MM_MIN && col->merge_method!=MM_MAX &&
+ col->merge_method!=MM_APPEND &&
+ col->merge_method!=MM_APPEND_MISSING )
+ error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Integer\n");
}
int i,ntmpi = 0;
- if ( tab )
+ if ( tab ) // has data, not flushing yet
{
char *str = tab->cols[col->icol], *end = str;
- if ( str[0]=='.' && str[1]==0 ) return 0;
+ if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1;
while ( *end )
{
- int val = strtol(str, &end, 10);
- if ( end==str )
- error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
ntmpi++;
hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
- args->tmpi[ntmpi-1] = val;
- str = end+1;
+ if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
+ {
+ if ( col->merge_method==MM_APPEND_MISSING )
+ args->tmpi[ntmpi-1] = bcf_int32_missing;
+ else
+ ntmpi--;
+ if ( str[1]==0 ) end = str+1;
+ str += 2;
+ }
+ else
+ {
+ args->tmpi[ntmpi-1] = strtol(str, &end, 10);
+ if ( end==str )
+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
+ str = end+1;
+ }
}
if ( col->merge_method!=MM_FIRST )
{
}
else
{
- if ( col->merge_method==MM_APPEND )
+ if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
{
int nori = col->mm_dbl_nused;
col->mm_dbl_nused += ntmpi;
}
}
col->mm_dbl_ndat++;
+ return 1;
}
}
- else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND )
+ else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
{
ntmpi = col->mm_dbl_nused;
hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
- return 0;
+ return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
}
static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
- return 0;
+ return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
}
static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
{
args->tmpf2[i] = args->tmpf[ map[i] ];
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
- return 0;
+ return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
}
static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
annot_line_t *tab = (annot_line_t*) data;
+ // This is a bit hacky, only to reuse existing code with minimal changes:
+ // -c =TAG will now behave as -l TAG:APPEND for floats
+ if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND;
+
if ( !tab )
{
- if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND )
- error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Float\n");
+ if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG &&
+ col->merge_method!=MM_MIN && col->merge_method!=MM_MAX &&
+ col->merge_method!=MM_APPEND &&
+ col->merge_method!=MM_APPEND_MISSING )
+ error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Float\n");
}
int i,ntmpf = 0;
if ( tab )
{
char *str = tab->cols[col->icol], *end = str;
- if ( str[0]=='.' && str[1]==0 ) return 0;
+ if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1;
while ( *end )
{
- double val = strtod(str, &end);
- if ( end==str )
- error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
ntmpf++;
hts_expand(float,ntmpf,args->mtmpf,args->tmpf);
- args->tmpf[ntmpf-1] = val;
- str = end+1;
+ if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
+ {
+ if ( col->merge_method==MM_APPEND_MISSING )
+ bcf_float_set_missing(args->tmpf[ntmpf-1]);
+ else
+ ntmpf--;
+ if ( str[1]==0 ) end = str+1;
+ str += 2;
+ }
+ else
+ {
+ args->tmpf[ntmpf-1] = strtod(str, &end);
+ if ( end==str )
+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
+ str = end+1;
+ }
}
if ( col->merge_method!=MM_FIRST )
{
col->mm_dbl_nused = ntmpf;
hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
for (i=0; i<ntmpf; i++)
- col->mm_dbl[i] = args->tmpf[i];
+ {
+ if ( bcf_float_is_missing(args->tmpf[i]) )
+ bcf_double_set_missing(col->mm_dbl[i]);
+ else
+ col->mm_dbl[i] = args->tmpf[i];
+ }
}
else
{
- if ( col->merge_method==MM_APPEND )
+ if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
{
int nori = col->mm_dbl_nused;
col->mm_dbl_nused += ntmpf;
hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
for (i=0; i<ntmpf; i++)
- col->mm_dbl[i+nori] = args->tmpf[i];
+ {
+ if ( bcf_float_is_missing(args->tmpf[i]) )
+ bcf_double_set_missing(col->mm_dbl[i+nori]);
+ else
+ col->mm_dbl[i+nori] = args->tmpf[i];
+ }
}
else
{
}
}
col->mm_dbl_ndat++;
+ return 1;
}
}
- else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND )
+ else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
{
ntmpf = col->mm_dbl_nused;
hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf);
- for (i=0; i<ntmpf; i++) args->tmpf[i] = col->mm_dbl[i];
+ for (i=0; i<ntmpf; i++)
+ {
+ if ( bcf_double_is_missing(col->mm_dbl[i]) )
+ bcf_float_set_missing(args->tmpf[i]);
+ else
+ args->tmpf[i] = col->mm_dbl[i];
+ }
col->mm_dbl_nused = col->mm_dbl_ndat = 0;
}
else if ( col->merge_method==MM_AVG )
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
- return 0;
+ return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
}
static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
- return 0;
+ return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
}
int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als)
if ( str[0]!='.' || (str[1]!=',' && str[1]!=0) ) continue; // value already set
}
int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i);
- assert( ret==0 );
+ if ( ret!=0 ) error("[%s:%d %s] Failed to copy a string field\n", __FILE__,__LINE__,__func__);
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
- return 0;
+ return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
}
void khash_str2int_clear_free(void *_hash)
{
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
}
+ // This is a bit hacky, only to reuse existing code with minimal changes:
+ // -c =TAG will now behave as -l TAG:unique for strings
+ if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_UNIQUE;
+
annot_line_t *tab = (annot_line_t*) data;
-
+
int len = 0;
if ( tab )
{
len = strlen(tab->cols[col->icol]);
if ( !len ) return 0;
- if ( len==1 && tab->cols[col->icol][0]=='.' ) return 0;
+ if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING ) return 1;
}
if ( col->merge_method!=MM_FIRST )
if ( data )
{
- assert( col->merge_method==MM_APPEND || col->merge_method==MM_UNIQUE );
+ assert( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING || col->merge_method==MM_UNIQUE );
if ( col->merge_method==MM_UNIQUE )
{
if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init();
- if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 0;
+ if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 1;
khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol]));
}
if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr);
kputs(tab->cols[col->icol], &col->mm_kstr);
- return 0;
+ return 1;
}
if ( col->mm_kstr.l )
else
return 0;
- if ( !data ) // flush the line
- {
- if ( col->merge_method==MM_UNIQUE )
- khash_str2int_clear_free(col->mm_str_hash);
- col->mm_kstr.l = 0;
- }
+ // flush the line
+ if ( col->merge_method==MM_UNIQUE )
+ khash_str2int_clear_free(col->mm_str_hash);
+ col->mm_kstr.l = 0;
}
else
{
return setter_ARinfo_string(args,line,col,tab->nals,tab->als);
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
- return 0;
+ return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
}
static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
- if ( ntmps < 0 ) return 0; // nothing to add
+
+ if ( col->getter )
+ col->getter(args,rec,col,(void**)&args->tmps, &args->mtmps);
+ else
+ {
+ int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
+ if ( ntmps < 0 ) return 0; // nothing to add
+ }
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
- return 0;
+ return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
}
static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str)
{
}
}
return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nsmpl_dst*ndst1);
-
}
static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
// tab annotation file, expecting that all samples are present: sample map not needed
if ( !src ) return 0;
- int nmatch = 0, order_ok = 1;
+ int nmatch = 0;
for (i=0; i<bcf_hdr_nsamples(src); i++)
{
int id = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, src->samples[i]);
- if ( id!=-1 )
- {
- nmatch++;
- if ( i!=id ) order_ok = 0;
- }
+ if ( id!=-1 ) nmatch++;
}
- if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok ) return 0; // not needed
if ( !nmatch ) return -1; // No matching samples found in the source and the destination file
args->nsample_map = bcf_hdr_nsamples(dst);
int need_sample_map = 0;
int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr);
+ kstring_t tmp = {0,0,0};
+ if ( args->columns_is_file )
+ {
+ int i,n;
+ char **str = hts_readlist(args->columns, args->columns_is_file, &n);
+ if ( !str ) error("Could not parse %s\n", args->columns);
+ for (i=0; i<n; i++)
+ {
+ char *ptr = str[i];
+ while ( *ptr && !isspace(*ptr) ) ptr++;
+ if ( *ptr )
+ {
+ *ptr = 0;
+ ptr++;
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ if ( *ptr )
+ {
+ if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str);
+ kputs(str[i],&args->merge_method_str);
+ kputc(':',&args->merge_method_str);
+ kputs(ptr,&args->merge_method_str);
+ }
+ }
+ if ( tmp.l ) kputc(',',&tmp);
+ kputs(str[i],&tmp);
+ free(str[i]);
+ }
+ free(str);
+ free(args->columns);
+ args->columns = tmp.s;
+ tmp.l = tmp.m = 0;
+ tmp.s = NULL;
+ }
+
void *skip_fmt = NULL, *skip_info = NULL;
if ( args->tgts_is_vcf )
args->columns = columns_complement(args->columns, &skip_info, &skip_fmt);
- kstring_t str = {0,0,0}, tmp = {0,0,0};
+ kstring_t str = {0,0,0};
char *ss = args->columns, *se = ss;
args->ncols = 0;
int icol = -1, has_fmt_str = 0;
{
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->setter = vcf_setter_ref;
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
{
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->setter = vcf_setter_alt;
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
}
else args->alt_idx = icol;
}
- else if ( !strcasecmp("ID",str.s) )
+ else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) )
{
if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ if ( str.s[0]=='~' ) replace = MATCH_VALUE;
+ if ( args->tgts_is_vcf && replace==MATCH_VALUE ) error("todo: -c ~ID with -a VCF?\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
+ if ( replace==MATCH_VALUE ) args->match_id = icol;
+ }
+ else if ( !strncasecmp("ID:=",str.s,4) ) // transfer a tag from INFO to ID column
+ {
+ if ( !args->tgts_is_vcf ) error("The annotation source must be a VCF for \"%s\"\n",str.s);
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
+ col->icol = icol;
+ col->replace = replace;
+ col->setter = vcf_setter_id;
+ col->getter = vcf_getter_info_str2str;
+ str.s[2] = 0;
+ col->hdr_key_dst = strdup(str.s);
+ col->hdr_key_src = strncasecmp("INFO/",str.s+4,5) ? strdup(str.s+4) : strdup(str.s+4+5);
+ int hdr_id = bcf_hdr_id2int(args->tgts_hdr, BCF_DT_ID,col->hdr_key_src);
+ if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) )
+ error("The INFO tag \"%s\" is not defined in %s\n", col->hdr_key_src, args->targets_fname);
+ if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR )
+ error("Only Type=String tags can be used to annotate the ID column\n");
}
else if ( !strcasecmp("FILTER",str.s) )
{
if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
bcf_hrec_t *hrec = tgts_hdr->hrec[j];
if ( hrec->type!=BCF_HL_FLT ) continue;
int k = bcf_hrec_find_key(hrec,"ID");
- assert( k>=0 ); // this should always be true for valid VCFs
+ if ( k<0 ) error("[%s] Failed to parse the header, the ID attribute not found", __func__);
tmp.l = 0;
bcf_hrec_format(hrec, &tmp);
bcf_hdr_append(args->hdr_out, tmp.s);
if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
{
if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
- if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n");
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
int j;
for (j=0; j<tgts_hdr->nhrec; j++)
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->icol = -1;
col->replace = replace;
col->hdr_key_src = strdup(hrec->vals[k]);
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
col->icol = -1;
col->replace = replace;
col->hdr_key_src = strdup(hrec->vals[k]);
col->hdr_key_dst = strdup(hrec->vals[k]);
- if ( !strcasecmp("GT",col->hdr_key_src) ) col->setter = vcf_setter_format_gt;
+ if ( !strcasecmp("GT",col->hdr_key_src) )
+ {
+ if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n");
+ col->setter = vcf_setter_format_gt;
+ }
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
}
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
- error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
+ error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
if ( !args->tgts_is_vcf )
{
col->icol = icol;
col->replace = replace;
col->hdr_key_src = strdup(key_src);
col->hdr_key_dst = strdup(key_dst);
- if ( !strcasecmp("GT",key_src) ) col->setter = vcf_setter_format_gt;
+ if ( !strcasecmp("GT",key_src) )
+ {
+ if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n");
+ col->setter = vcf_setter_format_gt;
+ }
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
else
{
if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
- if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
- int explicit_info = 0;
+ if ( replace==SET_OR_APPEND )
+ {
+ if ( args->tgts_is_vcf )
+ error("Error: the =INFO/TAG feature is currently supported only with TAB annotation files and has limitations\n"
+ " (the annotation type is modified to \"Number=.\" and allele ordering is disregarded)\n");
+ fprintf(bcftools_stderr,"Warning: the =INFO/TAG feature modifies the annotation to \"Number=.\" and disregards allele ordering\n");
+ }
+ int explicit_src_info = 0;
+ int explicit_dst_info = 0;
char *key_dst;
if ( !strncasecmp("INFO/",str.s,5) )
{
key_dst = str.s + 5;
- explicit_info = 1;
+ explicit_dst_info = 1;
}
else
key_dst = str.s;
if ( !strncasecmp("INFO/",key_src,5) )
{
key_src += 5;
- explicit_info = 1;
+ explicit_src_info = 1;
}
else if ( !strncasecmp("FMT/",key_src,4) || !strncasecmp("FORMAT/",key_src,5) )
{
}
else
key_src = key_dst;
+
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
+ col->icol = icol;
+ col->replace = replace;
+ col->hdr_key_src = strdup(key_src);
+ col->hdr_key_dst = strdup(key_dst);
+
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line
{
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
- if ( !hrec )
+ if ( !strcasecmp("ID",key_src) && !explicit_src_info )
{
- if ( !explicit_info && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) )
- error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
- fprintf(bcftools_stderr,"[%s] %d\n",key_src,explicit_info);
- error("The tag \"%s\" is not defined in %s\n", key_src,args->files->readers[1].fname);
+ // transferring ID column into a new INFO tag
+ tmp.l = 0;
+ ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred ID column\">",key_dst);
+ }
+ else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info )
+ {
+ // transferring FILTER column into a new INFO tag
+ tmp.l = 0;
+ ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred FILTER column\">",key_dst);
+ }
+ else
+ {
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
+ if ( !hrec )
+ {
+ if ( explicit_dst_info+explicit_src_info==0 && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) )
+ error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
+ char *ptr = strchr(key_src,'=');
+ if ( ptr )
+ {
+ *ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '=';
+ error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s);
+ }
+ error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname);
+ }
+ tmp.l = 0;
+ bcf_hrec_format_rename(hrec, key_dst, &tmp);
}
- tmp.l = 0;
- bcf_hrec_format_rename(hrec, key_dst, &tmp);
bcf_hdr_append(args->hdr_out, tmp.s);
if (bcf_hdr_sync(args->hdr_out) < 0)
error_errno("[%s] Failed to update header", __func__);
hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
}
else
- error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname);
+ error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname);
assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
}
-
- args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
- annot_col_t *col = &args->cols[args->ncols-1];
- col->icol = icol;
- col->replace = replace;
- col->hdr_key_src = strdup(key_src);
- col->hdr_key_dst = strdup(key_dst);
- col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
+ if ( args->tgts_is_vcf )
+ {
+ if ( !strcasecmp("ID",key_src) && !explicit_src_info ) col->getter = vcf_getter_id2str;
+ else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) col->getter = vcf_getter_filter2str;
+ }
+ col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
case BCF_HT_FLAG: col->setter = args->tgts_is_vcf ? vcf_setter_info_flag : setter_info_flag; break;
case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_info_str : setter_info_str; break;
default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
}
+ if ( replace==SET_OR_APPEND ) // change to Number=.
+ {
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_INFO, "ID", key_dst, NULL);
+ if ( !hrec ) error("Uh, could not find the new tag \"%s\" in the header\n", key_dst);
+ hrec = bcf_hrec_dup(hrec);
+ int j = bcf_hrec_find_key(hrec, "Number");
+ if ( j<0 ) error("Uh, could not find the entry Number in the header record of %s\n",key_dst);
+ free(hrec->vals[j]);
+ hrec->vals[j] = strdup(".");
+ bcf_hdr_remove(args->hdr_out,BCF_HL_INFO, key_dst);
+ bcf_hdr_add_hrec(args->hdr_out, hrec);
+ }
}
if ( !*se ) break;
ss = ++se;
args->cols[i].mm_dbl_nalloc = args->cols[i].mm_dbl_nused = args->cols[i].mm_dbl_ndat = 0;
memset(&args->cols[i].mm_kstr, 0, sizeof(args->cols[i].mm_kstr));
}
- if ( !args->merge_method_str ) return;
+ if ( !args->merge_method_str.l ) return;
if ( args->tgts_is_vcf ) error("Error: the --merge-logic is intended for use with BED or TAB-delimited files only.\n");
- if ( !args->tgt_idx ) error("Error: BEG,END (or FROM,TO) columns are expected with the --merge-logic option.\n");
- char *sb = args->merge_method_str;
+ if ( !args->tgt_idx && !args->tgts ) error("Error: BEG,END (or FROM,TO) columns or REF,ALT columns are expected with the --merge-logic option.\n");
+ char *sb = args->merge_method_str.s;
while ( *sb )
{
char *se = sb;
char *mm_type_str = args->tmpks.s + args->tmpks.l;
while ( *mm_type_str!=':' && mm_type_str > args->tmpks.s ) mm_type_str--;
if ( *mm_type_str!=':' )
- error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str);
+ error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str.s);
*mm_type_str = 0;
mm_type_str++;
int mm_type = MM_FIRST;
if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE;
+ else if ( !strcasecmp("first",mm_type_str) ) mm_type = MM_FIRST;
else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND;
+ else if ( !strcasecmp("append-missing",mm_type_str) )
+ {
+ mm_type = MM_APPEND_MISSING;
+ if ( args->ref_idx!=-1 ) args->has_append_mode = 1;
+ }
else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM;
else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG;
else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN;
else if ( !strcasecmp("max",mm_type_str) ) mm_type = MM_MAX;
- else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str,mm_type_str);
+ else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str.s,mm_type_str);
for (i=0; i<args->ncols; i++)
{
if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue;
- if ( mm_type==MM_APPEND && args->cols[i].number!=BCF_VL_VAR )
+ if ( (mm_type==MM_APPEND || mm_type==MM_APPEND_MISSING) && args->cols[i].number!=BCF_VL_VAR )
error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n");
args->cols[i].merge_method = mm_type;
break;
if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s);
sb = *se ? se + 1 : se;
}
+ if ( args->has_append_mode )
+ {
+ // create a missing line to insert missing values when VCF ALT finds no match in the annotation file
+ args->aline_missing = (annot_line_t*)calloc(1,sizeof(*args->aline_missing));
+ int ncol = 0;
+ for (i=0; i<args->ncols; i++)
+ if ( ncol < args->cols[i].icol + 1 ) ncol = args->cols[i].icol + 1;
+ if ( ncol < args->ref_idx + 1 ) ncol = args->ref_idx + 1;
+ args->aline_missing->mcols = ncol;
+ args->aline_missing->ncols = ncol;
+ args->aline_missing->cols = (char**) malloc(ncol*sizeof(char*));
+ for (i=0; i<ncol; i++)
+ args->aline_missing->cols[i] = strdup(".");
+ }
}
static void rename_chrs(args_t *args, char *fname)
free(map);
}
+static void rename_annots(args_t *args, char *fname)
+{
+ int n, i;
+ char **map = hts_readlist(fname, 1, &n);
+ if ( !map ) error("Could not read: %s\n", fname);
+ for (i=0; i<n; i++)
+ {
+ char *sb = NULL, *ss = map[i];
+ while ( *ss && !isspace(*ss) ) ss++;
+ if ( !*ss ) error("Could not parse: %s\n", fname);
+ *ss = 0;
+ int type;
+ if ( !strncasecmp("info/",map[i],5) ) type = BCF_HL_INFO, sb = map[i] + 5;
+ else if ( !strncasecmp("format/",map[i],7) ) type = BCF_HL_FMT, sb = map[i] + 7;
+ else if ( !strncasecmp("fmt/",map[i],4) ) type = BCF_HL_FMT, sb = map[i] + 4;
+ else if ( !strncasecmp("filter/",map[i],7) ) type = BCF_HL_FLT, sb = map[i] + 7;
+ else error("Could not parse \"%s\", expected INFO, FORMAT, or FILTER prefix for each line: %s\n",map[i],fname);
+ int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, sb);
+ if ( id<0 ) continue;
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", sb, NULL);
+ if ( !hrec ) continue; // the sequence not present
+ int j = bcf_hrec_find_key(hrec, "ID");
+ assert( j>=0 );
+ free(hrec->vals[j]);
+ ss++;
+ while ( *ss && isspace(*ss) ) ss++;
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+ *se = 0;
+ hrec->vals[j] = strdup(ss);
+ args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j];
+ }
+ for (i=0; i<n; i++) free(map[i]);
+ free(map);
+}
+
static void init_data(args_t *args)
{
args->hdr = args->files->readers[0].header;
// reading annots from a VCF
if ( !bcf_sr_add_reader(args->files, args->targets_fname) )
error("Failed to open %s: %s\n", args->targets_fname,bcf_sr_strerror(args->files->errnum));
+ args->tgts_hdr = args->files->readers[1].header;
}
if ( args->columns ) init_columns(args);
if ( args->targets_fname && !args->tgts_is_vcf )
if ( !args->columns ) error("The -c option not given\n");
if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n");
if ( args->beg_idx==-1 ) error("The -c POS option not given\n");
- if ( args->single_overlaps && args->merge_method_str ) error("The options --merge-logic and --single-overlaps cannot be combined\n");
- if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str) )
+ if ( args->single_overlaps && args->merge_method_str.l ) error("The options --merge-logic and --single-overlaps cannot be combined\n");
+ if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str.l) )
{
args->end_idx = -args->beg_idx - 1;
args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->beg_idx,args->end_idx);
if ( !args->drop_header )
{
if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
+ if ( args->rename_annots ) rename_annots(args, args->rename_annots);
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno));
if ( args->n_threads )
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
free(args->cols[i].mm_kstr.s);
if ( args->cols[i].mm_str_hash ) khash_str2int_destroy_free(args->cols[i].mm_str_hash);
free(args->cols[i].mm_dbl);
+ free(args->cols[i].ptr);
}
free(args->cols);
+ if ( args->aline_missing )
+ {
+ for (i=0; i<args->aline_missing->ncols; i++) free(args->aline_missing->cols[i]);
+ free(args->aline_missing->cols);
+ free(args->aline_missing);
+ }
for (i=0; i<args->malines; i++)
{
free(args->alines[i].cols);
free(args->alines[i].line.s);
}
free(args->alines);
+ free(args->srt_alines);
if ( args->tgt_idx )
{
regidx_destroy(args->tgt_idx);
filter_destroy(args->filter);
if (args->out_fh) hts_close(args->out_fh);
free(args->sample_map);
+ free(args->merge_method_str.s);
}
static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp)
}
else i++;
}
-
if ( args->ref_idx==-1 && args->nalines ) return;
while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) )
}
}
+// search string in semicolon separated strings (xx vs aa;bb)
+static int str_match(char *needle, char *haystack)
+{
+ int len = strlen(needle);
+ char *ptr = haystack;
+ while ( *ptr && (ptr=strstr(ptr,needle)) )
+ {
+ if ( ptr[len]!=0 && ptr[len]!=';' ) ptr++; // a prefix, not a match
+ else if ( ptr==haystack || ptr[-1]==';' ) return 1; // a match
+ ptr++; // a suffix, not a match
+ }
+ return 0;
+}
+// search common string in semicolon separated strings (xx;yy;zz vs aa;bb)
+static int strstr_match(char *a, char *b)
+{
+ char *beg = a;
+ while ( *beg )
+ {
+ char *end = beg;
+ while ( *end && *end!=';' ) end++;
+ char tmp = *end;
+ if ( *end==';' ) *end = 0;
+ int ret = str_match(beg,b);
+ *end = tmp;
+ if ( ret || !*end ) return ret;
+ beg = end + 1;
+ }
+ return 0;
+}
static void annotate(args_t *args, bcf1_t *line)
{
int i, j;
args->rm[i].handler(args, line, &args->rm[i]);
int has_overlap = 0;
-
if ( args->tgt_idx )
{
+ for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
{
while ( regitr_overlap(args->tgt_itr) )
tmp->end = args->tgt_itr->end;
parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp);
for (j=0; j<args->ncols; j++)
- if ( args->cols[j].setter(args,line,&args->cols[j],tmp) )
+ {
+ if ( args->cols[j].done==1 ) continue;
+ int ret = args->cols[j].setter(args,line,&args->cols[j],tmp);
+ if ( ret < 0 )
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ if ( ret==0 )
+ args->cols[j].done = 1;
+ }
}
has_overlap = 1;
}
for (j=0; j<args->ncols; j++)
- if ( args->cols[j].merge_method != MM_FIRST )
- args->cols[j].setter(args,line,&args->cols[j],NULL);
+ {
+ if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+ if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 )
+ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ }
}
else if ( args->tgts )
{
- // Buffer annotation lines. When multiple ALT alleles are present in the
- // annotation file, at least one must match one of the VCF alleles.
- int len = 0;
- bcf_get_variant_types(line);
- for (i=1; i<line->n_allele; i++)
- if ( len > line->d.var[i].n ) len = line->d.var[i].n;
- int end_pos = len<0 ? line->pos - len : line->pos;
+ // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one
+ // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the
+ // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found
+ // for an ALT, missing value is appended instead.
+ int end_pos = line->pos + line->rlen - 1;
buffer_annot_lines(args, line, line->pos, end_pos);
+
+ args->nsrt_alines = 0;
+ hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines);
+ if ( args->nalines >= 0xffff || line->n_allele >= 0xffff )
+ error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+
+ // Find matching lines
for (i=0; i<args->nalines; i++)
{
if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue;
- if ( args->ref_idx != -1 )
+ if ( args->ref_idx != -1 ) // REF+ALT matching requested
{
- if ( vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs not compatible
+ if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs are not compatible
for (j=1; j<args->alines[i].nals; j++)
{
- if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) break; // no ALT allele in VCF and annot file has "."
- if ( vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]) >= 0 ) break;
+ int ialt;
+ if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) // match: no ALT allele in VCF and annot file has "."
+ ialt = 0;
+ else
+ {
+ ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]);
+ if ( ialt < 0 ) continue;
+ ialt++;
+ }
+ if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
+ args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
+ has_overlap = 1;
+ break;
}
- if ( j==args->alines[i].nals ) continue; // none of the annot alleles present in VCF's ALT
}
- break;
+ else // overlap, REF+ALT matching not requested
+ {
+ args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i;
+ has_overlap = 1;
+ }
}
-
- if ( i<args->nalines )
+ // Sort lines if needed
+ if ( args->has_append_mode )
+ {
+ // insertion sort by VCF ALT index (top bits) and alines index (low bits)
+ uint32_t tmp;
+ for (i=1; i<args->nsrt_alines; i++)
+ for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--)
+ tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp;
+ }
+ // Annotate
+ for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
+ int ialt_exp = 1;
+ for (i=0; i<args->nsrt_alines; i++)
{
- // there is a matching line
+ int ialt = args->srt_alines[i] >> 16;
+ int ilin = args->srt_alines[i] & 0xffff;
+ if ( args->has_append_mode )
+ {
+ if ( ialt_exp > ialt ) continue; // multiple annotation lines for the same position
+ if ( ialt_exp < ialt )
+ {
+ // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT
+ while ( ialt_exp++ < ialt )
+ {
+ for (j=0; j<args->ncols; j++)
+ {
+ if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
+ if ( args->cols[j].done==1 ) continue;
+ int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
+ if ( ret < 0 )
+ error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ if ( ret==0 )
+ args->cols[j].done = 1;
+ }
+ }
+ }
+ }
for (j=0; j<args->ncols; j++)
- if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) )
+ {
+ if ( args->cols[j].done==1 ) continue;
+ int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]);
+ if ( ret < 0 )
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ if ( ret==0 )
+ args->cols[j].done = 1;
+ }
+ ialt_exp = ialt + 1;
+ }
+ if ( args->nsrt_alines )
+ {
+ // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one
+ // record was found. Otherwise leave the row will be left without annotation.
+ if ( args->has_append_mode && ialt_exp < line->n_allele )
+ {
+ while ( ialt_exp++ < line->n_allele )
+ {
+ for (j=0; j<args->ncols; j++)
+ {
+ if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
+ if ( args->cols[j].done==1 ) continue;
+ int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
+ if ( ret < 0 )
+ error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ if ( ret==0 )
+ args->cols[j].done = 1;
+ }
+ }
+ }
+ // Flush
+ for (j=0; j<args->ncols; j++)
+ {
+ if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+ int ret = args->cols[j].setter(args,line,&args->cols[j],NULL);
+ if ( ret < 0 )
+ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ }
}
- has_overlap = i<args->nalines ? 1 : 0;
}
else if ( args->files->nreaders == 2 )
{
fprintf(bcftools_stderr, "Usage: bcftools annotate [options] <in.vcf.gz>\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -a, --annotations <file> VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n");
- fprintf(bcftools_stderr, " --collapse <string> matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
- fprintf(bcftools_stderr, " -c, --columns <list> list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
- fprintf(bcftools_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " --force continue despite parsing error (at your own risk!)\n");
- fprintf(bcftools_stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
- fprintf(bcftools_stderr, " -I, --set-id [+]<format> set ID column, see man page for details\n");
- fprintf(bcftools_stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n");
- fprintf(bcftools_stderr, " -l, --merge-logic <tag:type> merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
- fprintf(bcftools_stderr, " -m, --mark-sites [+-]<tag> add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
- fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " --rename-chrs <file> rename sequences according to map file: from\\tto\n");
- fprintf(bcftools_stderr, " -s, --samples [^]<list> comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
- fprintf(bcftools_stderr, " -S, --samples-file [^]<file> file of samples to annotate (or exclude with \"^\" prefix)\n");
- fprintf(bcftools_stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
- fprintf(bcftools_stderr, " -x, --remove <list> list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
- fprintf(bcftools_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(bcftools_stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n");
+ fprintf(bcftools_stderr, " --collapse STR matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
+ fprintf(bcftools_stderr, " -c, --columns LIST list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
+ fprintf(bcftools_stderr, " -C, --columns-file FILE read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n");
+ fprintf(bcftools_stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " --force continue despite parsing error (at your own risk!)\n");
+ fprintf(bcftools_stderr, " -h, --header-lines FILE lines which should be appended to the VCF header\n");
+ fprintf(bcftools_stderr, " -I, --set-id [+]FORMAT set ID column using a `bcftools query`-like expression, see man page for details\n");
+ fprintf(bcftools_stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n");
+ fprintf(bcftools_stderr, " -l, --merge-logic TAG:TYPE merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
+ fprintf(bcftools_stderr, " -m, --mark-sites [+-]TAG add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
+ fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
+ fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -O, --output-type [b|u|z|v] b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE restrict to regions listed in FILE\n");
+ fprintf(bcftools_stderr, " --rename-annots FILE rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n");
+ fprintf(bcftools_stderr, " --rename-chrs FILE rename sequences according to the mapping: old\\tnew\n");
+ fprintf(bcftools_stderr, " -s, --samples [^]LIST comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(bcftools_stderr, " -S, --samples-file [^]FILE file of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(bcftools_stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
+ fprintf(bcftools_stderr, " -x, --remove LIST list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
+ fprintf(bcftools_stderr, " --threads INT number of extra output compression threads [0]\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcfannotate(int argc, char *argv[])
args->record_cmd_line = 1;
args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1;
args->set_ids_replace = 1;
+ args->match_id = -1;
int regions_is_file = 0, collapse = 0;
static struct option loptions[] =
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
{"remove",required_argument,NULL,'x'},
+ {"columns-file",required_argument,NULL,'C'},
{"columns",required_argument,NULL,'c'},
+ {"rename-annots",required_argument,NULL,11},
{"rename-chrs",required_argument,NULL,1},
{"header-lines",required_argument,NULL,'h'},
{"samples",required_argument,NULL,'s'},
{"force",no_argument,NULL,'f'},
{NULL,0,NULL,0}
};
- while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
{
switch (c) {
case 'f': args->force = 1; break;
else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; }
else args->mark_sites = optarg;
break;
- case 'l': args->merge_method_str = optarg; break;
+ case 'l':
+ if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str);
+ kputs(optarg,&args->merge_method_str);
+ break;
case 'I': args->set_ids_fmt = optarg; break;
case 's': args->sample_names = optarg; break;
case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
case 'c': args->columns = strdup(optarg); break;
+ case 'C': args->columns = strdup(optarg); args->columns_is_file = 1; break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
default: error("The output type \"%s\" not recognised\n", optarg);
};
break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'x': args->remove_annots = optarg; break;
case 'a': args->targets_fname = optarg; break;
case 'r': args->regions_list = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 10 : args->single_overlaps = 1; break;
+ case 11 : args->rename_annots = optarg; break;
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
/* The MIT License
- Copyright (c) 2016-2019 Genome Research Ltd.
+ Copyright (c) 2016-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
*/
+#include <assert.h>
+#include <strings.h>
#include <htslib/vcf.h>
#include <htslib/vcfutils.h>
+#include <htslib/hts_os.h>
#include "bcftools.h"
#include "vcfbuf.h"
#include "rbuf.h"
typedef struct
{
- double max;
- int rand_missing, skip_filter;
+ double max[VCFBUF_LD_N];
+ int rand_missing, filter1;
}
ld_t;
{
bcf1_t *rec;
double af;
- int af_set:1, idx:31;
+ int af_set:1, filter:1, idx:30;
}
vcfrec_t;
+#define PRUNE_MODE_MAX_AF 1
+#define PRUNE_MODE_1ST 2
+#define PRUNE_MODE_RAND 3
typedef struct
{
- int max_sites, mvrec, mac, mfarr;
+ int max_sites, mvrec, mac, mfarr, mode;
int *ac, *idx;
float *farr;
char *af_tag;
buf->hdr = hdr;
buf->win = win;
buf->overlap.rid = -1;
+ int i;
+ for (i=0; i<VCFBUF_LD_N; i++) buf->ld.max[i] = HUGE_VAL;
rbuf_init(&buf->rbuf, 0);
return buf;
}
void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value)
{
- if ( key==VCFBUF_LD_MAX ) { buf->ld.max = *((double*)value); return; }
- if ( key==VCFBUF_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; }
- if ( key==VCFBUF_SKIP_FILTER ) { buf->ld.skip_filter = *((int*)value); return; }
- if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; }
+ if ( key==LD_FILTER1 ) { buf->ld.filter1 = *((int*)value); return; }
+ if ( key==LD_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; }
+ if ( key==LD_MAX_R2 ) { buf->ld.max[VCFBUF_LD_IDX_R2] = *((double*)value); return; }
+ if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; }
+ if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; }
+
+ if ( key==VCFBUF_NSITES )
+ {
+ buf->prune.max_sites = *((int*)value);
+ if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF;
+ return;
+ }
if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; }
if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; }
if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; }
+
+ if ( key==VCFBUF_NSITES_MODE )
+ {
+ char *mode = *((char**)value);
+ if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF;
+ else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST;
+ else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND;
+ else error("The mode \"%s\" is not recognised\n",mode);
+ }
}
int vcfbuf_nsites(vcfbuf_t *buf)
return buf->rbuf.n;
}
-bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap)
+bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec)
{
- if ( !swap ) error("todo: swap=%d\n", swap);
-
rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf);
int i = rbuf_append(&buf->rbuf);
bcf1_t *ret = buf->vcf[i].rec;
buf->vcf[i].rec = rec;
buf->vcf[i].af_set = 0;
+ buf->vcf[i].filter = buf->ld.filter1;
+ buf->ld.filter1 = 0;
return ret;
}
{
int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1;
+ int nprune = nbuf - buf->prune.max_sites;
+ int i,k,irec = 0;
+ if ( buf->prune.mode==PRUNE_MODE_1ST )
+ {
+ int eoff = flush_all ? 1 : 2;
+ for (i=0; i<nprune; i++)
+ rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->rbuf.n - eoff, buf->vcf);
+ return;
+ }
+ if ( buf->prune.mode==PRUNE_MODE_RAND )
+ {
+ int eoff = flush_all ? 0 : 1;
+ for (i=0; i<nprune; i++)
+ {
+ int j = (buf->rbuf.n - eoff) * hts_drand48();
+ rbuf_remove_kth(&buf->rbuf, vcfrec_t, j, buf->vcf);
+ }
+ return;
+ }
+
if ( nbuf > buf->prune.mvrec )
{
buf->prune.idx = (int*) realloc(buf->prune.idx, nbuf*sizeof(int));
}
// set allele frequency and prepare buffer for sorting
- int i,k,irec = 0;
for (i=-1; rbuf_next(&buf->rbuf,&i) && irec<nbuf; )
{
bcf1_t *line = buf->vcf[i].rec;
// sort the rbuf indexes to be pruned descendently so that j-th rbuf index
// is removed before i-th index if i<j
- int nprune = nbuf - buf->prune.max_sites;
for (i=0; i<nprune; i++)
buf->prune.idx[i] = buf->prune.vrec[i]->idx;
}
/*
- For unphased genotypes D is approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/
+ The `ld` is set to D approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/
D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb))
+
+ and `hd` as proposed in Ragsdale, A. P., & Gravel, S. (2019). Unbiased estimation of linkage
+ disequilibrium from unphased data. Molecular Biology and Evolution. doi:10.1093/molbev/msz265
+
+ \hat{D} = 1/[n*(n+1)]*[
+ (n1 + n2/2 + n4/2 + n5/4)*(n5/4 + n6/2 + n8/2 + n9)
+ -(n2/2 + n3 + n5/4 + n6/2)*(n4/2 + n5/4 + n7 + n8/2)
+ ]
+ where n1,n2,..n9 are counts of RR/RR,RR/RA,..,AA/AA genotypes.
+
+ Returns 0 on success, -1 if the values could not be determined (missing genotypes)
*/
-static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec)
+static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *ld)
{
if ( arec->n_sample!=brec->n_sample ) error("Different number of samples: %d vs %d\n",arec->n_sample,brec->n_sample);
assert( arec->n_sample );
baf = _estimate_af((int8_t*)bfmt->p, bfmt->size, bfmt->n, brec->n_sample);
}
- // Calculate correlation
+ // Calculate r2, lf, hd
+ double nhd[] = {0,0,0,0,0,0,0,0,0};
double ab = 0, aa = 0, bb = 0, a = 0, b = 0;
- int nab = 0, na = 0, nb = 0, ndiff = 0;
+ int nab = 0, ndiff = 0;
+ int an_tot = 0, bn_tot = 0;
for (i=0; i<arec->n_sample; i++)
{
int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size);
int8_t *bptr = (int8_t*) (bfmt->p + i*bfmt->size);
- int adsg = 0, bdsg = 0, an = 0, bn = 0;
+ int adsg = 0, bdsg = 0; // dosages (0,1,2) at sites (a,b)
+ int an = 0, bn = 0; // number of alleles at sites (a,b)
for (j=0; j<afmt->n; j++)
{
if ( aptr[j]==bcf_int8_vector_end ) break;
if ( aptr[j]==bcf_gt_missing )
{
if ( !buf->ld.rand_missing ) break;
- if ( rand()/RAND_MAX >= aaf ) adsg += 1;
+ if ( hts_drand48() >= aaf ) adsg += 1;
}
else if ( bcf_gt_allele(aptr[j]) ) adsg += 1;
an++;
if ( bptr[j]==bcf_gt_missing )
{
if ( !buf->ld.rand_missing ) break;
- if ( rand()/RAND_MAX >= baf ) bdsg += 1;
+ if ( hts_drand48() >= baf ) bdsg += 1;
}
else if ( bcf_gt_allele(bptr[j]) ) bdsg += 1;
bn++;
}
- if ( an )
+ if ( an && bn )
{
+ an_tot += an;
aa += adsg*adsg;
a += adsg;
- na++;
- }
- if ( bn )
- {
+
+ bn_tot += bn;
bb += bdsg*bdsg;
b += bdsg;
- nb++;
- }
- if ( an && bn )
- {
+
if ( adsg!=bdsg ) ndiff++;
ab += adsg*bdsg;
nab++;
}
+ if ( an==2 && bn==2 ) // for now only diploid genotypes
+ {
+ assert( adsg<=2 && bdsg<=2 );
+ nhd[ bdsg*3 + adsg ]++;
+ }
}
- if ( !nab ) return -1;
+ if ( !nab ) return -1; // no data in common for the two sites
+ double pa = a/an_tot;
+ double pb = b/bn_tot;
double cor;
if ( !ndiff ) cor = 1;
else
{
- // Don't know how to deal with zero variance. Since this the purpose is filtering,
- // it is not enough to say the value is undefined. Therefore an artificial noise is
- // added to make the denominator non-zero.
- if ( aa == a*a/na || bb == b*b/nb )
+ if ( aa == a*a/nab || bb == b*b/nab ) // zero variance, add small noise
{
- aa += 3*3;
- bb += 3*3;
- ab += 3*3;
- a += 3;
- b += 3;
- na++;
- nb++;
+ aa += 1e-4;
+ bb += 1e-4;
+ ab += 1e-4;
+ a += 1e-2;
+ b += 1e-2;
nab++;
}
- cor = (ab/nab - a/na*b/nb) / sqrt(aa/na - a/na*a/na) / sqrt(bb/nb - b/nb*b/nb);
+ cor = (ab - a*b/nab) / sqrt(aa - a*a/nab) / sqrt(bb - b*b/nab);
}
- return cor*cor;
+
+ ld->val[VCFBUF_LD_IDX_R2] = cor * cor;
+
+ // Lewontin's normalization of D. Also we cap at 1 as the calculation
+ // can result in values bigger than 1 for high AFs.
+ ld->val[VCFBUF_LD_IDX_LD] = cor * sqrt(pa*(1-pa)*pb*(1-pb));
+ double norm;
+ if ( ld->val[VCFBUF_LD_IDX_LD] < 0 )
+ norm = -pa*pb > -(1-pa)*(1-pb) ? -pa*pb : -(1-pa)*(1-pb);
+ else
+ norm = pa*(1-pb) > (1-pa)*pb ? pa*(1-pb) : (1-pa)*pb;
+ if ( norm )
+ ld->val[VCFBUF_LD_IDX_LD] = fabs(norm) > fabs(ld->val[VCFBUF_LD_IDX_LD]) ? ld->val[VCFBUF_LD_IDX_LD]/norm : 1;
+ if ( !ld->val[VCFBUF_LD_IDX_LD] )
+ ld->val[VCFBUF_LD_IDX_LD] = fabs(ld->val[VCFBUF_LD_IDX_LD]); // avoid "-0" on output
+
+ ld->val[VCFBUF_LD_IDX_HD] =
+ (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8])
+ - (nhd[1]/2. + nhd[2] + nhd[4]/4. + nhd[5]/2.)*(nhd[3]/2. + nhd[4]/4. + nhd[6] + nhd[7]/2.);
+ ld->val[VCFBUF_LD_IDX_HD] /= nab;
+ ld->val[VCFBUF_LD_IDX_HD] /= nab+1;
+
+ return 0;
}
-bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld)
+int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld)
{
- *ld = -1;
- if ( !buf->rbuf.n ) return NULL;
+ int ret = -1;
+ if ( !buf->rbuf.n ) return ret;
- int i = buf->rbuf.f;
+ int j, i = buf->rbuf.f;
// Relying on vcfbuf being properly flushed - all sites in the buffer
// must come from the same chromosome
- if ( buf->vcf[i].rec->rid != rec->rid ) return NULL;
+ if ( buf->vcf[i].rec->rid != rec->rid ) return ret;
+
+ vcfbuf_ld_t tmp;
+ for (j=0; j<VCFBUF_LD_N; j++)
+ {
+ ld->val[j] = -HUGE_VAL;
+ ld->rec[j] = NULL;
+ }
- int imax = 0;
- double max = 0;
for (i=-1; rbuf_next(&buf->rbuf,&i); )
{
- if ( buf->ld.skip_filter )
- {
- if ( buf->vcf[i].rec->d.n_flt > 1 ) continue; // multiple filters are set
- if ( buf->vcf[i].rec->d.n_flt==1 && buf->vcf[i].rec->d.flt[0]!=0 ) continue; // not PASS
- }
- double val = _calc_ld(buf, buf->vcf[i].rec, rec);
- if ( buf->ld.max && buf->ld.max < val )
- {
- *ld = val;
- return buf->vcf[i].rec;
- }
- if ( val > max )
+ if ( buf->vcf[i].filter ) continue;
+ if ( _calc_r2_ld(buf, buf->vcf[i].rec, rec, &tmp) < 0 ) continue; // missing genotypes
+
+ int done = 0;
+ for (j=0; j<VCFBUF_LD_N; j++)
{
- max = val;
- imax = i;
+ if ( ld->val[j] < tmp.val[j] )
+ {
+ ld->val[j] = tmp.val[j];
+ ld->rec[j] = buf->vcf[i].rec;
+ }
+ if ( buf->ld.max[j] < tmp.val[j] ) done = 1;
+ ret = 0;
}
+ if ( done ) return ret;
}
- *ld = max;
- return buf->vcf[imax].rec;
+ return ret;
}
/* The MIT License
- Copyright (c) 2016-2019 Genome Research Ltd.
+ Copyright (c) 2016-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
*/
+#include <assert.h>
+#include <strings.h>
#include <htslib/vcf.h>
#include <htslib/vcfutils.h>
+#include <htslib/hts_os.h>
#include "bcftools.h"
#include "vcfbuf.h"
#include "rbuf.h"
typedef struct
{
- double max;
- int rand_missing, skip_filter;
+ double max[VCFBUF_LD_N];
+ int rand_missing, filter1;
}
ld_t;
{
bcf1_t *rec;
double af;
- int af_set:1, idx:31;
+ int af_set:1, filter:1, idx:30;
}
vcfrec_t;
+#define PRUNE_MODE_MAX_AF 1
+#define PRUNE_MODE_1ST 2
+#define PRUNE_MODE_RAND 3
typedef struct
{
- int max_sites, mvrec, mac, mfarr;
+ int max_sites, mvrec, mac, mfarr, mode;
int *ac, *idx;
float *farr;
char *af_tag;
buf->hdr = hdr;
buf->win = win;
buf->overlap.rid = -1;
+ int i;
+ for (i=0; i<VCFBUF_LD_N; i++) buf->ld.max[i] = HUGE_VAL;
rbuf_init(&buf->rbuf, 0);
return buf;
}
void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value)
{
- if ( key==VCFBUF_LD_MAX ) { buf->ld.max = *((double*)value); return; }
- if ( key==VCFBUF_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; }
- if ( key==VCFBUF_SKIP_FILTER ) { buf->ld.skip_filter = *((int*)value); return; }
- if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; }
+ if ( key==LD_FILTER1 ) { buf->ld.filter1 = *((int*)value); return; }
+ if ( key==LD_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; }
+ if ( key==LD_MAX_R2 ) { buf->ld.max[VCFBUF_LD_IDX_R2] = *((double*)value); return; }
+ if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; }
+ if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; }
+
+ if ( key==VCFBUF_NSITES )
+ {
+ buf->prune.max_sites = *((int*)value);
+ if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF;
+ return;
+ }
if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; }
if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; }
if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; }
+
+ if ( key==VCFBUF_NSITES_MODE )
+ {
+ char *mode = *((char**)value);
+ if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF;
+ else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST;
+ else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND;
+ else error("The mode \"%s\" is not recognised\n",mode);
+ }
}
int vcfbuf_nsites(vcfbuf_t *buf)
return buf->rbuf.n;
}
-bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap)
+bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec)
{
- if ( !swap ) error("todo: swap=%d\n", swap);
-
rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf);
int i = rbuf_append(&buf->rbuf);
bcf1_t *ret = buf->vcf[i].rec;
buf->vcf[i].rec = rec;
buf->vcf[i].af_set = 0;
+ buf->vcf[i].filter = buf->ld.filter1;
+ buf->ld.filter1 = 0;
return ret;
}
{
int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1;
+ int nprune = nbuf - buf->prune.max_sites;
+ int i,k,irec = 0;
+ if ( buf->prune.mode==PRUNE_MODE_1ST )
+ {
+ int eoff = flush_all ? 1 : 2;
+ for (i=0; i<nprune; i++)
+ rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->rbuf.n - eoff, buf->vcf);
+ return;
+ }
+ if ( buf->prune.mode==PRUNE_MODE_RAND )
+ {
+ int eoff = flush_all ? 0 : 1;
+ for (i=0; i<nprune; i++)
+ {
+ int j = (buf->rbuf.n - eoff) * hts_drand48();
+ rbuf_remove_kth(&buf->rbuf, vcfrec_t, j, buf->vcf);
+ }
+ return;
+ }
+
if ( nbuf > buf->prune.mvrec )
{
buf->prune.idx = (int*) realloc(buf->prune.idx, nbuf*sizeof(int));
}
// set allele frequency and prepare buffer for sorting
- int i,k,irec = 0;
for (i=-1; rbuf_next(&buf->rbuf,&i) && irec<nbuf; )
{
bcf1_t *line = buf->vcf[i].rec;
// sort the rbuf indexes to be pruned descendently so that j-th rbuf index
// is removed before i-th index if i<j
- int nprune = nbuf - buf->prune.max_sites;
for (i=0; i<nprune; i++)
buf->prune.idx[i] = buf->prune.vrec[i]->idx;
}
/*
- For unphased genotypes D is approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/
+ The `ld` is set to D approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/
D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb))
+
+ and `hd` as proposed in Ragsdale, A. P., & Gravel, S. (2019). Unbiased estimation of linkage
+ disequilibrium from unphased data. Molecular Biology and Evolution. doi:10.1093/molbev/msz265
+
+ \hat{D} = 1/[n*(n+1)]*[
+ (n1 + n2/2 + n4/2 + n5/4)*(n5/4 + n6/2 + n8/2 + n9)
+ -(n2/2 + n3 + n5/4 + n6/2)*(n4/2 + n5/4 + n7 + n8/2)
+ ]
+ where n1,n2,..n9 are counts of RR/RR,RR/RA,..,AA/AA genotypes.
+
+ Returns 0 on success, -1 if the values could not be determined (missing genotypes)
*/
-static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec)
+static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *ld)
{
if ( arec->n_sample!=brec->n_sample ) error("Different number of samples: %d vs %d\n",arec->n_sample,brec->n_sample);
assert( arec->n_sample );
baf = _estimate_af((int8_t*)bfmt->p, bfmt->size, bfmt->n, brec->n_sample);
}
- // Calculate correlation
+ // Calculate r2, lf, hd
+ double nhd[] = {0,0,0,0,0,0,0,0,0};
double ab = 0, aa = 0, bb = 0, a = 0, b = 0;
- int nab = 0, na = 0, nb = 0, ndiff = 0;
+ int nab = 0, ndiff = 0;
+ int an_tot = 0, bn_tot = 0;
for (i=0; i<arec->n_sample; i++)
{
int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size);
int8_t *bptr = (int8_t*) (bfmt->p + i*bfmt->size);
- int adsg = 0, bdsg = 0, an = 0, bn = 0;
+ int adsg = 0, bdsg = 0; // dosages (0,1,2) at sites (a,b)
+ int an = 0, bn = 0; // number of alleles at sites (a,b)
for (j=0; j<afmt->n; j++)
{
if ( aptr[j]==bcf_int8_vector_end ) break;
if ( aptr[j]==bcf_gt_missing )
{
if ( !buf->ld.rand_missing ) break;
- if ( rand()/RAND_MAX >= aaf ) adsg += 1;
+ if ( hts_drand48() >= aaf ) adsg += 1;
}
else if ( bcf_gt_allele(aptr[j]) ) adsg += 1;
an++;
if ( bptr[j]==bcf_gt_missing )
{
if ( !buf->ld.rand_missing ) break;
- if ( rand()/RAND_MAX >= baf ) bdsg += 1;
+ if ( hts_drand48() >= baf ) bdsg += 1;
}
else if ( bcf_gt_allele(bptr[j]) ) bdsg += 1;
bn++;
}
- if ( an )
+ if ( an && bn )
{
+ an_tot += an;
aa += adsg*adsg;
a += adsg;
- na++;
- }
- if ( bn )
- {
+
+ bn_tot += bn;
bb += bdsg*bdsg;
b += bdsg;
- nb++;
- }
- if ( an && bn )
- {
+
if ( adsg!=bdsg ) ndiff++;
ab += adsg*bdsg;
nab++;
}
+ if ( an==2 && bn==2 ) // for now only diploid genotypes
+ {
+ assert( adsg<=2 && bdsg<=2 );
+ nhd[ bdsg*3 + adsg ]++;
+ }
}
- if ( !nab ) return -1;
+ if ( !nab ) return -1; // no data in common for the two sites
+ double pa = a/an_tot;
+ double pb = b/bn_tot;
double cor;
if ( !ndiff ) cor = 1;
else
{
- // Don't know how to deal with zero variance. Since this the purpose is filtering,
- // it is not enough to say the value is undefined. Therefore an artificial noise is
- // added to make the denominator non-zero.
- if ( aa == a*a/na || bb == b*b/nb )
+ if ( aa == a*a/nab || bb == b*b/nab ) // zero variance, add small noise
{
- aa += 3*3;
- bb += 3*3;
- ab += 3*3;
- a += 3;
- b += 3;
- na++;
- nb++;
+ aa += 1e-4;
+ bb += 1e-4;
+ ab += 1e-4;
+ a += 1e-2;
+ b += 1e-2;
nab++;
}
- cor = (ab/nab - a/na*b/nb) / sqrt(aa/na - a/na*a/na) / sqrt(bb/nb - b/nb*b/nb);
+ cor = (ab - a*b/nab) / sqrt(aa - a*a/nab) / sqrt(bb - b*b/nab);
}
- return cor*cor;
+
+ ld->val[VCFBUF_LD_IDX_R2] = cor * cor;
+
+ // Lewontin's normalization of D. Also we cap at 1 as the calculation
+ // can result in values bigger than 1 for high AFs.
+ ld->val[VCFBUF_LD_IDX_LD] = cor * sqrt(pa*(1-pa)*pb*(1-pb));
+ double norm;
+ if ( ld->val[VCFBUF_LD_IDX_LD] < 0 )
+ norm = -pa*pb > -(1-pa)*(1-pb) ? -pa*pb : -(1-pa)*(1-pb);
+ else
+ norm = pa*(1-pb) > (1-pa)*pb ? pa*(1-pb) : (1-pa)*pb;
+ if ( norm )
+ ld->val[VCFBUF_LD_IDX_LD] = fabs(norm) > fabs(ld->val[VCFBUF_LD_IDX_LD]) ? ld->val[VCFBUF_LD_IDX_LD]/norm : 1;
+ if ( !ld->val[VCFBUF_LD_IDX_LD] )
+ ld->val[VCFBUF_LD_IDX_LD] = fabs(ld->val[VCFBUF_LD_IDX_LD]); // avoid "-0" on output
+
+ ld->val[VCFBUF_LD_IDX_HD] =
+ (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8])
+ - (nhd[1]/2. + nhd[2] + nhd[4]/4. + nhd[5]/2.)*(nhd[3]/2. + nhd[4]/4. + nhd[6] + nhd[7]/2.);
+ ld->val[VCFBUF_LD_IDX_HD] /= nab;
+ ld->val[VCFBUF_LD_IDX_HD] /= nab+1;
+
+ return 0;
}
-bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld)
+int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld)
{
- *ld = -1;
- if ( !buf->rbuf.n ) return NULL;
+ int ret = -1;
+ if ( !buf->rbuf.n ) return ret;
- int i = buf->rbuf.f;
+ int j, i = buf->rbuf.f;
// Relying on vcfbuf being properly flushed - all sites in the buffer
// must come from the same chromosome
- if ( buf->vcf[i].rec->rid != rec->rid ) return NULL;
+ if ( buf->vcf[i].rec->rid != rec->rid ) return ret;
+
+ vcfbuf_ld_t tmp;
+ for (j=0; j<VCFBUF_LD_N; j++)
+ {
+ ld->val[j] = -HUGE_VAL;
+ ld->rec[j] = NULL;
+ }
- int imax = 0;
- double max = 0;
for (i=-1; rbuf_next(&buf->rbuf,&i); )
{
- if ( buf->ld.skip_filter )
- {
- if ( buf->vcf[i].rec->d.n_flt > 1 ) continue; // multiple filters are set
- if ( buf->vcf[i].rec->d.n_flt==1 && buf->vcf[i].rec->d.flt[0]!=0 ) continue; // not PASS
- }
- double val = _calc_ld(buf, buf->vcf[i].rec, rec);
- if ( buf->ld.max && buf->ld.max < val )
- {
- *ld = val;
- return buf->vcf[i].rec;
- }
- if ( val > max )
+ if ( buf->vcf[i].filter ) continue;
+ if ( _calc_r2_ld(buf, buf->vcf[i].rec, rec, &tmp) < 0 ) continue; // missing genotypes
+
+ int done = 0;
+ for (j=0; j<VCFBUF_LD_N; j++)
{
- max = val;
- imax = i;
+ if ( ld->val[j] < tmp.val[j] )
+ {
+ ld->val[j] = tmp.val[j];
+ ld->rec[j] = buf->vcf[i].rec;
+ }
+ if ( buf->ld.max[j] < tmp.val[j] ) done = 1;
+ ret = 0;
}
+ if ( done ) return ret;
}
- *ld = max;
- return buf->vcf[imax].rec;
+ return ret;
}
/* The MIT License
- Copyright (c) 2017-2019 Genome Research Ltd.
+ Copyright (c) 2017-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
// Modes of operation
typedef enum
{
- VCFBUF_LD_MAX, // vcfbuf_max_ld() stops at the first record that exceeds the threshold
- VCFBUF_RAND_MISSING, // randomize rather than ignore missing genotypes
- VCFBUF_SKIP_FILTER, // skip sites with FILTER diferent from "PASS" or "."
- VCFBUF_NSITES, // leave at max this many sites in the window
- VCFBUF_AF_TAG, // use this INFO tag with LD_NSITES
VCFBUF_OVERLAP_WIN, // keep only overlapping variants in the window
VCFBUF_RMDUP, // remove duplicate sites (completely)
+ VCFBUF_NSITES, // leave at max this many sites in the window
+ VCFBUF_NSITES_MODE, // one of: maxAF (keep sites with max AF), 1st (sites that come first), rand (pick randomly)
+ VCFBUF_AF_TAG, // use this INFO tag with VCFBUF_NSITES
+
+ // LD related options
+ LD_RAND_MISSING, // randomize rather than ignore missing genotypes
+ LD_FILTER1, // exclude the next record inserted by vcfbuf_push() from LD analysis
+ LD_MAX_R2, // If set, vcfbuf_ld() will stop at the first record that exceeds the R2,
+ LD_MAX_LD, // LD, or HD threshold. When multiple are set, the OR logic is applied
+ LD_MAX_HD, //
}
vcfbuf_opt_t;
/*
* vcfbuf_push() - push a new site for analysis
- * @swap: if set, do not create a copy, but return a substitute
*/
-bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap);
+bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec);
/*
* vcfbuf_peek() - return pointer to i-th record in the buffer but do not remove it from the buffer
int vcfbuf_nsites(vcfbuf_t *buf);
/*
- * vcfbuf_max_ld() - return a record that has maximum D or first record exceeding the threshold
- * @ld: will be filled with the maximum D found
+ * vcfbuf_ld() - find records with maximum LD values or the values in first record that exceeds thresholds
+ * set by vcfbuf_set_opt(..,LD_MAX*,..)
+ *
+ * Returns 0 on success or -1 if no values were filled.
+ *
+ * @val: will be filled with the values
+ * .. correlation coefficient r-squared
+ * .. Lewontin's D' (PMID: 19433632)
+ * .. Ragsdale's \hat{D} (doi:10.1093/molbev/msz265)
+ * @rec: corresponding positions or NULL if the value(s) has not been set
*/
-bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld);
+#define VCFBUF_LD_N 3
+#define VCFBUF_LD_IDX_R2 0
+#define VCFBUF_LD_IDX_LD 1
+#define VCFBUF_LD_IDX_HD 2
+typedef struct
+{
+ double val[VCFBUF_LD_N]; // r2, ld, hd
+ bcf1_t *rec[VCFBUF_LD_N]; // record with max r2, ld, hd
+}
+vcfbuf_ld_t;
+int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld);
#endif
/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
- Copyright (C) 2013-2016 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdarg.h>
#include <string.h>
#include <strings.h>
+#include <assert.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
.ploidy =
"* * * * 1\n"
},
+ { .alias = "2",
+ .about = "Treat all samples as diploid",
+ .ploidy =
+ "* * * * 2\n"
+ },
{
.alias = NULL,
.about = NULL,
bcf_unpack(rec, BCF_UN_STR);
if ( !rec0 ) rec0 = rec;
recN = rec;
- args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1);
+ args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec);
if ( rec0->rid!=recN->rid || rec0->pos!=recN->pos ) break;
}
}
// Open files for input and output, initialize structures
if ( args->targets )
{
- args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL);
+ args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : (regidx_free_f) NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL);
args->tgt_itr = regitr_init(args->tgt_idx);
args->tgt_itr_tmp = regitr_init(args->tgt_idx);
}
if ( args->aux.flag & CALL_CONSTR_ALLELES )
args->vcfbuf = vcfbuf_init(args->aux.hdr, 0);
- args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
else error("Could not parse --novel-rate %s\n", str);
}
-static int parse_format_flag(const char *str)
+static void list_annotations(FILE *fp)
+{
+ fprintf(fp,
+ "\n"
+ "Optional INFO annotations available with -m (\"INFO/\" prefix is optional):\n"
+ " INFO/PV4 .. P-values for strand bias, baseQ bias, mapQ bias and tail distance bias (Number=4,Type=Float)\n"
+ "\n"
+ "Optional FORMAT annotations available with -m (\"FORMAT/\" prefix is optional):\n"
+ " FORMAT/GQ .. Phred-scaled genotype quality (Number=1,Type=Integer)\n"
+ " FORMAT/GP .. Phred-scaled genotype posterior probabilities (Number=G,Type=Float)\n"
+ "\n");
+}
+
+static int parse_output_tags(const char *str)
{
int flag = 0;
const char *ss = str;
{
const char *se = ss;
while ( *se && *se!=',' ) se++;
- if ( !strncasecmp(ss,"GQ",se-ss) ) flag |= CALL_FMT_GQ;
- else if ( !strncasecmp(ss,"GP",se-ss) ) flag |= CALL_FMT_GP;
+ if ( !strncasecmp(ss,"GQ",se-ss) || !strncasecmp(ss,"FORMAT/GQ",se-ss) || !strncasecmp(ss,"FMT/GQ",se-ss) ) flag |= CALL_FMT_GQ;
+ else if ( !strncasecmp(ss,"GP",se-ss) || !strncasecmp(ss,"FORMAT/GP",se-ss) || !strncasecmp(ss,"FMT/GP",se-ss) ) flag |= CALL_FMT_GP;
+ else if ( !strncasecmp(ss,"PV4",se-ss) || !strncasecmp(ss,"INFO/PV4",se-ss) ) flag |= CALL_FMT_PV4;
else
{
fprintf(stderr,"Could not parse \"%s\"\n", str);
fprintf(stderr, "Usage: bcftools call [options] <in.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "File format options:\n");
- fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(stderr, " -O, --output-type <b|u|z|v> output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(stderr, " --ploidy <assembly>[?] predefined ploidy, 'list' to print available settings, append '?' for details\n");
- fprintf(stderr, " --ploidy-file <file> space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -s, --samples <list> list of samples to include [all samples]\n");
- fprintf(stderr, " -S, --samples-file <file> PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
- fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
+ fprintf(stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n");
+ fprintf(stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --samples LIST List of samples to include [all samples]\n");
+ fprintf(stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
+ fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Input/output options:\n");
- fprintf(stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n");
- fprintf(stderr, " -f, --format-fields <list> output format fields: GQ,GP (lowercase allowed) []\n");
- fprintf(stderr, " -F, --prior-freqs <AN,AC> use prior allele frequencies\n");
- fprintf(stderr, " -G, --group-samples <file|-> group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling\n");
- fprintf(stderr, " -g, --gvcf <int>,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n");
- fprintf(stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n");
- fprintf(stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n");
- fprintf(stderr, " -V, --skip-variants <type> skip indels/snps\n");
- fprintf(stderr, " -v, --variants-only output variant sites only\n");
+ fprintf(stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n");
+ fprintf(stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n");
+//todo?
+// fprintf(stderr, " -a, --annots LIST Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n");
+// fprintf(stderr, " tag removal [^I16,^QS,^FMT/QS]\n");
+ fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n");
+ fprintf(stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
+ fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
+ fprintf(stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
+ fprintf(stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
+ fprintf(stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n");
+ fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n");
+ fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n");
+ fprintf(stderr, " -v, --variants-only Output variant sites only\n");
fprintf(stderr, "\n");
fprintf(stderr, "Consensus/variant calling options:\n");
- fprintf(stderr, " -c, --consensus-caller the original calling method (conflicts with -m)\n");
- fprintf(stderr, " -C, --constrain <str> one of: alleles, trio (see manual)\n");
- fprintf(stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
- fprintf(stderr, " -n, --novel-rate <float>,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
- fprintf(stderr, " -p, --pval-threshold <float> variant if P(ref|D)<FLOAT with -c [0.5]\n");
- fprintf(stderr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
+ fprintf(stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n");
+ fprintf(stderr, " -C, --constrain STR One of: alleles, trio (see manual)\n");
+ fprintf(stderr, " -m, --multiallelic-caller Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
+ fprintf(stderr, " -n, --novel-rate FLOAT,[...] Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
+ fprintf(stderr, " -p, --pval-threshold FLOAT Variant if P(ref|D)<FLOAT with -c [0.5]\n");
+ fprintf(stderr, " -P, --prior FLOAT Mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Example:\n");
fprintf(stderr, " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n");
- fprintf(stderr, " bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n");
+ fprintf(stderr, " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n");
// todo (and more)
// fprintf(stderr, "\nContrast calling and association test options:\n");
{
{"help",no_argument,NULL,'h'},
{"format-fields",required_argument,NULL,'f'},
+ {"annotate",required_argument,NULL,'a'},
{"prior-freqs",required_argument,NULL,'F'},
{"gvcf",required_argument,NULL,'g'},
{"group-samples",required_argument,NULL,'G'},
+ {"group-samples-tag",required_argument,NULL,3},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"regions",required_argument,NULL,'r'},
};
char *tmp = NULL;
- while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:G:", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0)
{
switch (c)
{
case 'X': ploidy = "X"; fprintf(stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break;
case 'Y': ploidy = "Y"; fprintf(stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break;
case 'G': args.aux.sample_groups = optarg; break;
- case 'f': args.aux.output_tags |= parse_format_flag(optarg); break;
+ case 3 : args.aux.sample_groups_tag = optarg; break;
+ case 'f': fprintf(stderr,"Warning: -f, --format-fields will be deprecated, please use -a, --annotate instead.\n");
+ case 'a':
+ if (optarg[0]=='?') { list_annotations(stderr); return 1; }
+ args.aux.output_tags |= parse_output_tags(optarg);
+ break;
case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N
case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default)
case 'A': args.aux.flag |= CALL_KEEPALT; break;
/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
- Copyright (C) 2013-2016 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdarg.h>
#include <string.h>
#include <strings.h>
+#include <assert.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
.ploidy =
"* * * * 1\n"
},
+ { .alias = "2",
+ .about = "Treat all samples as diploid",
+ .ploidy =
+ "* * * * 2\n"
+ },
{
.alias = NULL,
.about = NULL,
bcf_unpack(rec, BCF_UN_STR);
if ( !rec0 ) rec0 = rec;
recN = rec;
- args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1);
+ args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec);
if ( rec0->rid!=recN->rid || rec0->pos!=recN->pos ) break;
}
}
// Open files for input and output, initialize structures
if ( args->targets )
{
- args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL);
+ args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : (regidx_free_f) NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL);
args->tgt_itr = regitr_init(args->tgt_idx);
args->tgt_itr_tmp = regitr_init(args->tgt_idx);
}
if ( args->aux.flag & CALL_CONSTR_ALLELES )
args->vcfbuf = vcfbuf_init(args->aux.hdr, 0);
- args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
else error("Could not parse --novel-rate %s\n", str);
}
-static int parse_format_flag(const char *str)
+static void list_annotations(FILE *fp)
+{
+ fprintf(fp,
+ "\n"
+ "Optional INFO annotations available with -m (\"INFO/\" prefix is optional):\n"
+ " INFO/PV4 .. P-values for strand bias, baseQ bias, mapQ bias and tail distance bias (Number=4,Type=Float)\n"
+ "\n"
+ "Optional FORMAT annotations available with -m (\"FORMAT/\" prefix is optional):\n"
+ " FORMAT/GQ .. Phred-scaled genotype quality (Number=1,Type=Integer)\n"
+ " FORMAT/GP .. Phred-scaled genotype posterior probabilities (Number=G,Type=Float)\n"
+ "\n");
+}
+
+static int parse_output_tags(const char *str)
{
int flag = 0;
const char *ss = str;
{
const char *se = ss;
while ( *se && *se!=',' ) se++;
- if ( !strncasecmp(ss,"GQ",se-ss) ) flag |= CALL_FMT_GQ;
- else if ( !strncasecmp(ss,"GP",se-ss) ) flag |= CALL_FMT_GP;
+ if ( !strncasecmp(ss,"GQ",se-ss) || !strncasecmp(ss,"FORMAT/GQ",se-ss) || !strncasecmp(ss,"FMT/GQ",se-ss) ) flag |= CALL_FMT_GQ;
+ else if ( !strncasecmp(ss,"GP",se-ss) || !strncasecmp(ss,"FORMAT/GP",se-ss) || !strncasecmp(ss,"FMT/GP",se-ss) ) flag |= CALL_FMT_GP;
+ else if ( !strncasecmp(ss,"PV4",se-ss) || !strncasecmp(ss,"INFO/PV4",se-ss) ) flag |= CALL_FMT_PV4;
else
{
fprintf(bcftools_stderr,"Could not parse \"%s\"\n", str);
- exit(1);
+ bcftools_exit(1);
}
if ( !*se ) break;
ss = se + 1;
fprintf(bcftools_stderr,"Run as --ploidy <alias> (e.g. --ploidy GRCh37).\n");
fprintf(bcftools_stderr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n");
fprintf(bcftools_stderr,"\n");
- exit(-1);
+ bcftools_exit(-1);
}
else if ( detailed )
{
fprintf(bcftools_stderr,"%s", pld->ploidy);
- exit(-1);
+ bcftools_exit(-1);
}
return ploidy_init_string(pld->ploidy,2);
}
fprintf(bcftools_stderr, "Usage: bcftools call [options] <in.vcf.gz>\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "File format options:\n");
- fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -O, --output-type <b|u|z|v> output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " --ploidy <assembly>[?] predefined ploidy, 'list' to print available settings, append '?' for details\n");
- fprintf(bcftools_stderr, " --ploidy-file <file> space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " -s, --samples <list> list of samples to include [all samples]\n");
- fprintf(bcftools_stderr, " -S, --samples-file <file> PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
- fprintf(bcftools_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
+ fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(bcftools_stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n");
+ fprintf(bcftools_stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " -s, --samples LIST List of samples to include [all samples]\n");
+ fprintf(bcftools_stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
+ fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Input/output options:\n");
- fprintf(bcftools_stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n");
- fprintf(bcftools_stderr, " -f, --format-fields <list> output format fields: GQ,GP (lowercase allowed) []\n");
- fprintf(bcftools_stderr, " -F, --prior-freqs <AN,AC> use prior allele frequencies\n");
- fprintf(bcftools_stderr, " -G, --group-samples <file|-> group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling\n");
- fprintf(bcftools_stderr, " -g, --gvcf <int>,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n");
- fprintf(bcftools_stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n");
- fprintf(bcftools_stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n");
- fprintf(bcftools_stderr, " -V, --skip-variants <type> skip indels/snps\n");
- fprintf(bcftools_stderr, " -v, --variants-only output variant sites only\n");
+ fprintf(bcftools_stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n");
+ fprintf(bcftools_stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n");
+//todo?
+// fprintf(bcftools_stderr, " -a, --annots LIST Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n");
+// fprintf(bcftools_stderr, " tag removal [^I16,^QS,^FMT/QS]\n");
+ fprintf(bcftools_stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n");
+ fprintf(bcftools_stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
+ fprintf(bcftools_stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
+ fprintf(bcftools_stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
+ fprintf(bcftools_stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
+ fprintf(bcftools_stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n");
+ fprintf(bcftools_stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n");
+ fprintf(bcftools_stderr, " -V, --skip-variants TYPE Skip indels/snps\n");
+ fprintf(bcftools_stderr, " -v, --variants-only Output variant sites only\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Consensus/variant calling options:\n");
- fprintf(bcftools_stderr, " -c, --consensus-caller the original calling method (conflicts with -m)\n");
- fprintf(bcftools_stderr, " -C, --constrain <str> one of: alleles, trio (see manual)\n");
- fprintf(bcftools_stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
- fprintf(bcftools_stderr, " -n, --novel-rate <float>,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
- fprintf(bcftools_stderr, " -p, --pval-threshold <float> variant if P(ref|D)<FLOAT with -c [0.5]\n");
- fprintf(bcftools_stderr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
+ fprintf(bcftools_stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n");
+ fprintf(bcftools_stderr, " -C, --constrain STR One of: alleles, trio (see manual)\n");
+ fprintf(bcftools_stderr, " -m, --multiallelic-caller Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
+ fprintf(bcftools_stderr, " -n, --novel-rate FLOAT,[...] Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
+ fprintf(bcftools_stderr, " -p, --pval-threshold FLOAT Variant if P(ref|D)<FLOAT with -c [0.5]\n");
+ fprintf(bcftools_stderr, " -P, --prior FLOAT Mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Example:\n");
fprintf(bcftools_stderr, " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n");
- fprintf(bcftools_stderr, " bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n");
+ fprintf(bcftools_stderr, " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n");
// todo (and more)
// fprintf(bcftools_stderr, "\nContrast calling and association test options:\n");
// fprintf(bcftools_stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
// fprintf(bcftools_stderr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", args->aux.min_perm_p);
fprintf(bcftools_stderr, "\n");
- exit(-1);
+ bcftools_exit(-1);
}
int main_vcfcall(int argc, char *argv[])
{
{"help",no_argument,NULL,'h'},
{"format-fields",required_argument,NULL,'f'},
+ {"annotate",required_argument,NULL,'a'},
{"prior-freqs",required_argument,NULL,'F'},
{"gvcf",required_argument,NULL,'g'},
{"group-samples",required_argument,NULL,'G'},
+ {"group-samples-tag",required_argument,NULL,3},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"regions",required_argument,NULL,'r'},
};
char *tmp = NULL;
- while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:G:", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0)
{
switch (c)
{
case 'X': ploidy = "X"; fprintf(bcftools_stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break;
case 'Y': ploidy = "Y"; fprintf(bcftools_stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break;
case 'G': args.aux.sample_groups = optarg; break;
- case 'f': args.aux.output_tags |= parse_format_flag(optarg); break;
+ case 3 : args.aux.sample_groups_tag = optarg; break;
+ case 'f': fprintf(bcftools_stderr,"Warning: -f, --format-fields will be deprecated, please use -a, --annotate instead.\n");
+ case 'a':
+ if (optarg[0]=='?') { list_annotations(bcftools_stderr); return 1; }
+ args.aux.output_tags |= parse_output_tags(optarg);
+ break;
case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N
case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default)
case 'A': args.aux.flag |= CALL_KEEPALT; break;
#include <stdio.h>
#include <unistd.h>
+#include <assert.h>
#include <getopt.h>
#include <math.h>
#include <inttypes.h>
#include <stdio.h>
#include <unistd.h>
+#include <assert.h>
#include <getopt.h>
#include <math.h>
#include <inttypes.h>
fprintf(bcftools_stderr, " -P, --same-prob <float> prior probability of -s/-c being the same [0.5]\n");
fprintf(bcftools_stderr, " -x, --xy-prob <float> P(x|y) transition probability [1e-9]\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcfcnv(int argc, char *argv[])
/* vcfconcat.c -- Concatenate or combine VCF/BCF files.
- Copyright (C) 2013-2019 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <unistd.h>
#include <getopt.h>
#include <string.h>
+#include <assert.h>
#include <errno.h>
#include <math.h>
#include <inttypes.h>
bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
}
if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->allow_overlaps || args->phased_concat )
{
else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY;
else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY;
else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE;
+ else if ( !strcmp(args->remove_dups,"exact") ) args->files->collapse = COLLAPSE_NONE;
else error("The -D string \"%s\" not recognised.\n", args->remove_dups);
}
for (i=0; i<args->nfnames; i++)
if ( !args->swap_phase[i] ) continue;
int *gt = &args->GTa[i*2];
if ( bcf_gt_is_missing(gt[0]) || gt[1]==bcf_int32_vector_end ) continue;
+ if ( !bcf_gt_is_phased(gt[1]) ) continue;
SWAP(int, gt[0], gt[1]);
gt[1] |= 1;
}
fprintf(stderr, "Options:\n");
fprintf(stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n");
fprintf(stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n");
- fprintf(stderr, " -d, --rm-dups <string> Output duplicate records present in multiple files only once: <snps|indels|both|all|none>\n");
- fprintf(stderr, " -D, --remove-duplicates Alias for -d none\n");
+ fprintf(stderr, " -d, --rm-dups <string> Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
+ fprintf(stderr, " -D, --remove-duplicates Alias for -d exact\n");
fprintf(stderr, " -f, --file-list <file> Read the list of files from a file.\n");
fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
fprintf(stderr, " --no-version Do not append version and command line to the header\n");
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
case 'd': args->remove_dups = optarg; break;
- case 'D': args->remove_dups = "none"; break;
+ case 'D': args->remove_dups = "exact"; break;
case 'q':
args->min_PQ = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
/* vcfconcat.c -- Concatenate or combine VCF/BCF files.
- Copyright (C) 2013-2019 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <unistd.h>
#include <getopt.h>
#include <string.h>
+#include <assert.h>
#include <errno.h>
#include <math.h>
#include <inttypes.h>
bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
}
if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->allow_overlaps || args->phased_concat )
{
else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY;
else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY;
else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE;
+ else if ( !strcmp(args->remove_dups,"exact") ) args->files->collapse = COLLAPSE_NONE;
else error("The -D string \"%s\" not recognised.\n", args->remove_dups);
}
for (i=0; i<args->nfnames; i++)
if ( !args->swap_phase[i] ) continue;
int *gt = &args->GTa[i*2];
if ( bcf_gt_is_missing(gt[0]) || gt[1]==bcf_int32_vector_end ) continue;
+ if ( !bcf_gt_is_phased(gt[1]) ) continue;
SWAP(int, gt[0], gt[1]);
gt[1] |= 1;
}
fprintf(bcftools_stderr, "Options:\n");
fprintf(bcftools_stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n");
fprintf(bcftools_stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n");
- fprintf(bcftools_stderr, " -d, --rm-dups <string> Output duplicate records present in multiple files only once: <snps|indels|both|all|none>\n");
- fprintf(bcftools_stderr, " -D, --remove-duplicates Alias for -d none\n");
+ fprintf(bcftools_stderr, " -d, --rm-dups <string> Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
+ fprintf(bcftools_stderr, " -D, --remove-duplicates Alias for -d exact\n");
fprintf(bcftools_stderr, " -f, --file-list <file> Read the list of files from a file.\n");
fprintf(bcftools_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
fprintf(bcftools_stderr, " --threads <int> Use multithreading with <int> worker threads [0]\n");
fprintf(bcftools_stderr, " -v, --verbose <0|1> Set verbosity level [1]\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcfconcat(int argc, char *argv[])
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
case 'd': args->remove_dups = optarg; break;
- case 'D': args->remove_dups = "none"; break;
+ case 'D': args->remove_dups = "exact"; break;
case 'q':
args->min_PQ = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
/* vcfconvert.c -- convert between VCF/BCF and related formats.
- Copyright (C) 2013-2017 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
char *outfname, *infname, *ref_fname, *sex_fname;
- int argc, n_threads, record_cmd_line;
+ int argc, n_threads, record_cmd_line, keep_duplicates;
};
static void destroy_data(args_t *args)
if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss);
rec->pos--;
+ // ID
+ if ( args->output_vcf_ids )
+ {
+ char tmp = *tsv->se;
+ *tsv->se = 0;
+ bcf_update_id(args->header, rec, tsv->ss);
+ *tsv->se = tmp;
+ }
+
// REF,ALT
args->str.l = 0;
se = ++ss;
for (i=0; i<nsamples; i++) free(samples[i]);
free(samples);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
for (i=0; i<nrows; i++) free(samples[i]);
free(samples);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
for (i=0; i<nsamples; i++) free(samples[i]);
free(samples);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
}
// skip duplicate lines, or otherwise shapeit complains
- if ( prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; }
+ if ( !args->keep_duplicates && prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; }
prev_rid = line->rid;
prev_pos = line->pos;
if ( args->output_vcf_ids )
kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str);
else
- kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
+ kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
if ( args->hap2dip )
kputs("%_GT_TO_HAP2\n", &str);
if ( n_files==1 )
{
int l = str.l;
- kputs(".sample",&str);
+ kputs(".samples",&str);
sample_fname = strdup(str.s);
str.l = l;
kputs(".hap.gz",&str);
bcf_hdr_add_sample(args->header, NULL);
args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
static void vcf_to_vcf(args_t *args)
{
open_vcf(args,NULL);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname);
open_vcf(args,NULL);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
fprintf(stderr, " -g, --gensample <...> <prefix>|<gen-file>,<sample-file>\n");
fprintf(stderr, " --tag <string> tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
fprintf(stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+ fprintf(stderr, " --keep-duplicates keep duplicate positions\n");
fprintf(stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
fprintf(stderr, "\n");
{"columns",required_argument,NULL,'c'},
{"fasta-ref",required_argument,NULL,'f'},
{"no-version",no_argument,NULL,10},
+ {"keep-duplicates",no_argument,NULL,12},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
switch (c) {
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
case 't': args->targets_list = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 10 : args->record_cmd_line = 0; break;
case 11 : args->sex_fname = optarg; break;
+ case 12 : args->keep_duplicates = 1; break;
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
}
/* vcfconvert.c -- convert between VCF/BCF and related formats.
- Copyright (C) 2013-2017 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
char *outfname, *infname, *ref_fname, *sex_fname;
- int argc, n_threads, record_cmd_line;
+ int argc, n_threads, record_cmd_line, keep_duplicates;
};
static void destroy_data(args_t *args)
if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss);
rec->pos--;
+ // ID
+ if ( args->output_vcf_ids )
+ {
+ char tmp = *tsv->se;
+ *tsv->se = 0;
+ bcf_update_id(args->header, rec, tsv->ss);
+ *tsv->se = tmp;
+ }
+
// REF,ALT
args->str.l = 0;
se = ++ss;
for (i=0; i<nsamples; i++) free(samples[i]);
free(samples);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
for (i=0; i<nrows; i++) free(samples[i]);
free(samples);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
for (i=0; i<nsamples; i++) free(samples[i]);
free(samples);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
}
// skip duplicate lines, or otherwise shapeit complains
- if ( prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; }
+ if ( !args->keep_duplicates && prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; }
prev_rid = line->rid;
prev_pos = line->pos;
if ( args->output_vcf_ids )
kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str);
else
- kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
+ kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
if ( args->hap2dip )
kputs("%_GT_TO_HAP2\n", &str);
if ( n_files==1 )
{
int l = str.l;
- kputs(".sample",&str);
+ kputs(".samples",&str);
sample_fname = strdup(str.s);
str.l = l;
kputs(".hap.gz",&str);
bcf_hdr_add_sample(args->header, NULL);
args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
static void vcf_to_vcf(args_t *args)
{
open_vcf(args,NULL);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname);
open_vcf(args,NULL);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+ htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
fprintf(bcftools_stderr, " -g, --gensample <...> <prefix>|<gen-file>,<sample-file>\n");
fprintf(bcftools_stderr, " --tag <string> tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
fprintf(bcftools_stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+ fprintf(bcftools_stderr, " --keep-duplicates keep duplicate positions\n");
fprintf(bcftools_stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(bcftools_stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
fprintf(bcftools_stderr, "\n");
// fprintf(bcftools_stderr, "PBWT options:\n");
// fprintf(bcftools_stderr, " -b, --pbwt <prefix> or <pbwt>,<sites>,<sample>,<missing>\n");
// fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcfconvert(int argc, char *argv[])
{"columns",required_argument,NULL,'c'},
{"fasta-ref",required_argument,NULL,'f'},
{"no-version",no_argument,NULL,10},
+ {"keep-duplicates",no_argument,NULL,12},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
switch (c) {
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
case 't': args->targets_list = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 10 : args->record_cmd_line = 0; break;
case 11 : args->sex_fname = optarg; break;
+ case 12 : args->keep_duplicates = 1; break;
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
}
/* vcffilter.c -- Apply fixed-threshold filters.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdio.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <ctype.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
char *soft_filter; // drop failed sites or annotate FILTER column?
int annot_mode; // add to existing FILTER annotation or replace? Otherwise reset FILTER to PASS or leave as it is?
int flt_fail, flt_pass; // BCF ids of fail and pass filters
- int snp_gap, indel_gap, IndelGap_id, SnpGap_id;
+ int snp_gap, snp_gap_type, indel_gap, IndelGap_id, SnpGap_id;
+ char *snp_gap_str;
int32_t ntmpi, *tmpi, ntmp_ac, *tmp_ac;
rbuf_t rbuf;
bcf1_t **rbuf_lines;
static void init_data(args_t *args)
{
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
args->rbuf_lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
if ( args->snp_gap )
{
- bcf_hdr_printf(args->hdr, "##FILTER=<ID=SnpGap,Description=\"SNP within %d bp of an indel\">", args->snp_gap);
+ bcf_hdr_printf(args->hdr, "##FILTER=<ID=SnpGap,Description=\"SNP within %d bp of %s\">", args->snp_gap,args->snp_gap_str);
args->SnpGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "SnpGap");
assert( args->SnpGap_id>=0 );
}
*/
// To avoid additional data structure, we abuse bcf1_t's var and var_type records.
- const int SnpGap_set = VCF_OTHER<<1;
- const int IndelGap_set = VCF_OTHER<<2;
- const int IndelGap_flush = VCF_OTHER<<3;
+ const int SnpGap_set = 1 << (8*sizeof(int)/2);
+ const int IndelGap_set = 1 << (8*sizeof(int)/2-1);
+ const int IndelGap_flush = 1 << (8*sizeof(int)/2-2);
int var_type = 0, i;
if ( line )
// output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be
// used. This filter is therefore more strict and may remove some valid
// SNPs.
- int len = 1;
- if ( var_type & VCF_INDEL )
- {
- for (i=1; i<line->n_allele; i++)
- if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n;
- }
-
// Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion.
- line->d.var[0].n = len;
+ line->d.var[0].n = line->rlen;
}
int k_flush = 1;
int rec_to = rec->pos + rec->d.var[0].n - 1; // last position affected by the variant
if ( rec_to + args->snp_gap < last_from )
j_flush++;
- else if ( (var_type & VCF_INDEL) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) )
+ else if ( (var_type & args->snp_gap_type) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) )
{
// this SNP has not been SnpGap-filtered yet
rec->d.var_type |= SnpGap_set;
bcf_add_filter(args->hdr, rec, args->SnpGap_id);
}
- else if ( (var_type & VCF_SNP) && (rec->d.var_type & VCF_INDEL) )
+ else if ( (var_type & VCF_SNP) && (rec->d.var_type & args->snp_gap_type) )
{
// the line which we are adding is a SNP and needs to be filtered
line->d.var_type |= SnpGap_set;
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -g, --SnpGap <int> filter SNPs within <int> base pairs of an indel\n");
+ fprintf(stderr, " -g, --SnpGap <int>[:type] filter SNPs within <int> base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n");
fprintf(stderr, " -G, --IndelGap <int> filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
fprintf(stderr, " -i, --include <expr> include only sites for which the expression is true (see man page for details\n");
fprintf(stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
char *tmp;
while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) {
switch (c) {
- case 'g':
+ case 'g':
args->snp_gap = strtol(optarg,&tmp,10);
- if ( *tmp ) error("Could not parse argument: --SnpGap %s\n", optarg);
+ if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg);
+ if ( *tmp==':' )
+ {
+ args->snp_gap_str = tmp+1;
+ int i,n;
+ char **keys = hts_readlist(tmp+1,0,&n);
+ for(i=0; i<n; i++)
+ {
+ if ( !strcasecmp(keys[i],"indel") ) args->snp_gap_type |= VCF_INDEL;
+ else if ( !strcasecmp(keys[i],"mnp") ) args->snp_gap_type |= VCF_MNP;
+ else if ( !strcasecmp(keys[i],"bnd") ) args->snp_gap_type |= VCF_BND;
+ else if ( !strcasecmp(keys[i],"other") ) args->snp_gap_type |= VCF_OTHER;
+ else if ( !strcasecmp(keys[i],"overlap") ) args->snp_gap_type |= VCF_OVERLAP;
+ else error("Could not parse \"%s\" in \"--SnpGap %s\"\n", keys[i], optarg);
+ free(keys[i]);
+ }
+ if ( n ) free(keys);
+ }
+ else
+ {
+ args->snp_gap_type = VCF_INDEL;
+ args->snp_gap_str = "indel";
+ }
break;
case 'G':
args->indel_gap = strtol(optarg,&tmp,10);
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'S':
if ( !strcmp(".",optarg) ) args->set_gts = SET_GTS_MISSING;
else if ( !strcmp("0",optarg) ) args->set_gts = SET_GTS_REF;
/* vcffilter.c -- Apply fixed-threshold filters.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdio.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <ctype.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
char *soft_filter; // drop failed sites or annotate FILTER column?
int annot_mode; // add to existing FILTER annotation or replace? Otherwise reset FILTER to PASS or leave as it is?
int flt_fail, flt_pass; // BCF ids of fail and pass filters
- int snp_gap, indel_gap, IndelGap_id, SnpGap_id;
+ int snp_gap, snp_gap_type, indel_gap, IndelGap_id, SnpGap_id;
+ char *snp_gap_str;
int32_t ntmpi, *tmpi, ntmp_ac, *tmp_ac;
rbuf_t rbuf;
bcf1_t **rbuf_lines;
static void init_data(args_t *args)
{
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
args->rbuf_lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
if ( args->snp_gap )
{
- bcf_hdr_printf(args->hdr, "##FILTER=<ID=SnpGap,Description=\"SNP within %d bp of an indel\">", args->snp_gap);
+ bcf_hdr_printf(args->hdr, "##FILTER=<ID=SnpGap,Description=\"SNP within %d bp of %s\">", args->snp_gap,args->snp_gap_str);
args->SnpGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "SnpGap");
assert( args->SnpGap_id>=0 );
}
*/
// To avoid additional data structure, we abuse bcf1_t's var and var_type records.
- const int SnpGap_set = VCF_OTHER<<1;
- const int IndelGap_set = VCF_OTHER<<2;
- const int IndelGap_flush = VCF_OTHER<<3;
+ const int SnpGap_set = 1 << (8*sizeof(int)/2);
+ const int IndelGap_set = 1 << (8*sizeof(int)/2-1);
+ const int IndelGap_flush = 1 << (8*sizeof(int)/2-2);
int var_type = 0, i;
if ( line )
// output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be
// used. This filter is therefore more strict and may remove some valid
// SNPs.
- int len = 1;
- if ( var_type & VCF_INDEL )
- {
- for (i=1; i<line->n_allele; i++)
- if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n;
- }
-
// Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion.
- line->d.var[0].n = len;
+ line->d.var[0].n = line->rlen;
}
int k_flush = 1;
int rec_to = rec->pos + rec->d.var[0].n - 1; // last position affected by the variant
if ( rec_to + args->snp_gap < last_from )
j_flush++;
- else if ( (var_type & VCF_INDEL) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) )
+ else if ( (var_type & args->snp_gap_type) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) )
{
// this SNP has not been SnpGap-filtered yet
rec->d.var_type |= SnpGap_set;
bcf_add_filter(args->hdr, rec, args->SnpGap_id);
}
- else if ( (var_type & VCF_SNP) && (rec->d.var_type & VCF_INDEL) )
+ else if ( (var_type & VCF_SNP) && (rec->d.var_type & args->snp_gap_type) )
{
// the line which we are adding is a SNP and needs to be filtered
line->d.var_type |= SnpGap_set;
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
fprintf(bcftools_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -g, --SnpGap <int> filter SNPs within <int> base pairs of an indel\n");
+ fprintf(bcftools_stderr, " -g, --SnpGap <int>[:type] filter SNPs within <int> base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n");
fprintf(bcftools_stderr, " -G, --IndelGap <int> filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
fprintf(bcftools_stderr, " -i, --include <expr> include only sites for which the expression is true (see man page for details\n");
fprintf(bcftools_stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
fprintf(bcftools_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcffilter(int argc, char *argv[])
char *tmp;
while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) {
switch (c) {
- case 'g':
+ case 'g':
args->snp_gap = strtol(optarg,&tmp,10);
- if ( *tmp ) error("Could not parse argument: --SnpGap %s\n", optarg);
+ if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg);
+ if ( *tmp==':' )
+ {
+ args->snp_gap_str = tmp+1;
+ int i,n;
+ char **keys = hts_readlist(tmp+1,0,&n);
+ for(i=0; i<n; i++)
+ {
+ if ( !strcasecmp(keys[i],"indel") ) args->snp_gap_type |= VCF_INDEL;
+ else if ( !strcasecmp(keys[i],"mnp") ) args->snp_gap_type |= VCF_MNP;
+ else if ( !strcasecmp(keys[i],"bnd") ) args->snp_gap_type |= VCF_BND;
+ else if ( !strcasecmp(keys[i],"other") ) args->snp_gap_type |= VCF_OTHER;
+ else if ( !strcasecmp(keys[i],"overlap") ) args->snp_gap_type |= VCF_OVERLAP;
+ else error("Could not parse \"%s\" in \"--SnpGap %s\"\n", keys[i], optarg);
+ free(keys[i]);
+ }
+ if ( n ) free(keys);
+ }
+ else
+ {
+ args->snp_gap_type = VCF_INDEL;
+ args->snp_gap_str = "indel";
+ }
break;
case 'G':
args->indel_gap = strtol(optarg,&tmp,10);
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'S':
if ( !strcmp(".",optarg) ) args->set_gts = SET_GTS_MISSING;
else if ( !strcmp("0",optarg) ) args->set_gts = SET_GTS_REF;
/* vcfgtcheck.c -- Check sample identity.
- Copyright (C) 2013-2018 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdarg.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <ctype.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
+#include <htslib/kbitset.h>
+#include <htslib/hts_os.h>
#include <inttypes.h>
+#include <sys/time.h>
#include "bcftools.h"
-#include "hclust.h"
+#include "extsort.h"
+//#include "hclust.h"
typedef struct
{
- bcf_srs_t *files; // first reader is the query VCF - single sample normally or multi-sample for cross-check
- bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF
- int ntmp_arr, npl_arr;
- int32_t *tmp_arr, *pl_arr;
- double *lks, *sites, min_inter_err, max_intra_err;
- int *cnts, *dps, hom_only, cross_check, all_sites;
- char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample;
- int argc, no_PLs, narr, nsmpl;
-}
-args_t;
-
-FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
-char *msprintf(const char *fmt, ...);
-void mkdir_p(const char *fmt, ...);
-
-void py_plot(char *script)
-{
- mkdir_p(script);
- int len = strlen(script);
- char *cmd = !strcmp(".py",script+len-3) ? msprintf("python %s", script) : msprintf("python %s.py", script);
- int ret = system(cmd);
- if ( ret ) fprintf(stderr, "The command returned non-zero status %d: %s\n", ret, cmd);
- free(cmd);
-}
-
-static void plot_check(args_t *args, char *target_sample, char *query_sample)
-{
- char *fname;
- FILE *fp = open_file(&fname, "w", "%s.py", args->plot);
- fprintf(fp,
- "import matplotlib as mpl\n"
- "mpl.use('Agg')\n"
- "import matplotlib.pyplot as plt\n"
- "import matplotlib.gridspec as gridspec\n"
- "import csv\n"
- "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
- "\n"
- "sample_ids = False\n"
- "\n"
- "dat = []\n"
- "with open('%s.tab', 'r') as f:\n"
- " reader = csv.reader(f, 'tab')\n"
- " for row in reader:\n"
- " if row[0][0]=='#': continue\n"
- " if row[0]!='CN': continue\n"
- " tgt = 0\n"
- " if row[4]=='%s': tgt = 1\n"
- " dat.append([float(row[1]), float(row[2]), float(row[3]), tgt, row[4]])\n"
- "\n"
- "dat = sorted(dat)\n"
- "\n"
- "iq = -1; dp = 0\n"
- "for i in range(len(dat)):\n"
- " if iq==-1 and dat[i][3]==1: iq = i\n"
- " dp += dat[i][2]\n"
- "dp /= len(dat)\n"
- "\n"
- "fig,ax1 = plt.subplots(figsize=(8,5))\n"
- "ax2 = ax1.twinx()\n"
- "plots = ax1.plot([x[0] for x in dat],'o-', ms=3, color='g', mec='g', label='Discordance (total)')\n"
- "plots += ax1.plot([x[1] for x in dat], '^', ms=3, color='r', mec='r', label='Discordance (avg per site)')\n"
- "plots += ax2.plot([x[2] for x in dat],'v', ms=3, color='k', label='Number of sites')\n"
- "if iq!=-1:\n"
- " ax1.plot([iq],[dat[iq][0]],'o',color='orange', ms=9)\n"
- " ax1.annotate('%s',xy=(iq,dat[iq][0]), xytext=(5,5), textcoords='offset points',fontsize='xx-small',rotation=45,va='bottom',ha='left')\n"
- " ax1.plot([iq],[dat[iq][1]],'^',color='red', ms=5)\n"
- "for tl in ax1.get_yticklabels(): tl.set_color('g')\n"
- "for tl in ax2.get_yticklabels(): tl.set_color('k'); tl.set_fontsize(9)\n"
- "min_dp = min([x[2] for x in dat])\n"
- "max_dp = max([x[2] for x in dat])\n"
- "ax2.set_ylim(min_dp-1,max_dp+1)\n"
- "ax1.set_title('Discordance with %s')\n"
- "ax1.set_xlim(-0.05*len(dat),1.05*(len(dat)-1))\n"
- "ax1.set_xlabel('Sample ID')\n"
- "plt.subplots_adjust(left=0.1,right=0.9,bottom=0.1,top=0.9)\n"
- "if sample_ids:\n"
- " ax1.set_xticks(range(len(dat)))\n"
- " ax1.set_xticklabels([x[4] for x in dat],**{'rotation':45, 'ha':'right', 'fontsize':8})\n"
- " plt.subplots_adjust(bottom=0.2)\n"
- "ax1.set_ylabel('Discordance',color='g')\n"
- "ax2.set_ylabel('Number of sites',color='k')\n"
- "ax2.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n"
- "ax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n"
- "labels = [l.get_label() for l in plots]\n"
- "plt.legend(plots,labels,numpoints=1,markerscale=1,loc='best',prop={'size':10},frameon=False)\n"
- "plt.savefig('%s.png')\n"
- "plt.close()\n"
- "\n", args->plot, target_sample, target_sample, query_sample, args->plot
- );
- fclose(fp);
- py_plot(fname);
- free(fname);
-}
-
-#if 0
-static void plot_cross_check(args_t *args)
-{
- char *fname;
- FILE *fp = open_file(&fname, "w", "%s.py", args->plot);
- fprintf(fp,
- "import matplotlib as mpl\n"
- "mpl.use('Agg')\n"
- "import matplotlib.pyplot as plt\n"
- "import matplotlib.gridspec as gridspec\n"
- "import csv\n"
- "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
- "avg = []\n"
- "dp = []\n"
- "sm2id = {}\n"
- "dat = None\n"
- "min = None\n"
- "max = None\n"
- "with open('%s.tab', 'r') as f:\n"
- " reader = csv.reader(f, 'tab')\n"
- " i = 0\n"
- " for row in reader:\n"
- " if row[0]=='SM':\n"
- " sm2id[row[4]] = i\n"
- " avg.append([i,float(row[1])])\n"
- " dp.append([i,float(row[2])])\n"
- " i += 1\n"
- " elif row[0]=='CN':\n"
- " val = 0\n"
- " if int(row[2])!=0: val = float(row[1])/int(row[2])\n"
- " if not dat:\n"
- " dat = [[0]*len(sm2id) for x in xrange(len(sm2id))]\n"
- " min = val\n"
- " max = val\n"
- " id_i = sm2id[row[4]]\n"
- " id_j = sm2id[row[5]]\n"
- " dat[id_i][id_j] = val\n"
- " dat[id_j][id_i] = val\n"
- " if min > val: min = val\n"
- " if max < val: max = val\n"
- "\n"
- "if len(sm2id)<=1: exit(1)\n"
- "if min==max: exit(1)\n"
- "\n"
- "fig = plt.figure(figsize=(6,7))\n"
- "gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1.5])\n"
- "ax1 = plt.subplot(gs[0])\n"
- "ax2 = plt.subplot(gs[1])\n"
- "\n"
- "ax1.plot([x[0] for x in avg],[x[1] for x in avg],'^-', ms=3, color='k')\n"
- "ax3 = ax1.twinx()\n"
- "ax3.plot([x[0] for x in dp],[x[1] for x in dp],'^-', ms=3, color='r',mec='r')\n"
- "for tl in ax3.get_yticklabels():\n"
- " tl.set_color('r')\n"
- " tl.set_fontsize(9)\n"
- "\n"
- "im = ax2.imshow(dat,clim=(min),interpolation='nearest',origin='lower')\n"
- "cb1 = plt.colorbar(im,ax=ax2)\n"
- "cb1.set_label('Pairwise discordance')\n"
- "for t in cb1.ax.get_yticklabels(): t.set_fontsize(9)\n"
- "\n"
- "ax1.tick_params(axis='both', which='major', labelsize=9)\n"
- "ax1.tick_params(axis='both', which='minor', labelsize=9)\n"
- "ax2.tick_params(axis='both', which='major', labelsize=9)\n"
- "ax2.tick_params(axis='both', which='minor', labelsize=9)\n"
- "\n"
- "ax1.set_title('Sample Discordance Score')\n"
- "ax2.set_ylabel('Sample ID')\n"
- "ax2.set_xlabel('Sample ID')\n"
- "ax3.set_ylabel('Average Depth',color='r')\n"
- "ax1.set_xlabel('Sample ID')\n"
- "ax1.set_ylabel('Average discordance')\n"
- "\n"
- "plt.subplots_adjust(left=0.15,right=0.87,bottom=0.08,top=0.93,hspace=0.25)\n"
- "plt.savefig('%s.png')\n"
- "plt.close()\n"
- "\n", args->plot,args->plot
- );
- fclose(fp);
- py_plot(fname);
- free(fname);
-}
-#endif
-
-static void init_data(args_t *args)
-{
- args->sm_hdr = args->files->readers[0].header;
- if ( !bcf_hdr_nsamples(args->sm_hdr) ) error("No samples in %s?\n", args->files->readers[0].fname);
-
- if ( !args->cross_check )
- {
- args->gt_hdr = args->files->readers[1].header;
- int nsamples = bcf_hdr_nsamples(args->gt_hdr);
- if ( !nsamples ) error("No samples in %s?\n", args->files->readers[1].fname);
- args->lks = (double*) calloc(nsamples,sizeof(double));
- args->cnts = (int*) calloc(nsamples,sizeof(int));
- args->sites = (double*) calloc(nsamples,sizeof(double));
- args->dps = (int*) calloc(nsamples,sizeof(int));
- }
+ int iqry, igt;
}
+pair_t;
-static void destroy_data(args_t *args)
-{
- free(args->lks); free(args->cnts); free(args->dps); free(args->cwd); free(args->sites);
-}
-
-static int allele_to_int(bcf1_t *line, char *allele)
+typedef struct
{
- int i;
- for (i=0; i<line->n_allele; i++)
- if ( !strcmp(allele,line->d.allele[i]) ) return i;
- if ( strcmp(line->d.allele[i-1],"X") ) return -1;
- return i-1;
-}
+ bcf_srs_t *files; // first reader is the query VCF - single sample normally or multi-sample for cross-check
+ bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF
+ char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples;
+ int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
+ int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
+ double *pdiff, *qry_prob, *gt_prob;
+ uint32_t *ndiff,*ncnt,ncmp, npairs;
+ int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
+ uint8_t *qry_dsg, *gt_dsg;
+ pair_t *pairs;
+ double *hwe_prob, dsg2prob[8][3], pl2prob[256];
+ double min_inter_err, max_intra_err;
+ int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, use_PLs;
+ FILE *fp;
+ unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL;
-static int init_gt2ipl(args_t *args, bcf1_t *gt_line, bcf1_t *sm_line, int *gt2ipl, int n_gt2ipl)
-{
- int i, j;
- for (i=0; i<n_gt2ipl; i++) gt2ipl[i] = -1;
- for (i=0; i<gt_line->n_allele; i++)
- {
- // find which of the sm_alleles (k) corresponds to the gt_allele (i)
- int k = allele_to_int(sm_line, gt_line->d.allele[i]);
- if ( k<0 ) return 0;
- for (j=0; j<=i; j++)
- {
- int l = allele_to_int(sm_line, gt_line->d.allele[j]);
- if ( l<0 ) return 0;
- gt2ipl[ bcf_ij2G(j,i) ] = k<=l ? bcf_ij2G(k,l) : bcf_ij2G(l,k);
- }
- }
- //for (i=0; i<n_gt2ipl; i++) printf("%d .. %d\n", i,gt2ipl[i]);
- return 1;
+ // for --distinctive-sites
+ double distinctive_sites;
+ kbitset_t *kbs_diff;
+ size_t diff_sites_size;
+ extsort_t *es;
+ char *es_tmp_prefix, *es_max_mem;
}
+args_t;
static void set_cwd(args_t *args)
{
}
assert(buf);
}
-
static void print_header(args_t *args, FILE *fp)
{
fprintf(fp, "# This file was produced by bcftools (%s+htslib-%s), the command line was:\n", bcftools_version(), hts_version());
fprintf(fp, "# \t %s\n#\n", args->cwd);
}
-static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line)
+static int cmp_int(const void *_a, const void *_b)
{
- // PLs not present, use GTs instead.
- int fake_PL = args->no_PLs ? args->no_PLs : 99; // with 1, discordance is the number of non-matching GTs
- int nsm_gt, i;
- if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 )
- error("GT not present at %s:%"PRId64"?\n", hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
- nsm_gt /= bcf_hdr_nsamples(hdr);
- int npl = line->n_allele*(line->n_allele+1)/2;
- hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr);
- for (i=0; i<bcf_hdr_nsamples(hdr); i++)
- {
- int *gt_ptr = args->tmp_arr + i*nsm_gt;
- int j, *pl_ptr = args->pl_arr + i*npl;
- if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) // missing genotype
- {
- for (j=0; j<npl; j++) pl_ptr[j] = -1;
- }
- else
- {
- int a = bcf_gt_allele(gt_ptr[0]);
- int b = bcf_gt_allele(gt_ptr[1]);
- for (j=0; j<npl; j++) pl_ptr[j] = fake_PL;
- int idx = bcf_alleles2gt(a,b);
- pl_ptr[idx] = 0;
- }
- }
- return npl;
+ int a = *((int*)_a);
+ int b = *((int*)_b);
+ if ( a < b ) return -1;
+ if ( a > b ) return 1;
+ return 0;
+}
+static int cmp_pair(const void *_a, const void *_b)
+{
+ pair_t *a = (pair_t*)_a;
+ pair_t *b = (pair_t*)_b;
+ if ( a->iqry < b->iqry ) return -1;
+ if ( a->iqry > b->iqry ) return 1;
+ if ( a->igt < b->igt ) return -1;
+ if ( a->igt > b->igt ) return 1;
+ return 0;
}
-static int cmp_doubleptr(const void *_a, const void *_b)
+typedef struct
+{
+ uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosoms
+ unsigned long kbs_dat[1];
+}
+diff_sites_t;
+#if DBG
+static void diff_sites_debug_print(args_t *args, diff_sites_t *ds)
+{
+ int i;
+ memcpy(args->kbs_diff->b,ds->kbs_dat,args->kbs_diff->n*sizeof(unsigned long));
+ fprintf(stderr,"%s:%d\t%d\t",bcf_hdr_id2name(args->qry_hdr,ds->rid),ds->pos+1,ds->ndiff);
+ for (i=0; i<args->npairs; i++) fprintf(stderr,"%d",kbs_exists(args->kbs_diff,i)?1:0);
+ fprintf(stderr,"\n");
+}
+#endif
+static int diff_sites_cmp(const void *aptr, const void *bptr)
+{
+ diff_sites_t *a = *((diff_sites_t**)aptr);
+ diff_sites_t *b = *((diff_sites_t**)bptr);
+ if ( a->ndiff < b->ndiff ) return 1; // descending order
+ if ( a->ndiff > b->ndiff ) return -1;
+ if ( a->rand < b->rand ) return -1;
+ if ( a->rand > b->rand ) return 1;
+ return 0;
+}
+static void diff_sites_init(args_t *args)
+{
+ int nsites = args->distinctive_sites<=1 ? args->npairs*args->distinctive_sites : args->distinctive_sites;
+ if ( nsites<=0 ) error("The value for --distinctive-sites was set too low: %d\n",nsites);
+ if ( nsites > args->npairs )
+ {
+ fprintf(stderr,"Warning: The value for --distinctive-sites is bigger than is the number of pairs, all discordant sites be printed.\n");
+ nsites = args->npairs;
+ args->distinctive_sites = args->npairs + 1;
+ }
+ else
+ args->distinctive_sites = nsites;
+ args->kbs_diff = kbs_init(args->npairs);
+ size_t n = (args->npairs + KBS_ELTBITS-1) / KBS_ELTBITS;
+ assert( n==args->kbs_diff->n );
+ args->diff_sites_size = sizeof(diff_sites_t) + (n-1)*sizeof(unsigned long);
+ args->es = extsort_alloc();
+ extsort_set_opt(args->es,size_t,DAT_SIZE,args->diff_sites_size);
+ extsort_set_opt(args->es,const char*,TMP_PREFIX,args->es_tmp_prefix);
+ extsort_set_opt(args->es,const char*,MAX_MEM,args->es_max_mem);
+ extsort_set_opt(args->es,extsort_cmp_f,FUNC_CMP,diff_sites_cmp);
+ extsort_init(args->es);
+}
+static void diff_sites_destroy(args_t *args)
{
- double *a = *((double**)_a);
- double *b = *((double**)_b);
- if ( *a < *b ) return -1;
- else if ( *a == *b ) return 0;
+ kbs_destroy(args->kbs_diff);
+ extsort_destroy(args->es);
+}
+static inline void diff_sites_reset(args_t *args)
+{
+ kbs_clear(args->kbs_diff);
+}
+static inline void diff_sites_push(args_t *args, int ndiff, int rid, int pos)
+{
+ diff_sites_t *dat = (diff_sites_t*) malloc(args->diff_sites_size);
+ memset(dat,0,sizeof(*dat)); // for debugging: prevent warnings about uninitialized memory coming from struct padding (not needed after rand added)
+ dat->ndiff = ndiff;
+ dat->rid = rid;
+ dat->pos = pos;
+ dat->rand = hts_lrand48();
+ memcpy(dat->kbs_dat,args->kbs_diff->b,args->kbs_diff->n*sizeof(unsigned long));
+ extsort_push(args->es,dat);
+}
+static inline int diff_sites_shift(args_t *args, int *ndiff, int *rid, int *pos)
+{
+ diff_sites_t *dat = (diff_sites_t*) extsort_shift(args->es);
+ if ( !dat ) return 0;
+ *ndiff = dat->ndiff;
+ *rid = dat->rid;
+ *pos = dat->pos;
+ memcpy(args->kbs_diff->b,dat->kbs_dat,args->kbs_diff->n*sizeof(unsigned long));
return 1;
}
-static void check_gt(args_t *args)
+static void init_samples(char *list, int list_is_file, int **smpl, int *nsmpl, bcf_hdr_t *hdr, char *vcf_fname)
{
- int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0;
- int fake_pls = args->no_PLs;
+ int i;
+ if ( !strcmp(list,"-") )
+ {
+ *nsmpl = bcf_hdr_nsamples(hdr);
+ *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl));
+ for (i=0; i<*nsmpl; i++) (*smpl)[i] = i;
+ return;
+ }
- // Initialize things: check which tags are defined in the header, sample names etc.
- if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname);
- if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
+ char **tmp = hts_readlist(list, list_is_file, nsmpl);
+ if ( !tmp || !*nsmpl ) error("Failed to parse %s\n", list);
+ *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl));
+ for (i=0; i<*nsmpl; i++)
{
- if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
- error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
- if ( !args->no_PLs )
- fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
- fake_pls = 1;
+ int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, tmp[i]);
+ if ( idx<0 ) error("No such sample in %s: [%s]\n",vcf_fname,tmp[i]);
+ (*smpl)[i] = idx;
+ free(tmp[i]);
}
+ free(tmp);
+ qsort(*smpl,*nsmpl,sizeof(**smpl),cmp_int);
+ // check for duplicates
+ for (i=1; i<*nsmpl; i++)
+ if ( (*smpl)[i-1]==(*smpl)[i] )
+ error("Error: the sample \"%s\" is listed twice in %s\n", hdr->samples[(*smpl)[i]],list);
+}
- FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
- print_header(args, fp);
+static void init_data(args_t *args)
+{
+ hts_srand48(0);
- int tgt_isample = -1, query_isample = 0;
- if ( args->target_sample )
+ args->files = bcf_sr_init();
+ if ( args->regions && bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions);
+ if ( args->targets && bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets);
+
+ if ( args->gt_fname ) bcf_sr_set_opt(args->files, BCF_SR_REQUIRE_IDX);
+ if ( !bcf_sr_add_reader(args->files,args->qry_fname) ) error("Failed to open %s: %s\n", args->qry_fname,bcf_sr_strerror(args->files->errnum));
+ if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) )
+ error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum));
+
+ args->qry_hdr = bcf_sr_get_header(args->files,0);
+ if ( !bcf_hdr_nsamples(args->qry_hdr) ) error("No samples in %s?\n", args->qry_fname);
+ if ( args->gt_fname )
{
- tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample);
- if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample);
+ args->gt_hdr = bcf_sr_get_header(args->files,1);
+ if ( !bcf_hdr_nsamples(args->gt_hdr) ) error("No samples in %s?\n", args->gt_fname);
}
- if ( args->all_sites )
+
+ // Determine whether GT or PL will be used
+ if ( args->qry_use_GT==-1 ) // not set by -u, qry uses PL by default
{
- if ( tgt_isample==-1 )
- {
- fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
- tgt_isample = 0;
- }
+ if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")>=0 )
+ args->qry_use_GT = 0;
+ else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")>=0 )
+ args->qry_use_GT = 1;
+ else
+ error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->qry_fname);
}
- if ( args->query_sample )
+ else if ( args->qry_use_GT==1 )
{
- query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample);
- if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample);
+ if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")<0 )
+ error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->qry_fname);
}
- if ( args->all_sites )
- fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]match log LK\t[7]Query alleles\t[8-]Query PLs (%s)\n",
- args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]);
+ else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")<0 )
+ error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->qry_fname);
- // Main loop
- float prev_lk = 0;
- while ( (ret=bcf_sr_next_line(args->files)) )
+ if ( args->gt_hdr )
{
- if ( ret!=2 ) continue;
- bcf1_t *sm_line = args->files->readers[0].buffer[0]; // the query file
- bcf1_t *gt_line = args->files->readers[1].buffer[0]; // the -g target file
- bcf_unpack(sm_line, BCF_UN_FMT);
- bcf_unpack(gt_line, BCF_UN_FMT);
-
- // Init mapping from target genotype index to the sample's PL fields
- int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2;
- if ( n_gt2ipl > m_gt2ipl )
+ if ( args->gt_use_GT==-1 ) // not set by -u, gt uses GT by default
+ {
+ if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")>=0 )
+ args->gt_use_GT = 1;
+ else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")>=0 )
+ args->gt_use_GT = 0;
+ else
+ error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->gt_fname);
+ }
+ else if ( args->gt_use_GT==1 )
{
- m_gt2ipl = n_gt2ipl;
- gt2ipl = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl);
+ if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")<0 )
+ error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->gt_fname);
}
- if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue;
-
- // Target genotypes
- int ngt, npl;
- if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 )
- error("GT not present at %s:%"PRId64"?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1);
- ngt /= bcf_hdr_nsamples(args->gt_hdr);
- if ( ngt!=2 ) continue; // checking only diploid genotypes
+ else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")<0 )
+ error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->gt_fname);
+ }
+ else
+ args->gt_use_GT = args->qry_use_GT;
- // Sample PLs
- if ( !fake_pls )
+ // Prepare samples
+ int i,j;
+ args->nqry_smpl = bcf_hdr_nsamples(args->qry_hdr);
+ if ( args->qry_samples )
+ {
+ init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname);
+ }
+ if ( args->gt_samples )
+ {
+ init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl,
+ args->gt_hdr ? args->gt_hdr : args->qry_hdr,
+ args->gt_fname ? args->gt_fname : args->qry_fname);
+ }
+ else if ( args->pair_samples )
+ {
+ int npairs;
+ char **tmp = hts_readlist(args->pair_samples, args->pair_samples_is_file, &npairs);
+ if ( !tmp || !npairs ) error("Failed to parse %s\n", args->pair_samples);
+ if ( !args->pair_samples_is_file && npairs%2 ) error("Expected even number of comma-delimited samples with -p\n");
+ args->npairs = args->pair_samples_is_file ? npairs : npairs/2;
+ args->pairs = (pair_t*) calloc(args->npairs,sizeof(*args->pairs));
+ if ( !args->pair_samples_is_file )
{
- if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 )
+ for (i=0; i<args->npairs; i++)
{
- if ( sm_line->n_allele==1 )
- {
- // PL values may not be present when ALT=. (mpileup/bcftools output), in that case
- // switch automatically to GT at these sites
- npl = fake_PLs(args, args->sm_hdr, sm_line);
- }
- else
- error("PL not present at %s:%"PRId64"?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, (int64_t) sm_line->pos+1);
+ args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i]);
+ args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i+1]);
+ if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[2*i]);
+ if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,tmp[2*i+1]);
+ free(tmp[2*i]);
+ free(tmp[2*i+1]);
}
- else
- npl /= bcf_hdr_nsamples(args->sm_hdr);
}
else
- npl = fake_PLs(args, args->sm_hdr, sm_line);
+ {
+ for (i=0; i<args->npairs; i++)
+ {
+ char *ptr = tmp[i];
+ while ( *ptr && !isspace(*ptr) ) ptr++;
+ if ( !*ptr ) error("Could not parse %s: %s\n",args->pair_samples,tmp[i]);
+ *ptr = 0;
+ args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[i]);
+ if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[i]);
+ ptr++;
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, ptr);
+ if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,ptr);
+ free(tmp[i]);
+ }
+ }
+ free(tmp);
+ qsort(args->pairs,args->npairs,sizeof(*args->pairs),cmp_pair);
+ }
+ else if ( args->gt_hdr )
+ args->ngt_smpl = bcf_hdr_nsamples(args->gt_hdr);
+ if ( !args->ngt_smpl )
+ {
+ args->ngt_smpl = args->nqry_smpl;
+ args->gt_smpl = args->qry_smpl;
+ args->cross_check = 1;
+ }
+
+ // The data arrays
+ if ( !args->npairs ) args->npairs = args->cross_check ? args->nqry_smpl*(args->nqry_smpl+1)/2 : args->ngt_smpl*args->nqry_smpl;
+ if ( !args->pair_samples )
+ {
+ args->qry_dsg = (uint8_t*) malloc(args->nqry_smpl);
+ args->gt_dsg = args->cross_check ? args->qry_dsg : (uint8_t*) malloc(args->ngt_smpl);
+ }
+ if ( args->use_PLs )
+ {
+ args->pdiff = (double*) calloc(args->npairs,sizeof(*args->pdiff)); // log probability of pair samples being the same
+ args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob));
+ args->gt_prob = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
+
+ // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
+ // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
+ // probabilities of 0/0, 0/1, and 1/1 genotypes
+ for (i=0; i<8; i++)
+ for (j=0; j<3; j++)
+ args->dsg2prob[i][j] = HUGE_VAL;
+ args->dsg2prob[1][0] = -log(1-pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[1][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[1][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[2][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[2][1] = -log(1-pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[2][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[4][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[4][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[4][2] = -log(1-pow(10,-0.1*args->use_PLs));
- // Calculate likelihoods for all samples, assuming diploid genotypes
+ // lookup table to avoid exponentiation
+ for (i=0; i<256; i++) args->pl2prob[i] = pow(10,-0.1*i);
+ }
+ else
+ args->ndiff = (uint32_t*) calloc(args->npairs,sizeof(*args->ndiff)); // number of differing genotypes for each pair of samples
+ args->ncnt = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt)); // number of comparisons performed (non-missing data)
+ if ( !args->ncnt ) error("Error: failed to allocate %.1f Mb\n", args->npairs*sizeof(*args->ncnt)/1e6);
+ if ( args->calc_hwe_prob )
+ {
+ // prob of the observed sequence of matches given site AFs and HWE
+ args->hwe_prob = (double*) calloc(args->npairs,sizeof(*args->hwe_prob));
+ if ( !args->hwe_prob ) error("Error: failed to allocate %.1f Mb. Run with --no-HWE-prob to save some memory.\n", args->npairs*sizeof(*args->hwe_prob)/1e6);
+ }
+
+ if ( args->distinctive_sites ) diff_sites_init(args);
+
+ args->fp = stdout;
+ print_header(args, args->fp);
+}
+
+static void destroy_data(args_t *args)
+{
+ if ( args->gt_dsg!=args->qry_dsg ) free(args->gt_dsg);
+ free(args->qry_dsg);
+ if ( args->gt_prob!=args->qry_prob ) free(args->gt_prob);
+ free(args->qry_prob);
+ free(args->es_max_mem);
+ fclose(args->fp);
+ if ( args->distinctive_sites ) diff_sites_destroy(args);
+ free(args->hwe_prob);
+ free(args->cwd);
+ free(args->qry_arr);
+ if ( args->gt_hdr ) free(args->gt_arr);
+ free(args->pdiff);
+ free(args->ndiff);
+ free(args->ncnt);
+ free(args->qry_smpl);
+ if ( args->gt_smpl!=args->qry_smpl ) free(args->gt_smpl);
+ free(args->pairs);
+ bcf_sr_destroy(args->files);
+}
- // For faster access to genotype likelihoods (PLs) of the query sample
- int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl;
- double sum_pl = 0; // for converting PLs to probs
- for (max_ipl=0; max_ipl<npl; max_ipl++)
+static inline uint8_t gt_to_dsg(int32_t *ptr)
+{
+ if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end ) return 0;
+ uint8_t dsg = (bcf_gt_allele(ptr[0])?1:0) + (bcf_gt_allele(ptr[1])?1:0);
+ return 1<<dsg;
+}
+static inline uint8_t pl_to_dsg(int32_t *ptr)
+{
+ if ( ptr[0]==bcf_int32_missing || ptr[1]==bcf_int32_missing || ptr[2]==bcf_int32_missing ) return 0;
+ if ( ptr[1]==bcf_int32_vector_end || ptr[2]==bcf_int32_vector_end ) return 0;
+ int min_pl = ptr[0]<ptr[1] ? (ptr[0]<ptr[2]?ptr[0]:ptr[2]) : (ptr[1]<ptr[2]?ptr[1]:ptr[2]);
+ uint8_t dsg = 0;
+ if ( ptr[0]==min_pl ) dsg |= 1;
+ if ( ptr[1]==min_pl ) dsg |= 2;
+ if ( ptr[2]==min_pl ) dsg |= 4;
+ return dsg;
+}
+static inline uint8_t gt_to_prob(args_t *args, int32_t *ptr, double *prob)
+{
+ uint8_t dsg = gt_to_dsg(ptr);
+ if ( dsg )
+ {
+ prob[0] = args->dsg2prob[dsg][0];
+ prob[1] = args->dsg2prob[dsg][1];
+ prob[2] = args->dsg2prob[dsg][2];
+ }
+ return dsg;
+}
+static inline uint8_t pl_to_prob(args_t *args, int32_t *ptr, double *prob)
+{
+ uint8_t dsg = pl_to_dsg(ptr);
+ if ( dsg )
+ {
+ prob[0] = (ptr[0]>=0 && ptr[0]<255) ? args->pl2prob[ptr[0]] : args->pl2prob[255];
+ prob[1] = (ptr[1]>=0 && ptr[1]<255) ? args->pl2prob[ptr[1]] : args->pl2prob[255];
+ prob[2] = (ptr[2]>=0 && ptr[2]<255) ? args->pl2prob[ptr[2]] : args->pl2prob[255];
+ double sum = prob[0] + prob[1] + prob[2];
+ prob[0] /= sum;
+ prob[1] /= sum;
+ prob[2] /= sum;
+ prob[0] = -log(prob[0]);
+ prob[1] = -log(prob[1]);
+ prob[2] = -log(prob[2]);
+ }
+ return dsg;
+}
+static int set_data(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec, int32_t **arr, int32_t *narr, int *narr1, int *use_GT)
+{
+ static int warn_dip_GT = 1;
+ static int warn_dip_PL = 1;
+ int i;
+ for (i=0; i<2; i++)
+ {
+ if ( *use_GT )
{
- if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break;
- if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue;
- sum_pl += pow(10, -0.1*pl_ptr[max_ipl]);
+ int ret = bcf_get_genotypes(hdr,rec,arr,narr);
+ if ( ret < 0 )
+ {
+ if ( !i ) { *use_GT = 0; continue; }
+ args->nskip_no_data++;
+ return -1;
+ }
+ if ( ret != 2*bcf_hdr_nsamples(hdr) )
+ {
+ if ( warn_dip_GT )
+ {
+ fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/GT fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1);
+ warn_dip_GT = 0;
+ }
+ args->nskip_dip_GT++;
+ return -1;
+ }
+ *narr1 = 2;
+ return 0;
}
- if ( sum_pl==0 ) continue; // no PLs present
- if ( fake_pls && args->no_PLs==1 ) sum_pl = -1;
- // The main stats: concordance of the query sample with the target -g samples
- for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++)
+ int ret = bcf_get_format_int32(hdr,rec,"PL",arr,narr);
+ if ( ret < 0 )
{
- int *gt_ptr = gt_arr + i*ngt;
- if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes
- if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) continue;
- int a = bcf_gt_allele(gt_ptr[0]);
- int b = bcf_gt_allele(gt_ptr[1]);
- if ( args->hom_only && a!=b ) continue; // heterozygous genotype
- int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file
- int igt_qry = gt2ipl[igt_tgt]; // corresponding genotype in query file
- if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue; // genotype not present in query sample: haploid or missing
- args->lks[i] += sum_pl<0 ? -pl_ptr[igt_qry] : log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl);
- args->sites[i]++;
+ if ( !i ) { *use_GT = 1; continue; }
+ args->nskip_no_data++;
+ return -1;
}
- if ( args->all_sites )
+ if ( ret != 3*bcf_hdr_nsamples(hdr) )
{
- // Print LKs at all sites for debugging
- int *gt_ptr = gt_arr + tgt_isample*ngt;
- if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes
- int a = bcf_gt_allele(gt_ptr[0]);
- int b = bcf_gt_allele(gt_ptr[1]);
- if ( args->hom_only && a!=b ) continue; // heterozygous genotype
- fprintf(fp, "SC\t%s\t%"PRId64, args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1);
- for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]);
- fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : ".");
- fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk);
- prev_lk = args->lks[query_isample];
-
- int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample
- for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]);
- for (igt=0; igt<npl; igt++)
- if ( pl_ptr[igt]==bcf_int32_vector_end ) break;
- else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, ".");
- else fprintf(fp, "\t%d", pl_ptr[igt]);
- fprintf(fp, "\n");
+ if ( warn_dip_PL )
+ {
+ fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/PL fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1);
+ warn_dip_PL = 0;
+ }
+ args->nskip_dip_PL++;
+ return -1;
}
+ *narr1 = 3;
+ return 0;
}
- free(gt2ipl);
- free(gt_arr);
- free(args->pl_arr);
- free(args->tmp_arr);
+ return -1; // should never reach
+}
+static void process_line(args_t *args)
+{
+ int i,j,k, nqry1, ngt1, ret;
+
+ bcf1_t *gt_rec = NULL, *qry_rec = bcf_sr_get_line(args->files,0); // the query file
+ int qry_use_GT = args->qry_use_GT;
+ int gt_use_GT = args->gt_use_GT;
+
+ ret = set_data(args, args->qry_hdr, qry_rec, &args->qry_arr, &args->nqry_arr, &nqry1, &qry_use_GT);
+ if ( ret<0 ) return;
- // To be able to plot total discordance (=number of mismatching GTs with -G1) in the same
- // plot as discordance per site, the latter must be scaled to the same range
- int nsamples = bcf_hdr_nsamples(args->gt_hdr);
- double extreme_lk = 0, extreme_lk_per_site = 0;
- for (i=0; i<nsamples; i++)
+ if ( args->gt_hdr )
{
- if ( args->lks[i] < extreme_lk ) extreme_lk = args->lks[i];
- if ( args->sites[i] && args->lks[i]/args->sites[i] < extreme_lk_per_site ) extreme_lk_per_site = args->lks[i]/args->sites[i];
+ gt_rec = bcf_sr_get_line(args->files,1);
+ ret = set_data(args, args->gt_hdr, gt_rec, &args->gt_arr, &args->ngt_arr, &ngt1, >_use_GT);
+ if ( ret<0 ) return;
+ }
+ else
+ {
+ ngt1 = nqry1;
+ args->gt_arr = args->qry_arr;
}
- // Sorted output
- double **p = (double**) malloc(sizeof(double*)*nsamples);
- for (i=0; i<nsamples; i++) p[i] = &args->lks[i];
- qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
+ args->ncmp++;
- fprintf(fp, "# [1]CN\t[2]Discordance with %s (total)\t[3]Discordance (avg score per site)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]);
- for (i=0; i<nsamples; i++)
+ double af,hwe_dsg[8];
+ if ( args->calc_hwe_prob )
{
- int idx = p[i] - args->lks;
- double per_site = 0;
- if ( args->sites[idx] )
+ int ac[2];
+ if ( args->gt_hdr )
{
- if ( args->sites[idx] && extreme_lk_per_site )
+ if ( bcf_calc_ac(args->gt_hdr, gt_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
+ }
+ else if ( bcf_calc_ac(args->qry_hdr, qry_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
+
+ // hwe indexes correspond to the bitmask of eight dsg combinations to account for PL uncertainty
+ // for in the extreme case we can have uninformative PL=0,0,0. So the values are the minima of e.g.
+ // hwe[1,2,4] .. dsg=0,1,2
+ // hwe[3] .. dsg=0 or 1
+ // hwe[6] .. dsg=1 or 2
+
+ double hwe[3];
+ const double min_af = 1e-5; // cap the AF in case we get unrealistic values
+ af = (double)ac[1]/(ac[0]+ac[1]);
+ hwe[0] = af>min_af ? -log(af*af) : -log(min_af*min_af);
+ hwe[1] = af>min_af && af<1-min_af ? -log(2*af*(1-af)) : -log(2*min_af*(1-min_af));
+ hwe[2] = af<(1-min_af) ? -log((1-af)*(1-af)) : -log(min_af*min_af);
+ hwe_dsg[0] = 0;
+ for (i=1; i<8; i++)
+ {
+ hwe_dsg[i] = HUGE_VAL;
+ for (k=0; k<3; k++)
{
- per_site = args->lks[idx]/args->sites[idx];
- per_site *= extreme_lk / extreme_lk_per_site;
+ if ( ((1<<k)&i) && hwe_dsg[i] > hwe[k] ) hwe_dsg[i] = hwe[k];
}
- else
- per_site = 0;
}
- fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", fabs(args->lks[idx]), fabs(per_site), args->sites[idx], args->gt_hdr->samples[idx], i);
}
- if ( args->plot )
+ // The sample pairs were given explicitly via -p/-P options
+ if ( args->pairs )
{
- if ( fclose(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
- plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]);
- }
-}
+ if ( !args->use_PLs )
+ {
+ int ndiff = 0;
+ if ( args->kbs_diff ) diff_sites_reset(args);
-// static inline int is_hom_most_likely(int nals, int *pls)
-// {
-// int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
-// for (ia=1; ia<nals; ia++)
-// {
-// for (ib=0; ib<ia; ib++)
-// {
-// if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
-// idx++;
-// }
-// if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
-// idx++;
-// }
-// return min_is_hom;
-// }
-
-int process_GT(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
-{
- int ngt = bcf_get_genotypes(args->sm_hdr, line, &args->tmp_arr, &args->ntmp_arr);
+ for (i=0; i<args->npairs; i++)
+ {
+ int32_t *ptr;
+ uint8_t qry_dsg, gt_dsg;
- if ( ngt<=0 ) return 1; // GT not present
- if ( ngt!=args->nsmpl*2 ) return 2; // not diploid
- ngt /= args->nsmpl;
-
- int i,j, idx = 0;
- for (i=1; i<args->nsmpl; i++)
- {
- int32_t *a = args->tmp_arr + i*ngt;
- if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; }
- int agt = 1<<bcf_gt_allele(a[0]) | 1<<bcf_gt_allele(a[1]);
+ ptr = args->gt_arr + args->pairs[i].igt*ngt1;
+ gt_dsg = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
+ if ( !gt_dsg ) continue; // missing value
+ if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom
+
+ ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
+ qry_dsg = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
+ if ( !qry_dsg ) continue; // missing value
+
+ int match = qry_dsg & gt_dsg;
+ if ( !match )
+ {
+ args->ndiff[i]++;
+ if ( args->kbs_diff ) { ndiff++; kbs_insert(args->kbs_diff, i); }
+ }
+ else if ( args->calc_hwe_prob ) args->hwe_prob[i] += hwe_dsg[match];
+ args->ncnt[i]++;
+ }
- for (j=0; j<i; j++)
+ if ( ndiff ) diff_sites_push(args, ndiff, qry_rec->rid, qry_rec->pos);
+ }
+ else // use_PLs set
{
- int32_t *b = args->tmp_arr + j*ngt;
- if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; }
- int bgt = 1<<bcf_gt_allele(b[0]) | 1<<bcf_gt_allele(b[1]);
+ for (i=0; i<args->npairs; i++)
+ {
+ int32_t *ptr;
+ double qry_prob[3], gt_prob[3];
+ uint8_t qry_dsg, gt_dsg;
+
+ ptr = args->gt_arr + args->pairs[i].igt*ngt1;
+ gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob);
+ if ( !gt_dsg ) continue; // missing value
+ if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom
+
+ ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
+ qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob);
+ if ( !qry_dsg ) continue; // missing value
- ntot[idx]++;
- if ( agt!=bgt ) ndif[idx]++;
- idx++;
+ double min = qry_prob[0] + gt_prob[0];
+ qry_prob[1] += gt_prob[1];
+ if ( min > qry_prob[1] ) min = qry_prob[1];
+ qry_prob[2] += gt_prob[2];
+ if ( min > qry_prob[2] ) min = qry_prob[2];
+ args->pdiff[i] += min;
+
+ if ( args->calc_hwe_prob )
+ {
+ int match = qry_dsg & gt_dsg;
+ args->hwe_prob[i] += hwe_dsg[match];
+ }
+ args->ncnt[i]++;
+ }
}
+ return;
}
- return 0;
-}
-int process_PL(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
-{
- int npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr);
- if ( npl<=0 ) return 1; // PL not present
- npl /= args->nsmpl;
-
- int i,j,k, idx = 0;
- for (i=1; i<args->nsmpl; i++)
+ int idx=0;
+ if ( !args->use_PLs )
{
- int32_t *a = args->tmp_arr + i*npl;
- int imin = -1;
- for (k=0; k<npl; k++)
+ for (i=0; i<args->nqry_smpl; i++)
{
- if ( a[k]==bcf_int32_vector_end ) break;
- if ( a[k]==bcf_int32_missing ) continue;
- if ( imin==-1 || a[imin] > a[k] ) imin = k;
+ int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+ int32_t *ptr = args->qry_arr + nqry1*iqry;
+ args->qry_dsg[i] = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
}
- if ( imin<0 ) { idx+=i; continue; }
-
- for (j=0; j<i; j++)
+ if ( !args->cross_check ) // in this case gt_dsg points to qry_dsg
{
- int32_t *b = args->tmp_arr + j*npl;
- int jmin = -1;
- for (k=0; k<npl; k++)
+ for (i=0; i<args->ngt_smpl; i++)
{
- if ( b[k]==bcf_int32_vector_end ) break;
- if ( b[k]==bcf_int32_missing ) continue;
- if ( jmin==-1 || b[jmin] > b[k] ) jmin = k;
+ int igt = args->gt_smpl ? args->gt_smpl[i] : i;
+ int32_t *ptr = args->gt_arr + ngt1*igt;
+ args->gt_dsg[i] = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
+ if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0; // not a hom, set to a missing value
+ }
+ }
+ for (i=0; i<args->nqry_smpl; i++)
+ {
+ int ngt = args->cross_check ? i : args->ngt_smpl; // two files or a sub-diagonal cross-check mode?
+ if ( !args->qry_dsg[i] ) { idx += ngt; continue; } // missing value
+ for (j=0; j<ngt; j++)
+ {
+ if ( !args->gt_dsg[j] ) { idx++; continue; } // missing value
+ int match = args->qry_dsg[i] & args->gt_dsg[j];
+ if ( !match ) args->ndiff[idx]++;
+ else if ( args->calc_hwe_prob ) args->hwe_prob[idx] += hwe_dsg[match];
+ args->ncnt[idx]++;
+ idx++;
}
- if ( jmin<0 ) { idx++; continue; }
-
- ntot[idx]++;
- if ( imin!=jmin ) ndif[idx]++;
- idx++;
}
}
- return 0;
-}
+ else // use_PLs set
+ {
+ for (i=0; i<args->nqry_smpl; i++)
+ {
+ int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+ int32_t *ptr = args->qry_arr + nqry1*iqry;
+ args->qry_dsg[i] = qry_use_GT ? gt_to_prob(args,ptr,args->qry_prob+i*3) : pl_to_prob(args,ptr,args->qry_prob+i*3);
+ }
+ if ( !args->cross_check ) // in this case gt_dsg points to qry_dsg
+ {
+ for (i=0; i<args->ngt_smpl; i++)
+ {
+ int igt = args->gt_smpl ? args->gt_smpl[i] : i;
+ int32_t *ptr = args->gt_arr + ngt1*igt;
+ args->gt_dsg[i] = gt_use_GT ? gt_to_prob(args,ptr,args->gt_prob+i*3) : pl_to_prob(args,ptr,args->gt_prob+i*3);
+ if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0; // not a hom, set to a missing value
+ }
+ }
+ for (i=0; i<args->nqry_smpl; i++)
+ {
+ int ngt = args->cross_check ? i : args->ngt_smpl; // two files or a sub-diagonal cross-check mode?
+ if ( !args->qry_dsg[i] ) { idx += ngt; continue; } // missing value
+ for (j=0; j<ngt; j++)
+ {
+ if ( !args->gt_dsg[j] ) { idx++; continue; } // missing value
-static void cross_check_gts(args_t *args)
-{
- // Initialize things: check which tags are defined in the header, sample names etc.
- if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
- {
- if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
- error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
- if ( !args->no_PLs ) {
- fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
- args->no_PLs = 99;
+ double min = args->qry_prob[i*3] + args->gt_prob[j*3];
+ if ( min > args->qry_prob[i*3+1] + args->gt_prob[j*3+1] ) min = args->qry_prob[i*3+1] + args->gt_prob[j*3+1];
+ if ( min > args->qry_prob[i*3+2] + args->gt_prob[j*3+2] ) min = args->qry_prob[i*3+2] + args->gt_prob[j*3+2];
+ args->pdiff[idx] += min;
+
+ if ( args->calc_hwe_prob )
+ {
+ int match = args->qry_dsg[i] & args->gt_dsg[j];
+ args->hwe_prob[idx] += hwe_dsg[match];
+ }
+ args->ncnt[idx]++;
+ idx++;
+ }
}
}
+}
- args->nsmpl = bcf_hdr_nsamples(args->sm_hdr);
- args->narr = (args->nsmpl-1)*args->nsmpl/2;
- uint32_t *ndif = (uint32_t*) calloc(args->narr,4);
- uint32_t *ntot = (uint32_t*) calloc(args->narr,4);
+typedef struct
+{
+ int ism, idx;
+ double val;
+}
+idbl_t;
+static int cmp_idbl(const void *_a, const void *_b)
+{
+ idbl_t *a = (idbl_t*)_a;
+ idbl_t *b = (idbl_t*)_b;
+ if ( a->val < b->val ) return -1;
+ if ( a->val > b->val ) return 1;
+ return 0;
+}
+static void report_distinctive_sites(args_t *args)
+{
+ extsort_sort(args->es);
+
+ fprintf(args->fp,"# DS, distinctive sites:\n");
+ fprintf(args->fp,"# - chromosome\n");
+ fprintf(args->fp,"# - position\n");
+ fprintf(args->fp,"# - cumulative number of pairs distinguished by this block\n");
+ fprintf(args->fp,"# - block id\n");
+ fprintf(args->fp,"#DS\t[2]Chromosome\t[3]Position\t[4]Cumulative number of distinct pairs\t[5]Block id\n");
- while ( bcf_sr_next_line(args->files) )
+ kbitset_t *kbs_blk = kbs_init(args->npairs);
+ kbitset_iter_t itr;
+ int i,ndiff,rid,pos,ndiff_tot = 0, iblock = 0;
+ int ndiff_min = args->distinctive_sites <= args->npairs ? args->distinctive_sites : args->npairs;
+ while ( diff_sites_shift(args,&ndiff,&rid,&pos) )
{
- bcf1_t *line = bcf_sr_get_line(args->files,0);
-
- // use PLs unless no_PLs is set and GT exists
- if ( args->no_PLs )
+ int ndiff_new = 0, ndiff_dbg = 0;
+ kbs_start(&itr);
+ while ( (i=kbs_next(args->kbs_diff, &itr))>=0 )
{
- if ( process_GT(args,line,ntot,ndif)==0 ) continue;
+ ndiff_dbg++;
+ if ( kbs_exists(kbs_blk,i) ) continue; // already set
+ kbs_insert(kbs_blk,i);
+ ndiff_new++;
}
- process_PL(args,line,ntot,ndif);
+ if ( ndiff_dbg!=ndiff ) error("Corrupted data, fixme: %d vs %d\n",ndiff_dbg,ndiff);
+ if ( !ndiff_new ) continue; // no new pair distinguished by this site
+ ndiff_tot += ndiff_new;
+ fprintf(args->fp,"DS\t%s\t%d\t%d\t%d\n",bcf_hdr_id2name(args->qry_hdr,rid),pos+1,ndiff_tot,iblock);
+ if ( ndiff_tot < ndiff_min ) continue; // fewer than the requested number of pairs can be distinguished at this point
+ iblock++;
+ ndiff_tot = 0;
+ kbs_clear(kbs_blk);
}
-
- FILE *fp = stdout;
- print_header(args, fp);
+ kbs_destroy(kbs_blk);
+}
+static void report(args_t *args)
+{
+ fprintf(args->fp,"INFO\tsites-compared\t%u\n",args->ncmp);
+ fprintf(args->fp,"INFO\tsites-skipped-no-match\t%u\n",args->nskip_no_match);
+ fprintf(args->fp,"INFO\tsites-skipped-multiallelic\t%u\n",args->nskip_not_ba);
+ fprintf(args->fp,"INFO\tsites-skipped-monoallelic\t%u\n",args->nskip_mono);
+ fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
+ fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
+ fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
+ fprintf(args->fp,"# DC, discordance:\n");
+ fprintf(args->fp,"# - query sample\n");
+ fprintf(args->fp,"# - genotyped sample\n");
+ fprintf(args->fp,"# - discordance (number of mismatches; smaller is better)\n");
+ fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n");
+ fprintf(args->fp,"# - number of sites compared (bigger is better)\n");
+ fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
- float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2);
+ int trim = args->ntop;
+ if ( !args->pairs )
+ {
+ if ( !args->ngt_smpl && args->nqry_smpl <= args->ntop ) trim = 0;
+ if ( args->ngt_smpl && args->ngt_smpl <= args->ntop ) trim = 0;
+ }
- // Output pairwise distances
- fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n");
- int i,j, idx = 0;
- for (i=0; i<args->nsmpl; i++)
+ if ( args->pairs )
{
- for (j=0; j<i; j++)
+ int i;
+ for (i=0; i<args->npairs; i++)
{
- float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10;
- fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
- PDIST(tmp,i,j) = err;
- idx++;
+ int iqry = args->pairs[i].iqry;
+ int igt = args->pairs[i].igt;
+ if ( args->ndiff )
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+ args->ndiff[i],
+ args->calc_hwe_prob ? args->hwe_prob[i] : 0,
+ args->ncnt[i]);
+ }
+ else
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+ args->pdiff[i],
+ args->calc_hwe_prob ? args->hwe_prob[i] : 0,
+ args->ncnt[i]);
+ }
}
}
-
- // Cluster samples
- int nlist;
- float clust_max_err = args->max_intra_err;
- hclust_t *clust = hclust_init(args->nsmpl,tmp);
- cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist);
- fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n");
- for (i=0; i<nlist; i++)
- {
- fprintf(fp,"CLUSTER\t%f", list[i].dist);
- for (j=0; j<list[i].nmemb; j++)
- fprintf(fp,"\t%s",args->sm_hdr->samples[list[i].memb[j]]);
- fprintf(fp,"\n");
- }
- hclust_destroy_list(list,nlist);
- // Debugging output: the cluster graph and data used for deciding
- char **dbg = hclust_explain(clust,&nlist);
- for (i=0; i<nlist; i++)
- fprintf(fp,"DBG\t%s\n", dbg[i]);
- fprintf(fp, "# TH, clustering threshold\t[2]Value\nTH\t%f\n",clust_max_err);
- fprintf(fp, "# DOT\t[2]Cluster graph, visualize e.g. as \"this-output.txt | grep ^DOT | cut -f2- | dot -Tsvg -o graph.svg\"\n");
- fprintf(fp, "DOT\t%s\n", hclust_create_dot(clust,args->sm_hdr->samples,clust_max_err));
- hclust_destroy(clust);
- free(tmp);
-
-
- // Deprecated output for temporary backward compatibility
- fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n");
- fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
- idx = 0;
- for (i=0; i<args->nsmpl; i++)
+ else if ( !trim )
{
- for (j=0; j<i; j++)
+ int i,j,idx=0;
+ for (i=0; i<args->nqry_smpl; i++)
{
- fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
- idx++;
+ int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+ int ngt = args->cross_check ? i : args->ngt_smpl;
+ for (j=0; j<ngt; j++)
+ {
+ int igt = args->gt_smpl ? args->gt_smpl[j] : j;
+ if ( args->ndiff )
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+ args->ndiff[idx],
+ args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+ args->ncnt[idx]);
+ }
+ else
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+ args->pdiff[idx],
+ args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+ args->ncnt[idx]);
+ }
+ idx++;
+ }
}
}
-
- free(ndif);
- free(ntot);
- free(args->tmp_arr);
+ else if ( !args->cross_check )
+ {
+ idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*args->ngt_smpl);
+ int i,j;
+ for (i=0; i<args->nqry_smpl; i++)
+ {
+ int idx = i*args->ngt_smpl;
+ for (j=0; j<args->ngt_smpl; j++)
+ {
+ if ( args->sort_by_hwe )
+ arr[j].val = -args->hwe_prob[idx];
+ else if ( args->ndiff )
+ arr[j].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
+ else
+ arr[j].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
+ arr[j].ism = j;
+ arr[j].idx = idx;
+ idx++;
+ }
+ qsort(arr, args->ngt_smpl, sizeof(*arr), cmp_idbl);
+ int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+ for (j=0; j<args->ntop; j++)
+ {
+ int idx = arr[j].idx;
+ int igt = args->gt_smpl ? args->gt_smpl[arr[j].ism] : arr[j].ism;
+ if ( args->ndiff )
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+ args->ndiff[idx],
+ args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+ args->ncnt[idx]);
+ }
+ else
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+ args->pdiff[idx],
+ args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+ args->ncnt[idx]);
+ }
+ }
+ }
+ free(arr);
+ }
+ else
+ {
+ int narr = args->nqry_smpl-1;
+ idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*narr);
+ int i,j,k,idx;
+ for (i=0; i<args->nqry_smpl; i++)
+ {
+ k = 0, idx = i*(i-1)/2;
+ for (j=0; j<i; j++)
+ {
+ if ( args->sort_by_hwe )
+ arr[k].val = -args->hwe_prob[idx];
+ else if ( args->ndiff )
+ arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
+ else
+ arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
+ arr[k].ism = j;
+ arr[k].idx = idx;
+ idx++;
+ k++;
+ }
+ for (; j<narr; j++)
+ {
+ idx = j*(j+1)/2 + i;
+ if ( args->sort_by_hwe )
+ arr[k].val = -args->hwe_prob[idx];
+ else if ( args->ndiff )
+ arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
+ else
+ arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
+ arr[k].ism = j + 1;
+ arr[k].idx = idx;
+ k++;
+ }
+ qsort(arr, narr, sizeof(*arr), cmp_idbl);
+ int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+ for (j=0; j<args->ntop; j++)
+ {
+ if ( i <= arr[j].ism ) continue;
+ int idx = arr[j].idx;
+ int igt = args->qry_smpl ? args->qry_smpl[arr[j].ism] : arr[j].ism;
+ if ( args->ndiff )
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->qry_hdr->samples[igt],
+ args->ndiff[idx],
+ args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+ args->ncnt[idx]);
+ }
+ else
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->qry_hdr->samples[igt],
+ args->pdiff[idx],
+ args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+ args->ncnt[idx]);
+ }
+ }
+ }
+ free(arr);
+ }
}
-static char *init_prefix(char *prefix)
+static int is_input_okay(args_t *args, int nmatch)
{
- int len = strlen(prefix);
- if ( prefix[len-1] == '/' || prefix[len-1] == '\\' )
- return msprintf("%sgtcheck", prefix);
- return strdup(prefix);
+ int i;
+ const char *msg;
+ bcf_hdr_t *hdr;
+ bcf1_t *rec;
+ if ( args->gt_hdr && nmatch!=2 )
+ {
+ if ( args->nskip_no_match++ ) return 0;
+ for (i=0; i<2; i++)
+ {
+ rec = bcf_sr_get_line(args->files,i);
+ if ( rec ) break;
+ }
+ hdr = bcf_sr_get_header(args->files,i);
+ fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", no record with matching POS+ALT. (This is printed only once.)\n",
+ bcf_seqname(hdr,rec),rec->pos+1);
+ return 0;
+ }
+ for (i=0; i<2; i++)
+ {
+ hdr = bcf_sr_get_header(args->files,i);
+ rec = bcf_sr_get_line(args->files,i);
+ if ( rec->n_allele>2 )
+ {
+ if ( args->nskip_not_ba++ ) return 0;
+ msg = "not a biallelic site, run `bcftools norm -m -` first";
+ goto not_okay;
+ }
+ if ( bcf_get_variant_types(rec)==VCF_REF )
+ {
+ if ( args->nskip_mono++ ) return 0;
+ msg = "monoallelic site";
+ goto not_okay;
+ }
+ if ( !args->gt_hdr ) break;
+ }
+ return 1;
+
+not_okay:
+ fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n",
+ bcf_seqname(hdr,rec),rec->pos+1,msg);
+ return 0;
}
static void usage(void)
fprintf(stderr, "Usage: bcftools gtcheck [options] [-g <genotypes.vcf.gz>] <query.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -a, --all-sites output comparison for all sites\n");
- fprintf(stderr, " -c, --cluster <min,max> min inter- and max intra-sample error [0.23,-0.3]\n");
- fprintf(stderr, " -g, --genotypes <file> genotypes to compare against\n");
- fprintf(stderr, " -G, --GTs-only <int> use GTs, ignore PLs, using <int> for unseen genotypes [99]\n");
- fprintf(stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n");
- fprintf(stderr, " -p, --plot <prefix> plot\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -s, --query-sample <string> query sample (by default the first sample is checked)\n");
- fprintf(stderr, " -S, --target-sample <string> target sample in the -g file (used only for plotting)\n");
- fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ //fprintf(stderr, " -a, --all-sites Output comparison for all sites\n");
+ //fprintf(stderr, " -c, --cluster MIN,MAX Min inter- and max intra-sample error [0.23,-0.3]\n");
+ fprintf(stderr, " --distinctive-sites Find sites that can distinguish between at least NUM sample pairs.\n");
+ fprintf(stderr, " NUM[,MEM[,TMP]] If the number is smaller or equal to 1, it is interpreted as the fraction of pairs.\n");
+ fprintf(stderr, " The optional MEM string sets the maximum memory used for in-memory sorting [500M]\n");
+#ifdef _WIN32
+ fprintf(stderr, " and TMP is a prefix of temporary files used by external sorting [/bcftools.XXXXXX]\n");
+#else
+ fprintf(stderr, " and TMP is a prefix of temporary files used by external sorting [/tmp/bcftools.XXXXXX]\n");
+#endif
+ fprintf(stderr, " --dry-run Stop after first record to estimate required time\n");
+ fprintf(stderr, " -e, --error-probability INT Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n");
+ fprintf(stderr, " -g, --genotypes FILE Genotypes to compare against\n");
+ fprintf(stderr, " -H, --homs-only Homozygous genotypes only, useful with low coverage data (requires -g)\n");
+ fprintf(stderr, " --n-matches INT Print only top INT matches for each sample (sorted by average score), 0 for unlimited.\n");
+ fprintf(stderr, " Use negative value to sort by HWE probability rather than by discordance [0]\n");
+ fprintf(stderr, " --no-HWE-prob Disable calculation of HWE probability\n");
+ fprintf(stderr, " -p, --pairs LIST Comma-separated sample pairs to compare (qry,gt[,qry,gt..] with -g or qry,qry[,qry,qry..] w/o)\n");
+ fprintf(stderr, " -P, --pairs-file FILE File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --samples [qry|gt]:LIST List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n");
+ fprintf(stderr, " -S, --samples-file [qry|gt]:FILE File with the query or -g samples to compare\n");
+ fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n");
+ fprintf(stderr, "Examples:\n");
+ fprintf(stderr, " # Check discordance of all samples from B against all sample in A\n");
+ fprintf(stderr, " bcftools gtcheck -g A.bcf B.bcf\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " # Limit comparisons to the fiven list of samples\n");
+ fprintf(stderr, " bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " # Compare only two pairs a1,b1 and a1,b2\n");
+ fprintf(stderr, " bcftools gtcheck -p a1,b1,a1,b2 -g A.bcf B.bcf\n");
fprintf(stderr, "\n");
exit(1);
}
{
int c;
args_t *args = (args_t*) calloc(1,sizeof(args_t));
- args->files = bcf_sr_init();
args->argc = argc; args->argv = argv; set_cwd(args);
- char *regions = NULL, *targets = NULL;
- int regions_is_file = 0, targets_is_file = 0;
+ args->qry_use_GT = -1;
+ args->gt_use_GT = -1;
+ args->calc_hwe_prob = 1;
+ args->use_PLs = 40;
+
+ // external sort for --distinctive-sites
+#ifdef _WIN32
+ args->es_tmp_prefix = NULL;
+#else
+ args->es_tmp_prefix = "/tmp/bcftools-gtcheck";
+#endif
+ args->es_max_mem = strdup("500M");
// In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
// - min_inter: pairs with smaller err value will be considered identical
static struct option loptions[] =
{
+ {"error-probability",1,0,'e'},
+ {"use",1,0,'u'},
{"cluster",1,0,'c'},
{"GTs-only",1,0,'G'},
{"all-sites",0,0,'a'},
{"help",0,0,'h'},
{"genotypes",1,0,'g'},
{"plot",1,0,'p'},
- {"target-sample",1,0,'S'},
- {"query-sample",1,0,'s'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
+ {"n-matches",1,0,2},
+ {"no-HWE-prob",0,0,3},
+ {"target-sample",1,0,4},
+ {"dry-run",0,0,5},
+ {"distinctive-sites",1,0,6},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"pairs",1,0,'p'},
+ {"pairs-file",1,0,'P'},
{0,0,0,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'e':
+ args->use_PLs = strtol(optarg,&tmp,10);
+ if ( !tmp || *tmp ) error("Could not parse: --error-probability %s\n", optarg);
+ break;
+ case 'u':
+ {
+ int i,nlist;
+ char **list = hts_readlist(optarg, 0, &nlist);
+ if ( !list || nlist<=0 || nlist>2 ) error("Failed to parse --use %s\n", optarg);
+ if ( !strcasecmp("GT",list[0]) ) args->qry_use_GT = 1;
+ else if ( !strcasecmp("PL",list[0]) ) args->qry_use_GT = 0;
+ else error("Failed to parse --use %s; only GT and PL are supported\n", optarg);
+ if ( nlist==2 )
+ {
+ if ( !strcasecmp("GT",list[1]) ) args->gt_use_GT = 1;
+ else if ( !strcasecmp("PL",list[1]) ) args->gt_use_GT = 0;
+ else error("Failed to parse --use %s; only GT and PL are supported\n", optarg);
+ }
+ else args->gt_use_GT = args->qry_use_GT;
+ for (i=0; i<nlist; i++) free(list[i]);
+ free(list);
+ }
+ break;
+ case 2 :
+ args->ntop = strtol(optarg,&tmp,10);
+ if ( !tmp || *tmp ) error("Could not parse: --n-matches %s\n", optarg);
+ if ( args->ntop < 0 )
+ {
+ args->sort_by_hwe = 1;
+ args->ntop *= -1;
+ }
+ break;
+ case 3 : args->calc_hwe_prob = 0; break;
+ case 4 : error("The option -S, --target-sample has been deprecated\n"); break;
+ case 5 : args->dry_run = 1; break;
+ case 6 :
+ args->distinctive_sites = strtod(optarg,&tmp);
+ if ( *tmp )
+ {
+ if ( *tmp!=',' ) error("Could not parse: --distinctive-sites %s\n", optarg);
+ tmp++;
+ free(args->es_max_mem);
+ args->es_max_mem = strdup(tmp);
+ while ( *tmp && *tmp!=',' ) tmp++;
+ if ( *tmp ) { *tmp = 0; args->es_tmp_prefix = tmp+1; }
+ }
+ args->use_PLs = 0;
+ break;
case 'c':
+ error("The -c option is to be implemented, please open an issue on github\n");
args->min_inter_err = strtod(optarg,&tmp);
if ( *tmp )
{
if ( *tmp ) error("Could not parse: -c %s\n", optarg);
}
break;
- case 'G':
- args->no_PLs = strtol(optarg,&tmp,10);
- if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg);
- break;
- case 'a': args->all_sites = 1; break;
+ case 'G': error("The option -G, --GTs-only has been deprecated\n"); break;
+ case 'a': args->all_sites = 1; error("The -a option is to be implemented, please open an issue on github\n"); break;
case 'H': args->hom_only = 1; break;
case 'g': args->gt_fname = optarg; break;
- case 'p': args->plot = optarg; break;
- case 'S': args->target_sample = optarg; break;
- case 's': args->query_sample = optarg; break;
- case 'r': regions = optarg; break;
- case 'R': regions = optarg; regions_is_file = 1; break;
- case 't': targets = optarg; break;
- case 'T': targets = optarg; targets_is_file = 1; break;
+// case 'p': args->plot = optarg; break;
+ case 's':
+ if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3;
+ else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4;
+ else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
+ break;
+ case 'S':
+ if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1;
+ else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1;
+ else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
+ break;
+ case 'p': args->pair_samples = optarg; break;
+ case 'P': args->pair_samples = optarg; args->pair_samples_is_file = 1; break;
+ case 'r': args->regions = optarg; break;
+ case 'R': args->regions = optarg; args->regions_is_file = 1; break;
+ case 't': args->targets = optarg; break;
+ case 'T': args->targets = optarg; args->targets_is_file = 1; break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
}
}
- char *fname = NULL;
if ( optind==argc )
{
- if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ if ( !isatty(fileno((FILE *)stdin)) ) args->qry_fname = "-"; // reading from stdin
else usage(); // no files given
}
- else fname = argv[optind];
- if ( argc>optind+1 ) usage(); // too many files given
- if ( !args->gt_fname ) args->cross_check = 1; // no genotype file, run in cross-check mode
- else args->files->require_index = 1;
- if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions);
- if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets);
- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
- if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) )
- error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum));
- args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS;
- if ( args->plot ) args->plot = init_prefix(args->plot);
+ else args->qry_fname = argv[optind];
+ if ( argc>optind+1 ) error("Error: too many files given, run with -h for help\n"); // too many files given
+ if ( args->pair_samples )
+ {
+ if ( args->gt_samples || args->qry_samples ) error("The -p/-P option cannot be combined with -s/-S\n");
+ if ( args->ntop ) error("The --n-matches option cannot be combined with -p/-P\n");
+ }
+ if ( args->distinctive_sites && !args->pair_samples ) error("The experimental option --distinctive-sites requires -p/-P\n");
+ if ( args->hom_only && !args->gt_fname ) error("The option --homs-only requires --genotypes\n");
+ if ( args->distinctive_sites && args->use_PLs ) error("The option --distinctive-sites cannot be combined with --error-probability\n");
+
init_data(args);
- if ( args->cross_check )
- cross_check_gts(args);
- else
- check_gt(args);
+
+ int ret;
+ while ( (ret=bcf_sr_next_line(args->files)) )
+ {
+ if ( !is_input_okay(args,ret) ) continue;
+
+ // time one record to give the user an estimate with very big files
+ struct timeval t0, t1;
+ if ( !args->ncmp ) gettimeofday(&t0, NULL);
+
+ process_line(args);
+
+ if ( args->ncmp==1 )
+ {
+ gettimeofday(&t1, NULL);
+ double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec);
+ fprintf(stderr,"INFO:\tTime required to process one record .. %f seconds\n",delta/1e6);
+ fprintf(args->fp,"INFO\tTime required to process one record .. %f seconds\n",delta/1e6);
+ if ( args->dry_run ) break;
+ }
+ }
+ if ( !args->dry_run )
+ {
+ report(args);
+ if ( args->distinctive_sites ) report_distinctive_sites(args);
+ }
+
destroy_data(args);
- bcf_sr_destroy(args->files);
- if (args->plot) free(args->plot);
free(args);
return 0;
}
/* vcfgtcheck.c -- Check sample identity.
- Copyright (C) 2013-2018 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdarg.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <ctype.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
+#include <htslib/kbitset.h>
+#include <htslib/hts_os.h>
#include <inttypes.h>
+#include <sys/time.h>
#include "bcftools.h"
-#include "hclust.h"
+#include "extsort.h"
+//#include "hclust.h"
typedef struct
{
- bcf_srs_t *files; // first reader is the query VCF - single sample normally or multi-sample for cross-check
- bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF
- int ntmp_arr, npl_arr;
- int32_t *tmp_arr, *pl_arr;
- double *lks, *sites, min_inter_err, max_intra_err;
- int *cnts, *dps, hom_only, cross_check, all_sites;
- char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample;
- int argc, no_PLs, narr, nsmpl;
-}
-args_t;
-
-FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
-char *msprintf(const char *fmt, ...);
-void mkdir_p(const char *fmt, ...);
-
-void py_plot(char *script)
-{
- mkdir_p(script);
- int len = strlen(script);
- char *cmd = !strcmp(".py",script+len-3) ? msprintf("python %s", script) : msprintf("python %s.py", script);
- int ret = system(cmd);
- if ( ret ) fprintf(bcftools_stderr, "The command returned non-zero status %d: %s\n", ret, cmd);
- free(cmd);
-}
-
-static void plot_check(args_t *args, char *target_sample, char *query_sample)
-{
- char *fname;
- FILE *fp = open_file(&fname, "w", "%s.py", args->plot);
- fprintf(fp,
- "import matplotlib as mpl\n"
- "mpl.use('Agg')\n"
- "import matplotlib.pyplot as plt\n"
- "import matplotlib.gridspec as gridspec\n"
- "import csv\n"
- "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
- "\n"
- "sample_ids = False\n"
- "\n"
- "dat = []\n"
- "with open('%s.tab', 'r') as f:\n"
- " reader = csv.reader(f, 'tab')\n"
- " for row in reader:\n"
- " if row[0][0]=='#': continue\n"
- " if row[0]!='CN': continue\n"
- " tgt = 0\n"
- " if row[4]=='%s': tgt = 1\n"
- " dat.append([float(row[1]), float(row[2]), float(row[3]), tgt, row[4]])\n"
- "\n"
- "dat = sorted(dat)\n"
- "\n"
- "iq = -1; dp = 0\n"
- "for i in range(len(dat)):\n"
- " if iq==-1 and dat[i][3]==1: iq = i\n"
- " dp += dat[i][2]\n"
- "dp /= len(dat)\n"
- "\n"
- "fig,ax1 = plt.subplots(figsize=(8,5))\n"
- "ax2 = ax1.twinx()\n"
- "plots = ax1.plot([x[0] for x in dat],'o-', ms=3, color='g', mec='g', label='Discordance (total)')\n"
- "plots += ax1.plot([x[1] for x in dat], '^', ms=3, color='r', mec='r', label='Discordance (avg per site)')\n"
- "plots += ax2.plot([x[2] for x in dat],'v', ms=3, color='k', label='Number of sites')\n"
- "if iq!=-1:\n"
- " ax1.plot([iq],[dat[iq][0]],'o',color='orange', ms=9)\n"
- " ax1.annotate('%s',xy=(iq,dat[iq][0]), xytext=(5,5), textcoords='offset points',fontsize='xx-small',rotation=45,va='bottom',ha='left')\n"
- " ax1.plot([iq],[dat[iq][1]],'^',color='red', ms=5)\n"
- "for tl in ax1.get_yticklabels(): tl.set_color('g')\n"
- "for tl in ax2.get_yticklabels(): tl.set_color('k'); tl.set_fontsize(9)\n"
- "min_dp = min([x[2] for x in dat])\n"
- "max_dp = max([x[2] for x in dat])\n"
- "ax2.set_ylim(min_dp-1,max_dp+1)\n"
- "ax1.set_title('Discordance with %s')\n"
- "ax1.set_xlim(-0.05*len(dat),1.05*(len(dat)-1))\n"
- "ax1.set_xlabel('Sample ID')\n"
- "plt.subplots_adjust(left=0.1,right=0.9,bottom=0.1,top=0.9)\n"
- "if sample_ids:\n"
- " ax1.set_xticks(range(len(dat)))\n"
- " ax1.set_xticklabels([x[4] for x in dat],**{'rotation':45, 'ha':'right', 'fontsize':8})\n"
- " plt.subplots_adjust(bottom=0.2)\n"
- "ax1.set_ylabel('Discordance',color='g')\n"
- "ax2.set_ylabel('Number of sites',color='k')\n"
- "ax2.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n"
- "ax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n"
- "labels = [l.get_label() for l in plots]\n"
- "plt.legend(plots,labels,numpoints=1,markerscale=1,loc='best',prop={'size':10},frameon=False)\n"
- "plt.savefig('%s.png')\n"
- "plt.close()\n"
- "\n", args->plot, target_sample, target_sample, query_sample, args->plot
- );
- fclose(fp);
- py_plot(fname);
- free(fname);
-}
-
-#if 0
-static void plot_cross_check(args_t *args)
-{
- char *fname;
- FILE *fp = open_file(&fname, "w", "%s.py", args->plot);
- fprintf(fp,
- "import matplotlib as mpl\n"
- "mpl.use('Agg')\n"
- "import matplotlib.pyplot as plt\n"
- "import matplotlib.gridspec as gridspec\n"
- "import csv\n"
- "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
- "avg = []\n"
- "dp = []\n"
- "sm2id = {}\n"
- "dat = None\n"
- "min = None\n"
- "max = None\n"
- "with open('%s.tab', 'r') as f:\n"
- " reader = csv.reader(f, 'tab')\n"
- " i = 0\n"
- " for row in reader:\n"
- " if row[0]=='SM':\n"
- " sm2id[row[4]] = i\n"
- " avg.append([i,float(row[1])])\n"
- " dp.append([i,float(row[2])])\n"
- " i += 1\n"
- " elif row[0]=='CN':\n"
- " val = 0\n"
- " if int(row[2])!=0: val = float(row[1])/int(row[2])\n"
- " if not dat:\n"
- " dat = [[0]*len(sm2id) for x in xrange(len(sm2id))]\n"
- " min = val\n"
- " max = val\n"
- " id_i = sm2id[row[4]]\n"
- " id_j = sm2id[row[5]]\n"
- " dat[id_i][id_j] = val\n"
- " dat[id_j][id_i] = val\n"
- " if min > val: min = val\n"
- " if max < val: max = val\n"
- "\n"
- "if len(sm2id)<=1: exit(1)\n"
- "if min==max: exit(1)\n"
- "\n"
- "fig = plt.figure(figsize=(6,7))\n"
- "gs = gridspec.GridSpec(2, 1, height_ratios=[1, 1.5])\n"
- "ax1 = plt.subplot(gs[0])\n"
- "ax2 = plt.subplot(gs[1])\n"
- "\n"
- "ax1.plot([x[0] for x in avg],[x[1] for x in avg],'^-', ms=3, color='k')\n"
- "ax3 = ax1.twinx()\n"
- "ax3.plot([x[0] for x in dp],[x[1] for x in dp],'^-', ms=3, color='r',mec='r')\n"
- "for tl in ax3.get_yticklabels():\n"
- " tl.set_color('r')\n"
- " tl.set_fontsize(9)\n"
- "\n"
- "im = ax2.imshow(dat,clim=(min),interpolation='nearest',origin='lower')\n"
- "cb1 = plt.colorbar(im,ax=ax2)\n"
- "cb1.set_label('Pairwise discordance')\n"
- "for t in cb1.ax.get_yticklabels(): t.set_fontsize(9)\n"
- "\n"
- "ax1.tick_params(axis='both', which='major', labelsize=9)\n"
- "ax1.tick_params(axis='both', which='minor', labelsize=9)\n"
- "ax2.tick_params(axis='both', which='major', labelsize=9)\n"
- "ax2.tick_params(axis='both', which='minor', labelsize=9)\n"
- "\n"
- "ax1.set_title('Sample Discordance Score')\n"
- "ax2.set_ylabel('Sample ID')\n"
- "ax2.set_xlabel('Sample ID')\n"
- "ax3.set_ylabel('Average Depth',color='r')\n"
- "ax1.set_xlabel('Sample ID')\n"
- "ax1.set_ylabel('Average discordance')\n"
- "\n"
- "plt.subplots_adjust(left=0.15,right=0.87,bottom=0.08,top=0.93,hspace=0.25)\n"
- "plt.savefig('%s.png')\n"
- "plt.close()\n"
- "\n", args->plot,args->plot
- );
- fclose(fp);
- py_plot(fname);
- free(fname);
-}
-#endif
-
-static void init_data(args_t *args)
-{
- args->sm_hdr = args->files->readers[0].header;
- if ( !bcf_hdr_nsamples(args->sm_hdr) ) error("No samples in %s?\n", args->files->readers[0].fname);
-
- if ( !args->cross_check )
- {
- args->gt_hdr = args->files->readers[1].header;
- int nsamples = bcf_hdr_nsamples(args->gt_hdr);
- if ( !nsamples ) error("No samples in %s?\n", args->files->readers[1].fname);
- args->lks = (double*) calloc(nsamples,sizeof(double));
- args->cnts = (int*) calloc(nsamples,sizeof(int));
- args->sites = (double*) calloc(nsamples,sizeof(double));
- args->dps = (int*) calloc(nsamples,sizeof(int));
- }
+ int iqry, igt;
}
+pair_t;
-static void destroy_data(args_t *args)
-{
- free(args->lks); free(args->cnts); free(args->dps); free(args->cwd); free(args->sites);
-}
-
-static int allele_to_int(bcf1_t *line, char *allele)
+typedef struct
{
- int i;
- for (i=0; i<line->n_allele; i++)
- if ( !strcmp(allele,line->d.allele[i]) ) return i;
- if ( strcmp(line->d.allele[i-1],"X") ) return -1;
- return i-1;
-}
+ bcf_srs_t *files; // first reader is the query VCF - single sample normally or multi-sample for cross-check
+ bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF
+ char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples;
+ int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
+ int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
+ double *pdiff, *qry_prob, *gt_prob;
+ uint32_t *ndiff,*ncnt,ncmp, npairs;
+ int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
+ uint8_t *qry_dsg, *gt_dsg;
+ pair_t *pairs;
+ double *hwe_prob, dsg2prob[8][3], pl2prob[256];
+ double min_inter_err, max_intra_err;
+ int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, use_PLs;
+ FILE *fp;
+ unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL;
-static int init_gt2ipl(args_t *args, bcf1_t *gt_line, bcf1_t *sm_line, int *gt2ipl, int n_gt2ipl)
-{
- int i, j;
- for (i=0; i<n_gt2ipl; i++) gt2ipl[i] = -1;
- for (i=0; i<gt_line->n_allele; i++)
- {
- // find which of the sm_alleles (k) corresponds to the gt_allele (i)
- int k = allele_to_int(sm_line, gt_line->d.allele[i]);
- if ( k<0 ) return 0;
- for (j=0; j<=i; j++)
- {
- int l = allele_to_int(sm_line, gt_line->d.allele[j]);
- if ( l<0 ) return 0;
- gt2ipl[ bcf_ij2G(j,i) ] = k<=l ? bcf_ij2G(k,l) : bcf_ij2G(l,k);
- }
- }
- //for (i=0; i<n_gt2ipl; i++) fprintf(bcftools_stdout, "%d .. %d\n", i,gt2ipl[i]);
- return 1;
+ // for --distinctive-sites
+ double distinctive_sites;
+ kbitset_t *kbs_diff;
+ size_t diff_sites_size;
+ extsort_t *es;
+ char *es_tmp_prefix, *es_max_mem;
}
+args_t;
static void set_cwd(args_t *args)
{
}
assert(buf);
}
-
static void print_header(args_t *args, FILE *fp)
{
fprintf(fp, "# This file was produced by bcftools (%s+htslib-%s), the command line was:\n", bcftools_version(), hts_version());
fprintf(fp, "# \t %s\n#\n", args->cwd);
}
-static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line)
+static int cmp_int(const void *_a, const void *_b)
{
- // PLs not present, use GTs instead.
- int fake_PL = args->no_PLs ? args->no_PLs : 99; // with 1, discordance is the number of non-matching GTs
- int nsm_gt, i;
- if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 )
- error("GT not present at %s:%"PRId64"?\n", hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
- nsm_gt /= bcf_hdr_nsamples(hdr);
- int npl = line->n_allele*(line->n_allele+1)/2;
- hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr);
- for (i=0; i<bcf_hdr_nsamples(hdr); i++)
- {
- int *gt_ptr = args->tmp_arr + i*nsm_gt;
- int j, *pl_ptr = args->pl_arr + i*npl;
- if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) // missing genotype
- {
- for (j=0; j<npl; j++) pl_ptr[j] = -1;
- }
- else
- {
- int a = bcf_gt_allele(gt_ptr[0]);
- int b = bcf_gt_allele(gt_ptr[1]);
- for (j=0; j<npl; j++) pl_ptr[j] = fake_PL;
- int idx = bcf_alleles2gt(a,b);
- pl_ptr[idx] = 0;
- }
- }
- return npl;
+ int a = *((int*)_a);
+ int b = *((int*)_b);
+ if ( a < b ) return -1;
+ if ( a > b ) return 1;
+ return 0;
+}
+static int cmp_pair(const void *_a, const void *_b)
+{
+ pair_t *a = (pair_t*)_a;
+ pair_t *b = (pair_t*)_b;
+ if ( a->iqry < b->iqry ) return -1;
+ if ( a->iqry > b->iqry ) return 1;
+ if ( a->igt < b->igt ) return -1;
+ if ( a->igt > b->igt ) return 1;
+ return 0;
}
-static int cmp_doubleptr(const void *_a, const void *_b)
+typedef struct
+{
+ uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosoms
+ unsigned long kbs_dat[1];
+}
+diff_sites_t;
+#if DBG
+static void diff_sites_debug_print(args_t *args, diff_sites_t *ds)
+{
+ int i;
+ memcpy(args->kbs_diff->b,ds->kbs_dat,args->kbs_diff->n*sizeof(unsigned long));
+ fprintf(bcftools_stderr,"%s:%d\t%d\t",bcf_hdr_id2name(args->qry_hdr,ds->rid),ds->pos+1,ds->ndiff);
+ for (i=0; i<args->npairs; i++) fprintf(bcftools_stderr,"%d",kbs_exists(args->kbs_diff,i)?1:0);
+ fprintf(bcftools_stderr,"\n");
+}
+#endif
+static int diff_sites_cmp(const void *aptr, const void *bptr)
+{
+ diff_sites_t *a = *((diff_sites_t**)aptr);
+ diff_sites_t *b = *((diff_sites_t**)bptr);
+ if ( a->ndiff < b->ndiff ) return 1; // descending order
+ if ( a->ndiff > b->ndiff ) return -1;
+ if ( a->rand < b->rand ) return -1;
+ if ( a->rand > b->rand ) return 1;
+ return 0;
+}
+static void diff_sites_init(args_t *args)
+{
+ int nsites = args->distinctive_sites<=1 ? args->npairs*args->distinctive_sites : args->distinctive_sites;
+ if ( nsites<=0 ) error("The value for --distinctive-sites was set too low: %d\n",nsites);
+ if ( nsites > args->npairs )
+ {
+ fprintf(bcftools_stderr,"Warning: The value for --distinctive-sites is bigger than is the number of pairs, all discordant sites be printed.\n");
+ nsites = args->npairs;
+ args->distinctive_sites = args->npairs + 1;
+ }
+ else
+ args->distinctive_sites = nsites;
+ args->kbs_diff = kbs_init(args->npairs);
+ size_t n = (args->npairs + KBS_ELTBITS-1) / KBS_ELTBITS;
+ assert( n==args->kbs_diff->n );
+ args->diff_sites_size = sizeof(diff_sites_t) + (n-1)*sizeof(unsigned long);
+ args->es = extsort_alloc();
+ extsort_set_opt(args->es,size_t,DAT_SIZE,args->diff_sites_size);
+ extsort_set_opt(args->es,const char*,TMP_PREFIX,args->es_tmp_prefix);
+ extsort_set_opt(args->es,const char*,MAX_MEM,args->es_max_mem);
+ extsort_set_opt(args->es,extsort_cmp_f,FUNC_CMP,diff_sites_cmp);
+ extsort_init(args->es);
+}
+static void diff_sites_destroy(args_t *args)
{
- double *a = *((double**)_a);
- double *b = *((double**)_b);
- if ( *a < *b ) return -1;
- else if ( *a == *b ) return 0;
+ kbs_destroy(args->kbs_diff);
+ extsort_destroy(args->es);
+}
+static inline void diff_sites_reset(args_t *args)
+{
+ kbs_clear(args->kbs_diff);
+}
+static inline void diff_sites_push(args_t *args, int ndiff, int rid, int pos)
+{
+ diff_sites_t *dat = (diff_sites_t*) malloc(args->diff_sites_size);
+ memset(dat,0,sizeof(*dat)); // for debugging: prevent warnings about uninitialized memory coming from struct padding (not needed after rand added)
+ dat->ndiff = ndiff;
+ dat->rid = rid;
+ dat->pos = pos;
+ dat->rand = hts_lrand48();
+ memcpy(dat->kbs_dat,args->kbs_diff->b,args->kbs_diff->n*sizeof(unsigned long));
+ extsort_push(args->es,dat);
+}
+static inline int diff_sites_shift(args_t *args, int *ndiff, int *rid, int *pos)
+{
+ diff_sites_t *dat = (diff_sites_t*) extsort_shift(args->es);
+ if ( !dat ) return 0;
+ *ndiff = dat->ndiff;
+ *rid = dat->rid;
+ *pos = dat->pos;
+ memcpy(args->kbs_diff->b,dat->kbs_dat,args->kbs_diff->n*sizeof(unsigned long));
return 1;
}
-static void check_gt(args_t *args)
+static void init_samples(char *list, int list_is_file, int **smpl, int *nsmpl, bcf_hdr_t *hdr, char *vcf_fname)
{
- int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0;
- int fake_pls = args->no_PLs;
+ int i;
+ if ( !strcmp(list,"-") )
+ {
+ *nsmpl = bcf_hdr_nsamples(hdr);
+ *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl));
+ for (i=0; i<*nsmpl; i++) (*smpl)[i] = i;
+ return;
+ }
- // Initialize things: check which tags are defined in the header, sample names etc.
- if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname);
- if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
+ char **tmp = hts_readlist(list, list_is_file, nsmpl);
+ if ( !tmp || !*nsmpl ) error("Failed to parse %s\n", list);
+ *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl));
+ for (i=0; i<*nsmpl; i++)
{
- if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
- error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
- if ( !args->no_PLs )
- fprintf(bcftools_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
- fake_pls = 1;
+ int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, tmp[i]);
+ if ( idx<0 ) error("No such sample in %s: [%s]\n",vcf_fname,tmp[i]);
+ (*smpl)[i] = idx;
+ free(tmp[i]);
}
+ free(tmp);
+ qsort(*smpl,*nsmpl,sizeof(**smpl),cmp_int);
+ // check for duplicates
+ for (i=1; i<*nsmpl; i++)
+ if ( (*smpl)[i-1]==(*smpl)[i] )
+ error("Error: the sample \"%s\" is listed twice in %s\n", hdr->samples[(*smpl)[i]],list);
+}
- FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : bcftools_stdout;
- print_header(args, fp);
+static void init_data(args_t *args)
+{
+ hts_srand48(0);
- int tgt_isample = -1, query_isample = 0;
- if ( args->target_sample )
+ args->files = bcf_sr_init();
+ if ( args->regions && bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions);
+ if ( args->targets && bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets);
+
+ if ( args->gt_fname ) bcf_sr_set_opt(args->files, BCF_SR_REQUIRE_IDX);
+ if ( !bcf_sr_add_reader(args->files,args->qry_fname) ) error("Failed to open %s: %s\n", args->qry_fname,bcf_sr_strerror(args->files->errnum));
+ if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) )
+ error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum));
+
+ args->qry_hdr = bcf_sr_get_header(args->files,0);
+ if ( !bcf_hdr_nsamples(args->qry_hdr) ) error("No samples in %s?\n", args->qry_fname);
+ if ( args->gt_fname )
{
- tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample);
- if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample);
+ args->gt_hdr = bcf_sr_get_header(args->files,1);
+ if ( !bcf_hdr_nsamples(args->gt_hdr) ) error("No samples in %s?\n", args->gt_fname);
}
- if ( args->all_sites )
+
+ // Determine whether GT or PL will be used
+ if ( args->qry_use_GT==-1 ) // not set by -u, qry uses PL by default
{
- if ( tgt_isample==-1 )
- {
- fprintf(bcftools_stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
- tgt_isample = 0;
- }
+ if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")>=0 )
+ args->qry_use_GT = 0;
+ else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")>=0 )
+ args->qry_use_GT = 1;
+ else
+ error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->qry_fname);
}
- if ( args->query_sample )
+ else if ( args->qry_use_GT==1 )
{
- query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample);
- if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample);
+ if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")<0 )
+ error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->qry_fname);
}
- if ( args->all_sites )
- fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]match log LK\t[7]Query alleles\t[8-]Query PLs (%s)\n",
- args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]);
+ else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")<0 )
+ error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->qry_fname);
- // Main loop
- float prev_lk = 0;
- while ( (ret=bcf_sr_next_line(args->files)) )
+ if ( args->gt_hdr )
{
- if ( ret!=2 ) continue;
- bcf1_t *sm_line = args->files->readers[0].buffer[0]; // the query file
- bcf1_t *gt_line = args->files->readers[1].buffer[0]; // the -g target file
- bcf_unpack(sm_line, BCF_UN_FMT);
- bcf_unpack(gt_line, BCF_UN_FMT);
-
- // Init mapping from target genotype index to the sample's PL fields
- int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2;
- if ( n_gt2ipl > m_gt2ipl )
+ if ( args->gt_use_GT==-1 ) // not set by -u, gt uses GT by default
+ {
+ if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")>=0 )
+ args->gt_use_GT = 1;
+ else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")>=0 )
+ args->gt_use_GT = 0;
+ else
+ error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->gt_fname);
+ }
+ else if ( args->gt_use_GT==1 )
{
- m_gt2ipl = n_gt2ipl;
- gt2ipl = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl);
+ if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")<0 )
+ error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->gt_fname);
}
- if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue;
-
- // Target genotypes
- int ngt, npl;
- if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 )
- error("GT not present at %s:%"PRId64"?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1);
- ngt /= bcf_hdr_nsamples(args->gt_hdr);
- if ( ngt!=2 ) continue; // checking only diploid genotypes
+ else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")<0 )
+ error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->gt_fname);
+ }
+ else
+ args->gt_use_GT = args->qry_use_GT;
- // Sample PLs
- if ( !fake_pls )
+ // Prepare samples
+ int i,j;
+ args->nqry_smpl = bcf_hdr_nsamples(args->qry_hdr);
+ if ( args->qry_samples )
+ {
+ init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname);
+ }
+ if ( args->gt_samples )
+ {
+ init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl,
+ args->gt_hdr ? args->gt_hdr : args->qry_hdr,
+ args->gt_fname ? args->gt_fname : args->qry_fname);
+ }
+ else if ( args->pair_samples )
+ {
+ int npairs;
+ char **tmp = hts_readlist(args->pair_samples, args->pair_samples_is_file, &npairs);
+ if ( !tmp || !npairs ) error("Failed to parse %s\n", args->pair_samples);
+ if ( !args->pair_samples_is_file && npairs%2 ) error("Expected even number of comma-delimited samples with -p\n");
+ args->npairs = args->pair_samples_is_file ? npairs : npairs/2;
+ args->pairs = (pair_t*) calloc(args->npairs,sizeof(*args->pairs));
+ if ( !args->pair_samples_is_file )
{
- if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 )
+ for (i=0; i<args->npairs; i++)
{
- if ( sm_line->n_allele==1 )
- {
- // PL values may not be present when ALT=. (mpileup/bcftools output), in that case
- // switch automatically to GT at these sites
- npl = fake_PLs(args, args->sm_hdr, sm_line);
- }
- else
- error("PL not present at %s:%"PRId64"?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, (int64_t) sm_line->pos+1);
+ args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i]);
+ args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i+1]);
+ if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[2*i]);
+ if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,tmp[2*i+1]);
+ free(tmp[2*i]);
+ free(tmp[2*i+1]);
}
- else
- npl /= bcf_hdr_nsamples(args->sm_hdr);
}
else
- npl = fake_PLs(args, args->sm_hdr, sm_line);
+ {
+ for (i=0; i<args->npairs; i++)
+ {
+ char *ptr = tmp[i];
+ while ( *ptr && !isspace(*ptr) ) ptr++;
+ if ( !*ptr ) error("Could not parse %s: %s\n",args->pair_samples,tmp[i]);
+ *ptr = 0;
+ args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[i]);
+ if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[i]);
+ ptr++;
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, ptr);
+ if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,ptr);
+ free(tmp[i]);
+ }
+ }
+ free(tmp);
+ qsort(args->pairs,args->npairs,sizeof(*args->pairs),cmp_pair);
+ }
+ else if ( args->gt_hdr )
+ args->ngt_smpl = bcf_hdr_nsamples(args->gt_hdr);
+ if ( !args->ngt_smpl )
+ {
+ args->ngt_smpl = args->nqry_smpl;
+ args->gt_smpl = args->qry_smpl;
+ args->cross_check = 1;
+ }
+
+ // The data arrays
+ if ( !args->npairs ) args->npairs = args->cross_check ? args->nqry_smpl*(args->nqry_smpl+1)/2 : args->ngt_smpl*args->nqry_smpl;
+ if ( !args->pair_samples )
+ {
+ args->qry_dsg = (uint8_t*) malloc(args->nqry_smpl);
+ args->gt_dsg = args->cross_check ? args->qry_dsg : (uint8_t*) malloc(args->ngt_smpl);
+ }
+ if ( args->use_PLs )
+ {
+ args->pdiff = (double*) calloc(args->npairs,sizeof(*args->pdiff)); // log probability of pair samples being the same
+ args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob));
+ args->gt_prob = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
+
+ // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
+ // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding
+ // probabilities of 0/0, 0/1, and 1/1 genotypes
+ for (i=0; i<8; i++)
+ for (j=0; j<3; j++)
+ args->dsg2prob[i][j] = HUGE_VAL;
+ args->dsg2prob[1][0] = -log(1-pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[1][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[1][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[2][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[2][1] = -log(1-pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[2][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[4][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[4][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
+ args->dsg2prob[4][2] = -log(1-pow(10,-0.1*args->use_PLs));
- // Calculate likelihoods for all samples, assuming diploid genotypes
+ // lookup table to avoid exponentiation
+ for (i=0; i<256; i++) args->pl2prob[i] = pow(10,-0.1*i);
+ }
+ else
+ args->ndiff = (uint32_t*) calloc(args->npairs,sizeof(*args->ndiff)); // number of differing genotypes for each pair of samples
+ args->ncnt = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt)); // number of comparisons performed (non-missing data)
+ if ( !args->ncnt ) error("Error: failed to allocate %.1f Mb\n", args->npairs*sizeof(*args->ncnt)/1e6);
+ if ( args->calc_hwe_prob )
+ {
+ // prob of the observed sequence of matches given site AFs and HWE
+ args->hwe_prob = (double*) calloc(args->npairs,sizeof(*args->hwe_prob));
+ if ( !args->hwe_prob ) error("Error: failed to allocate %.1f Mb. Run with --no-HWE-prob to save some memory.\n", args->npairs*sizeof(*args->hwe_prob)/1e6);
+ }
+
+ if ( args->distinctive_sites ) diff_sites_init(args);
+
+ args->fp = bcftools_stdout;
+ print_header(args, args->fp);
+}
+
+static void destroy_data(args_t *args)
+{
+ if ( args->gt_dsg!=args->qry_dsg ) free(args->gt_dsg);
+ free(args->qry_dsg);
+ if ( args->gt_prob!=args->qry_prob ) free(args->gt_prob);
+ free(args->qry_prob);
+ free(args->es_max_mem);
+ fclose(args->fp);
+ if ( args->distinctive_sites ) diff_sites_destroy(args);
+ free(args->hwe_prob);
+ free(args->cwd);
+ free(args->qry_arr);
+ if ( args->gt_hdr ) free(args->gt_arr);
+ free(args->pdiff);
+ free(args->ndiff);
+ free(args->ncnt);
+ free(args->qry_smpl);
+ if ( args->gt_smpl!=args->qry_smpl ) free(args->gt_smpl);
+ free(args->pairs);
+ bcf_sr_destroy(args->files);
+}
- // For faster access to genotype likelihoods (PLs) of the query sample
- int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl;
- double sum_pl = 0; // for converting PLs to probs
- for (max_ipl=0; max_ipl<npl; max_ipl++)
+static inline uint8_t gt_to_dsg(int32_t *ptr)
+{
+ if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end ) return 0;
+ uint8_t dsg = (bcf_gt_allele(ptr[0])?1:0) + (bcf_gt_allele(ptr[1])?1:0);
+ return 1<<dsg;
+}
+static inline uint8_t pl_to_dsg(int32_t *ptr)
+{
+ if ( ptr[0]==bcf_int32_missing || ptr[1]==bcf_int32_missing || ptr[2]==bcf_int32_missing ) return 0;
+ if ( ptr[1]==bcf_int32_vector_end || ptr[2]==bcf_int32_vector_end ) return 0;
+ int min_pl = ptr[0]<ptr[1] ? (ptr[0]<ptr[2]?ptr[0]:ptr[2]) : (ptr[1]<ptr[2]?ptr[1]:ptr[2]);
+ uint8_t dsg = 0;
+ if ( ptr[0]==min_pl ) dsg |= 1;
+ if ( ptr[1]==min_pl ) dsg |= 2;
+ if ( ptr[2]==min_pl ) dsg |= 4;
+ return dsg;
+}
+static inline uint8_t gt_to_prob(args_t *args, int32_t *ptr, double *prob)
+{
+ uint8_t dsg = gt_to_dsg(ptr);
+ if ( dsg )
+ {
+ prob[0] = args->dsg2prob[dsg][0];
+ prob[1] = args->dsg2prob[dsg][1];
+ prob[2] = args->dsg2prob[dsg][2];
+ }
+ return dsg;
+}
+static inline uint8_t pl_to_prob(args_t *args, int32_t *ptr, double *prob)
+{
+ uint8_t dsg = pl_to_dsg(ptr);
+ if ( dsg )
+ {
+ prob[0] = (ptr[0]>=0 && ptr[0]<255) ? args->pl2prob[ptr[0]] : args->pl2prob[255];
+ prob[1] = (ptr[1]>=0 && ptr[1]<255) ? args->pl2prob[ptr[1]] : args->pl2prob[255];
+ prob[2] = (ptr[2]>=0 && ptr[2]<255) ? args->pl2prob[ptr[2]] : args->pl2prob[255];
+ double sum = prob[0] + prob[1] + prob[2];
+ prob[0] /= sum;
+ prob[1] /= sum;
+ prob[2] /= sum;
+ prob[0] = -log(prob[0]);
+ prob[1] = -log(prob[1]);
+ prob[2] = -log(prob[2]);
+ }
+ return dsg;
+}
+static int set_data(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec, int32_t **arr, int32_t *narr, int *narr1, int *use_GT)
+{
+ static int warn_dip_GT = 1;
+ static int warn_dip_PL = 1;
+ int i;
+ for (i=0; i<2; i++)
+ {
+ if ( *use_GT )
{
- if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break;
- if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue;
- sum_pl += pow(10, -0.1*pl_ptr[max_ipl]);
+ int ret = bcf_get_genotypes(hdr,rec,arr,narr);
+ if ( ret < 0 )
+ {
+ if ( !i ) { *use_GT = 0; continue; }
+ args->nskip_no_data++;
+ return -1;
+ }
+ if ( ret != 2*bcf_hdr_nsamples(hdr) )
+ {
+ if ( warn_dip_GT )
+ {
+ fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/GT fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1);
+ warn_dip_GT = 0;
+ }
+ args->nskip_dip_GT++;
+ return -1;
+ }
+ *narr1 = 2;
+ return 0;
}
- if ( sum_pl==0 ) continue; // no PLs present
- if ( fake_pls && args->no_PLs==1 ) sum_pl = -1;
- // The main stats: concordance of the query sample with the target -g samples
- for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++)
+ int ret = bcf_get_format_int32(hdr,rec,"PL",arr,narr);
+ if ( ret < 0 )
{
- int *gt_ptr = gt_arr + i*ngt;
- if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes
- if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) continue;
- int a = bcf_gt_allele(gt_ptr[0]);
- int b = bcf_gt_allele(gt_ptr[1]);
- if ( args->hom_only && a!=b ) continue; // heterozygous genotype
- int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file
- int igt_qry = gt2ipl[igt_tgt]; // corresponding genotype in query file
- if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue; // genotype not present in query sample: haploid or missing
- args->lks[i] += sum_pl<0 ? -pl_ptr[igt_qry] : log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl);
- args->sites[i]++;
+ if ( !i ) { *use_GT = 1; continue; }
+ args->nskip_no_data++;
+ return -1;
}
- if ( args->all_sites )
+ if ( ret != 3*bcf_hdr_nsamples(hdr) )
{
- // Print LKs at all sites for debugging
- int *gt_ptr = gt_arr + tgt_isample*ngt;
- if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes
- int a = bcf_gt_allele(gt_ptr[0]);
- int b = bcf_gt_allele(gt_ptr[1]);
- if ( args->hom_only && a!=b ) continue; // heterozygous genotype
- fprintf(fp, "SC\t%s\t%"PRId64, args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1);
- for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]);
- fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : ".");
- fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk);
- prev_lk = args->lks[query_isample];
-
- int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample
- for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]);
- for (igt=0; igt<npl; igt++)
- if ( pl_ptr[igt]==bcf_int32_vector_end ) break;
- else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, ".");
- else fprintf(fp, "\t%d", pl_ptr[igt]);
- fprintf(fp, "\n");
+ if ( warn_dip_PL )
+ {
+ fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/PL fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1);
+ warn_dip_PL = 0;
+ }
+ args->nskip_dip_PL++;
+ return -1;
}
+ *narr1 = 3;
+ return 0;
}
- free(gt2ipl);
- free(gt_arr);
- free(args->pl_arr);
- free(args->tmp_arr);
+ return -1; // should never reach
+}
+static void process_line(args_t *args)
+{
+ int i,j,k, nqry1, ngt1, ret;
+
+ bcf1_t *gt_rec = NULL, *qry_rec = bcf_sr_get_line(args->files,0); // the query file
+ int qry_use_GT = args->qry_use_GT;
+ int gt_use_GT = args->gt_use_GT;
+
+ ret = set_data(args, args->qry_hdr, qry_rec, &args->qry_arr, &args->nqry_arr, &nqry1, &qry_use_GT);
+ if ( ret<0 ) return;
- // To be able to plot total discordance (=number of mismatching GTs with -G1) in the same
- // plot as discordance per site, the latter must be scaled to the same range
- int nsamples = bcf_hdr_nsamples(args->gt_hdr);
- double extreme_lk = 0, extreme_lk_per_site = 0;
- for (i=0; i<nsamples; i++)
+ if ( args->gt_hdr )
{
- if ( args->lks[i] < extreme_lk ) extreme_lk = args->lks[i];
- if ( args->sites[i] && args->lks[i]/args->sites[i] < extreme_lk_per_site ) extreme_lk_per_site = args->lks[i]/args->sites[i];
+ gt_rec = bcf_sr_get_line(args->files,1);
+ ret = set_data(args, args->gt_hdr, gt_rec, &args->gt_arr, &args->ngt_arr, &ngt1, >_use_GT);
+ if ( ret<0 ) return;
+ }
+ else
+ {
+ ngt1 = nqry1;
+ args->gt_arr = args->qry_arr;
}
- // Sorted output
- double **p = (double**) malloc(sizeof(double*)*nsamples);
- for (i=0; i<nsamples; i++) p[i] = &args->lks[i];
- qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
+ args->ncmp++;
- fprintf(fp, "# [1]CN\t[2]Discordance with %s (total)\t[3]Discordance (avg score per site)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]);
- for (i=0; i<nsamples; i++)
+ double af,hwe_dsg[8];
+ if ( args->calc_hwe_prob )
{
- int idx = p[i] - args->lks;
- double per_site = 0;
- if ( args->sites[idx] )
+ int ac[2];
+ if ( args->gt_hdr )
{
- if ( args->sites[idx] && extreme_lk_per_site )
+ if ( bcf_calc_ac(args->gt_hdr, gt_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
+ }
+ else if ( bcf_calc_ac(args->qry_hdr, qry_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
+
+ // hwe indexes correspond to the bitmask of eight dsg combinations to account for PL uncertainty
+ // for in the extreme case we can have uninformative PL=0,0,0. So the values are the minima of e.g.
+ // hwe[1,2,4] .. dsg=0,1,2
+ // hwe[3] .. dsg=0 or 1
+ // hwe[6] .. dsg=1 or 2
+
+ double hwe[3];
+ const double min_af = 1e-5; // cap the AF in case we get unrealistic values
+ af = (double)ac[1]/(ac[0]+ac[1]);
+ hwe[0] = af>min_af ? -log(af*af) : -log(min_af*min_af);
+ hwe[1] = af>min_af && af<1-min_af ? -log(2*af*(1-af)) : -log(2*min_af*(1-min_af));
+ hwe[2] = af<(1-min_af) ? -log((1-af)*(1-af)) : -log(min_af*min_af);
+ hwe_dsg[0] = 0;
+ for (i=1; i<8; i++)
+ {
+ hwe_dsg[i] = HUGE_VAL;
+ for (k=0; k<3; k++)
{
- per_site = args->lks[idx]/args->sites[idx];
- per_site *= extreme_lk / extreme_lk_per_site;
+ if ( ((1<<k)&i) && hwe_dsg[i] > hwe[k] ) hwe_dsg[i] = hwe[k];
}
- else
- per_site = 0;
}
- fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", fabs(args->lks[idx]), fabs(per_site), args->sites[idx], args->gt_hdr->samples[idx], i);
}
- if ( args->plot )
+ // The sample pairs were given explicitly via -p/-P options
+ if ( args->pairs )
{
- if ( fclose(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
- plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]);
- }
-}
+ if ( !args->use_PLs )
+ {
+ int ndiff = 0;
+ if ( args->kbs_diff ) diff_sites_reset(args);
-// static inline int is_hom_most_likely(int nals, int *pls)
-// {
-// int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
-// for (ia=1; ia<nals; ia++)
-// {
-// for (ib=0; ib<ia; ib++)
-// {
-// if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
-// idx++;
-// }
-// if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
-// idx++;
-// }
-// return min_is_hom;
-// }
-
-int process_GT(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
-{
- int ngt = bcf_get_genotypes(args->sm_hdr, line, &args->tmp_arr, &args->ntmp_arr);
+ for (i=0; i<args->npairs; i++)
+ {
+ int32_t *ptr;
+ uint8_t qry_dsg, gt_dsg;
- if ( ngt<=0 ) return 1; // GT not present
- if ( ngt!=args->nsmpl*2 ) return 2; // not diploid
- ngt /= args->nsmpl;
-
- int i,j, idx = 0;
- for (i=1; i<args->nsmpl; i++)
- {
- int32_t *a = args->tmp_arr + i*ngt;
- if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; }
- int agt = 1<<bcf_gt_allele(a[0]) | 1<<bcf_gt_allele(a[1]);
+ ptr = args->gt_arr + args->pairs[i].igt*ngt1;
+ gt_dsg = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
+ if ( !gt_dsg ) continue; // missing value
+ if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom
+
+ ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
+ qry_dsg = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
+ if ( !qry_dsg ) continue; // missing value
+
+ int match = qry_dsg & gt_dsg;
+ if ( !match )
+ {
+ args->ndiff[i]++;
+ if ( args->kbs_diff ) { ndiff++; kbs_insert(args->kbs_diff, i); }
+ }
+ else if ( args->calc_hwe_prob ) args->hwe_prob[i] += hwe_dsg[match];
+ args->ncnt[i]++;
+ }
- for (j=0; j<i; j++)
+ if ( ndiff ) diff_sites_push(args, ndiff, qry_rec->rid, qry_rec->pos);
+ }
+ else // use_PLs set
{
- int32_t *b = args->tmp_arr + j*ngt;
- if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; }
- int bgt = 1<<bcf_gt_allele(b[0]) | 1<<bcf_gt_allele(b[1]);
+ for (i=0; i<args->npairs; i++)
+ {
+ int32_t *ptr;
+ double qry_prob[3], gt_prob[3];
+ uint8_t qry_dsg, gt_dsg;
+
+ ptr = args->gt_arr + args->pairs[i].igt*ngt1;
+ gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob);
+ if ( !gt_dsg ) continue; // missing value
+ if ( args->hom_only && !(gt_dsg&5) ) continue; // not a hom
+
+ ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
+ qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob);
+ if ( !qry_dsg ) continue; // missing value
- ntot[idx]++;
- if ( agt!=bgt ) ndif[idx]++;
- idx++;
+ double min = qry_prob[0] + gt_prob[0];
+ qry_prob[1] += gt_prob[1];
+ if ( min > qry_prob[1] ) min = qry_prob[1];
+ qry_prob[2] += gt_prob[2];
+ if ( min > qry_prob[2] ) min = qry_prob[2];
+ args->pdiff[i] += min;
+
+ if ( args->calc_hwe_prob )
+ {
+ int match = qry_dsg & gt_dsg;
+ args->hwe_prob[i] += hwe_dsg[match];
+ }
+ args->ncnt[i]++;
+ }
}
+ return;
}
- return 0;
-}
-int process_PL(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
-{
- int npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr);
- if ( npl<=0 ) return 1; // PL not present
- npl /= args->nsmpl;
-
- int i,j,k, idx = 0;
- for (i=1; i<args->nsmpl; i++)
+ int idx=0;
+ if ( !args->use_PLs )
{
- int32_t *a = args->tmp_arr + i*npl;
- int imin = -1;
- for (k=0; k<npl; k++)
+ for (i=0; i<args->nqry_smpl; i++)
{
- if ( a[k]==bcf_int32_vector_end ) break;
- if ( a[k]==bcf_int32_missing ) continue;
- if ( imin==-1 || a[imin] > a[k] ) imin = k;
+ int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+ int32_t *ptr = args->qry_arr + nqry1*iqry;
+ args->qry_dsg[i] = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
}
- if ( imin<0 ) { idx+=i; continue; }
-
- for (j=0; j<i; j++)
+ if ( !args->cross_check ) // in this case gt_dsg points to qry_dsg
{
- int32_t *b = args->tmp_arr + j*npl;
- int jmin = -1;
- for (k=0; k<npl; k++)
+ for (i=0; i<args->ngt_smpl; i++)
{
- if ( b[k]==bcf_int32_vector_end ) break;
- if ( b[k]==bcf_int32_missing ) continue;
- if ( jmin==-1 || b[jmin] > b[k] ) jmin = k;
+ int igt = args->gt_smpl ? args->gt_smpl[i] : i;
+ int32_t *ptr = args->gt_arr + ngt1*igt;
+ args->gt_dsg[i] = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
+ if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0; // not a hom, set to a missing value
+ }
+ }
+ for (i=0; i<args->nqry_smpl; i++)
+ {
+ int ngt = args->cross_check ? i : args->ngt_smpl; // two files or a sub-diagonal cross-check mode?
+ if ( !args->qry_dsg[i] ) { idx += ngt; continue; } // missing value
+ for (j=0; j<ngt; j++)
+ {
+ if ( !args->gt_dsg[j] ) { idx++; continue; } // missing value
+ int match = args->qry_dsg[i] & args->gt_dsg[j];
+ if ( !match ) args->ndiff[idx]++;
+ else if ( args->calc_hwe_prob ) args->hwe_prob[idx] += hwe_dsg[match];
+ args->ncnt[idx]++;
+ idx++;
}
- if ( jmin<0 ) { idx++; continue; }
-
- ntot[idx]++;
- if ( imin!=jmin ) ndif[idx]++;
- idx++;
}
}
- return 0;
-}
+ else // use_PLs set
+ {
+ for (i=0; i<args->nqry_smpl; i++)
+ {
+ int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+ int32_t *ptr = args->qry_arr + nqry1*iqry;
+ args->qry_dsg[i] = qry_use_GT ? gt_to_prob(args,ptr,args->qry_prob+i*3) : pl_to_prob(args,ptr,args->qry_prob+i*3);
+ }
+ if ( !args->cross_check ) // in this case gt_dsg points to qry_dsg
+ {
+ for (i=0; i<args->ngt_smpl; i++)
+ {
+ int igt = args->gt_smpl ? args->gt_smpl[i] : i;
+ int32_t *ptr = args->gt_arr + ngt1*igt;
+ args->gt_dsg[i] = gt_use_GT ? gt_to_prob(args,ptr,args->gt_prob+i*3) : pl_to_prob(args,ptr,args->gt_prob+i*3);
+ if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0; // not a hom, set to a missing value
+ }
+ }
+ for (i=0; i<args->nqry_smpl; i++)
+ {
+ int ngt = args->cross_check ? i : args->ngt_smpl; // two files or a sub-diagonal cross-check mode?
+ if ( !args->qry_dsg[i] ) { idx += ngt; continue; } // missing value
+ for (j=0; j<ngt; j++)
+ {
+ if ( !args->gt_dsg[j] ) { idx++; continue; } // missing value
-static void cross_check_gts(args_t *args)
-{
- // Initialize things: check which tags are defined in the header, sample names etc.
- if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
- {
- if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
- error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
- if ( !args->no_PLs ) {
- fprintf(bcftools_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
- args->no_PLs = 99;
+ double min = args->qry_prob[i*3] + args->gt_prob[j*3];
+ if ( min > args->qry_prob[i*3+1] + args->gt_prob[j*3+1] ) min = args->qry_prob[i*3+1] + args->gt_prob[j*3+1];
+ if ( min > args->qry_prob[i*3+2] + args->gt_prob[j*3+2] ) min = args->qry_prob[i*3+2] + args->gt_prob[j*3+2];
+ args->pdiff[idx] += min;
+
+ if ( args->calc_hwe_prob )
+ {
+ int match = args->qry_dsg[i] & args->gt_dsg[j];
+ args->hwe_prob[idx] += hwe_dsg[match];
+ }
+ args->ncnt[idx]++;
+ idx++;
+ }
}
}
+}
- args->nsmpl = bcf_hdr_nsamples(args->sm_hdr);
- args->narr = (args->nsmpl-1)*args->nsmpl/2;
- uint32_t *ndif = (uint32_t*) calloc(args->narr,4);
- uint32_t *ntot = (uint32_t*) calloc(args->narr,4);
+typedef struct
+{
+ int ism, idx;
+ double val;
+}
+idbl_t;
+static int cmp_idbl(const void *_a, const void *_b)
+{
+ idbl_t *a = (idbl_t*)_a;
+ idbl_t *b = (idbl_t*)_b;
+ if ( a->val < b->val ) return -1;
+ if ( a->val > b->val ) return 1;
+ return 0;
+}
+static void report_distinctive_sites(args_t *args)
+{
+ extsort_sort(args->es);
+
+ fprintf(args->fp,"# DS, distinctive sites:\n");
+ fprintf(args->fp,"# - chromosome\n");
+ fprintf(args->fp,"# - position\n");
+ fprintf(args->fp,"# - cumulative number of pairs distinguished by this block\n");
+ fprintf(args->fp,"# - block id\n");
+ fprintf(args->fp,"#DS\t[2]Chromosome\t[3]Position\t[4]Cumulative number of distinct pairs\t[5]Block id\n");
- while ( bcf_sr_next_line(args->files) )
+ kbitset_t *kbs_blk = kbs_init(args->npairs);
+ kbitset_iter_t itr;
+ int i,ndiff,rid,pos,ndiff_tot = 0, iblock = 0;
+ int ndiff_min = args->distinctive_sites <= args->npairs ? args->distinctive_sites : args->npairs;
+ while ( diff_sites_shift(args,&ndiff,&rid,&pos) )
{
- bcf1_t *line = bcf_sr_get_line(args->files,0);
-
- // use PLs unless no_PLs is set and GT exists
- if ( args->no_PLs )
+ int ndiff_new = 0, ndiff_dbg = 0;
+ kbs_start(&itr);
+ while ( (i=kbs_next(args->kbs_diff, &itr))>=0 )
{
- if ( process_GT(args,line,ntot,ndif)==0 ) continue;
+ ndiff_dbg++;
+ if ( kbs_exists(kbs_blk,i) ) continue; // already set
+ kbs_insert(kbs_blk,i);
+ ndiff_new++;
}
- process_PL(args,line,ntot,ndif);
+ if ( ndiff_dbg!=ndiff ) error("Corrupted data, fixme: %d vs %d\n",ndiff_dbg,ndiff);
+ if ( !ndiff_new ) continue; // no new pair distinguished by this site
+ ndiff_tot += ndiff_new;
+ fprintf(args->fp,"DS\t%s\t%d\t%d\t%d\n",bcf_hdr_id2name(args->qry_hdr,rid),pos+1,ndiff_tot,iblock);
+ if ( ndiff_tot < ndiff_min ) continue; // fewer than the requested number of pairs can be distinguished at this point
+ iblock++;
+ ndiff_tot = 0;
+ kbs_clear(kbs_blk);
}
-
- FILE *fp = bcftools_stdout;
- print_header(args, fp);
+ kbs_destroy(kbs_blk);
+}
+static void report(args_t *args)
+{
+ fprintf(args->fp,"INFO\tsites-compared\t%u\n",args->ncmp);
+ fprintf(args->fp,"INFO\tsites-skipped-no-match\t%u\n",args->nskip_no_match);
+ fprintf(args->fp,"INFO\tsites-skipped-multiallelic\t%u\n",args->nskip_not_ba);
+ fprintf(args->fp,"INFO\tsites-skipped-monoallelic\t%u\n",args->nskip_mono);
+ fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
+ fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
+ fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
+ fprintf(args->fp,"# DC, discordance:\n");
+ fprintf(args->fp,"# - query sample\n");
+ fprintf(args->fp,"# - genotyped sample\n");
+ fprintf(args->fp,"# - discordance (number of mismatches; smaller is better)\n");
+ fprintf(args->fp,"# - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n");
+ fprintf(args->fp,"# - number of sites compared (bigger is better)\n");
+ fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
- float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2);
+ int trim = args->ntop;
+ if ( !args->pairs )
+ {
+ if ( !args->ngt_smpl && args->nqry_smpl <= args->ntop ) trim = 0;
+ if ( args->ngt_smpl && args->ngt_smpl <= args->ntop ) trim = 0;
+ }
- // Output pairwise distances
- fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n");
- int i,j, idx = 0;
- for (i=0; i<args->nsmpl; i++)
+ if ( args->pairs )
{
- for (j=0; j<i; j++)
+ int i;
+ for (i=0; i<args->npairs; i++)
{
- float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10;
- fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
- PDIST(tmp,i,j) = err;
- idx++;
+ int iqry = args->pairs[i].iqry;
+ int igt = args->pairs[i].igt;
+ if ( args->ndiff )
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+ args->ndiff[i],
+ args->calc_hwe_prob ? args->hwe_prob[i] : 0,
+ args->ncnt[i]);
+ }
+ else
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+ args->pdiff[i],
+ args->calc_hwe_prob ? args->hwe_prob[i] : 0,
+ args->ncnt[i]);
+ }
}
}
-
- // Cluster samples
- int nlist;
- float clust_max_err = args->max_intra_err;
- hclust_t *clust = hclust_init(args->nsmpl,tmp);
- cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist);
- fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n");
- for (i=0; i<nlist; i++)
- {
- fprintf(fp,"CLUSTER\t%f", list[i].dist);
- for (j=0; j<list[i].nmemb; j++)
- fprintf(fp,"\t%s",args->sm_hdr->samples[list[i].memb[j]]);
- fprintf(fp,"\n");
- }
- hclust_destroy_list(list,nlist);
- // Debugging output: the cluster graph and data used for deciding
- char **dbg = hclust_explain(clust,&nlist);
- for (i=0; i<nlist; i++)
- fprintf(fp,"DBG\t%s\n", dbg[i]);
- fprintf(fp, "# TH, clustering threshold\t[2]Value\nTH\t%f\n",clust_max_err);
- fprintf(fp, "# DOT\t[2]Cluster graph, visualize e.g. as \"this-output.txt | grep ^DOT | cut -f2- | dot -Tsvg -o graph.svg\"\n");
- fprintf(fp, "DOT\t%s\n", hclust_create_dot(clust,args->sm_hdr->samples,clust_max_err));
- hclust_destroy(clust);
- free(tmp);
-
-
- // Deprecated output for temporary backward compatibility
- fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n");
- fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
- idx = 0;
- for (i=0; i<args->nsmpl; i++)
+ else if ( !trim )
{
- for (j=0; j<i; j++)
+ int i,j,idx=0;
+ for (i=0; i<args->nqry_smpl; i++)
{
- fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
- idx++;
+ int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+ int ngt = args->cross_check ? i : args->ngt_smpl;
+ for (j=0; j<ngt; j++)
+ {
+ int igt = args->gt_smpl ? args->gt_smpl[j] : j;
+ if ( args->ndiff )
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+ args->ndiff[idx],
+ args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+ args->ncnt[idx]);
+ }
+ else
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+ args->pdiff[idx],
+ args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+ args->ncnt[idx]);
+ }
+ idx++;
+ }
}
}
-
- free(ndif);
- free(ntot);
- free(args->tmp_arr);
+ else if ( !args->cross_check )
+ {
+ idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*args->ngt_smpl);
+ int i,j;
+ for (i=0; i<args->nqry_smpl; i++)
+ {
+ int idx = i*args->ngt_smpl;
+ for (j=0; j<args->ngt_smpl; j++)
+ {
+ if ( args->sort_by_hwe )
+ arr[j].val = -args->hwe_prob[idx];
+ else if ( args->ndiff )
+ arr[j].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
+ else
+ arr[j].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
+ arr[j].ism = j;
+ arr[j].idx = idx;
+ idx++;
+ }
+ qsort(arr, args->ngt_smpl, sizeof(*arr), cmp_idbl);
+ int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+ for (j=0; j<args->ntop; j++)
+ {
+ int idx = arr[j].idx;
+ int igt = args->gt_smpl ? args->gt_smpl[arr[j].ism] : arr[j].ism;
+ if ( args->ndiff )
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+ args->ndiff[idx],
+ args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+ args->ncnt[idx]);
+ }
+ else
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+ args->pdiff[idx],
+ args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+ args->ncnt[idx]);
+ }
+ }
+ }
+ free(arr);
+ }
+ else
+ {
+ int narr = args->nqry_smpl-1;
+ idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*narr);
+ int i,j,k,idx;
+ for (i=0; i<args->nqry_smpl; i++)
+ {
+ k = 0, idx = i*(i-1)/2;
+ for (j=0; j<i; j++)
+ {
+ if ( args->sort_by_hwe )
+ arr[k].val = -args->hwe_prob[idx];
+ else if ( args->ndiff )
+ arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
+ else
+ arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
+ arr[k].ism = j;
+ arr[k].idx = idx;
+ idx++;
+ k++;
+ }
+ for (; j<narr; j++)
+ {
+ idx = j*(j+1)/2 + i;
+ if ( args->sort_by_hwe )
+ arr[k].val = -args->hwe_prob[idx];
+ else if ( args->ndiff )
+ arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
+ else
+ arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
+ arr[k].ism = j + 1;
+ arr[k].idx = idx;
+ k++;
+ }
+ qsort(arr, narr, sizeof(*arr), cmp_idbl);
+ int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+ for (j=0; j<args->ntop; j++)
+ {
+ if ( i <= arr[j].ism ) continue;
+ int idx = arr[j].idx;
+ int igt = args->qry_smpl ? args->qry_smpl[arr[j].ism] : arr[j].ism;
+ if ( args->ndiff )
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->qry_hdr->samples[igt],
+ args->ndiff[idx],
+ args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+ args->ncnt[idx]);
+ }
+ else
+ {
+ fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+ args->qry_hdr->samples[iqry],
+ args->qry_hdr->samples[igt],
+ args->pdiff[idx],
+ args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+ args->ncnt[idx]);
+ }
+ }
+ }
+ free(arr);
+ }
}
-static char *init_prefix(char *prefix)
+static int is_input_okay(args_t *args, int nmatch)
{
- int len = strlen(prefix);
- if ( prefix[len-1] == '/' || prefix[len-1] == '\\' )
- return msprintf("%sgtcheck", prefix);
- return strdup(prefix);
+ int i;
+ const char *msg;
+ bcf_hdr_t *hdr;
+ bcf1_t *rec;
+ if ( args->gt_hdr && nmatch!=2 )
+ {
+ if ( args->nskip_no_match++ ) return 0;
+ for (i=0; i<2; i++)
+ {
+ rec = bcf_sr_get_line(args->files,i);
+ if ( rec ) break;
+ }
+ hdr = bcf_sr_get_header(args->files,i);
+ fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", no record with matching POS+ALT. (This is printed only once.)\n",
+ bcf_seqname(hdr,rec),rec->pos+1);
+ return 0;
+ }
+ for (i=0; i<2; i++)
+ {
+ hdr = bcf_sr_get_header(args->files,i);
+ rec = bcf_sr_get_line(args->files,i);
+ if ( rec->n_allele>2 )
+ {
+ if ( args->nskip_not_ba++ ) return 0;
+ msg = "not a biallelic site, run `bcftools norm -m -` first";
+ goto not_okay;
+ }
+ if ( bcf_get_variant_types(rec)==VCF_REF )
+ {
+ if ( args->nskip_mono++ ) return 0;
+ msg = "monoallelic site";
+ goto not_okay;
+ }
+ if ( !args->gt_hdr ) break;
+ }
+ return 1;
+
+not_okay:
+ fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n",
+ bcf_seqname(hdr,rec),rec->pos+1,msg);
+ return 0;
}
static void usage(void)
fprintf(bcftools_stderr, "Usage: bcftools gtcheck [options] [-g <genotypes.vcf.gz>] <query.vcf.gz>\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -a, --all-sites output comparison for all sites\n");
- fprintf(bcftools_stderr, " -c, --cluster <min,max> min inter- and max intra-sample error [0.23,-0.3]\n");
- fprintf(bcftools_stderr, " -g, --genotypes <file> genotypes to compare against\n");
- fprintf(bcftools_stderr, " -G, --GTs-only <int> use GTs, ignore PLs, using <int> for unseen genotypes [99]\n");
- fprintf(bcftools_stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n");
- fprintf(bcftools_stderr, " -p, --plot <prefix> plot\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " -s, --query-sample <string> query sample (by default the first sample is checked)\n");
- fprintf(bcftools_stderr, " -S, --target-sample <string> target sample in the -g file (used only for plotting)\n");
- fprintf(bcftools_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ //fprintf(bcftools_stderr, " -a, --all-sites Output comparison for all sites\n");
+ //fprintf(bcftools_stderr, " -c, --cluster MIN,MAX Min inter- and max intra-sample error [0.23,-0.3]\n");
+ fprintf(bcftools_stderr, " --distinctive-sites Find sites that can distinguish between at least NUM sample pairs.\n");
+ fprintf(bcftools_stderr, " NUM[,MEM[,TMP]] If the number is smaller or equal to 1, it is interpreted as the fraction of pairs.\n");
+ fprintf(bcftools_stderr, " The optional MEM string sets the maximum memory used for in-memory sorting [500M]\n");
+#ifdef _WIN32
+ fprintf(bcftools_stderr, " and TMP is a prefix of temporary files used by external sorting [/bcftools.XXXXXX]\n");
+#else
+ fprintf(bcftools_stderr, " and TMP is a prefix of temporary files used by external sorting [/tmp/bcftools.XXXXXX]\n");
+#endif
+ fprintf(bcftools_stderr, " --dry-run Stop after first record to estimate required time\n");
+ fprintf(bcftools_stderr, " -e, --error-probability INT Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n");
+ fprintf(bcftools_stderr, " -g, --genotypes FILE Genotypes to compare against\n");
+ fprintf(bcftools_stderr, " -H, --homs-only Homozygous genotypes only, useful with low coverage data (requires -g)\n");
+ fprintf(bcftools_stderr, " --n-matches INT Print only top INT matches for each sample (sorted by average score), 0 for unlimited.\n");
+ fprintf(bcftools_stderr, " Use negative value to sort by HWE probability rather than by discordance [0]\n");
+ fprintf(bcftools_stderr, " --no-HWE-prob Disable calculation of HWE probability\n");
+ fprintf(bcftools_stderr, " -p, --pairs LIST Comma-separated sample pairs to compare (qry,gt[,qry,gt..] with -g or qry,qry[,qry,qry..] w/o)\n");
+ fprintf(bcftools_stderr, " -P, --pairs-file FILE File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " -s, --samples [qry|gt]:LIST List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n");
+ fprintf(bcftools_stderr, " -S, --samples-file [qry|gt]:FILE File with the query or -g samples to compare\n");
+ fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n");
+ fprintf(bcftools_stderr, "Examples:\n");
+ fprintf(bcftools_stderr, " # Check discordance of all samples from B against all sample in A\n");
+ fprintf(bcftools_stderr, " bcftools gtcheck -g A.bcf B.bcf\n");
+ fprintf(bcftools_stderr, "\n");
+ fprintf(bcftools_stderr, " # Limit comparisons to the fiven list of samples\n");
+ fprintf(bcftools_stderr, " bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ fprintf(bcftools_stderr, " # Compare only two pairs a1,b1 and a1,b2\n");
+ fprintf(bcftools_stderr, " bcftools gtcheck -p a1,b1,a1,b2 -g A.bcf B.bcf\n");
+ fprintf(bcftools_stderr, "\n");
+ bcftools_exit(1);
}
int main_vcfgtcheck(int argc, char *argv[])
{
int c;
args_t *args = (args_t*) calloc(1,sizeof(args_t));
- args->files = bcf_sr_init();
args->argc = argc; args->argv = argv; set_cwd(args);
- char *regions = NULL, *targets = NULL;
- int regions_is_file = 0, targets_is_file = 0;
+ args->qry_use_GT = -1;
+ args->gt_use_GT = -1;
+ args->calc_hwe_prob = 1;
+ args->use_PLs = 40;
+
+ // external sort for --distinctive-sites
+#ifdef _WIN32
+ args->es_tmp_prefix = NULL;
+#else
+ args->es_tmp_prefix = "/tmp/bcftools-gtcheck";
+#endif
+ args->es_max_mem = strdup("500M");
// In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
// - min_inter: pairs with smaller err value will be considered identical
static struct option loptions[] =
{
+ {"error-probability",1,0,'e'},
+ {"use",1,0,'u'},
{"cluster",1,0,'c'},
{"GTs-only",1,0,'G'},
{"all-sites",0,0,'a'},
{"help",0,0,'h'},
{"genotypes",1,0,'g'},
{"plot",1,0,'p'},
- {"target-sample",1,0,'S'},
- {"query-sample",1,0,'s'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
+ {"n-matches",1,0,2},
+ {"no-HWE-prob",0,0,3},
+ {"target-sample",1,0,4},
+ {"dry-run",0,0,5},
+ {"distinctive-sites",1,0,6},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"pairs",1,0,'p'},
+ {"pairs-file",1,0,'P'},
{0,0,0,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'e':
+ args->use_PLs = strtol(optarg,&tmp,10);
+ if ( !tmp || *tmp ) error("Could not parse: --error-probability %s\n", optarg);
+ break;
+ case 'u':
+ {
+ int i,nlist;
+ char **list = hts_readlist(optarg, 0, &nlist);
+ if ( !list || nlist<=0 || nlist>2 ) error("Failed to parse --use %s\n", optarg);
+ if ( !strcasecmp("GT",list[0]) ) args->qry_use_GT = 1;
+ else if ( !strcasecmp("PL",list[0]) ) args->qry_use_GT = 0;
+ else error("Failed to parse --use %s; only GT and PL are supported\n", optarg);
+ if ( nlist==2 )
+ {
+ if ( !strcasecmp("GT",list[1]) ) args->gt_use_GT = 1;
+ else if ( !strcasecmp("PL",list[1]) ) args->gt_use_GT = 0;
+ else error("Failed to parse --use %s; only GT and PL are supported\n", optarg);
+ }
+ else args->gt_use_GT = args->qry_use_GT;
+ for (i=0; i<nlist; i++) free(list[i]);
+ free(list);
+ }
+ break;
+ case 2 :
+ args->ntop = strtol(optarg,&tmp,10);
+ if ( !tmp || *tmp ) error("Could not parse: --n-matches %s\n", optarg);
+ if ( args->ntop < 0 )
+ {
+ args->sort_by_hwe = 1;
+ args->ntop *= -1;
+ }
+ break;
+ case 3 : args->calc_hwe_prob = 0; break;
+ case 4 : error("The option -S, --target-sample has been deprecated\n"); break;
+ case 5 : args->dry_run = 1; break;
+ case 6 :
+ args->distinctive_sites = strtod(optarg,&tmp);
+ if ( *tmp )
+ {
+ if ( *tmp!=',' ) error("Could not parse: --distinctive-sites %s\n", optarg);
+ tmp++;
+ free(args->es_max_mem);
+ args->es_max_mem = strdup(tmp);
+ while ( *tmp && *tmp!=',' ) tmp++;
+ if ( *tmp ) { *tmp = 0; args->es_tmp_prefix = tmp+1; }
+ }
+ args->use_PLs = 0;
+ break;
case 'c':
+ error("The -c option is to be implemented, please open an issue on github\n");
args->min_inter_err = strtod(optarg,&tmp);
if ( *tmp )
{
if ( *tmp ) error("Could not parse: -c %s\n", optarg);
}
break;
- case 'G':
- args->no_PLs = strtol(optarg,&tmp,10);
- if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg);
- break;
- case 'a': args->all_sites = 1; break;
+ case 'G': error("The option -G, --GTs-only has been deprecated\n"); break;
+ case 'a': args->all_sites = 1; error("The -a option is to be implemented, please open an issue on github\n"); break;
case 'H': args->hom_only = 1; break;
case 'g': args->gt_fname = optarg; break;
- case 'p': args->plot = optarg; break;
- case 'S': args->target_sample = optarg; break;
- case 's': args->query_sample = optarg; break;
- case 'r': regions = optarg; break;
- case 'R': regions = optarg; regions_is_file = 1; break;
- case 't': targets = optarg; break;
- case 'T': targets = optarg; targets_is_file = 1; break;
+// case 'p': args->plot = optarg; break;
+ case 's':
+ if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3;
+ else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4;
+ else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
+ break;
+ case 'S':
+ if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1;
+ else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1;
+ else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
+ break;
+ case 'p': args->pair_samples = optarg; break;
+ case 'P': args->pair_samples = optarg; args->pair_samples_is_file = 1; break;
+ case 'r': args->regions = optarg; break;
+ case 'R': args->regions = optarg; args->regions_is_file = 1; break;
+ case 't': args->targets = optarg; break;
+ case 'T': args->targets = optarg; args->targets_is_file = 1; break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
}
}
- char *fname = NULL;
if ( optind==argc )
{
- if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ if ( !isatty(fileno((FILE *)stdin)) ) args->qry_fname = "-"; // reading from stdin
else usage(); // no files given
}
- else fname = argv[optind];
- if ( argc>optind+1 ) usage(); // too many files given
- if ( !args->gt_fname ) args->cross_check = 1; // no genotype file, run in cross-check mode
- else args->files->require_index = 1;
- if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions);
- if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets);
- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
- if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) )
- error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum));
- args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS;
- if ( args->plot ) args->plot = init_prefix(args->plot);
+ else args->qry_fname = argv[optind];
+ if ( argc>optind+1 ) error("Error: too many files given, run with -h for help\n"); // too many files given
+ if ( args->pair_samples )
+ {
+ if ( args->gt_samples || args->qry_samples ) error("The -p/-P option cannot be combined with -s/-S\n");
+ if ( args->ntop ) error("The --n-matches option cannot be combined with -p/-P\n");
+ }
+ if ( args->distinctive_sites && !args->pair_samples ) error("The experimental option --distinctive-sites requires -p/-P\n");
+ if ( args->hom_only && !args->gt_fname ) error("The option --homs-only requires --genotypes\n");
+ if ( args->distinctive_sites && args->use_PLs ) error("The option --distinctive-sites cannot be combined with --error-probability\n");
+
init_data(args);
- if ( args->cross_check )
- cross_check_gts(args);
- else
- check_gt(args);
+
+ int ret;
+ while ( (ret=bcf_sr_next_line(args->files)) )
+ {
+ if ( !is_input_okay(args,ret) ) continue;
+
+ // time one record to give the user an estimate with very big files
+ struct timeval t0, t1;
+ if ( !args->ncmp ) gettimeofday(&t0, NULL);
+
+ process_line(args);
+
+ if ( args->ncmp==1 )
+ {
+ gettimeofday(&t1, NULL);
+ double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec);
+ fprintf(bcftools_stderr,"INFO:\tTime required to process one record .. %f seconds\n",delta/1e6);
+ fprintf(args->fp,"INFO\tTime required to process one record .. %f seconds\n",delta/1e6);
+ if ( args->dry_run ) break;
+ }
+ }
+ if ( !args->dry_run )
+ {
+ report(args);
+ if ( args->distinctive_sites ) report_distinctive_sites(args);
+ }
+
destroy_data(args);
- bcf_sr_destroy(args->files);
- if (args->plot) free(args->plot);
free(args);
return 0;
}
/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
- Copyright (C) 2014-2016 Genome Research Ltd.
+ Copyright (C) 2014-2021 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
#include <stdio.h>
#include <stdlib.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <htslib/vcf.h>
#define BCF_LIDX_SHIFT 14
+enum {
+ per_contig = 1,
+ total = 2
+};
+
static void usage(void)
{
fprintf(stderr, "\n");
fprintf(stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
fprintf(stderr, " -f, --force overwrite index if it already exists\n");
fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
- fprintf(stderr, " -o, --output-file FILE optional output index file name\n");
+ fprintf(stderr, " -o, --output FILE optional output index file name\n");
fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n");
fprintf(stderr, " --threads INT use multithreading with INT worker threads [0]\n");
fprintf(stderr, "\n");
int vcf_index_stats(char *fname, int stats)
{
- const char **seq;
- int i, nseq;
+ const char **seq = NULL;
+ int tid, nseq = 0, ret = 0;
tbx_t *tbx = NULL;
+ bcf_hdr_t *hdr = NULL;
hts_idx_t *idx = NULL;
+ htsFile *fp = NULL;
+ uint64_t sum = 0;
+ char *fntemp = NULL, *fnidx = NULL;
- htsFile *fp = hts_open(fname,"r");
- if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; }
- bcf_hdr_t *hdr = bcf_hdr_read(fp);
- if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; }
-
- if ( hts_get_format(fp)->format==vcf )
+ /*
+ * First, has the user provided an index file? If per contig stats
+ * are requested, open the variant file (together with the index file,
+ * if provided), since the contig names can only be retrieved from its
+ * header. Otherwise, use just the corresponding index file to count
+ * the total number of records.
+ */
+ int len = strlen(fname);
+ if ( (fnidx = strstr(fname, HTS_IDX_DELIM)) != NULL ) {
+ fntemp = strdup(fname);
+ if ( !fntemp ) return 1;
+ fntemp[fnidx-fname] = 0;
+ fname = fntemp;
+ fnidx += strlen(HTS_IDX_DELIM);
+ }
+ else if ( len>4 && (!strcasecmp(".csi",fname+len-4) || !strcasecmp(".tbi",fname+len-4)) )
{
- tbx = tbx_index_load(fname);
- if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; }
+ fnidx = fname;
+ fntemp = strdup(fname);
+ fname = fntemp;
+ fname[len-4] = 0;
}
- else if ( hts_get_format(fp)->format==bcf )
+
+ if ( stats&per_contig )
{
- idx = bcf_index_load(fname);
- if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
+ fp = hts_open(fname,"r");
+ if ( !fp ) {
+ fprintf(stderr,"Could not read %s\n", fname);
+ ret = 1; goto cleanup;
+ }
+ hdr = bcf_hdr_read(fp);
+ if ( !hdr ) {
+ fprintf(stderr,"Could not read the header: %s\n", fname);
+ ret = 1; goto cleanup;
+ }
+
+ if ( hts_get_format(fp)->format==vcf )
+ {
+ tbx = tbx_index_load2(fname, fnidx);
+ if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; }
+ }
+ else if ( hts_get_format(fp)->format==bcf )
+ {
+ idx = bcf_index_load2(fname, fnidx);
+ if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
+ }
+ else
+ {
+ fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
+ return 1;
+ }
}
- else
+ else if ( fnidx )
{
- fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
- return 1;
+ char *ext = strrchr(fnidx, '.');
+ if ( ext && strcmp(ext, ".tbi") == 0 ) {
+ tbx = tbx_index_load2(fname, fnidx);
+ } else if ( ext && strcmp(ext, ".csi") == 0 ) {
+ idx = bcf_index_load2(fname, fnidx);
+ }
+ if ( !tbx && !idx ) {
+ fprintf(stderr,"Could not load index file '%s'\n", fnidx);
+ ret = 1; goto cleanup;
+ }
+ } else {
+ char *ext = strrchr(fname, '.');
+ if ( ext && strcmp(ext, ".bcf") == 0 ) {
+ idx = bcf_index_load(fname);
+ } else if ( ext && (ext-fname) > 4 && strcmp(ext-4, ".vcf.gz") == 0 ) {
+ tbx = tbx_index_load(fname);
+ }
}
- seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq);
- uint64_t sum = 0;
- for (i=0; i<nseq; i++)
+ if ( !tbx && !idx ) {
+ fprintf(stderr,"No index file could be found for '%s'. Use 'bcftools index' to create one\n", fname);
+ ret = 1; goto cleanup;
+ }
+
+ if ( tbx ) {
+ seq = tbx_seqnames(tbx, &nseq);
+ } else {
+ nseq = hts_idx_nseq(idx);
+ }
+
+ for (tid=0; tid<nseq; tid++)
{
uint64_t records, v;
- hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v);
- sum+=records;
- if (stats&2 || !records) continue;
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
- int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
- printf("%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
- }
- if (!sum)
+ hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v);
+ sum += records;
+ if ( (stats&total) || !records ) continue;
+ const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : NULL;
+ if ( ctg_name ) {
+ bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL;
+ int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
+ printf("%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records);
+ }
+ }
+ if ( !sum )
{
// No counts found.
// Is this because index version has no stored count data, or no records?
bcf1_t *rec = bcf_init1();
- if (bcf_read1(fp, hdr, rec) >= 0)
- {
+ if (fp && hdr && rec && bcf_read1(fp, hdr, rec) >= 0) {
fprintf(stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname);
- return 1;
+ ret = 1;
}
bcf_destroy1(rec);
}
- if (stats&2) printf("%" PRIu64 "\n", sum);
+ if ( (stats&total) && !ret ) {
+ printf("%" PRIu64 "\n", sum);
+ }
+
+cleanup:
free(seq);
- if ( hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
+ free(fntemp);
+ if ( fp && hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
bcf_hdr_destroy(hdr);
if (tbx)
tbx_destroy(tbx);
if (idx)
hts_idx_destroy(idx);
- return 0;
+ return ret;
}
int main_vcfindex(int argc, char *argv[])
{"nrecords",no_argument,NULL,'n'},
{"threads",required_argument,NULL,9},
{"output-file",required_argument,NULL,'o'},
+ {"output",required_argument,NULL,'o'},
{NULL, 0, NULL, 0}
};
min_shift = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg);
break;
- case 's': stats |= 1; break;
- case 'n': stats |= 2; break;
+ case 's': stats |= per_contig; break;
+ case 'n': stats |= total; break;
case 9:
n_threads = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
default: usage();
}
}
- if (stats>2)
+ if (stats > total)
{
fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
return 1;
/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
- Copyright (C) 2014-2016 Genome Research Ltd.
+ Copyright (C) 2014-2021 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
#include <stdio.h>
#include <stdlib.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <htslib/vcf.h>
#define BCF_LIDX_SHIFT 14
+enum {
+ per_contig = 1,
+ total = 2
+};
+
static void usage(void)
{
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
fprintf(bcftools_stderr, " -f, --force overwrite index if it already exists\n");
fprintf(bcftools_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
- fprintf(bcftools_stderr, " -o, --output-file FILE optional output index file name\n");
+ fprintf(bcftools_stderr, " -o, --output FILE optional output index file name\n");
fprintf(bcftools_stderr, " -t, --tbi generate TBI-format index for VCF files\n");
fprintf(bcftools_stderr, " --threads INT use multithreading with INT worker threads [0]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, " -n, --nrecords print number of records based on existing index file\n");
fprintf(bcftools_stderr, " -s, --stats print per contig stats based on existing index file\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int vcf_index_stats(char *fname, int stats)
{
- const char **seq;
- int i, nseq;
+ const char **seq = NULL;
+ int tid, nseq = 0, ret = 0;
tbx_t *tbx = NULL;
+ bcf_hdr_t *hdr = NULL;
hts_idx_t *idx = NULL;
+ htsFile *fp = NULL;
+ uint64_t sum = 0;
+ char *fntemp = NULL, *fnidx = NULL;
- htsFile *fp = hts_open(fname,"r");
- if ( !fp ) { fprintf(bcftools_stderr,"Could not read %s\n", fname); return 1; }
- bcf_hdr_t *hdr = bcf_hdr_read(fp);
- if ( !hdr ) { fprintf(bcftools_stderr,"Could not read the header: %s\n", fname); return 1; }
-
- if ( hts_get_format(fp)->format==vcf )
+ /*
+ * First, has the user provided an index file? If per contig stats
+ * are requested, open the variant file (together with the index file,
+ * if provided), since the contig names can only be retrieved from its
+ * header. Otherwise, use just the corresponding index file to count
+ * the total number of records.
+ */
+ int len = strlen(fname);
+ if ( (fnidx = strstr(fname, HTS_IDX_DELIM)) != NULL ) {
+ fntemp = strdup(fname);
+ if ( !fntemp ) return 1;
+ fntemp[fnidx-fname] = 0;
+ fname = fntemp;
+ fnidx += strlen(HTS_IDX_DELIM);
+ }
+ else if ( len>4 && (!strcasecmp(".csi",fname+len-4) || !strcasecmp(".tbi",fname+len-4)) )
{
- tbx = tbx_index_load(fname);
- if ( !tbx ) { fprintf(bcftools_stderr,"Could not load index for VCF: %s\n", fname); return 1; }
+ fnidx = fname;
+ fntemp = strdup(fname);
+ fname = fntemp;
+ fname[len-4] = 0;
}
- else if ( hts_get_format(fp)->format==bcf )
+
+ if ( stats&per_contig )
{
- idx = bcf_index_load(fname);
- if ( !idx ) { fprintf(bcftools_stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
+ fp = hts_open(fname,"r");
+ if ( !fp ) {
+ fprintf(bcftools_stderr,"Could not read %s\n", fname);
+ ret = 1; goto cleanup;
+ }
+ hdr = bcf_hdr_read(fp);
+ if ( !hdr ) {
+ fprintf(bcftools_stderr,"Could not read the header: %s\n", fname);
+ ret = 1; goto cleanup;
+ }
+
+ if ( hts_get_format(fp)->format==vcf )
+ {
+ tbx = tbx_index_load2(fname, fnidx);
+ if ( !tbx ) { fprintf(bcftools_stderr,"Could not load index for VCF: %s\n", fname); return 1; }
+ }
+ else if ( hts_get_format(fp)->format==bcf )
+ {
+ idx = bcf_index_load2(fname, fnidx);
+ if ( !idx ) { fprintf(bcftools_stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
+ }
+ else
+ {
+ fprintf(bcftools_stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
+ return 1;
+ }
}
- else
+ else if ( fnidx )
{
- fprintf(bcftools_stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
- return 1;
+ char *ext = strrchr(fnidx, '.');
+ if ( ext && strcmp(ext, ".tbi") == 0 ) {
+ tbx = tbx_index_load2(fname, fnidx);
+ } else if ( ext && strcmp(ext, ".csi") == 0 ) {
+ idx = bcf_index_load2(fname, fnidx);
+ }
+ if ( !tbx && !idx ) {
+ fprintf(bcftools_stderr,"Could not load index file '%s'\n", fnidx);
+ ret = 1; goto cleanup;
+ }
+ } else {
+ char *ext = strrchr(fname, '.');
+ if ( ext && strcmp(ext, ".bcf") == 0 ) {
+ idx = bcf_index_load(fname);
+ } else if ( ext && (ext-fname) > 4 && strcmp(ext-4, ".vcf.gz") == 0 ) {
+ tbx = tbx_index_load(fname);
+ }
}
- seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq);
- uint64_t sum = 0;
- for (i=0; i<nseq; i++)
+ if ( !tbx && !idx ) {
+ fprintf(bcftools_stderr,"No index file could be found for '%s'. Use 'bcftools index' to create one\n", fname);
+ ret = 1; goto cleanup;
+ }
+
+ if ( tbx ) {
+ seq = tbx_seqnames(tbx, &nseq);
+ } else {
+ nseq = hts_idx_nseq(idx);
+ }
+
+ for (tid=0; tid<nseq; tid++)
{
uint64_t records, v;
- hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v);
- sum+=records;
- if (stats&2 || !records) continue;
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
- int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
- fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
- }
- if (!sum)
+ hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v);
+ sum += records;
+ if ( (stats&total) || !records ) continue;
+ const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : NULL;
+ if ( ctg_name ) {
+ bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL;
+ int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
+ fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records);
+ }
+ }
+ if ( !sum )
{
// No counts found.
// Is this because index version has no stored count data, or no records?
bcf1_t *rec = bcf_init1();
- if (bcf_read1(fp, hdr, rec) >= 0)
- {
+ if (fp && hdr && rec && bcf_read1(fp, hdr, rec) >= 0) {
fprintf(bcftools_stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname);
- return 1;
+ ret = 1;
}
bcf_destroy1(rec);
}
- if (stats&2) fprintf(bcftools_stdout, "%" PRIu64 "\n", sum);
+ if ( (stats&total) && !ret ) {
+ fprintf(bcftools_stdout, "%" PRIu64 "\n", sum);
+ }
+
+cleanup:
free(seq);
- if ( hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
+ free(fntemp);
+ if ( fp && hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
bcf_hdr_destroy(hdr);
if (tbx)
tbx_destroy(tbx);
if (idx)
hts_idx_destroy(idx);
- return 0;
+ return ret;
}
int main_vcfindex(int argc, char *argv[])
{"nrecords",no_argument,NULL,'n'},
{"threads",required_argument,NULL,9},
{"output-file",required_argument,NULL,'o'},
+ {"output",required_argument,NULL,'o'},
{NULL, 0, NULL, 0}
};
min_shift = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg);
break;
- case 's': stats |= 1; break;
- case 'n': stats |= 2; break;
+ case 's': stats |= per_contig; break;
+ case 'n': stats |= total; break;
case 9:
n_threads = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
default: usage();
}
}
- if (stats>2)
+ if (stats > total)
{
fprintf(bcftools_stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
return 1;
/* vcfisec.c -- Create intersections, unions and complements of VCF files.
- Copyright (C) 2012-2019 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
if ( args->targets_list && files->nreaders==1 ) out_std = 1;
if ( out_std )
{
- out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+ out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname));
if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
#define OPEN_FILE(i,j) { \
open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \
- args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type)); \
+ args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode2(args->output_type,args->fnames[i])); \
if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \
if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \
if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
fprintf(args->fh_log,"%s\tfor stripped\t%s\n", args->fnames[i], args->files->readers[i].fname);
}
#undef OPEN_FILE
-
- args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix);
- if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno));
}
+ args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix);
+ if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno));
}
else {
if (args->output_fname) {
/* vcfisec.c -- Create intersections, unions and complements of VCF files.
- Copyright (C) 2012-2019 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
if ( args->targets_list && files->nreaders==1 ) out_std = 1;
if ( out_std )
{
- out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+ out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname));
if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
#define OPEN_FILE(i,j) { \
open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \
- args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type)); \
+ args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode2(args->output_type,args->fnames[i])); \
if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \
if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \
if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
fprintf(args->fh_log,"%s\tfor stripped\t%s\n", args->fnames[i], args->files->readers[i].fname);
}
#undef OPEN_FILE
-
- args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix);
- if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno));
}
+ args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix);
+ if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno));
}
else {
if (args->output_fname) {
fprintf(bcftools_stderr, " # Extract records private to A or B comparing by position only\n");
fprintf(bcftools_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcfisec(int argc, char *argv[])
/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
- Copyright (C) 2012-2019 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdio.h>
#include <string.h>
#include <strings.h>
+#include <assert.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
+#define PL2PROB_MAX 1024
+
// For merging INFO Number=A,G,R tags
typedef struct
{
gvcf_aux_t *gvcf; // buffer of gVCF lines, for each reader one line
int nout_smpl;
kstring_t *str;
+ int32_t *laa; // localized alternate alleles given as input-based indexes in per-sample blocks of (args->local_alleles+1) values, 0 is always first
+ int nlaa, laa_dirty; // number of LAA alleles actually used at this site, and was any L* added?
+ int32_t *tmpi, *k2k;
+ double *tmpd, *pl2prob; // mapping from phred-score likelihoods (PL) to probability
+ int ntmpi, ntmpd, nk2k;
}
maux_t;
maux_t *maux;
regidx_t *regs; // apply regions only after the blocks are expanded
regitr_t *regs_itr;
- int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref;
+ int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index;
char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
faidx_t *gvcf_fai;
info_rule_t *rules;
bcf_hdr_t *out_hdr;
char **argv;
int argc, n_threads, record_cmd_line;
+ int local_alleles; // the value of -L option
}
args_t;
bcf_update_info_string(hdr,line,rule->hdr_tag,rule->vals);
}
else
+ {
+ int isrc, idst = 0;
+ #define BRANCH(type_t,is_missing,is_vector_end) { \
+ type_t *ptr = (type_t*) rule->vals; \
+ for (isrc=0; isrc<rule->nvals; isrc++) \
+ { \
+ if ( is_vector_end ) break; \
+ if ( is_missing ) continue; \
+ if ( idst!=isrc ) ptr[idst] = ptr[isrc]; \
+ idst++; \
+ } \
+ }
+ switch (rule->type) {
+ case BCF_HT_INT: BRANCH(int32_t, ptr[isrc]==bcf_int32_missing, ptr[isrc]==bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[isrc]), bcf_float_is_vector_end(ptr[isrc])); break;
+ default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+ }
+ #undef BRANCH
+
+ rule->nvals = idst;
bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,rule->nvals,rule->type);
+ }
}
static int info_rules_comp_key2(const void *a, const void *b)
if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t);
else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float);
else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char);
- else error("The type is not supported: \"%s\"\n", rule->hdr_tag);
+ else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type);
ss = strchr(ss, '\0'); ss++;
if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag);
bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_G ||
bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_R
) ? 1 : 0;
- if ( is_join && is_agr )
- error("Cannot -i %s:join on Number=[AGR] tags is not supported.\n", rule->hdr_tag);
+ if ( is_join && bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)!=BCF_VL_VAR )
+ {
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->out_hdr, BCF_HL_INFO, "ID", rule->hdr_tag, NULL);
+ hrec = bcf_hrec_dup(hrec);
+ int i = bcf_hrec_find_key(hrec, "Number");
+ if ( i<0 ) error("Uh, could not find the entry Number in the header record of %s\n",rule->hdr_tag);
+ free(hrec->vals[i]);
+ hrec->vals[i] = strdup(".");
+ bcf_hdr_remove(args->out_hdr,BCF_HL_INFO, rule->hdr_tag);
+ bcf_hdr_add_hrec(args->out_hdr, hrec);
+ }
if ( !is_join && !is_agr )
error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag);
}
assert( n_smpl==bcf_hdr_nsamples(args->out_hdr) );
if ( args->do_gvcf )
{
- ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t));
+ ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); // -Walloc-size-larger-than gives a harmless warning caused by signed integer ma->n
for (i=0; i<ma->n; i++)
ma->gvcf[i].line = bcf_init1();
}
for (i=0; i<ma->n; i++)
ma->buf[i].rid = -1;
ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t));
+ if ( args->local_alleles )
+ {
+ ma->laa = (int32_t*)malloc(sizeof(*ma->laa)*ma->nout_smpl*(1+args->local_alleles));
+ ma->pl2prob = (double*)malloc(PL2PROB_MAX*sizeof(*ma->pl2prob));
+ for (i=0; i<PL2PROB_MAX; i++)
+ ma->pl2prob[i] = pow(10,-0.1*i);
+ }
return ma;
}
void maux_destroy(maux_t *ma)
free(ma->smpl_ploidy);
free(ma->smpl_nGsize);
free(ma->chr);
+ free(ma->laa);
+ free(ma->tmpi);
+ free(ma->k2k);
+ free(ma->tmpd);
+ free(ma->pl2prob);
free(ma);
}
void maux_expand1(buffer_t *buf, int size)
return max_ploidy;
}
+// Sets ma->laa to local indexes relevant for each sample or missing/vector_end.
+// The indexes are with respect to the source indexes and must be translated as
+// the very last step.
+void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL)
+{
+ bcf_srs_t *files = args->files;
+ maux_t *ma = args->maux;
+ int i,j,k,l, ismpl = 0, nlaa = 0;
+ static int warned = 0;
+
+ hts_expand(double,out->n_allele,ma->ntmpd,ma->tmpd); // allele probabilities
+ hts_expand(int,out->n_allele,ma->ntmpi,ma->tmpi); // indexes of the sorted probabilities
+
+ // Let map[] be the mapping from src to output idx. Then k2k[] is mapping from src allele idxs to src allele idxs
+ // reordered so that if i<j then map[k2k[i]] < map[k2k[j]]
+ hts_expand(int,out->n_allele,ma->nk2k,ma->k2k);
+
+ // Determine local alleles: either take all that are present in the reader or use PL to determine the best
+ // subset for each sample. The alleles must be listed in the order of the alleles in the output file.
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ bcf_fmt_t *fmt_ori = ma->fmt_map[files->nreaders*ifmt_PL+i];
+ bcf1_t *line = maux_get_line(args, i);
+ int nsmpl = bcf_hdr_nsamples(hdr);
+ if ( line )
+ {
+ if ( nlaa < line->n_allele - 1 )
+ nlaa = line->n_allele - 1 <= args->local_alleles ? line->n_allele - 1 : args->local_alleles;
+
+ for (j=0; j<line->n_allele; j++) ma->k2k[j] = j;
+
+ if ( line->n_allele <= args->local_alleles + 1 )
+ {
+ // sort to the output order, insertion sort, ascending
+ int *map = ma->buf[i].rec[ma->buf[i].cur].map;
+ int *k2k = ma->k2k;
+ int tmp;
+ for (k=1; k<line->n_allele; k++)
+ for (l=k; l>0 && map[k2k[l]] < map[k2k[l-1]]; l--)
+ tmp = k2k[l], k2k[l] = k2k[l-1], k2k[l-1] = tmp;
+
+ // fewer than the allowed number of alleles, use all alleles from this file
+ for (j=0; j<nsmpl; j++)
+ {
+ int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl;
+ for (k=0; k<line->n_allele; k++) ptr[k] = k2k[k];
+ for (; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end;
+ ismpl++;
+ }
+ continue;
+ }
+ }
+ if ( !line || !fmt_ori )
+ {
+ // no values, fill in missing values
+ for (j=0; j<nsmpl; j++)
+ {
+ int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl;
+ ptr[0] = bcf_int32_missing;
+ for (k=1; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end;
+ ismpl++;
+ }
+ continue;
+ }
+
+ // there are more alternate alleles in the input files than is allowed on output, need to subset
+ if ( ifmt_PL==-1 )
+ {
+ if ( !warned )
+ fprintf(stderr,"Warning: local alleles are determined from FORMAT/PL but the tag is missing, cannot apply --local-alleles\n");
+ warned = 1;
+ ma->nlaa = 0;
+ return;
+ }
+
+ if ( !IS_VL_G(hdr, fmt_ori->id) ) error("FORMAT/PL must be defined as Number=G\n");
+ if ( 2*fmt_ori->n != line->n_allele*(line->n_allele+1) ) error("Todo: haploid PL to LPL\n");
+
+ int *map = ma->buf[i].rec[ma->buf[i].cur].map;
+ double *allele_prob = ma->tmpd;
+ int *idx = ma->tmpi;
+ #define BRANCH(src_type_t, src_is_missing, src_is_vector_end, pl2prob_idx) { \
+ src_type_t *src = (src_type_t*) fmt_ori->p; \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ for (k=0; k<line->n_allele; k++) allele_prob[k] = 0; \
+ for (k=0; k<line->n_allele; k++) \
+ for (l=0; l<=k; l++) \
+ { \
+ if ( src_is_missing || src_is_vector_end ) { src++; continue; } \
+ double prob = ma->pl2prob[pl2prob_idx]; \
+ allele_prob[k] += prob; \
+ allele_prob[l] += prob; \
+ src++; \
+ } \
+ /* insertion sort by allele probability, descending order, with the twist that REF (idx=0) always comes first */ \
+ allele_prob++; idx[0] = -1; idx++; /* keep REF first */ \
+ int si,sj,tmp; \
+ for (si=0; si<line->n_allele-1; si++) idx[si] = si; \
+ for (si=1; si<line->n_allele-1; si++) \
+ for (sj=si; sj>0 && allele_prob[idx[sj]] > allele_prob[idx[sj-1]]; sj--) \
+ tmp = idx[sj], idx[sj] = idx[sj-1], idx[sj-1] = tmp; \
+ /*for debugging only: test order*/ \
+ for (si=1; si<line->n_allele-1; si++) \
+ assert( allele_prob[idx[si-1]] >= allele_prob[idx[si]] ); \
+ allele_prob--; idx--; /* this was to keep REF first */ \
+ int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl; \
+ ptr[0] = 0; \
+ for (k=1; k<=args->local_alleles && k<line->n_allele; k++) ptr[k] = idx[k]+1; \
+ int kmax = k; \
+ for (; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end; \
+ /* insertion sort by indexes to the output order, ascending */ \
+ for (k=1; k<kmax; k++) \
+ for (l=k; l>0 && map[ptr[l]] < map[ptr[l-1]]; l--) \
+ tmp = ptr[l], ptr[l] = ptr[l-1], ptr[l-1] = tmp; \
+ ismpl++; \
+ } \
+ }
+ switch (fmt_ori->type)
+ {
+ case BCF_BT_INT8: BRANCH( int8_t, *src==bcf_int8_missing, *src==bcf_int8_vector_end, *src); break;
+ case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *src>=0 && *src<PL2PROB_MAX ? *src : PL2PROB_MAX-1); break;
+ case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *src>=0 && *src<PL2PROB_MAX ? *src : PL2PROB_MAX-1); break;
+ default: error("Unexpected case: %d, PL\n", fmt_ori->type);
+ }
+ #undef BRANCH
+ }
+ ma->nlaa = nlaa;
+}
+
+void update_local_alleles(args_t *args, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ maux_t *ma = args->maux;
+ int i,j,k,ismpl=0,nsamples = bcf_hdr_nsamples(args->out_hdr);
+ for (i=0; i<files->nreaders; i++)
+ {
+ int irec = ma->buf[i].cur;
+ bcf_sr_t *reader = &files->readers[i];
+ int nsmpl = bcf_hdr_nsamples(reader->header);
+ for (k=0; k<nsmpl; k++)
+ {
+ int32_t *src = ma->laa + ismpl*(1+args->local_alleles);
+ int32_t *dst = ma->laa + ismpl*ma->nlaa;
+ j = 0;
+ if ( irec>=0 )
+ {
+ for (; j<ma->nlaa; j++)
+ {
+ if ( src[j+1]==bcf_int32_missing ) dst[j] = bcf_int32_missing;
+ else if ( src[j+1]==bcf_int32_vector_end ) break;
+ else
+ dst[j] = ma->buf[i].rec[irec].map[src[j+1]];
+ }
+ }
+ if ( j==0 ) dst[j++] = bcf_int32_missing;
+ for (; j<ma->nlaa; j++) src[j] = bcf_int32_vector_end;
+ ismpl++;
+ }
+ }
+ bcf_update_format_int32(args->out_hdr, out, "LAA", ma->laa, nsamples*ma->nlaa);
+}
+
void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
{
bcf_srs_t *files = args->files;
int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr);
static int warned = 0;
- int nsize = 0, msize = sizeof(int32_t);
+ int nsize = 0;
for (i=0; i<files->nreaders; i++)
{
bcf_fmt_t *fmt = fmt_map[i];
}
if ( nsize==0 ) nsize = 1;
- if ( ma->ntmp_arr < nsamples*nsize*msize )
+ size_t msize = sizeof(int32_t)*nsize*nsamples;
+ if ( msize > 2147483647 )
{
- ma->ntmp_arr = nsamples*nsize*msize;
- ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr);
- if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr);
- if ( ma->ntmp_arr > 2147483647 )
- {
- if ( !warned ) fprintf(stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr);
- warned = 1;
- return;
- }
+ if ( !warned ) fprintf(stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize);
+ warned = 1;
+ return;
+ }
+ if ( ma->ntmp_arr < msize )
+ {
+ ma->tmp_arr = realloc(ma->tmp_arr, msize);
+ if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",msize);
+ ma->ntmp_arr = msize;
}
memset(ma->smpl_ploidy,0,nsamples*sizeof(int));
int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew);
if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret);
}
+ if ( nmax < str->l ) nmax = str->l;
src += fmt_ori->size;
}
continue;
"If you don't really need it, use `bcftools annotate -x` to remove the annotation before merging.\n", __func__,key);
}
// update the record
- if ( ma->ntmp_arr < nsamples*nmax )
+ size_t msize = nsamples*nmax;
+ if ( msize > 2147483647 )
{
- ma->ntmp_arr = nsamples*nmax;
- ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr);
- if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr);
- if ( ma->ntmp_arr > 2147483647 )
- {
- if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr);
- warned = 1;
- return;
- }
+ if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize);
+ warned = 1;
+ return;
+ }
+ if ( ma->ntmp_arr < msize )
+ {
+ ma->tmp_arr = realloc(ma->tmp_arr, msize);
+ if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",msize);
+ ma->ntmp_arr = msize;
}
char *tgt = (char*) ma->tmp_arr;
for (i=0; i<nsamples; i++)
bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nmax);
}
+// Note: only diploid Number=G tags only for now
+void merge_localized_numberG_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out, int irdr)
+{
+ int i,j,k, nsamples = bcf_hdr_nsamples(args->out_hdr);
+ bcf_srs_t *files = args->files;
+ maux_t *ma = args->maux;
+ bcf_fmt_t *fmt = fmt_map[irdr];
+ const char *key = files->readers[irdr].header->id[BCF_DT_ID][fmt_map[irdr]->id].key;
+ size_t nsize = (ma->nlaa+1)*(ma->nlaa+2)/2; // max number of Number=G localized fields
+ size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
+ msize *= nsamples*nsize;
+ if ( msize > 2147483647 )
+ {
+ static int warned = 0;
+ if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,msize);
+ warned = 1;
+ return;
+ }
+ if ( ma->ntmp_arr < msize )
+ {
+ ma->tmp_arr = realloc(ma->tmp_arr, msize);
+ if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
+ ma->ntmp_arr = msize;
+ }
+ int ismpl = 0;
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ bcf_fmt_t *fmt_ori = fmt_map[i];
+ bcf1_t *line = maux_get_line(args, i);
+ int nsmpl = bcf_hdr_nsamples(hdr);
+
+ if ( !fmt_ori )
+ {
+ // fill missing values
+ #define BRANCH(tgt_type_t, tgt_set_missing, tgt_set_vector_end) { \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+ tgt_set_missing; \
+ for (k=1; k<nsize; k++) { tgt++; tgt_set_vector_end; } \
+ ismpl++; \
+ } \
+ }
+ switch (fmt->type)
+ {
+ case BCF_BT_INT8: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
+ default: error("Unexpected case: %d, %s\n", fmt->type, key);
+ }
+ #undef BRANCH
+ continue;
+ }
+ if ( 2*fmt_ori->n!=line->n_allele*(line->n_allele+1) ) error("Todo: localization of missing or haploid Number=G tags\n");
+
+ // localize
+ #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ src_type_t *src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+ tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+ int *laa = ma->laa + (1+args->local_alleles)*ismpl; \
+ int ii,ij,tgt_idx = 0; \
+ for (ii=0; ii<=ma->nlaa; ii++) \
+ { \
+ if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \
+ for (ij=0; ij<=ii; ij++) \
+ { \
+ int src_idx = bcf_alleles2gt(laa[ii],laa[ij]); \
+ if ( src_is_missing ) tgt_set_missing; \
+ else if ( src_is_vector_end ) break; \
+ else tgt[tgt_idx] = src[src_idx]; \
+ tgt_idx++; \
+ } \
+ } \
+ if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \
+ for (; tgt_idx<nsize; tgt_idx++) tgt_set_vector_end; \
+ ismpl++; \
+ } \
+ }
+ switch (fmt_ori->type)
+ {
+ case BCF_BT_INT8: BRANCH(int32_t, int8_t, src[src_idx]==bcf_int8_missing, src[src_idx]==bcf_int8_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+ case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
+ default: error("Unexpected case: %d, %s\n", fmt_ori->type, key);
+ }
+ #undef BRANCH
+ }
+ args->tmps.l = 0;
+ kputc('L',&args->tmps);
+ kputs(key,&args->tmps);
+ if ( fmt_map[irdr]->type==BCF_BT_FLOAT )
+ bcf_update_format_float(args->out_hdr, out, args->tmps.s, (float*)ma->tmp_arr, nsamples*nsize);
+ else
+ bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
+ ma->laa_dirty = 1;
+}
+void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out, int irdr)
+{
+ int i,j,k, nsamples = bcf_hdr_nsamples(args->out_hdr);
+ bcf_srs_t *files = args->files;
+ maux_t *ma = args->maux;
+ bcf_fmt_t *fmt = fmt_map[irdr];
+ const char *key = files->readers[irdr].header->id[BCF_DT_ID][fmt->id].key;
+ size_t nsize = IS_VL_R(files->readers[irdr].header, fmt->id) ? ma->nlaa + 1 : ma->nlaa;
+ size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
+ msize *= nsamples*nsize;
+ if ( msize > 2147483647 )
+ {
+ static int warned = 0;
+ if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,msize);
+ warned = 1;
+ return;
+ }
+ if ( ma->ntmp_arr < msize )
+ {
+ ma->tmp_arr = realloc(ma->tmp_arr, msize);
+ if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
+ ma->ntmp_arr = msize;
+ }
+ int ismpl = 0, ibeg = IS_VL_R(files->readers[irdr].header, fmt->id) ? 0 : 1;;
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ bcf_fmt_t *fmt_ori = fmt_map[i];
+ int nsmpl = bcf_hdr_nsamples(hdr);
+
+ if ( !fmt_ori )
+ {
+ // fill missing values
+ #define BRANCH(tgt_type_t, tgt_set_missing, tgt_set_vector_end) { \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+ tgt_set_missing; \
+ for (k=1; k<nsize; k++) { tgt++; tgt_set_vector_end; } \
+ ismpl++; \
+ } \
+ }
+ switch (fmt->type)
+ {
+ case BCF_BT_INT8: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
+ default: error("Unexpected case: %d, %s\n", fmt->type, key);
+ }
+ #undef BRANCH
+ continue;
+ }
+
+ // localize
+ #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ src_type_t *src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+ tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+ int *laa = ma->laa + (1+args->local_alleles)*ismpl; \
+ int ii,tgt_idx = 0; \
+ for (ii=ibeg; ii<=ma->nlaa; ii++) \
+ { \
+ if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \
+ int src_idx = laa[ii] - ibeg; \
+ if ( src_is_missing ) tgt_set_missing; \
+ else if ( src_is_vector_end ) break; \
+ else tgt[tgt_idx] = src[src_idx]; \
+ tgt_idx++; \
+ } \
+ if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \
+ for (; tgt_idx<nsize; tgt_idx++) tgt_set_vector_end; \
+ ismpl++; \
+ } \
+ }
+ switch (fmt_ori->type)
+ {
+ case BCF_BT_INT8: BRANCH(int32_t, int8_t, src[src_idx]==bcf_int8_missing, src[src_idx]==bcf_int8_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+ case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
+ default: error("Unexpected case: %d, %s\n", fmt_ori->type, key);
+ }
+ #undef BRANCH
+ }
+ args->tmps.l = 0;
+ kputc('L',&args->tmps);
+ kputs(key,&args->tmps);
+ if ( fmt_map[irdr]->type==BCF_BT_FLOAT )
+ bcf_update_format_float(args->out_hdr, out, args->tmps.s, (float*)ma->tmp_arr, nsamples*nsize);
+ else
+ bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
+ ma->laa_dirty = 1;
+}
void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
{
bcf_srs_t *files = args->files;
}
if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n;
}
+ if ( ma->nlaa && length!=BCF_VL_FIXED )
+ {
+ if ( length==BCF_VL_G ) merge_localized_numberG_format_field(args,fmt_map,out,i);
+ else if ( length==BCF_VL_A || length==BCF_VL_R ) merge_localized_numberAR_format_field(args,fmt_map,out,i);
+ return;
+ }
+
if ( type==BCF_BT_CHAR )
{
merge_format_string(args, key, fmt_map, out, length, nsize);
}
size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
- if ( ma->ntmp_arr < nsamples*nsize*msize )
+ msize *= nsamples*nsize;
+ if ( msize > 2147483647 )
{
- ma->ntmp_arr = nsamples*nsize*msize;
- ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr);
- if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", ma->ntmp_arr,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
- if ( ma->ntmp_arr > 2147483647 )
- {
- if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr);
- warned = 1;
- return;
- }
+ if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize);
+ warned = 1;
+ return;
+ }
+ if ( ma->ntmp_arr < msize )
+ {
+ ma->tmp_arr = realloc(ma->tmp_arr, msize);
+ if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
+ ma->ntmp_arr = msize;
}
// Fill the temp array for all samples by collecting values from all files
khiter_t kitr;
strdict_t *tmph = args->tmph;
kh_clear(strdict, tmph);
- int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index
+ int i, j, ret, has_GT = 0, has_PL = -1, max_ifmt = 0; // max fmt index
for (i=0; i<files->nreaders; i++)
{
bcf1_t *line = maux_get_line(args,i);
memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*));
ma->nfmt_map = max_ifmt+1;
}
+ if ( key[0]=='P' && key[1]=='L' && key[2]==0 ) { has_PL = ifmt; }
}
kitr = kh_put(strdict, tmph, key, &ret);
kh_value(tmph, kitr) = ifmt;
ma->buf[i].rec[irec].als_differ = j==line->n_allele ? 0 : 1;
}
+ if ( args->local_alleles )
+ {
+ ma->laa_dirty = ma->nlaa = 0;
+ if ( out->n_allele > args->local_alleles + 1 ) init_local_alleles(args, out, has_PL);
+ }
+
out->n_sample = bcf_hdr_nsamples(out_hdr);
if ( has_GT )
merge_GT(args, ma->fmt_map, out);
for (i=1; i<=max_ifmt; i++)
merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
+
+ if ( ma->laa_dirty )
+ update_local_alleles(args, out);
+
out->d.indiv_dirty = 1;
}
}
}
+static inline int is_gvcf_block(bcf1_t *line)
+{
+ if ( line->rlen<=1 ) return 0;
+ if ( strlen(line->d.allele[0])==line->rlen ) return 0;
+ if ( line->n_allele==1 ) return 1;
+
+ int i;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( !strcmp(line->d.allele[i],"<*>") ) return 1;
+ if ( !strcmp(line->d.allele[i],"<NON_REF>") ) return 1;
+ if ( !strcmp(line->d.allele[i],"<X>") ) return 1;
+ }
+ return 0;
+}
+static const int snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), indel_mask = VCF_INDEL<<2, ref_mask = 2;
+
/*
Check incoming lines for new gVCF blocks, set pointer to the current source
buffer (gvcf or readers). In contrast to gvcf_flush, this function can be
maux->gvcf_min = INT_MAX;
for (i=0; i<files->nreaders; i++)
{
+ if ( gaux[i].active && gaux[i].end < pos ) gaux[i].active = 0;
if ( gaux[i].active )
{
// gvcf block should not overlap with another record
int irec = maux->buf[i].beg;
bcf_hdr_t *hdr = bcf_sr_get_header(files, i);
bcf1_t *line = args->files->readers[i].buffer[irec];
- int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend);
+ int ret = is_gvcf_block(line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0;
if ( ret==1 )
{
if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END
fprintf(stderr,"\n");
}
-
/*
Determine which line should be merged from which reader: go through all
readers and all buffered lines, expand REF,ALT and try to match lines with
int can_merge(args_t *args)
{
bcf_srs_t *files = args->files;
- int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
maux_t *maux = args->maux;
gvcf_aux_t *gaux = maux->gvcf;
char *id = NULL, ref = 'N';
}
maux->var_types = maux->nals = 0;
+ // this is only for the `-m none -g` mode, ensure that <*> lines come last
+ #define VCF_GVCF_REF 1
+
for (i=0; i<files->nreaders; i++)
{
buffer_t *buf = &maux->buf[i];
buf->rec[j].skip = SKIP_DIFF;
ntodo++;
+ bcf1_t *line = buf->lines[j];
if ( args->merge_by_id )
- id = buf->lines[j]->d.id;
+ id = line->d.id;
else
{
- int var_type = bcf_get_variant_types(buf->lines[j]);
- maux->var_types |= var_type ? var_type<<1 : 1;
+ int var_type = bcf_get_variant_types(line);
+ maux->var_types |= var_type ? var_type<<2 : 2;
+
+ // for the `-m none -g` mode
+ if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) )
+ maux->var_types |= VCF_GVCF_REF;
}
}
bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
int line_type = bcf_get_variant_types(line);
- line_type = line_type ? line_type<<1 : 1;
+ line_type = line_type ? line_type<<2 : 2;
// select relevant lines
if ( args->merge_by_id )
}
else
{
+ // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant
+ // records come last, otherwise infinite loop is created (#1164)
+ if ( args->collapse==COLLAPSE_NONE && args->do_gvcf )
+ {
+ if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue;
+ }
if ( args->collapse==COLLAPSE_NONE && maux->nals )
{
// All alleles of the tested record must be present in the
*/
void stage_line(args_t *args)
{
- int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
bcf_srs_t *files = args->files;
maux_t *maux = args->maux;
void merge_line(args_t *args)
{
- if ( args->regs )
- {
- if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return;
- }
-
bcf1_t *out = args->out_line;
merge_chrom2qual(args, out);
+ if ( args->regs && !regidx_overlap(args->regs,args->maux->chr,out->pos,out->pos+out->rlen-1,NULL) ) return;
merge_filter(args, out);
merge_info(args, out);
if ( args->do_gvcf )
error_errno("[%s] Failed to add program information to header", __func__);
}
+void hdr_add_localized_tags(args_t *args, bcf_hdr_t *hdr)
+{
+ char **str = NULL;
+ int i,j, nstr = 0, mstr = 0;
+ for (i=0; i<hdr->nhrec; i++)
+ {
+ if ( hdr->hrec[i]->type!=BCF_HL_FMT ) continue;
+ j = bcf_hrec_find_key(hdr->hrec[i],"ID");
+ if ( j<0 ) continue;
+ char *key = hdr->hrec[i]->vals[j];
+ int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
+ assert( id>=0 );
+ int localize = 0;
+ if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G ) localize = 1;
+ if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A ) localize = 1;
+ if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R ) localize = 1;
+ if ( !localize ) continue;
+ args->tmps.l = 0;
+
+ uint32_t e = 0, nout = 0;
+ e |= ksprintf(&args->tmps, "##%s=<", hdr->hrec[i]->key) < 0;
+ for (j=0; j<hdr->hrec[i]->nkeys; j++)
+ {
+ if ( !strcmp("IDX",hdr->hrec[i]->keys[j]) ) continue;
+ if ( nout ) e |= kputc(',',&args->tmps) < 0;
+ if ( !strcmp("ID",hdr->hrec[i]->keys[j]) )
+ e |= ksprintf(&args->tmps,"%s=L%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]) < 0;
+ else if ( !strcmp("Number",hdr->hrec[i]->keys[j]) )
+ e |= ksprintf(&args->tmps,"Number=.") < 0;
+ else if ( !strcmp("Description",hdr->hrec[i]->keys[j]) && hdr->hrec[i]->vals[j][0]=='"' )
+ e |= ksprintf(&args->tmps,"Description=\"Localized field: %s", hdr->hrec[i]->vals[j]+1) < 0;
+ else
+ e |= ksprintf(&args->tmps,"%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]) < 0;
+ nout++;
+ }
+ e |= ksprintf(&args->tmps,">\n") < 0;
+ if ( e ) error("Failed to format the header line for %s\n", key);
+ nstr++;
+ hts_expand(char*,nstr,mstr,str);
+ str[nstr-1] = strdup(args->tmps.s);
+ }
+ if ( !nstr ) return;
+ bcf_hdr_append(hdr,"##FORMAT=<ID=LAA,Number=.,Type=Integer,Description=\"Localized alleles: subset of alternate alleles relevant for each sample\">");
+ for (i=0; i<nstr; i++)
+ {
+ bcf_hdr_append(hdr, str[i]);
+ free(str[i]);
+ }
+ free(str);
+}
void merge_vcf(args_t *args)
{
- args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads);
args->out_hdr = bcf_hdr_init("w");
char buf[24]; snprintf(buf,sizeof buf,"%d",i+1);
merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples);
}
+ if ( args->local_alleles ) hdr_add_localized_tags(args, args->out_hdr);
if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge");
if (bcf_hdr_sync(args->out_hdr) < 0)
error_errno("[%s] Failed to update header", __func__);
fprintf(stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
fprintf(stderr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
fprintf(stderr, " -l, --file-list <file> read file names from the file\n");
+ fprintf(stderr, " -L, --local-alleles <int> EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
fprintf(stderr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+ fprintf(stderr, " --no-index merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
{
{"help",no_argument,NULL,'h'},
{"merge",required_argument,NULL,'m'},
+ {"local-alleles",required_argument,NULL,'L'},
{"gvcf",required_argument,NULL,'g'},
{"file-list",required_argument,NULL,'l'},
{"missing-to-ref",no_argument,NULL,'0'},
{"regions-file",required_argument,NULL,'R'},
{"info-rules",required_argument,NULL,'i'},
{"no-version",no_argument,NULL,8},
+ {"no-index",no_argument,NULL,10},
{"filter-logic",required_argument,NULL,'F'},
{NULL,0,NULL,0}
};
- while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0",loptions,NULL)) >= 0) {
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'L':
+ args->local_alleles = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --local-alleles %s\n", optarg);
+ if ( args->local_alleles < 1 )
+ error("Error: \"--local-alleles %s\" makes no sense, expected value bigger or equal than 1\n", optarg);
+ break;
case 'F':
if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD;
else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE;
case 3 : args->force_samples = 1; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
+ case 10 : args->no_index = 1; break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
if ( argc==optind && !args->file_list ) usage();
if ( argc-optind<2 && !args->file_list ) usage();
- args->files->require_index = 1;
+ if ( args->no_index )
+ {
+ if ( args->regions_list ) error("Error: cannot combine --no-index with -r/-R\n");
+ bcf_sr_set_opt(args->files,BCF_SR_ALLOW_NO_IDX);
+ }
+ else
+ bcf_sr_set_opt(args->files,BCF_SR_REQUIRE_IDX);
if ( args->regions_list )
{
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
- Copyright (C) 2012-2019 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdio.h>
#include <string.h>
#include <strings.h>
+#include <assert.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
+#define PL2PROB_MAX 1024
+
// For merging INFO Number=A,G,R tags
typedef struct
{
gvcf_aux_t *gvcf; // buffer of gVCF lines, for each reader one line
int nout_smpl;
kstring_t *str;
+ int32_t *laa; // localized alternate alleles given as input-based indexes in per-sample blocks of (args->local_alleles+1) values, 0 is always first
+ int nlaa, laa_dirty; // number of LAA alleles actually used at this site, and was any L* added?
+ int32_t *tmpi, *k2k;
+ double *tmpd, *pl2prob; // mapping from phred-score likelihoods (PL) to probability
+ int ntmpi, ntmpd, nk2k;
}
maux_t;
maux_t *maux;
regidx_t *regs; // apply regions only after the blocks are expanded
regitr_t *regs_itr;
- int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref;
+ int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index;
char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
faidx_t *gvcf_fai;
info_rule_t *rules;
bcf_hdr_t *out_hdr;
char **argv;
int argc, n_threads, record_cmd_line;
+ int local_alleles; // the value of -L option
}
args_t;
bcf_update_info_string(hdr,line,rule->hdr_tag,rule->vals);
}
else
+ {
+ int isrc, idst = 0;
+ #define BRANCH(type_t,is_missing,is_vector_end) { \
+ type_t *ptr = (type_t*) rule->vals; \
+ for (isrc=0; isrc<rule->nvals; isrc++) \
+ { \
+ if ( is_vector_end ) break; \
+ if ( is_missing ) continue; \
+ if ( idst!=isrc ) ptr[idst] = ptr[isrc]; \
+ idst++; \
+ } \
+ }
+ switch (rule->type) {
+ case BCF_HT_INT: BRANCH(int32_t, ptr[isrc]==bcf_int32_missing, ptr[isrc]==bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[isrc]), bcf_float_is_vector_end(ptr[isrc])); break;
+ default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+ }
+ #undef BRANCH
+
+ rule->nvals = idst;
bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,rule->nvals,rule->type);
+ }
}
static int info_rules_comp_key2(const void *a, const void *b)
if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t);
else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float);
else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char);
- else error("The type is not supported: \"%s\"\n", rule->hdr_tag);
+ else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type);
ss = strchr(ss, '\0'); ss++;
if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag);
bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_G ||
bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_R
) ? 1 : 0;
- if ( is_join && is_agr )
- error("Cannot -i %s:join on Number=[AGR] tags is not supported.\n", rule->hdr_tag);
+ if ( is_join && bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)!=BCF_VL_VAR )
+ {
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->out_hdr, BCF_HL_INFO, "ID", rule->hdr_tag, NULL);
+ hrec = bcf_hrec_dup(hrec);
+ int i = bcf_hrec_find_key(hrec, "Number");
+ if ( i<0 ) error("Uh, could not find the entry Number in the header record of %s\n",rule->hdr_tag);
+ free(hrec->vals[i]);
+ hrec->vals[i] = strdup(".");
+ bcf_hdr_remove(args->out_hdr,BCF_HL_INFO, rule->hdr_tag);
+ bcf_hdr_add_hrec(args->out_hdr, hrec);
+ }
if ( !is_join && !is_agr )
error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag);
}
assert( n_smpl==bcf_hdr_nsamples(args->out_hdr) );
if ( args->do_gvcf )
{
- ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t));
+ ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); // -Walloc-size-larger-than gives a harmless warning caused by signed integer ma->n
for (i=0; i<ma->n; i++)
ma->gvcf[i].line = bcf_init1();
}
for (i=0; i<ma->n; i++)
ma->buf[i].rid = -1;
ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t));
+ if ( args->local_alleles )
+ {
+ ma->laa = (int32_t*)malloc(sizeof(*ma->laa)*ma->nout_smpl*(1+args->local_alleles));
+ ma->pl2prob = (double*)malloc(PL2PROB_MAX*sizeof(*ma->pl2prob));
+ for (i=0; i<PL2PROB_MAX; i++)
+ ma->pl2prob[i] = pow(10,-0.1*i);
+ }
return ma;
}
void maux_destroy(maux_t *ma)
free(ma->smpl_ploidy);
free(ma->smpl_nGsize);
free(ma->chr);
+ free(ma->laa);
+ free(ma->tmpi);
+ free(ma->k2k);
+ free(ma->tmpd);
+ free(ma->pl2prob);
free(ma);
}
void maux_expand1(buffer_t *buf, int size)
case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, int); break;
case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, int); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), float); break;
- default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
+ default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); bcftools_exit(1);
}
#undef BRANCH
}
case BCF_BT_INT16: BRANCH(int16_t, src[kori]==bcf_int16_missing, src[kori]==bcf_int16_vector_end, int); break;
case BCF_BT_INT32: BRANCH(int32_t, src[kori]==bcf_int32_missing, src[kori]==bcf_int32_vector_end, int); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(src[kori]), bcf_float_is_vector_end(src[kori]), float); break;
- default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
+ default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); bcftools_exit(1);
}
#undef BRANCH
}
return max_ploidy;
}
+// Sets ma->laa to local indexes relevant for each sample or missing/vector_end.
+// The indexes are with respect to the source indexes and must be translated as
+// the very last step.
+void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL)
+{
+ bcf_srs_t *files = args->files;
+ maux_t *ma = args->maux;
+ int i,j,k,l, ismpl = 0, nlaa = 0;
+ static int warned = 0;
+
+ hts_expand(double,out->n_allele,ma->ntmpd,ma->tmpd); // allele probabilities
+ hts_expand(int,out->n_allele,ma->ntmpi,ma->tmpi); // indexes of the sorted probabilities
+
+ // Let map[] be the mapping from src to output idx. Then k2k[] is mapping from src allele idxs to src allele idxs
+ // reordered so that if i<j then map[k2k[i]] < map[k2k[j]]
+ hts_expand(int,out->n_allele,ma->nk2k,ma->k2k);
+
+ // Determine local alleles: either take all that are present in the reader or use PL to determine the best
+ // subset for each sample. The alleles must be listed in the order of the alleles in the output file.
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ bcf_fmt_t *fmt_ori = ma->fmt_map[files->nreaders*ifmt_PL+i];
+ bcf1_t *line = maux_get_line(args, i);
+ int nsmpl = bcf_hdr_nsamples(hdr);
+ if ( line )
+ {
+ if ( nlaa < line->n_allele - 1 )
+ nlaa = line->n_allele - 1 <= args->local_alleles ? line->n_allele - 1 : args->local_alleles;
+
+ for (j=0; j<line->n_allele; j++) ma->k2k[j] = j;
+
+ if ( line->n_allele <= args->local_alleles + 1 )
+ {
+ // sort to the output order, insertion sort, ascending
+ int *map = ma->buf[i].rec[ma->buf[i].cur].map;
+ int *k2k = ma->k2k;
+ int tmp;
+ for (k=1; k<line->n_allele; k++)
+ for (l=k; l>0 && map[k2k[l]] < map[k2k[l-1]]; l--)
+ tmp = k2k[l], k2k[l] = k2k[l-1], k2k[l-1] = tmp;
+
+ // fewer than the allowed number of alleles, use all alleles from this file
+ for (j=0; j<nsmpl; j++)
+ {
+ int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl;
+ for (k=0; k<line->n_allele; k++) ptr[k] = k2k[k];
+ for (; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end;
+ ismpl++;
+ }
+ continue;
+ }
+ }
+ if ( !line || !fmt_ori )
+ {
+ // no values, fill in missing values
+ for (j=0; j<nsmpl; j++)
+ {
+ int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl;
+ ptr[0] = bcf_int32_missing;
+ for (k=1; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end;
+ ismpl++;
+ }
+ continue;
+ }
+
+ // there are more alternate alleles in the input files than is allowed on output, need to subset
+ if ( ifmt_PL==-1 )
+ {
+ if ( !warned )
+ fprintf(bcftools_stderr,"Warning: local alleles are determined from FORMAT/PL but the tag is missing, cannot apply --local-alleles\n");
+ warned = 1;
+ ma->nlaa = 0;
+ return;
+ }
+
+ if ( !IS_VL_G(hdr, fmt_ori->id) ) error("FORMAT/PL must be defined as Number=G\n");
+ if ( 2*fmt_ori->n != line->n_allele*(line->n_allele+1) ) error("Todo: haploid PL to LPL\n");
+
+ int *map = ma->buf[i].rec[ma->buf[i].cur].map;
+ double *allele_prob = ma->tmpd;
+ int *idx = ma->tmpi;
+ #define BRANCH(src_type_t, src_is_missing, src_is_vector_end, pl2prob_idx) { \
+ src_type_t *src = (src_type_t*) fmt_ori->p; \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ for (k=0; k<line->n_allele; k++) allele_prob[k] = 0; \
+ for (k=0; k<line->n_allele; k++) \
+ for (l=0; l<=k; l++) \
+ { \
+ if ( src_is_missing || src_is_vector_end ) { src++; continue; } \
+ double prob = ma->pl2prob[pl2prob_idx]; \
+ allele_prob[k] += prob; \
+ allele_prob[l] += prob; \
+ src++; \
+ } \
+ /* insertion sort by allele probability, descending order, with the twist that REF (idx=0) always comes first */ \
+ allele_prob++; idx[0] = -1; idx++; /* keep REF first */ \
+ int si,sj,tmp; \
+ for (si=0; si<line->n_allele-1; si++) idx[si] = si; \
+ for (si=1; si<line->n_allele-1; si++) \
+ for (sj=si; sj>0 && allele_prob[idx[sj]] > allele_prob[idx[sj-1]]; sj--) \
+ tmp = idx[sj], idx[sj] = idx[sj-1], idx[sj-1] = tmp; \
+ /*for debugging only: test order*/ \
+ for (si=1; si<line->n_allele-1; si++) \
+ assert( allele_prob[idx[si-1]] >= allele_prob[idx[si]] ); \
+ allele_prob--; idx--; /* this was to keep REF first */ \
+ int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl; \
+ ptr[0] = 0; \
+ for (k=1; k<=args->local_alleles && k<line->n_allele; k++) ptr[k] = idx[k]+1; \
+ int kmax = k; \
+ for (; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end; \
+ /* insertion sort by indexes to the output order, ascending */ \
+ for (k=1; k<kmax; k++) \
+ for (l=k; l>0 && map[ptr[l]] < map[ptr[l-1]]; l--) \
+ tmp = ptr[l], ptr[l] = ptr[l-1], ptr[l-1] = tmp; \
+ ismpl++; \
+ } \
+ }
+ switch (fmt_ori->type)
+ {
+ case BCF_BT_INT8: BRANCH( int8_t, *src==bcf_int8_missing, *src==bcf_int8_vector_end, *src); break;
+ case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *src>=0 && *src<PL2PROB_MAX ? *src : PL2PROB_MAX-1); break;
+ case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *src>=0 && *src<PL2PROB_MAX ? *src : PL2PROB_MAX-1); break;
+ default: error("Unexpected case: %d, PL\n", fmt_ori->type);
+ }
+ #undef BRANCH
+ }
+ ma->nlaa = nlaa;
+}
+
+void update_local_alleles(args_t *args, bcf1_t *out)
+{
+ bcf_srs_t *files = args->files;
+ maux_t *ma = args->maux;
+ int i,j,k,ismpl=0,nsamples = bcf_hdr_nsamples(args->out_hdr);
+ for (i=0; i<files->nreaders; i++)
+ {
+ int irec = ma->buf[i].cur;
+ bcf_sr_t *reader = &files->readers[i];
+ int nsmpl = bcf_hdr_nsamples(reader->header);
+ for (k=0; k<nsmpl; k++)
+ {
+ int32_t *src = ma->laa + ismpl*(1+args->local_alleles);
+ int32_t *dst = ma->laa + ismpl*ma->nlaa;
+ j = 0;
+ if ( irec>=0 )
+ {
+ for (; j<ma->nlaa; j++)
+ {
+ if ( src[j+1]==bcf_int32_missing ) dst[j] = bcf_int32_missing;
+ else if ( src[j+1]==bcf_int32_vector_end ) break;
+ else
+ dst[j] = ma->buf[i].rec[irec].map[src[j+1]];
+ }
+ }
+ if ( j==0 ) dst[j++] = bcf_int32_missing;
+ for (; j<ma->nlaa; j++) src[j] = bcf_int32_vector_end;
+ ismpl++;
+ }
+ }
+ bcf_update_format_int32(args->out_hdr, out, "LAA", ma->laa, nsamples*ma->nlaa);
+}
+
void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
{
bcf_srs_t *files = args->files;
int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr);
static int warned = 0;
- int nsize = 0, msize = sizeof(int32_t);
+ int nsize = 0;
for (i=0; i<files->nreaders; i++)
{
bcf_fmt_t *fmt = fmt_map[i];
}
if ( nsize==0 ) nsize = 1;
- if ( ma->ntmp_arr < nsamples*nsize*msize )
+ size_t msize = sizeof(int32_t)*nsize*nsamples;
+ if ( msize > 2147483647 )
{
- ma->ntmp_arr = nsamples*nsize*msize;
- ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr);
- if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr);
- if ( ma->ntmp_arr > 2147483647 )
- {
- if ( !warned ) fprintf(bcftools_stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr);
- warned = 1;
- return;
- }
+ if ( !warned ) fprintf(bcftools_stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize);
+ warned = 1;
+ return;
+ }
+ if ( ma->ntmp_arr < msize )
+ {
+ ma->tmp_arr = realloc(ma->tmp_arr, msize);
+ if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",msize);
+ ma->ntmp_arr = msize;
}
memset(ma->smpl_ploidy,0,nsamples*sizeof(int));
int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew);
if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret);
}
+ if ( nmax < str->l ) nmax = str->l;
src += fmt_ori->size;
}
continue;
"If you don't really need it, use `bcftools annotate -x` to remove the annotation before merging.\n", __func__,key);
}
// update the record
- if ( ma->ntmp_arr < nsamples*nmax )
+ size_t msize = nsamples*nmax;
+ if ( msize > 2147483647 )
{
- ma->ntmp_arr = nsamples*nmax;
- ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr);
- if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr);
- if ( ma->ntmp_arr > 2147483647 )
- {
- if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr);
- warned = 1;
- return;
- }
+ if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize);
+ warned = 1;
+ return;
+ }
+ if ( ma->ntmp_arr < msize )
+ {
+ ma->tmp_arr = realloc(ma->tmp_arr, msize);
+ if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",msize);
+ ma->ntmp_arr = msize;
}
char *tgt = (char*) ma->tmp_arr;
for (i=0; i<nsamples; i++)
bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nmax);
}
+// Note: only diploid Number=G tags only for now
+void merge_localized_numberG_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out, int irdr)
+{
+ int i,j,k, nsamples = bcf_hdr_nsamples(args->out_hdr);
+ bcf_srs_t *files = args->files;
+ maux_t *ma = args->maux;
+ bcf_fmt_t *fmt = fmt_map[irdr];
+ const char *key = files->readers[irdr].header->id[BCF_DT_ID][fmt_map[irdr]->id].key;
+ size_t nsize = (ma->nlaa+1)*(ma->nlaa+2)/2; // max number of Number=G localized fields
+ size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
+ msize *= nsamples*nsize;
+ if ( msize > 2147483647 )
+ {
+ static int warned = 0;
+ if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,msize);
+ warned = 1;
+ return;
+ }
+ if ( ma->ntmp_arr < msize )
+ {
+ ma->tmp_arr = realloc(ma->tmp_arr, msize);
+ if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
+ ma->ntmp_arr = msize;
+ }
+ int ismpl = 0;
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ bcf_fmt_t *fmt_ori = fmt_map[i];
+ bcf1_t *line = maux_get_line(args, i);
+ int nsmpl = bcf_hdr_nsamples(hdr);
+
+ if ( !fmt_ori )
+ {
+ // fill missing values
+ #define BRANCH(tgt_type_t, tgt_set_missing, tgt_set_vector_end) { \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+ tgt_set_missing; \
+ for (k=1; k<nsize; k++) { tgt++; tgt_set_vector_end; } \
+ ismpl++; \
+ } \
+ }
+ switch (fmt->type)
+ {
+ case BCF_BT_INT8: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
+ default: error("Unexpected case: %d, %s\n", fmt->type, key);
+ }
+ #undef BRANCH
+ continue;
+ }
+ if ( 2*fmt_ori->n!=line->n_allele*(line->n_allele+1) ) error("Todo: localization of missing or haploid Number=G tags\n");
+
+ // localize
+ #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ src_type_t *src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+ tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+ int *laa = ma->laa + (1+args->local_alleles)*ismpl; \
+ int ii,ij,tgt_idx = 0; \
+ for (ii=0; ii<=ma->nlaa; ii++) \
+ { \
+ if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \
+ for (ij=0; ij<=ii; ij++) \
+ { \
+ int src_idx = bcf_alleles2gt(laa[ii],laa[ij]); \
+ if ( src_is_missing ) tgt_set_missing; \
+ else if ( src_is_vector_end ) break; \
+ else tgt[tgt_idx] = src[src_idx]; \
+ tgt_idx++; \
+ } \
+ } \
+ if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \
+ for (; tgt_idx<nsize; tgt_idx++) tgt_set_vector_end; \
+ ismpl++; \
+ } \
+ }
+ switch (fmt_ori->type)
+ {
+ case BCF_BT_INT8: BRANCH(int32_t, int8_t, src[src_idx]==bcf_int8_missing, src[src_idx]==bcf_int8_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+ case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
+ default: error("Unexpected case: %d, %s\n", fmt_ori->type, key);
+ }
+ #undef BRANCH
+ }
+ args->tmps.l = 0;
+ kputc('L',&args->tmps);
+ kputs(key,&args->tmps);
+ if ( fmt_map[irdr]->type==BCF_BT_FLOAT )
+ bcf_update_format_float(args->out_hdr, out, args->tmps.s, (float*)ma->tmp_arr, nsamples*nsize);
+ else
+ bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
+ ma->laa_dirty = 1;
+}
+void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out, int irdr)
+{
+ int i,j,k, nsamples = bcf_hdr_nsamples(args->out_hdr);
+ bcf_srs_t *files = args->files;
+ maux_t *ma = args->maux;
+ bcf_fmt_t *fmt = fmt_map[irdr];
+ const char *key = files->readers[irdr].header->id[BCF_DT_ID][fmt->id].key;
+ size_t nsize = IS_VL_R(files->readers[irdr].header, fmt->id) ? ma->nlaa + 1 : ma->nlaa;
+ size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
+ msize *= nsamples*nsize;
+ if ( msize > 2147483647 )
+ {
+ static int warned = 0;
+ if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,msize);
+ warned = 1;
+ return;
+ }
+ if ( ma->ntmp_arr < msize )
+ {
+ ma->tmp_arr = realloc(ma->tmp_arr, msize);
+ if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
+ ma->ntmp_arr = msize;
+ }
+ int ismpl = 0, ibeg = IS_VL_R(files->readers[irdr].header, fmt->id) ? 0 : 1;;
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ bcf_fmt_t *fmt_ori = fmt_map[i];
+ int nsmpl = bcf_hdr_nsamples(hdr);
+
+ if ( !fmt_ori )
+ {
+ // fill missing values
+ #define BRANCH(tgt_type_t, tgt_set_missing, tgt_set_vector_end) { \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+ tgt_set_missing; \
+ for (k=1; k<nsize; k++) { tgt++; tgt_set_vector_end; } \
+ ismpl++; \
+ } \
+ }
+ switch (fmt->type)
+ {
+ case BCF_BT_INT8: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
+ default: error("Unexpected case: %d, %s\n", fmt->type, key);
+ }
+ #undef BRANCH
+ continue;
+ }
+
+ // localize
+ #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+ for (j=0; j<nsmpl; j++) \
+ { \
+ src_type_t *src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+ tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+ int *laa = ma->laa + (1+args->local_alleles)*ismpl; \
+ int ii,tgt_idx = 0; \
+ for (ii=ibeg; ii<=ma->nlaa; ii++) \
+ { \
+ if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \
+ int src_idx = laa[ii] - ibeg; \
+ if ( src_is_missing ) tgt_set_missing; \
+ else if ( src_is_vector_end ) break; \
+ else tgt[tgt_idx] = src[src_idx]; \
+ tgt_idx++; \
+ } \
+ if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \
+ for (; tgt_idx<nsize; tgt_idx++) tgt_set_vector_end; \
+ ismpl++; \
+ } \
+ }
+ switch (fmt_ori->type)
+ {
+ case BCF_BT_INT8: BRANCH(int32_t, int8_t, src[src_idx]==bcf_int8_missing, src[src_idx]==bcf_int8_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+ case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+ case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+ case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
+ default: error("Unexpected case: %d, %s\n", fmt_ori->type, key);
+ }
+ #undef BRANCH
+ }
+ args->tmps.l = 0;
+ kputc('L',&args->tmps);
+ kputs(key,&args->tmps);
+ if ( fmt_map[irdr]->type==BCF_BT_FLOAT )
+ bcf_update_format_float(args->out_hdr, out, args->tmps.s, (float*)ma->tmp_arr, nsamples*nsize);
+ else
+ bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
+ ma->laa_dirty = 1;
+}
void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
{
bcf_srs_t *files = args->files;
}
if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n;
}
+ if ( ma->nlaa && length!=BCF_VL_FIXED )
+ {
+ if ( length==BCF_VL_G ) merge_localized_numberG_format_field(args,fmt_map,out,i);
+ else if ( length==BCF_VL_A || length==BCF_VL_R ) merge_localized_numberAR_format_field(args,fmt_map,out,i);
+ return;
+ }
+
if ( type==BCF_BT_CHAR )
{
merge_format_string(args, key, fmt_map, out, length, nsize);
}
size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
- if ( ma->ntmp_arr < nsamples*nsize*msize )
+ msize *= nsamples*nsize;
+ if ( msize > 2147483647 )
{
- ma->ntmp_arr = nsamples*nsize*msize;
- ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr);
- if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", ma->ntmp_arr,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
- if ( ma->ntmp_arr > 2147483647 )
- {
- if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr);
- warned = 1;
- return;
- }
+ if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize);
+ warned = 1;
+ return;
+ }
+ if ( ma->ntmp_arr < msize )
+ {
+ ma->tmp_arr = realloc(ma->tmp_arr, msize);
+ if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
+ ma->ntmp_arr = msize;
}
// Fill the temp array for all samples by collecting values from all files
khiter_t kitr;
strdict_t *tmph = args->tmph;
kh_clear(strdict, tmph);
- int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index
+ int i, j, ret, has_GT = 0, has_PL = -1, max_ifmt = 0; // max fmt index
for (i=0; i<files->nreaders; i++)
{
bcf1_t *line = maux_get_line(args,i);
memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*));
ma->nfmt_map = max_ifmt+1;
}
+ if ( key[0]=='P' && key[1]=='L' && key[2]==0 ) { has_PL = ifmt; }
}
kitr = kh_put(strdict, tmph, key, &ret);
kh_value(tmph, kitr) = ifmt;
ma->buf[i].rec[irec].als_differ = j==line->n_allele ? 0 : 1;
}
+ if ( args->local_alleles )
+ {
+ ma->laa_dirty = ma->nlaa = 0;
+ if ( out->n_allele > args->local_alleles + 1 ) init_local_alleles(args, out, has_PL);
+ }
+
out->n_sample = bcf_hdr_nsamples(out_hdr);
if ( has_GT )
merge_GT(args, ma->fmt_map, out);
for (i=1; i<=max_ifmt; i++)
merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
+
+ if ( ma->laa_dirty )
+ update_local_alleles(args, out);
+
out->d.indiv_dirty = 1;
}
}
}
+static inline int is_gvcf_block(bcf1_t *line)
+{
+ if ( line->rlen<=1 ) return 0;
+ if ( strlen(line->d.allele[0])==line->rlen ) return 0;
+ if ( line->n_allele==1 ) return 1;
+
+ int i;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( !strcmp(line->d.allele[i],"<*>") ) return 1;
+ if ( !strcmp(line->d.allele[i],"<NON_REF>") ) return 1;
+ if ( !strcmp(line->d.allele[i],"<X>") ) return 1;
+ }
+ return 0;
+}
+static const int snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), indel_mask = VCF_INDEL<<2, ref_mask = 2;
+
/*
Check incoming lines for new gVCF blocks, set pointer to the current source
buffer (gvcf or readers). In contrast to gvcf_flush, this function can be
maux->gvcf_min = INT_MAX;
for (i=0; i<files->nreaders; i++)
{
+ if ( gaux[i].active && gaux[i].end < pos ) gaux[i].active = 0;
if ( gaux[i].active )
{
// gvcf block should not overlap with another record
int irec = maux->buf[i].beg;
bcf_hdr_t *hdr = bcf_sr_get_header(files, i);
bcf1_t *line = args->files->readers[i].buffer[irec];
- int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend);
+ int ret = is_gvcf_block(line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0;
if ( ret==1 )
{
if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END
fprintf(bcftools_stderr,"\n");
}
-
/*
Determine which line should be merged from which reader: go through all
readers and all buffered lines, expand REF,ALT and try to match lines with
int can_merge(args_t *args)
{
bcf_srs_t *files = args->files;
- int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
maux_t *maux = args->maux;
gvcf_aux_t *gaux = maux->gvcf;
char *id = NULL, ref = 'N';
}
maux->var_types = maux->nals = 0;
+ // this is only for the `-m none -g` mode, ensure that <*> lines come last
+ #define VCF_GVCF_REF 1
+
for (i=0; i<files->nreaders; i++)
{
buffer_t *buf = &maux->buf[i];
buf->rec[j].skip = SKIP_DIFF;
ntodo++;
+ bcf1_t *line = buf->lines[j];
if ( args->merge_by_id )
- id = buf->lines[j]->d.id;
+ id = line->d.id;
else
{
- int var_type = bcf_get_variant_types(buf->lines[j]);
- maux->var_types |= var_type ? var_type<<1 : 1;
+ int var_type = bcf_get_variant_types(line);
+ maux->var_types |= var_type ? var_type<<2 : 2;
+
+ // for the `-m none -g` mode
+ if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) )
+ maux->var_types |= VCF_GVCF_REF;
}
}
bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
int line_type = bcf_get_variant_types(line);
- line_type = line_type ? line_type<<1 : 1;
+ line_type = line_type ? line_type<<2 : 2;
// select relevant lines
if ( args->merge_by_id )
}
else
{
+ // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant
+ // records come last, otherwise infinite loop is created (#1164)
+ if ( args->collapse==COLLAPSE_NONE && args->do_gvcf )
+ {
+ if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue;
+ }
if ( args->collapse==COLLAPSE_NONE && maux->nals )
{
// All alleles of the tested record must be present in the
*/
void stage_line(args_t *args)
{
- int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
bcf_srs_t *files = args->files;
maux_t *maux = args->maux;
void merge_line(args_t *args)
{
- if ( args->regs )
- {
- if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return;
- }
-
bcf1_t *out = args->out_line;
merge_chrom2qual(args, out);
+ if ( args->regs && !regidx_overlap(args->regs,args->maux->chr,out->pos,out->pos+out->rlen-1,NULL) ) return;
merge_filter(args, out);
merge_info(args, out);
if ( args->do_gvcf )
error_errno("[%s] Failed to add program information to header", __func__);
}
+void hdr_add_localized_tags(args_t *args, bcf_hdr_t *hdr)
+{
+ char **str = NULL;
+ int i,j, nstr = 0, mstr = 0;
+ for (i=0; i<hdr->nhrec; i++)
+ {
+ if ( hdr->hrec[i]->type!=BCF_HL_FMT ) continue;
+ j = bcf_hrec_find_key(hdr->hrec[i],"ID");
+ if ( j<0 ) continue;
+ char *key = hdr->hrec[i]->vals[j];
+ int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
+ assert( id>=0 );
+ int localize = 0;
+ if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G ) localize = 1;
+ if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A ) localize = 1;
+ if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R ) localize = 1;
+ if ( !localize ) continue;
+ args->tmps.l = 0;
+
+ uint32_t e = 0, nout = 0;
+ e |= ksprintf(&args->tmps, "##%s=<", hdr->hrec[i]->key) < 0;
+ for (j=0; j<hdr->hrec[i]->nkeys; j++)
+ {
+ if ( !strcmp("IDX",hdr->hrec[i]->keys[j]) ) continue;
+ if ( nout ) e |= kputc(',',&args->tmps) < 0;
+ if ( !strcmp("ID",hdr->hrec[i]->keys[j]) )
+ e |= ksprintf(&args->tmps,"%s=L%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]) < 0;
+ else if ( !strcmp("Number",hdr->hrec[i]->keys[j]) )
+ e |= ksprintf(&args->tmps,"Number=.") < 0;
+ else if ( !strcmp("Description",hdr->hrec[i]->keys[j]) && hdr->hrec[i]->vals[j][0]=='"' )
+ e |= ksprintf(&args->tmps,"Description=\"Localized field: %s", hdr->hrec[i]->vals[j]+1) < 0;
+ else
+ e |= ksprintf(&args->tmps,"%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]) < 0;
+ nout++;
+ }
+ e |= ksprintf(&args->tmps,">\n") < 0;
+ if ( e ) error("Failed to format the header line for %s\n", key);
+ nstr++;
+ hts_expand(char*,nstr,mstr,str);
+ str[nstr-1] = strdup(args->tmps.s);
+ }
+ if ( !nstr ) return;
+ bcf_hdr_append(hdr,"##FORMAT=<ID=LAA,Number=.,Type=Integer,Description=\"Localized alleles: subset of alternate alleles relevant for each sample\">");
+ for (i=0; i<nstr; i++)
+ {
+ bcf_hdr_append(hdr, str[i]);
+ free(str[i]);
+ }
+ free(str);
+}
void merge_vcf(args_t *args)
{
- args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads);
args->out_hdr = bcf_hdr_init("w");
char buf[24]; snprintf(buf,sizeof buf,"%d",i+1);
merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples);
}
+ if ( args->local_alleles ) hdr_add_localized_tags(args, args->out_hdr);
if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge");
if (bcf_hdr_sync(args->out_hdr) < 0)
error_errno("[%s] Failed to update header", __func__);
fprintf(bcftools_stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
fprintf(bcftools_stderr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
fprintf(bcftools_stderr, " -l, --file-list <file> read file names from the file\n");
+ fprintf(bcftools_stderr, " -L, --local-alleles <int> EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
fprintf(bcftools_stderr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+ fprintf(bcftools_stderr, " --no-index merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(bcftools_stderr, " -O, --output-type <b|u|z|v> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcfmerge(int argc, char *argv[])
{
{"help",no_argument,NULL,'h'},
{"merge",required_argument,NULL,'m'},
+ {"local-alleles",required_argument,NULL,'L'},
{"gvcf",required_argument,NULL,'g'},
{"file-list",required_argument,NULL,'l'},
{"missing-to-ref",no_argument,NULL,'0'},
{"regions-file",required_argument,NULL,'R'},
{"info-rules",required_argument,NULL,'i'},
{"no-version",no_argument,NULL,8},
+ {"no-index",no_argument,NULL,10},
{"filter-logic",required_argument,NULL,'F'},
{NULL,0,NULL,0}
};
- while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0",loptions,NULL)) >= 0) {
+ char *tmp;
+ while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'L':
+ args->local_alleles = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --local-alleles %s\n", optarg);
+ if ( args->local_alleles < 1 )
+ error("Error: \"--local-alleles %s\" makes no sense, expected value bigger or equal than 1\n", optarg);
+ break;
case 'F':
if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD;
else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE;
case 3 : args->force_samples = 1; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
+ case 10 : args->no_index = 1; break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
if ( argc==optind && !args->file_list ) usage();
if ( argc-optind<2 && !args->file_list ) usage();
- args->files->require_index = 1;
+ if ( args->no_index )
+ {
+ if ( args->regions_list ) error("Error: cannot combine --no-index with -r/-R\n");
+ bcf_sr_set_opt(args->files,BCF_SR_ALLOW_NO_IDX);
+ }
+ else
+ bcf_sr_set_opt(args->files,BCF_SR_REQUIRE_IDX);
if ( args->regions_list )
{
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2019 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <strings.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include <htslib/khash_str2int.h>
#include "bcftools.h"
#include "rbuf.h"
+#include "abuf.h"
#define CHECK_REF_EXIT 1
#define CHECK_REF_WARN 2
int32_t *int32_arr;
int ntmp_arr1, ntmp_arr2, nint32_arr;
kstring_t *tmp_str;
- kstring_t *tmp_als, tmp_als_str;
+ kstring_t *tmp_als, tmp_kstr;
int ntmp_als;
rbuf_t rbuf;
int buf_win; // maximum distance between two records to consider
int aln_win; // the realignment window size (maximum repeat size)
bcf_srs_t *files; // using the synced reader only for -r option
- bcf_hdr_t *hdr;
+ bcf_hdr_t *hdr, *out_hdr;
cmpals_t cmpals_in, cmpals_out;
faidx_t *fai;
struct { int tot, set, swap; } nref;
char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels;
int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
- int record_cmd_line, force, force_warned;
+ int record_cmd_line, force, force_warned, keep_sum_ad;
+ abuf_t *abuf;
+ abuf_opt_t atomize;
+ int use_star_allele;
+ char *old_rec_tag;
+ htsFile *out;
}
args_t;
static void fix_ref(args_t *args, bcf1_t *line)
{
int reflen = strlen(line->d.allele[0]);
- int i, maxlen = reflen, len;
+ int i,j, maxlen = reflen, len;
for (i=1; i<line->n_allele; i++)
{
int len = strlen(line->d.allele[i]);
args->nref.tot++;
- // is the REF different?
+ // is the REF different? If not, we are done
if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
- // is the REF allele missing or N?
- if ( reflen==1 && (line->d.allele[0][0]=='.' || line->d.allele[0][0]=='N' || line->d.allele[0][0]=='n') )
+ // is the REF allele missing?
+ if ( reflen==1 && line->d.allele[0][0]=='.' )
{
line->d.allele[0][0] = ref[0];
args->nref.set++;
free(ref);
- bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
return;
}
- // does REF contain non-standard bases?
- if ( replace_iupac_codes(line->d.allele[0],strlen(line->d.allele[0])) )
+ // does REF or ALT contain non-standard bases?
+ int has_non_acgtn = 0;
+ for (i=0; i<line->n_allele; i++)
+ {
+ if ( line->d.allele[i][0]=='<' ) continue;
+ has_non_acgtn += replace_iupac_codes(line->d.allele[i],strlen(line->d.allele[i]));
+ }
+ if ( has_non_acgtn )
{
args->nref.set++;
- bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
}
+ // does the REF allele contain N's ?
+ int fix = 0;
+ for (i=0; i<reflen; i++)
+ {
+ if ( line->d.allele[0][i]!='N' ) continue;
+ if ( ref[i]=='N' ) continue;
+ line->d.allele[0][i] = ref[i];
+ fix++;
+ for (j=1; j<line->n_allele; j++)
+ {
+ int len = strlen(line->d.allele[j]);
+ if ( len <= i || line->d.allele[j][i]!='N' ) continue;
+ line->d.allele[j][i] = ref[i];
+ fix++;
+ }
+ }
+ if ( fix )
+ {
+ args->nref.set++;
+ bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
+ if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+ }
+
+
// is it swapped?
for (i=1; i<line->n_allele; i++)
{
}
kstring_t str = {0,0,0};
- if ( i==line->n_allele )
+ if ( i==line->n_allele ) // none of the alternate alleles matches the reference
{
- // none of the alternate alleles matches the reference
- if ( line->n_allele>1 )
- args->nref.set++;
- else
- args->nref.swap++;
-
- kputs(line->d.allele[0],&str);
- kputc(',',&str);
+ args->nref.set++;
+ kputsn(ref,reflen,&str);
for (i=1; i<line->n_allele; i++)
{
- kputs(line->d.allele[i],&str);
kputc(',',&str);
+ kputs(line->d.allele[i],&str);
}
- kputc(ref[0],&str);
- bcf_update_alleles_str(args->hdr,line,str.s);
- str.l = 0;
+ bcf_update_alleles_str(args->out_hdr,line,str.s);
+ free(ref);
+ free(str.s);
+ return;
}
- else
- args->nref.swap++;
- free(ref);
- // swap the alleles
- int j;
+ // one of the alternate alleles matches the reference, assume it's a simple swap
kputs(line->d.allele[i],&str);
- for (j=1; j<i; j++)
- {
- kputc(',',&str);
- kputs(line->d.allele[j],&str);
- }
- kputc(',',&str);
- kputs(line->d.allele[0],&str);
- for (j=i+1; j<line->n_allele; j++)
+ for (j=1; j<line->n_allele; j++)
{
kputc(',',&str);
- kputs(line->d.allele[j],&str);
+ if ( j==i )
+ kputs(line->d.allele[0],&str);
+ else
+ kputs(line->d.allele[j],&str);
}
- bcf_update_alleles_str(args->hdr,line,str.s);
+ bcf_update_alleles_str(args->out_hdr,line,str.s);
+ args->nref.swap++;
+ free(ref);
+ free(str.s);
// swap genotypes
int ntmp = args->ntmp_arr1 / sizeof(int32_t); // reuse tmp_arr declared as uint8_t
else if ( gts[j]==bcf_gt_unphased(i) ) gts[j] = bcf_gt_unphased(0);
else if ( gts[j]==bcf_gt_phased(i) ) gts[j] = bcf_gt_phased(0);
}
- bcf_update_genotypes(args->hdr,line,gts,ngts);
+ bcf_update_genotypes(args->out_hdr,line,gts,ngts);
// update AC
int nac = bcf_get_info_int32(args->hdr, line, "AC", &args->tmp_arr1, &ntmp);
{
int32_t *ac = (int32_t*)args->tmp_arr1;
ac[i-1] = ni;
- bcf_update_info_int32(args->hdr, line, "AC", ac, nac);
+ bcf_update_info_int32(args->out_hdr, line, "AC", ac, nac);
}
-
- free(str.s);
}
static void fix_dup_alt(args_t *args, bcf1_t *line)
if ( !args->tmp_arr1[i] ) continue;
line->d.allele[j++] = line->d.allele[i];
}
- bcf_update_alleles(args->hdr, line, (const char**)line->d.allele, nals);
+ bcf_update_alleles(args->out_hdr, line, (const char**)line->d.allele, nals);
// update genotypes
gts[i] = bcf_gt_is_phased(gts[i]) ? bcf_gt_phased(ial_new) : bcf_gt_unphased(ial_new);
changed = 1;
}
- if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts);
+ if ( changed ) bcf_update_genotypes(args->out_hdr,line,gts,ngts);
+}
+
+static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
+{
+ if ( !args->old_rec_tag ) return;
+
+ // only update if the tag is not present already, there can be multiple normalization steps
+ int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
+ bcf_unpack(dst, BCF_UN_INFO);
+ for (i=0; i<dst->n_info; i++)
+ {
+ bcf_info_t *inf = &dst->d.info[i];
+ if ( inf && inf->key == id ) return;
+ }
+
+ args->tmp_kstr.l = 0;
+ ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]);
+ for (i=1; i<src->n_allele; i++)
+ {
+ kputs(src->d.allele[i],&args->tmp_kstr);
+ if ( i+1<src->n_allele ) kputc(',',&args->tmp_kstr);
+ }
+ if ( ialt>0 )
+ {
+ kputc('|',&args->tmp_kstr);
+ kputw(ialt,&args->tmp_kstr);
+ }
+ if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 )
+ error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
}
#define ERR_DUP_ALLELE -2
if ( line->rlen > 1 )
{
line->d.allele[0][1] = 0;
- bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
}
return ERR_OK;
}
}
// trim from right
- int ori_pos = line->pos;
+ int new_pos = line->pos;
while (1)
{
// is the rightmost base identical in all alleles?
if ( als[i].l < min_len ) min_len = als[i].l;
}
if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
- if ( min_len<=1 && line->pos==0 ) break;
+ if ( min_len<=1 && new_pos==0 ) break;
int pad_from_left = 0;
for (i=0; i<line->n_allele; i++) // trim all alleles
}
if ( pad_from_left )
{
- int npad = line->pos >= args->aln_win ? args->aln_win : line->pos;
+ int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
free(ref);
- ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref);
- if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos-npad+1);
+ ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1);
replace_iupac_codes(ref,nref);
for (i=0; i<line->n_allele; i++)
{
memcpy(als[i].s,ref,npad);
als[i].l += npad;
}
- line->pos -= npad;
+ new_pos -= npad;
}
}
free(ref);
memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
als[i].l -= ntrim_left;
}
- line->pos += ntrim_left;
+ new_pos += ntrim_left;
}
// Have the alleles changed?
als[0].s[ als[0].l ] = 0; // in order for strcmp to work
- if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
+ if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
+
+ set_old_rec_tag(args, line, line, 0);
// Create new block of alleles and update
- args->tmp_als_str.l = 0;
+ args->tmp_kstr.l = 0;
for (i=0; i<line->n_allele; i++)
{
- if (i>0) kputc(',',&args->tmp_als_str);
- kputsn(als[i].s,als[i].l,&args->tmp_als_str);
+ if (i>0) kputc(',',&args->tmp_kstr);
+ kputsn(als[i].s,als[i].l,&args->tmp_kstr);
}
- args->tmp_als_str.s[ args->tmp_als_str.l ] = 0;
- bcf_update_alleles_str(args->hdr,line,args->tmp_als_str.s);
+ args->tmp_kstr.s[ args->tmp_kstr.l ] = 0;
+ bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s);
args->nchanged++;
// Update INFO/END if necessary
int new_reflen = strlen(line->d.allele[0]);
- if ( (ori_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 )
+ if ( (new_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 )
{
// bcf_update_alleles_str() messed up rlen because line->pos changed. This will be fixed by bcf_update_info_int32()
+ line->pos = new_pos;
args->int32_arr[0] = line->pos + new_reflen;
- bcf_update_info_int32(args->hdr, line, "END", args->int32_arr, 1);
+ bcf_update_info_int32(args->out_hdr, line, "END", args->int32_arr, 1);
}
+ line->pos = new_pos;
return ERR_OK;
}
static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
{
- #define BRANCH_NUMERIC(type,type_t) \
+ #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing) \
{ \
const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); \
int ntmp = args->ntmp_arr1 / sizeof(type_t); \
} \
if ( args->force ) \
{ \
- bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
return; \
} \
error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \
} \
- bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,vals+ialt,1); \
} \
else if ( len==BCF_VL_R ) \
{ \
} \
if ( args->force ) \
{ \
- bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
return; \
} \
error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \
} \
- if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
- bcf_update_info_##type(args->hdr,dst,tag,vals,2); \
+ if ( args->keep_sum_ad >= 0 && args->keep_sum_ad==info->key ) \
+ { \
+ int j; \
+ for (j=1; j<info->len; j++) \
+ if ( j!=ialt+1 && !(is_missing) && !(is_vector_end) ) vals[0] += vals[j]; \
+ vals[1] = vals[ialt+1]; \
+ } \
+ else \
+ { \
+ if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
+ } \
+ bcf_update_info_##type(args->out_hdr,dst,tag,vals,2); \
} \
else if ( len==BCF_VL_G ) \
{ \
} \
if ( args->force ) \
{ \
- bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
return; \
} \
error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \
vals[2] = vals[bcf_alleles2gt(ialt+1,ialt+1)]; \
} \
- bcf_update_info_##type(args->hdr,dst,tag,vals,3); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,vals,3); \
} \
else \
- bcf_update_info_##type(args->hdr,dst,tag,vals,ret); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,vals,ret); \
}
switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key))
{
- case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t); break;
- case BCF_HT_REAL: BRANCH_NUMERIC(float, float); break;
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, vals[j]==bcf_int32_vector_end, vals[j]==bcf_int32_missing); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(vals[j]), bcf_float_is_missing(vals[j])); break;
}
#undef BRANCH_NUMERIC
}
STR_MOVE_NTH(str.s,tmp,str.s+str.l,ialt,len);
if ( len<0 ) return; // wrong number of fields: skip
str.s[len] = 0;
- bcf_update_info_string(args->hdr,dst,tag,str.s);
+ bcf_update_info_string(args->out_hdr,dst,tag,str.s);
}
else if ( len==BCF_VL_R )
{
STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,ialt,len);
if ( len<0 ) return; // wrong number of fields: skip
str.s[len] = 0;
- bcf_update_info_string(args->hdr,dst,tag,str.s);
+ bcf_update_info_string(args->out_hdr,dst,tag,str.s);
}
else if ( len==BCF_VL_G )
{
STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,iaa-i0a-1,len);
if ( len<0 ) return; // wrong number of fields: skip
str.s[len] = 0;
- bcf_update_info_string(args->hdr,dst,tag,str.s);
+ bcf_update_info_string(args->out_hdr,dst,tag,str.s);
}
else
- bcf_update_info_string(args->hdr,dst,tag,str.s);
+ bcf_update_info_string(args->out_hdr,dst,tag,str.s);
}
static void split_info_flag(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
{
const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
int ret = bcf_get_info_flag(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1);
- bcf_update_info_flag(args->hdr,dst,tag,NULL,ret);
+ bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret);
}
static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
}
gt += ngts;
}
- bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl);
+ bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl);
}
static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
{
- #define BRANCH_NUMERIC(type,type_t,is_vector_end,set_vector_end) \
+ #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end) \
{ \
const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \
int ntmp = args->ntmp_arr1 / sizeof(type_t); \
assert( nvals>0 ); \
type_t *vals = (type_t *) args->tmp_arr1; \
int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id); \
- int i, nsmpl = bcf_hdr_nsamples(args->hdr); \
+ int i,j, nsmpl = bcf_hdr_nsamples(args->hdr); \
if ( nvals==nsmpl ) /* all values are missing */ \
{ \
- bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \
return; \
} \
if ( len==BCF_VL_A ) \
} \
if ( args->force ) \
{ \
- bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
return; \
} \
error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \
dst_vals += 1; \
src_vals += nvals; \
} \
- bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \
} \
else if ( len==BCF_VL_R ) \
{ \
} \
if ( args->force ) \
{ \
- bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
return; \
} \
error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \
} \
nvals /= nsmpl; \
type_t *src_vals = vals, *dst_vals = vals; \
- for (i=0; i<nsmpl; i++) \
+ if ( args->keep_sum_ad >= 0 && args->keep_sum_ad==fmt->id ) \
{ \
- dst_vals[0] = src_vals[0]; \
- dst_vals[1] = src_vals[ialt+1]; \
- dst_vals += 2; \
- src_vals += nvals; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ dst_vals[0] = src_vals[0]; \
+ for (j=1; j<nvals; j++) \
+ if ( j!=ialt+1 && !(is_missing) && !(is_vector_end) ) dst_vals[0] += src_vals[j]; \
+ dst_vals[1] = src_vals[ialt+1]; \
+ dst_vals += 2; \
+ src_vals += nvals; \
+ } \
+ } \
+ else \
+ { \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ dst_vals[0] = src_vals[0]; \
+ dst_vals[1] = src_vals[ialt+1]; \
+ dst_vals += 2; \
+ src_vals += nvals; \
+ } \
} \
- bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl*2); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl*2); \
} \
else if ( len==BCF_VL_G ) \
{ \
} \
if ( args->force ) \
{ \
- bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
return; \
} \
error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \
dst_vals += all_haploid ? 2 : 3; \
src_vals += nvals; \
} \
- bcf_update_format_##type(args->hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \
} \
else \
- bcf_update_format_##type(args->hdr,dst,tag,vals,nvals); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,vals,nvals); \
}
switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
{
- case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, dst_vals[2]=bcf_int32_vector_end); break;
- case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break;
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, src_vals[j]==bcf_int32_missing, dst_vals[2]=bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_is_missing(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break;
}
#undef BRANCH_NUMERIC
}
ptr += blen;
}
if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
- bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+ bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
}
else if ( len==BCF_VL_R )
{
ptr += blen;
}
if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
- bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+ bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
}
else if ( len==BCF_VL_G )
{
}
if ( args->force )
{
- bcf_update_format_char(args->hdr,dst,tag,NULL,0);
+ bcf_update_format_char(args->out_hdr,dst,tag,NULL,0);
return;
}
error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n",
ptr += blen;
}
if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
- bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+ bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
}
else
- bcf_update_format_char(args->hdr,dst,tag,str.s,str.l);
+ bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l);
}
-
static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
{
int i;
// Not quite sure how to handle IDs, they can be assigned to a specific
// ALT. For now we leave the ID unchanged for all.
- bcf_update_id(args->hdr, dst, line->d.id ? line->d.id : ".");
+ bcf_update_id(args->out_hdr, dst, line->d.id ? line->d.id : ".");
tmp.l = rlen;
kputs(line->d.allele[i+1],&tmp);
- bcf_update_alleles_str(args->hdr,dst,tmp.s);
+ bcf_update_alleles_str(args->out_hdr,dst,tmp.s);
if ( line->d.n_flt ) bcf_update_filter(args->hdr, dst, line->d.flt, line->d.n_flt);
else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst);
else split_info_string(args, line, info, i, dst);
}
+ set_old_rec_tag(args, dst, line, i + 1); // 1-based indexes
dst->n_sample = line->n_sample;
for (j=0; j<line->n_fmt; j++)
vals[ args->maps[i].map[k+1] - 1 ] = vals2[k]; \
} \
} \
- bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \
} \
else if ( len==BCF_VL_R ) \
{ \
vals[ args->maps[i].map[k] ] = vals2[k]; \
} \
} \
- bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \
} \
else if ( len==BCF_VL_G ) \
{ \
} \
} \
} \
- bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \
} \
else \
- bcf_update_info_##type(args->hdr,dst,tag,vals,nvals_ori); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,vals,nvals_ori); \
}
switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key))
{
{
const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
int ret = bcf_get_info_flag(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
- bcf_update_info_flag(args->hdr,dst,tag,NULL,ret);
+ bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret);
}
int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst)
str.s[str.l] = 0;
args->tmp_arr1 = (uint8_t*) str.s;
args->ntmp_arr1 = str.m;
- bcf_update_info_string(args->hdr,dst,tag,str.s);
+ bcf_update_info_string(args->out_hdr,dst,tag,str.s);
}
else if ( len==BCF_VL_G )
{
str.s[str.l] = 0;
args->tmp_arr1 = (uint8_t*) str.s;
args->ntmp_arr1 = str.m;
- bcf_update_info_string(args->hdr,dst,tag,str.s);
+ bcf_update_info_string(args->out_hdr,dst,tag,str.s);
}
else
{
bcf_get_info_string(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
- bcf_update_info_string(args->hdr,dst,tag,args->tmp_arr1);
+ bcf_update_info_string(args->out_hdr,dst,tag,args->tmp_arr1);
}
}
static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst)
gt2 += ngts;
}
}
- bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl);
+ bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl);
}
static int diploid_to_haploid(int size, int nsmpl, int nals, uint8_t *vals)
{
vals2 += nvals2; \
} \
} \
- bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
} \
else if ( len==BCF_VL_R ) \
{ \
vals2 += nvals2; \
} \
} \
- bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
} \
else if ( len==BCF_VL_G ) \
{ \
vals2 += nvals;\
}\
}\
- bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
} \
else \
- bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \
}
switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
{
if ( len!=BCF_VL_A && len!=BCF_VL_R && len!=BCF_VL_G )
{
int nret = bcf_get_format_char(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
- bcf_update_format_char(args->hdr,dst,tag,args->tmp_arr1,nret);
+ bcf_update_format_char(args->out_hdr,dst,tag,args->tmp_arr1,nret);
return;
}
for (i=0; i<nlines; i++)
{
int nret = bcf_get_format_char(args->hdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1);
- if (nret<0) continue; /* format tag does not exist in this record, skip */ \
+ if (nret<0) continue; /* format tag does not exist in this record, skip */
nret /= nsmpl;
for (k=0; k<nsmpl; k++)
{
if ( i ) // we already have a copy
{
nret = bcf_get_format_char(args->hdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1);
- if (nret<0) continue; /* format tag does not exist in this record, skip */ \
+ if (nret<0) continue; /* format tag does not exist in this record, skip */
nret /= nsmpl;
}
for (k=0; k<nsmpl; k++)
}
args->ntmp_arr2 = str.m;
args->tmp_arr2 = (uint8_t*)str.s;
- bcf_update_format_char(args->hdr,dst,tag,str.s,str.l);
+ bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l);
}
char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb); // see vcfmerge.c
dst->qual = lines[i]->qual;
}
- bcf_update_id(args->hdr, dst, lines[0]->d.id);
+ bcf_update_id(args->out_hdr, dst, lines[0]->d.id);
// Merge and set the alleles, create a mapping from source allele indexes to dst idxs
hts_expand0(map_t,nlines,args->mmaps,args->maps); // a mapping for each line
}
for (i=1; i<nlines; i++)
{
- if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->hdr, dst, lines[i]->d.id);
+ if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->out_hdr, dst, lines[i]->d.id);
args->maps[i].nals = lines[i]->n_allele;
hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map);
args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals);
if ( !args->als ) error("Failed to merge alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1);
}
- bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals);
+ bcf_update_alleles(args->out_hdr, dst, (const char**)args->als, args->nals);
for (i=0; i<args->nals; i++)
{
free(args->als[i]);
args->als[i] = NULL;
}
- if ( lines[0]->d.n_flt ) bcf_update_filter(args->hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt);
+ if ( lines[0]->d.n_flt ) bcf_update_filter(args->out_hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt);
for (i=1; i<nlines; i++) {
int j;
for (j=0; j<lines[i]->d.n_flt; j++) {
// otherwise accumulate FILTERs
if (lines[i]->d.flt[j] == bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PASS")) {
if (args->strict_filter) {
- bcf_update_filter(args->hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt);
+ bcf_update_filter(args->out_hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt);
break;
}
else
continue;
}
- bcf_add_filter(args->hdr, dst, lines[i]->d.flt[j]);
+ bcf_add_filter(args->out_hdr, dst, lines[i]->d.flt[j]);
}
}
if ( mrows_ready_to_flush(args, args->lines[k]) )
{
while ( (line=mrows_flush(args)) )
- if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
}
int merge = 1;
if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
prev_type |= line_type;
if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]);
}
- if ( bcf_write1(file, args->hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( bcf_write1(file, args->out_hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
}
if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n )
{
while ( (line=mrows_flush(args)) )
- if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
}
}
static void init_data(args_t *args)
{
args->hdr = args->files->readers[0].header;
+ if ( args->keep_sum_ad )
+ {
+ args->keep_sum_ad = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"AD");
+ if ( args->keep_sum_ad < 0 ) error("Error: --keep-sum-ad requested but the tag AD is not present\n");
+ }
+ else
+ args->keep_sum_ad = -1;
+
+ args->out_hdr = bcf_hdr_dup(args->hdr);
+ if ( args->old_rec_tag )
+ bcf_hdr_printf(args->out_hdr,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX\">",args->old_rec_tag);
+
rbuf_init(&args->rbuf, 100);
args->lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
if ( args->ref_fname )
args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t));
args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr));
}
+ if ( args->atomize==SPLIT )
+ {
+ args->abuf = abuf_init(args->hdr, SPLIT);
+ abuf_set_opt(args->abuf, bcf_hdr_t*, BCF_HDR, args->out_hdr);
+ if ( args->old_rec_tag )
+ abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag);
+ abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele);
+ }
}
static void destroy_data(args_t *args)
for (i=0; i<args->ntmp_als; i++)
free(args->tmp_als[i].s);
free(args->tmp_als);
- free(args->tmp_als_str.s);
+ free(args->tmp_kstr.s);
if ( args->tmp_str )
{
for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) free(args->tmp_str[i].s);
free(args->tmp_arr1);
free(args->tmp_arr2);
free(args->diploid);
+ if ( args->abuf ) abuf_destroy(args->abuf);
+ bcf_hdr_destroy(args->out_hdr);
if ( args->mrow_out ) bcf_destroy1(args->mrow_out);
if ( args->fai ) fai_destroy(args->fai);
if ( args->mseq ) free(args->seq);
}
-static void normalize_line(args_t *args, bcf1_t **line_ptr)
+static void normalize_line(args_t *args, bcf1_t *line)
{
- bcf1_t *line = *line_ptr;
if ( args->fai )
{
if ( args->check_ref & CHECK_REF_FIX ) fix_ref(args, line);
rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines);
int i,j;
i = j = rbuf_append(&args->rbuf);
- if ( !args->lines[i] ) args->lines[i] = bcf_init1();
- SWAP(bcf1_t*, (*line_ptr), args->lines[i]);
+ if ( args->lines[i] ) bcf_destroy(args->lines[i]);
+ args->lines[i] = bcf_dup(line);
while ( rbuf_prev(&args->rbuf,&i) )
{
if ( args->lines[i]->pos > args->lines[j]->pos ) SWAP(bcf1_t*, args->lines[i], args->lines[j]);
}
}
+static bcf1_t *next_atomized_line(args_t *args)
+{
+ bcf1_t *rec = NULL;
+ if ( args->atomize==SPLIT )
+ {
+ rec = abuf_flush(args->abuf, 0);
+ if ( rec ) return rec;
+ }
+
+ if ( !bcf_sr_next_line(args->files) ) return NULL;
+
+ if ( args->atomize==SPLIT )
+ {
+ abuf_push(args->abuf,bcf_sr_get_line(args->files,0));
+ return abuf_flush(args->abuf, 0);
+ }
+ return bcf_sr_get_line(args->files,0);
+}
static void normalize_vcf(args_t *args)
{
- htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
- if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ args->out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
+ if ( args->out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads )
- hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p);
- if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
- if ( bcf_hdr_write(out, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
+ if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm");
+ if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ bcf1_t *line;
int prev_rid = -1, prev_pos = -1, prev_type = 0;
- while ( bcf_sr_next_line(args->files) )
+ while ( (line = next_atomized_line(args)) )
{
args->ntotal++;
-
- bcf1_t *line = args->files->readers[0].buffer[0];
if ( args->rmdup )
{
int line_type = bcf_get_variant_types(line);
// still on the same chromosome?
int i,j,ilast = rbuf_last(&args->rbuf);
- if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, out, args->rbuf.n); // new chromosome
+ if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, args->out, args->rbuf.n); // new chromosome
int split = 0;
if ( args->mrows_op==MROWS_SPLIT )
args->nsplit++;
split_multiallelic_to_biallelics(args, line);
for (j=0; j<args->ntmp_lines; j++)
- normalize_line(args, &args->tmp_lines[j]);
+ normalize_line(args, args->tmp_lines[j]);
}
else
split = 0;
}
if ( !split )
- normalize_line(args, &args->files->readers[0].buffer[0]);
+ normalize_line(args, line);
// find out how many sites to flush
ilast = rbuf_last(&args->rbuf);
if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break;
j++;
}
- if ( j>0 ) flush_buffer(args, out, j);
+ if ( j>0 ) flush_buffer(args, args->out, j);
}
- flush_buffer(args, out, args->rbuf.n);
- if ( hts_close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+ flush_buffer(args, args->out, args->rbuf.n);
+ if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
fprintf(stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
if ( args->check_ref & CHECK_REF_FIX )
fprintf(stderr, "Usage: bcftools norm [options] <in.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -c, --check-ref <e|w|x|s> check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
- fprintf(stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n");
- fprintf(stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|all|exact\n");
- fprintf(stderr, " -f, --fasta-ref <file> reference sequence\n");
- fprintf(stderr, " --force try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
- fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
- fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n");
- fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(stderr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
- fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
- fprintf(stderr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
+ fprintf(stderr, " -a, --atomize Decompose complex variants (e.g. MNVs become consecutive SNVs)\n");
+ fprintf(stderr, " --atom-overlaps '*'|. Use the star allele (*) for overlapping alleles or set to missing (.) [*]\n");
+ fprintf(stderr, " -c, --check-ref e|w|x|s Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
+ fprintf(stderr, " -D, --remove-duplicates Remove duplicate lines of the same type.\n");
+ fprintf(stderr, " -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact\n");
+ fprintf(stderr, " -f, --fasta-ref FILE Reference sequence\n");
+ fprintf(stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
+ fprintf(stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
+ fprintf(stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
+ fprintf(stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n");
+ fprintf(stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n");
+ fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS\n");
+ fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
+ fprintf(stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # normalize and left-align indels\n");
args->do_indels = 1;
int region_is_file = 0;
int targets_is_file = 0;
+ args->use_star_allele = 1;
static struct option loptions[] =
{
{"help",no_argument,NULL,'h'},
{"force",no_argument,NULL,7},
+ {"atomize",no_argument,NULL,'a'},
+ {"atom-overlaps",required_argument,NULL,11},
+ {"old-rec-tag",required_argument,NULL,12},
+ {"keep-sum",required_argument,NULL,10},
{"fasta-ref",required_argument,NULL,'f'},
{"do-not-normalize",no_argument,NULL,'N'},
{"multiallelics",required_argument,NULL,'m'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sN",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) {
switch (c) {
+ case 10:
+ // possibly generalize this also to INFO/AD and other tags
+ if ( strcasecmp("ad",optarg) )
+ error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n");
+ args->keep_sum_ad = 1; // this will be set to the header id or -1 in init_data
+ break;
+ case 'a': args->atomize = SPLIT; break;
+ case 11 :
+ if ( optarg[0]=='*' ) args->use_star_allele = 1;
+ else if ( optarg[0]=='.' ) args->use_star_allele = 0;
+ else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n");
+ break;
+ case 12 : args->old_rec_tag = optarg; break;
case 'N': args->do_indels = 0; break;
case 'd':
if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
}
else fname = argv[optind];
- if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n");
+ if ( !args->ref_fname && !args->mrows_op && !args->rmdup && args->atomize==NONE ) error("Expected -a, -f, -m, -D or -d option\n");
if ( !args->check_ref && args->ref_fname ) args->check_ref = CHECK_REF_EXIT;
if ( args->check_ref && !args->ref_fname ) error("Expected --fasta-ref with --check-ref\n");
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2019 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <strings.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include <htslib/khash_str2int.h>
#include "bcftools.h"
#include "rbuf.h"
+#include "abuf.h"
#define CHECK_REF_EXIT 1
#define CHECK_REF_WARN 2
int32_t *int32_arr;
int ntmp_arr1, ntmp_arr2, nint32_arr;
kstring_t *tmp_str;
- kstring_t *tmp_als, tmp_als_str;
+ kstring_t *tmp_als, tmp_kstr;
int ntmp_als;
rbuf_t rbuf;
int buf_win; // maximum distance between two records to consider
int aln_win; // the realignment window size (maximum repeat size)
bcf_srs_t *files; // using the synced reader only for -r option
- bcf_hdr_t *hdr;
+ bcf_hdr_t *hdr, *out_hdr;
cmpals_t cmpals_in, cmpals_out;
faidx_t *fai;
struct { int tot, set, swap; } nref;
char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels;
int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
- int record_cmd_line, force, force_warned;
+ int record_cmd_line, force, force_warned, keep_sum_ad;
+ abuf_t *abuf;
+ abuf_opt_t atomize;
+ int use_star_allele;
+ char *old_rec_tag;
+ htsFile *out;
}
args_t;
static void fix_ref(args_t *args, bcf1_t *line)
{
int reflen = strlen(line->d.allele[0]);
- int i, maxlen = reflen, len;
+ int i,j, maxlen = reflen, len;
for (i=1; i<line->n_allele; i++)
{
int len = strlen(line->d.allele[i]);
args->nref.tot++;
- // is the REF different?
+ // is the REF different? If not, we are done
if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
- // is the REF allele missing or N?
- if ( reflen==1 && (line->d.allele[0][0]=='.' || line->d.allele[0][0]=='N' || line->d.allele[0][0]=='n') )
+ // is the REF allele missing?
+ if ( reflen==1 && line->d.allele[0][0]=='.' )
{
line->d.allele[0][0] = ref[0];
args->nref.set++;
free(ref);
- bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
return;
}
- // does REF contain non-standard bases?
- if ( replace_iupac_codes(line->d.allele[0],strlen(line->d.allele[0])) )
+ // does REF or ALT contain non-standard bases?
+ int has_non_acgtn = 0;
+ for (i=0; i<line->n_allele; i++)
+ {
+ if ( line->d.allele[i][0]=='<' ) continue;
+ has_non_acgtn += replace_iupac_codes(line->d.allele[i],strlen(line->d.allele[i]));
+ }
+ if ( has_non_acgtn )
{
args->nref.set++;
- bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
}
+ // does the REF allele contain N's ?
+ int fix = 0;
+ for (i=0; i<reflen; i++)
+ {
+ if ( line->d.allele[0][i]!='N' ) continue;
+ if ( ref[i]=='N' ) continue;
+ line->d.allele[0][i] = ref[i];
+ fix++;
+ for (j=1; j<line->n_allele; j++)
+ {
+ int len = strlen(line->d.allele[j]);
+ if ( len <= i || line->d.allele[j][i]!='N' ) continue;
+ line->d.allele[j][i] = ref[i];
+ fix++;
+ }
+ }
+ if ( fix )
+ {
+ args->nref.set++;
+ bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
+ if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+ }
+
+
// is it swapped?
for (i=1; i<line->n_allele; i++)
{
}
kstring_t str = {0,0,0};
- if ( i==line->n_allele )
+ if ( i==line->n_allele ) // none of the alternate alleles matches the reference
{
- // none of the alternate alleles matches the reference
- if ( line->n_allele>1 )
- args->nref.set++;
- else
- args->nref.swap++;
-
- kputs(line->d.allele[0],&str);
- kputc(',',&str);
+ args->nref.set++;
+ kputsn(ref,reflen,&str);
for (i=1; i<line->n_allele; i++)
{
- kputs(line->d.allele[i],&str);
kputc(',',&str);
+ kputs(line->d.allele[i],&str);
}
- kputc(ref[0],&str);
- bcf_update_alleles_str(args->hdr,line,str.s);
- str.l = 0;
+ bcf_update_alleles_str(args->out_hdr,line,str.s);
+ free(ref);
+ free(str.s);
+ return;
}
- else
- args->nref.swap++;
- free(ref);
- // swap the alleles
- int j;
+ // one of the alternate alleles matches the reference, assume it's a simple swap
kputs(line->d.allele[i],&str);
- for (j=1; j<i; j++)
- {
- kputc(',',&str);
- kputs(line->d.allele[j],&str);
- }
- kputc(',',&str);
- kputs(line->d.allele[0],&str);
- for (j=i+1; j<line->n_allele; j++)
+ for (j=1; j<line->n_allele; j++)
{
kputc(',',&str);
- kputs(line->d.allele[j],&str);
+ if ( j==i )
+ kputs(line->d.allele[0],&str);
+ else
+ kputs(line->d.allele[j],&str);
}
- bcf_update_alleles_str(args->hdr,line,str.s);
+ bcf_update_alleles_str(args->out_hdr,line,str.s);
+ args->nref.swap++;
+ free(ref);
+ free(str.s);
// swap genotypes
int ntmp = args->ntmp_arr1 / sizeof(int32_t); // reuse tmp_arr declared as uint8_t
else if ( gts[j]==bcf_gt_unphased(i) ) gts[j] = bcf_gt_unphased(0);
else if ( gts[j]==bcf_gt_phased(i) ) gts[j] = bcf_gt_phased(0);
}
- bcf_update_genotypes(args->hdr,line,gts,ngts);
+ bcf_update_genotypes(args->out_hdr,line,gts,ngts);
// update AC
int nac = bcf_get_info_int32(args->hdr, line, "AC", &args->tmp_arr1, &ntmp);
{
int32_t *ac = (int32_t*)args->tmp_arr1;
ac[i-1] = ni;
- bcf_update_info_int32(args->hdr, line, "AC", ac, nac);
+ bcf_update_info_int32(args->out_hdr, line, "AC", ac, nac);
}
-
- free(str.s);
}
static void fix_dup_alt(args_t *args, bcf1_t *line)
if ( !args->tmp_arr1[i] ) continue;
line->d.allele[j++] = line->d.allele[i];
}
- bcf_update_alleles(args->hdr, line, (const char**)line->d.allele, nals);
+ bcf_update_alleles(args->out_hdr, line, (const char**)line->d.allele, nals);
// update genotypes
gts[i] = bcf_gt_is_phased(gts[i]) ? bcf_gt_phased(ial_new) : bcf_gt_unphased(ial_new);
changed = 1;
}
- if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts);
+ if ( changed ) bcf_update_genotypes(args->out_hdr,line,gts,ngts);
+}
+
+static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
+{
+ if ( !args->old_rec_tag ) return;
+
+ // only update if the tag is not present already, there can be multiple normalization steps
+ int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
+ bcf_unpack(dst, BCF_UN_INFO);
+ for (i=0; i<dst->n_info; i++)
+ {
+ bcf_info_t *inf = &dst->d.info[i];
+ if ( inf && inf->key == id ) return;
+ }
+
+ args->tmp_kstr.l = 0;
+ ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]);
+ for (i=1; i<src->n_allele; i++)
+ {
+ kputs(src->d.allele[i],&args->tmp_kstr);
+ if ( i+1<src->n_allele ) kputc(',',&args->tmp_kstr);
+ }
+ if ( ialt>0 )
+ {
+ kputc('|',&args->tmp_kstr);
+ kputw(ialt,&args->tmp_kstr);
+ }
+ if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 )
+ error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
}
#define ERR_DUP_ALLELE -2
if ( line->rlen > 1 )
{
line->d.allele[0][1] = 0;
- bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
}
return ERR_OK;
}
}
// trim from right
- int ori_pos = line->pos;
+ int new_pos = line->pos;
while (1)
{
// is the rightmost base identical in all alleles?
if ( als[i].l < min_len ) min_len = als[i].l;
}
if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
- if ( min_len<=1 && line->pos==0 ) break;
+ if ( min_len<=1 && new_pos==0 ) break;
int pad_from_left = 0;
for (i=0; i<line->n_allele; i++) // trim all alleles
}
if ( pad_from_left )
{
- int npad = line->pos >= args->aln_win ? args->aln_win : line->pos;
+ int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
free(ref);
- ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref);
- if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos-npad+1);
+ ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref);
+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1);
replace_iupac_codes(ref,nref);
for (i=0; i<line->n_allele; i++)
{
memcpy(als[i].s,ref,npad);
als[i].l += npad;
}
- line->pos -= npad;
+ new_pos -= npad;
}
}
free(ref);
memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
als[i].l -= ntrim_left;
}
- line->pos += ntrim_left;
+ new_pos += ntrim_left;
}
// Have the alleles changed?
als[0].s[ als[0].l ] = 0; // in order for strcmp to work
- if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
+ if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
+
+ set_old_rec_tag(args, line, line, 0);
// Create new block of alleles and update
- args->tmp_als_str.l = 0;
+ args->tmp_kstr.l = 0;
for (i=0; i<line->n_allele; i++)
{
- if (i>0) kputc(',',&args->tmp_als_str);
- kputsn(als[i].s,als[i].l,&args->tmp_als_str);
+ if (i>0) kputc(',',&args->tmp_kstr);
+ kputsn(als[i].s,als[i].l,&args->tmp_kstr);
}
- args->tmp_als_str.s[ args->tmp_als_str.l ] = 0;
- bcf_update_alleles_str(args->hdr,line,args->tmp_als_str.s);
+ args->tmp_kstr.s[ args->tmp_kstr.l ] = 0;
+ bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s);
args->nchanged++;
// Update INFO/END if necessary
int new_reflen = strlen(line->d.allele[0]);
- if ( (ori_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 )
+ if ( (new_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 )
{
// bcf_update_alleles_str() messed up rlen because line->pos changed. This will be fixed by bcf_update_info_int32()
+ line->pos = new_pos;
args->int32_arr[0] = line->pos + new_reflen;
- bcf_update_info_int32(args->hdr, line, "END", args->int32_arr, 1);
+ bcf_update_info_int32(args->out_hdr, line, "END", args->int32_arr, 1);
}
+ line->pos = new_pos;
return ERR_OK;
}
static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
{
- #define BRANCH_NUMERIC(type,type_t) \
+ #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing) \
{ \
const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); \
int ntmp = args->ntmp_arr1 / sizeof(type_t); \
} \
if ( args->force ) \
{ \
- bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
return; \
} \
error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \
} \
- bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,vals+ialt,1); \
} \
else if ( len==BCF_VL_R ) \
{ \
} \
if ( args->force ) \
{ \
- bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
return; \
} \
error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \
} \
- if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
- bcf_update_info_##type(args->hdr,dst,tag,vals,2); \
+ if ( args->keep_sum_ad >= 0 && args->keep_sum_ad==info->key ) \
+ { \
+ int j; \
+ for (j=1; j<info->len; j++) \
+ if ( j!=ialt+1 && !(is_missing) && !(is_vector_end) ) vals[0] += vals[j]; \
+ vals[1] = vals[ialt+1]; \
+ } \
+ else \
+ { \
+ if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
+ } \
+ bcf_update_info_##type(args->out_hdr,dst,tag,vals,2); \
} \
else if ( len==BCF_VL_G ) \
{ \
} \
if ( args->force ) \
{ \
- bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
return; \
} \
error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \
vals[2] = vals[bcf_alleles2gt(ialt+1,ialt+1)]; \
} \
- bcf_update_info_##type(args->hdr,dst,tag,vals,3); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,vals,3); \
} \
else \
- bcf_update_info_##type(args->hdr,dst,tag,vals,ret); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,vals,ret); \
}
switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key))
{
- case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t); break;
- case BCF_HT_REAL: BRANCH_NUMERIC(float, float); break;
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, vals[j]==bcf_int32_vector_end, vals[j]==bcf_int32_missing); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(vals[j]), bcf_float_is_missing(vals[j])); break;
}
#undef BRANCH_NUMERIC
}
STR_MOVE_NTH(str.s,tmp,str.s+str.l,ialt,len);
if ( len<0 ) return; // wrong number of fields: skip
str.s[len] = 0;
- bcf_update_info_string(args->hdr,dst,tag,str.s);
+ bcf_update_info_string(args->out_hdr,dst,tag,str.s);
}
else if ( len==BCF_VL_R )
{
STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,ialt,len);
if ( len<0 ) return; // wrong number of fields: skip
str.s[len] = 0;
- bcf_update_info_string(args->hdr,dst,tag,str.s);
+ bcf_update_info_string(args->out_hdr,dst,tag,str.s);
}
else if ( len==BCF_VL_G )
{
STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,iaa-i0a-1,len);
if ( len<0 ) return; // wrong number of fields: skip
str.s[len] = 0;
- bcf_update_info_string(args->hdr,dst,tag,str.s);
+ bcf_update_info_string(args->out_hdr,dst,tag,str.s);
}
else
- bcf_update_info_string(args->hdr,dst,tag,str.s);
+ bcf_update_info_string(args->out_hdr,dst,tag,str.s);
}
static void split_info_flag(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
{
const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
int ret = bcf_get_info_flag(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1);
- bcf_update_info_flag(args->hdr,dst,tag,NULL,ret);
+ bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret);
}
static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
}
gt += ngts;
}
- bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl);
+ bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl);
}
static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
{
- #define BRANCH_NUMERIC(type,type_t,is_vector_end,set_vector_end) \
+ #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end) \
{ \
const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \
int ntmp = args->ntmp_arr1 / sizeof(type_t); \
assert( nvals>0 ); \
type_t *vals = (type_t *) args->tmp_arr1; \
int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id); \
- int i, nsmpl = bcf_hdr_nsamples(args->hdr); \
+ int i,j, nsmpl = bcf_hdr_nsamples(args->hdr); \
if ( nvals==nsmpl ) /* all values are missing */ \
{ \
- bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \
return; \
} \
if ( len==BCF_VL_A ) \
} \
if ( args->force ) \
{ \
- bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
return; \
} \
error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \
dst_vals += 1; \
src_vals += nvals; \
} \
- bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \
} \
else if ( len==BCF_VL_R ) \
{ \
} \
if ( args->force ) \
{ \
- bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
return; \
} \
error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \
} \
nvals /= nsmpl; \
type_t *src_vals = vals, *dst_vals = vals; \
- for (i=0; i<nsmpl; i++) \
+ if ( args->keep_sum_ad >= 0 && args->keep_sum_ad==fmt->id ) \
{ \
- dst_vals[0] = src_vals[0]; \
- dst_vals[1] = src_vals[ialt+1]; \
- dst_vals += 2; \
- src_vals += nvals; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ dst_vals[0] = src_vals[0]; \
+ for (j=1; j<nvals; j++) \
+ if ( j!=ialt+1 && !(is_missing) && !(is_vector_end) ) dst_vals[0] += src_vals[j]; \
+ dst_vals[1] = src_vals[ialt+1]; \
+ dst_vals += 2; \
+ src_vals += nvals; \
+ } \
+ } \
+ else \
+ { \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ dst_vals[0] = src_vals[0]; \
+ dst_vals[1] = src_vals[ialt+1]; \
+ dst_vals += 2; \
+ src_vals += nvals; \
+ } \
} \
- bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl*2); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl*2); \
} \
else if ( len==BCF_VL_G ) \
{ \
} \
if ( args->force ) \
{ \
- bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
return; \
} \
error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \
dst_vals += all_haploid ? 2 : 3; \
src_vals += nvals; \
} \
- bcf_update_format_##type(args->hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \
} \
else \
- bcf_update_format_##type(args->hdr,dst,tag,vals,nvals); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,vals,nvals); \
}
switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
{
- case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, dst_vals[2]=bcf_int32_vector_end); break;
- case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break;
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, src_vals[j]==bcf_int32_missing, dst_vals[2]=bcf_int32_vector_end); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_is_missing(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break;
}
#undef BRANCH_NUMERIC
}
ptr += blen;
}
if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
- bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+ bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
}
else if ( len==BCF_VL_R )
{
ptr += blen;
}
if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
- bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+ bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
}
else if ( len==BCF_VL_G )
{
}
if ( args->force )
{
- bcf_update_format_char(args->hdr,dst,tag,NULL,0);
+ bcf_update_format_char(args->out_hdr,dst,tag,NULL,0);
return;
}
error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n",
ptr += blen;
}
if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
- bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+ bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
}
else
- bcf_update_format_char(args->hdr,dst,tag,str.s,str.l);
+ bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l);
}
-
static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
{
int i;
// Not quite sure how to handle IDs, they can be assigned to a specific
// ALT. For now we leave the ID unchanged for all.
- bcf_update_id(args->hdr, dst, line->d.id ? line->d.id : ".");
+ bcf_update_id(args->out_hdr, dst, line->d.id ? line->d.id : ".");
tmp.l = rlen;
kputs(line->d.allele[i+1],&tmp);
- bcf_update_alleles_str(args->hdr,dst,tmp.s);
+ bcf_update_alleles_str(args->out_hdr,dst,tmp.s);
if ( line->d.n_flt ) bcf_update_filter(args->hdr, dst, line->d.flt, line->d.n_flt);
else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst);
else split_info_string(args, line, info, i, dst);
}
+ set_old_rec_tag(args, dst, line, i + 1); // 1-based indexes
dst->n_sample = line->n_sample;
for (j=0; j<line->n_fmt; j++)
vals[ args->maps[i].map[k+1] - 1 ] = vals2[k]; \
} \
} \
- bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \
} \
else if ( len==BCF_VL_R ) \
{ \
vals[ args->maps[i].map[k] ] = vals2[k]; \
} \
} \
- bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \
} \
else if ( len==BCF_VL_G ) \
{ \
} \
} \
} \
- bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \
} \
else \
- bcf_update_info_##type(args->hdr,dst,tag,vals,nvals_ori); \
+ bcf_update_info_##type(args->out_hdr,dst,tag,vals,nvals_ori); \
}
switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key))
{
{
const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
int ret = bcf_get_info_flag(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
- bcf_update_info_flag(args->hdr,dst,tag,NULL,ret);
+ bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret);
}
int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst)
str.s[str.l] = 0;
args->tmp_arr1 = (uint8_t*) str.s;
args->ntmp_arr1 = str.m;
- bcf_update_info_string(args->hdr,dst,tag,str.s);
+ bcf_update_info_string(args->out_hdr,dst,tag,str.s);
}
else if ( len==BCF_VL_G )
{
str.s[str.l] = 0;
args->tmp_arr1 = (uint8_t*) str.s;
args->ntmp_arr1 = str.m;
- bcf_update_info_string(args->hdr,dst,tag,str.s);
+ bcf_update_info_string(args->out_hdr,dst,tag,str.s);
}
else
{
bcf_get_info_string(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
- bcf_update_info_string(args->hdr,dst,tag,args->tmp_arr1);
+ bcf_update_info_string(args->out_hdr,dst,tag,args->tmp_arr1);
}
}
static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst)
gt2 += ngts;
}
}
- bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl);
+ bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl);
}
static int diploid_to_haploid(int size, int nsmpl, int nals, uint8_t *vals)
{
vals2 += nvals2; \
} \
} \
- bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
} \
else if ( len==BCF_VL_R ) \
{ \
vals2 += nvals2; \
} \
} \
- bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
} \
else if ( len==BCF_VL_G ) \
{ \
vals2 += nvals;\
}\
}\
- bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
} \
else \
- bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \
+ bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \
}
switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
{
if ( len!=BCF_VL_A && len!=BCF_VL_R && len!=BCF_VL_G )
{
int nret = bcf_get_format_char(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
- bcf_update_format_char(args->hdr,dst,tag,args->tmp_arr1,nret);
+ bcf_update_format_char(args->out_hdr,dst,tag,args->tmp_arr1,nret);
return;
}
for (i=0; i<nlines; i++)
{
int nret = bcf_get_format_char(args->hdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1);
- if (nret<0) continue; /* format tag does not exist in this record, skip */ \
+ if (nret<0) continue; /* format tag does not exist in this record, skip */
nret /= nsmpl;
for (k=0; k<nsmpl; k++)
{
if ( i ) // we already have a copy
{
nret = bcf_get_format_char(args->hdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1);
- if (nret<0) continue; /* format tag does not exist in this record, skip */ \
+ if (nret<0) continue; /* format tag does not exist in this record, skip */
nret /= nsmpl;
}
for (k=0; k<nsmpl; k++)
}
args->ntmp_arr2 = str.m;
args->tmp_arr2 = (uint8_t*)str.s;
- bcf_update_format_char(args->hdr,dst,tag,str.s,str.l);
+ bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l);
}
char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb); // see vcfmerge.c
dst->qual = lines[i]->qual;
}
- bcf_update_id(args->hdr, dst, lines[0]->d.id);
+ bcf_update_id(args->out_hdr, dst, lines[0]->d.id);
// Merge and set the alleles, create a mapping from source allele indexes to dst idxs
hts_expand0(map_t,nlines,args->mmaps,args->maps); // a mapping for each line
}
for (i=1; i<nlines; i++)
{
- if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->hdr, dst, lines[i]->d.id);
+ if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->out_hdr, dst, lines[i]->d.id);
args->maps[i].nals = lines[i]->n_allele;
hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map);
args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals);
if ( !args->als ) error("Failed to merge alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1);
}
- bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals);
+ bcf_update_alleles(args->out_hdr, dst, (const char**)args->als, args->nals);
for (i=0; i<args->nals; i++)
{
free(args->als[i]);
args->als[i] = NULL;
}
- if ( lines[0]->d.n_flt ) bcf_update_filter(args->hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt);
+ if ( lines[0]->d.n_flt ) bcf_update_filter(args->out_hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt);
for (i=1; i<nlines; i++) {
int j;
for (j=0; j<lines[i]->d.n_flt; j++) {
// otherwise accumulate FILTERs
if (lines[i]->d.flt[j] == bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PASS")) {
if (args->strict_filter) {
- bcf_update_filter(args->hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt);
+ bcf_update_filter(args->out_hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt);
break;
}
else
continue;
}
- bcf_add_filter(args->hdr, dst, lines[i]->d.flt[j]);
+ bcf_add_filter(args->out_hdr, dst, lines[i]->d.flt[j]);
}
}
if ( mrows_ready_to_flush(args, args->lines[k]) )
{
while ( (line=mrows_flush(args)) )
- if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
}
int merge = 1;
if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
prev_type |= line_type;
if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]);
}
- if ( bcf_write1(file, args->hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( bcf_write1(file, args->out_hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
}
if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n )
{
while ( (line=mrows_flush(args)) )
- if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
}
}
static void init_data(args_t *args)
{
args->hdr = args->files->readers[0].header;
+ if ( args->keep_sum_ad )
+ {
+ args->keep_sum_ad = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"AD");
+ if ( args->keep_sum_ad < 0 ) error("Error: --keep-sum-ad requested but the tag AD is not present\n");
+ }
+ else
+ args->keep_sum_ad = -1;
+
+ args->out_hdr = bcf_hdr_dup(args->hdr);
+ if ( args->old_rec_tag )
+ bcf_hdr_printf(args->out_hdr,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX\">",args->old_rec_tag);
+
rbuf_init(&args->rbuf, 100);
args->lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
if ( args->ref_fname )
args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t));
args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr));
}
+ if ( args->atomize==SPLIT )
+ {
+ args->abuf = abuf_init(args->hdr, SPLIT);
+ abuf_set_opt(args->abuf, bcf_hdr_t*, BCF_HDR, args->out_hdr);
+ if ( args->old_rec_tag )
+ abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag);
+ abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele);
+ }
}
static void destroy_data(args_t *args)
for (i=0; i<args->ntmp_als; i++)
free(args->tmp_als[i].s);
free(args->tmp_als);
- free(args->tmp_als_str.s);
+ free(args->tmp_kstr.s);
if ( args->tmp_str )
{
for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) free(args->tmp_str[i].s);
free(args->tmp_arr1);
free(args->tmp_arr2);
free(args->diploid);
+ if ( args->abuf ) abuf_destroy(args->abuf);
+ bcf_hdr_destroy(args->out_hdr);
if ( args->mrow_out ) bcf_destroy1(args->mrow_out);
if ( args->fai ) fai_destroy(args->fai);
if ( args->mseq ) free(args->seq);
}
-static void normalize_line(args_t *args, bcf1_t **line_ptr)
+static void normalize_line(args_t *args, bcf1_t *line)
{
- bcf1_t *line = *line_ptr;
if ( args->fai )
{
if ( args->check_ref & CHECK_REF_FIX ) fix_ref(args, line);
rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines);
int i,j;
i = j = rbuf_append(&args->rbuf);
- if ( !args->lines[i] ) args->lines[i] = bcf_init1();
- SWAP(bcf1_t*, (*line_ptr), args->lines[i]);
+ if ( args->lines[i] ) bcf_destroy(args->lines[i]);
+ args->lines[i] = bcf_dup(line);
while ( rbuf_prev(&args->rbuf,&i) )
{
if ( args->lines[i]->pos > args->lines[j]->pos ) SWAP(bcf1_t*, args->lines[i], args->lines[j]);
}
}
+static bcf1_t *next_atomized_line(args_t *args)
+{
+ bcf1_t *rec = NULL;
+ if ( args->atomize==SPLIT )
+ {
+ rec = abuf_flush(args->abuf, 0);
+ if ( rec ) return rec;
+ }
+
+ if ( !bcf_sr_next_line(args->files) ) return NULL;
+
+ if ( args->atomize==SPLIT )
+ {
+ abuf_push(args->abuf,bcf_sr_get_line(args->files,0));
+ return abuf_flush(args->abuf, 0);
+ }
+ return bcf_sr_get_line(args->files,0);
+}
static void normalize_vcf(args_t *args)
{
- htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
- if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+ args->out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
+ if ( args->out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads )
- hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p);
- if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
- if ( bcf_hdr_write(out, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
+ if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm");
+ if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ bcf1_t *line;
int prev_rid = -1, prev_pos = -1, prev_type = 0;
- while ( bcf_sr_next_line(args->files) )
+ while ( (line = next_atomized_line(args)) )
{
args->ntotal++;
-
- bcf1_t *line = args->files->readers[0].buffer[0];
if ( args->rmdup )
{
int line_type = bcf_get_variant_types(line);
// still on the same chromosome?
int i,j,ilast = rbuf_last(&args->rbuf);
- if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, out, args->rbuf.n); // new chromosome
+ if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, args->out, args->rbuf.n); // new chromosome
int split = 0;
if ( args->mrows_op==MROWS_SPLIT )
args->nsplit++;
split_multiallelic_to_biallelics(args, line);
for (j=0; j<args->ntmp_lines; j++)
- normalize_line(args, &args->tmp_lines[j]);
+ normalize_line(args, args->tmp_lines[j]);
}
else
split = 0;
}
if ( !split )
- normalize_line(args, &args->files->readers[0].buffer[0]);
+ normalize_line(args, line);
// find out how many sites to flush
ilast = rbuf_last(&args->rbuf);
if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break;
j++;
}
- if ( j>0 ) flush_buffer(args, out, j);
+ if ( j>0 ) flush_buffer(args, args->out, j);
}
- flush_buffer(args, out, args->rbuf.n);
- if ( hts_close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+ flush_buffer(args, args->out, args->rbuf.n);
+ if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
fprintf(bcftools_stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
if ( args->check_ref & CHECK_REF_FIX )
fprintf(bcftools_stderr, "Usage: bcftools norm [options] <in.vcf.gz>\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -c, --check-ref <e|w|x|s> check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
- fprintf(bcftools_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n");
- fprintf(bcftools_stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|all|exact\n");
- fprintf(bcftools_stderr, " -f, --fasta-ref <file> reference sequence\n");
- fprintf(bcftools_stderr, " --force try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
- fprintf(bcftools_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
- fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(bcftools_stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n");
- fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
- fprintf(bcftools_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
- fprintf(bcftools_stderr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
+ fprintf(bcftools_stderr, " -a, --atomize Decompose complex variants (e.g. MNVs become consecutive SNVs)\n");
+ fprintf(bcftools_stderr, " --atom-overlaps '*'|. Use the star allele (*) for overlapping alleles or set to missing (.) [*]\n");
+ fprintf(bcftools_stderr, " -c, --check-ref e|w|x|s Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
+ fprintf(bcftools_stderr, " -D, --remove-duplicates Remove duplicate lines of the same type.\n");
+ fprintf(bcftools_stderr, " -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact\n");
+ fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence\n");
+ fprintf(bcftools_stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
+ fprintf(bcftools_stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
+ fprintf(bcftools_stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
+ fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(bcftools_stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n");
+ fprintf(bcftools_stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " -s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS\n");
+ fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
+ fprintf(bcftools_stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, " # normalize and left-align indels\n");
fprintf(bcftools_stderr, " # split multi-allelic sites\n");
fprintf(bcftools_stderr, " bcftools norm -m- in.vcf\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcfnorm(int argc, char *argv[])
args->do_indels = 1;
int region_is_file = 0;
int targets_is_file = 0;
+ args->use_star_allele = 1;
static struct option loptions[] =
{
{"help",no_argument,NULL,'h'},
{"force",no_argument,NULL,7},
+ {"atomize",no_argument,NULL,'a'},
+ {"atom-overlaps",required_argument,NULL,11},
+ {"old-rec-tag",required_argument,NULL,12},
+ {"keep-sum",required_argument,NULL,10},
{"fasta-ref",required_argument,NULL,'f'},
{"do-not-normalize",no_argument,NULL,'N'},
{"multiallelics",required_argument,NULL,'m'},
{NULL,0,NULL,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sN",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) {
switch (c) {
+ case 10:
+ // possibly generalize this also to INFO/AD and other tags
+ if ( strcasecmp("ad",optarg) )
+ error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n");
+ args->keep_sum_ad = 1; // this will be set to the header id or -1 in init_data
+ break;
+ case 'a': args->atomize = SPLIT; break;
+ case 11 :
+ if ( optarg[0]=='*' ) args->use_star_allele = 1;
+ else if ( optarg[0]=='.' ) args->use_star_allele = 0;
+ else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n");
+ break;
+ case 12 : args->old_rec_tag = optarg; break;
case 'N': args->do_indels = 0; break;
case 'd':
if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
}
else fname = argv[optind];
- if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n");
+ if ( !args->ref_fname && !args->mrows_op && !args->rmdup && args->atomize==NONE ) error("Expected -a, -f, -m, -D or -d option\n");
if ( !args->check_ref && args->ref_fname ) args->check_ref = CHECK_REF_EXIT;
if ( args->check_ref && !args->ref_fname ) error("Expected --fasta-ref with --check-ref\n");
/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
- Copyright (C) 2013-2017 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char **plugin_paths;
char **argv, *output_fname, *regions_list, *targets_list;
- int argc, drop_header, verbose, record_cmd_line;
+ int argc, drop_header, verbose, record_cmd_line, plist_only;
}
args_t;
args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1));
args->plugin_paths[args->nplugin_paths] = dir;
args->nplugin_paths++;
- if ( args->verbose > 1 ) fprintf(stderr, "plugin directory %s .. ok\n", dir);
+ if ( args->verbose > 1 && strcmp(".",dir) ) fprintf(stderr, "plugin directory %s .. ok\n", dir);
}
else
{
#else
if ( fname[0]=='/' ) is_absolute_path = 1;
#endif
+
+ kstring_t err = {0,0,0};
if ( !is_absolute_path )
{
int i;
#else
handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
#endif
- if ( args->verbose > 1 )
- {
- if ( !handle )
+ if ( !handle )
#ifdef _WIN32
- fprintf(stderr,"%s:\n\tLoadLibraryA .. %lu\n", tmp, GetLastError());
+ ksprintf(&err,"LoadLibraryA .. %lu\n", GetLastError());
#else
- fprintf(stderr,"%s:\n\tdlopen .. %s\n", tmp, dlerror());
+ ksprintf(&err,"%s:\n\tdlopen .. %s\n", tmp,dlerror());
#endif
- else fprintf(stderr,"%s:\n\tplugin open .. ok\n", tmp);
- }
+ else if ( args->verbose > 1 )
+ fprintf(stderr,"%s:\n\tplugin open .. ok\n", tmp);
free(tmp);
if ( handle ) return handle;
}
#else
handle = dlopen(fname, RTLD_NOW);
#endif
- if ( args->verbose > 1 )
- {
- if ( !handle )
+ if ( !handle )
#ifdef _WIN32
- fprintf(stderr,"%s:\n\tLoadLibraryA .. %lu\n", fname, GetLastError());
+ ksprintf(&err,"LoadLibraryA .. %lu\n", GetLastError());
#else
- fprintf(stderr,"%s:\n\tdlopen .. %s\n", fname, dlerror());
+ ksprintf(&err,"%s:\n\tdlopen .. %s\n", fname,dlerror());
#endif
- else fprintf(stderr,"%s:\n\tplugin open .. ok\n", fname);
- }
+ else if ( args->verbose > 1 )
+ fprintf(stderr,"%s:\n\tplugin open .. ok\n", fname);
+
+ if ( !handle && (!args->plist_only || args->verbose>1) )
+ fprintf(stderr,"%s",err.s);
+ free(err.s);
return handle;
}
-static void print_plugin_usage_hint(void)
+static void print_plugin_usage_hint(const char *name)
{
- fprintf(stderr, "\nNo functional bcftools plugins were found");
+ if ( name )
+ fprintf(stderr, "\nThe bcftools plugin \"%s\" was not found or is not functional", name);
+ else
+ fprintf(stderr, "\nNo functional bcftools plugins were found");
if ( !getenv("BCFTOOLS_PLUGINS") )
- fprintf(stderr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n");
+ {
+ fprintf(stderr,". The environment variable BCFTOOLS_PLUGINS is not set");
+#ifdef PLUGINPATH
+ fprintf(stderr,"\nand no usable plugins were found in %s", PLUGINPATH);
+#endif
+ fprintf(stderr,".\n\n");
+ }
else
+ {
fprintf(stderr,
" in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n"
"- Is the plugin path correct?\n\n"
- "- Run \"bcftools plugin -lv\" for more detailed error output.\n"
+ "- Run \"bcftools plugin -l\" or \"bcftools plugin -lvv\" for a list of available plugins.\n"
"\n",
getenv("BCFTOOLS_PLUGINS")
);
+ }
}
static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugin_t *plugin)
{
if ( exit_on_error )
{
- print_plugin_usage_hint();
+ print_plugin_usage_hint(fname);
error("Could not load \"%s\".\n\n", fname);
}
return -1;
return 0;
}
-static void init_plugin(args_t *args)
+static void check_version(args_t *args)
{
static int warned_bcftools = 0, warned_htslib = 0;
-
- int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out);
- if ( ret<0 ) error("The plugin exited with an error.\n");
const char *bver, *hver;
args->plugin.version(&bver, &hver);
if ( strcmp(bver,bcftools_version()) && !warned_bcftools )
fprintf(stderr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver);
warned_htslib = 1;
}
+}
+
+static void init_plugin(args_t *args)
+{
+ int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out);
+ if ( ret<0 ) error("The plugin exited with an error.\n");
+ check_version(args);
args->drop_header += ret;
}
if ( args->verbose ) printf("\n");
}
else
- print_plugin_usage_hint();
+ print_plugin_usage_hint(NULL);
free(str.s);
return nplugins ? 0 : 1;
}
if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
if ( !args->drop_header )
{
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
args->n_threads = 0;
args->record_cmd_line = 1;
args->nplugin_paths = -1;
- int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0;
+ int regions_is_file = 0, targets_is_file = 0, usage_only = 0, version_only = 0;
if ( argc==1 ) usage(args);
-
char *plugin_name = NULL;
if ( argv[1][0]!='-' )
{
load_plugin(args, plugin_name, 1, &args->plugin);
if ( args->plugin.run )
{
+ check_version(args);
int ret = args->plugin.run(argc, argv);
destroy_data(args);
free(args);
default: error("The output type \"%s\" not recognised\n", optarg);
};
break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
case 't': args->targets_list = optarg; break;
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
- case 'l': plist_only = 1; break;
+ case 'l': args->plist_only = 1; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case '?':
default: error("Unknown argument: %s\n", optarg);
}
}
- if ( plist_only ) return list_plugins(args);
- if ( usage_only && ! plugin_name ) usage(args);
+ if ( args->plist_only ) return list_plugins(args);
+ if ( !plugin_name ) usage(args);
if ( version_only )
{
}
char *fname = NULL;
- if ( optind>=argc || argv[optind][0]=='-' )
+ if ( optind>=argc || (argv[optind][0]=='-' && argv[optind][1]) )
{
args->plugin.argc = argc - optind + 1;
args->plugin.argv = argv + optind - 1;
/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
- Copyright (C) 2013-2017 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char **plugin_paths;
char **argv, *output_fname, *regions_list, *targets_list;
- int argc, drop_header, verbose, record_cmd_line;
+ int argc, drop_header, verbose, record_cmd_line, plist_only;
}
args_t;
args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1));
args->plugin_paths[args->nplugin_paths] = dir;
args->nplugin_paths++;
- if ( args->verbose > 1 ) fprintf(bcftools_stderr, "plugin directory %s .. ok\n", dir);
+ if ( args->verbose > 1 && strcmp(".",dir) ) fprintf(bcftools_stderr, "plugin directory %s .. ok\n", dir);
}
else
{
#else
if ( fname[0]=='/' ) is_absolute_path = 1;
#endif
+
+ kstring_t err = {0,0,0};
if ( !is_absolute_path )
{
int i;
#else
handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
#endif
- if ( args->verbose > 1 )
- {
- if ( !handle )
+ if ( !handle )
#ifdef _WIN32
- fprintf(bcftools_stderr,"%s:\n\tLoadLibraryA .. %lu\n", tmp, GetLastError());
+ ksprintf(&err,"LoadLibraryA .. %lu\n", GetLastError());
#else
- fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", tmp, dlerror());
+ ksprintf(&err,"%s:\n\tdlopen .. %s\n", tmp,dlerror());
#endif
- else fprintf(bcftools_stderr,"%s:\n\tplugin open .. ok\n", tmp);
- }
+ else if ( args->verbose > 1 )
+ fprintf(bcftools_stderr,"%s:\n\tplugin open .. ok\n", tmp);
free(tmp);
if ( handle ) return handle;
}
#else
handle = dlopen(fname, RTLD_NOW);
#endif
- if ( args->verbose > 1 )
- {
- if ( !handle )
+ if ( !handle )
#ifdef _WIN32
- fprintf(bcftools_stderr,"%s:\n\tLoadLibraryA .. %lu\n", fname, GetLastError());
+ ksprintf(&err,"LoadLibraryA .. %lu\n", GetLastError());
#else
- fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", fname, dlerror());
+ ksprintf(&err,"%s:\n\tdlopen .. %s\n", fname,dlerror());
#endif
- else fprintf(bcftools_stderr,"%s:\n\tplugin open .. ok\n", fname);
- }
+ else if ( args->verbose > 1 )
+ fprintf(bcftools_stderr,"%s:\n\tplugin open .. ok\n", fname);
+
+ if ( !handle && (!args->plist_only || args->verbose>1) )
+ fprintf(bcftools_stderr,"%s",err.s);
+ free(err.s);
return handle;
}
-static void print_plugin_usage_hint(void)
+static void print_plugin_usage_hint(const char *name)
{
- fprintf(bcftools_stderr, "\nNo functional bcftools plugins were found");
+ if ( name )
+ fprintf(bcftools_stderr, "\nThe bcftools plugin \"%s\" was not found or is not functional", name);
+ else
+ fprintf(bcftools_stderr, "\nNo functional bcftools plugins were found");
if ( !getenv("BCFTOOLS_PLUGINS") )
- fprintf(bcftools_stderr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n");
+ {
+ fprintf(bcftools_stderr,". The environment variable BCFTOOLS_PLUGINS is not set");
+#ifdef PLUGINPATH
+ fprintf(bcftools_stderr,"\nand no usable plugins were found in %s", PLUGINPATH);
+#endif
+ fprintf(bcftools_stderr,".\n\n");
+ }
else
+ {
fprintf(bcftools_stderr,
" in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n"
"- Is the plugin path correct?\n\n"
- "- Run \"bcftools plugin -lv\" for more detailed error output.\n"
+ "- Run \"bcftools plugin -l\" or \"bcftools plugin -lvv\" for a list of available plugins.\n"
"\n",
getenv("BCFTOOLS_PLUGINS")
);
+ }
}
static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugin_t *plugin)
{
if ( exit_on_error )
{
- print_plugin_usage_hint();
+ print_plugin_usage_hint(fname);
error("Could not load \"%s\".\n\n", fname);
}
return -1;
return 0;
}
-static void init_plugin(args_t *args)
+static void check_version(args_t *args)
{
static int warned_bcftools = 0, warned_htslib = 0;
-
- int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out);
- if ( ret<0 ) error("The plugin exited with an error.\n");
const char *bver, *hver;
args->plugin.version(&bver, &hver);
if ( strcmp(bver,bcftools_version()) && !warned_bcftools )
fprintf(bcftools_stderr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver);
warned_htslib = 1;
}
+}
+
+static void init_plugin(args_t *args)
+{
+ int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out);
+ if ( ret<0 ) error("The plugin exited with an error.\n");
+ check_version(args);
args->drop_header += ret;
}
if ( args->verbose ) fprintf(bcftools_stdout, "\n");
}
else
- print_plugin_usage_hint();
+ print_plugin_usage_hint(NULL);
free(str.s);
return nplugins ? 0 : 1;
}
if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
if ( !args->drop_header )
{
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
fprintf(bcftools_stderr, " -v, --verbose print verbose information, -vv increases verbosity\n");
fprintf(bcftools_stderr, " -V, --version print version string and exit\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
static int is_verbose(int argc, char *argv[])
args->n_threads = 0;
args->record_cmd_line = 1;
args->nplugin_paths = -1;
- int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0;
+ int regions_is_file = 0, targets_is_file = 0, usage_only = 0, version_only = 0;
if ( argc==1 ) usage(args);
-
char *plugin_name = NULL;
if ( argv[1][0]!='-' )
{
load_plugin(args, plugin_name, 1, &args->plugin);
if ( args->plugin.run )
{
+ check_version(args);
int ret = args->plugin.run(argc, argv);
destroy_data(args);
free(args);
default: error("The output type \"%s\" not recognised\n", optarg);
};
break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
case 't': args->targets_list = optarg; break;
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
- case 'l': plist_only = 1; break;
+ case 'l': args->plist_only = 1; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case '?':
default: error("Unknown argument: %s\n", optarg);
}
}
- if ( plist_only ) return list_plugins(args);
- if ( usage_only && ! plugin_name ) usage(args);
+ if ( args->plist_only ) return list_plugins(args);
+ if ( !plugin_name ) usage(args);
if ( version_only )
{
}
char *fname = NULL;
- if ( optind>=argc || argv[optind][0]=='-' )
+ if ( optind>=argc || (argv[optind][0]=='-' && argv[optind][1]) )
{
args->plugin.argc = argc - optind + 1;
args->plugin.argv = argv + optind - 1;
/* vcfquery.c -- Extracts fields from VCF/BCF file.
- Copyright (C) 2013-2017 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
fprintf(stderr, " -H, --print-header print header\n");
fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -l, --list-samples print the list of samples and exit\n");
- fprintf(stderr, " -o, --output-file <file> output file name [stdout]\n");
+ fprintf(stderr, " -o, --output <file> output file name [stdout]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(stderr, " -s, --samples <list> list of samples to include\n");
{"exclude",1,0,'e'},
{"format",1,0,'f'},
{"output-file",1,0,'o'},
+ {"output",1,0,'o'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
{"targets",1,0,'t'},
args->format_str = str.s;
break;
}
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
case 't': args->targets_list = optarg; break;
/* vcfquery.c -- Extracts fields from VCF/BCF file.
- Copyright (C) 2013-2017 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
fprintf(bcftools_stderr, " -H, --print-header print header\n");
fprintf(bcftools_stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
fprintf(bcftools_stderr, " -l, --list-samples print the list of samples and exit\n");
- fprintf(bcftools_stderr, " -o, --output-file <file> output file name [bcftools_stdout]\n");
+ fprintf(bcftools_stderr, " -o, --output <file> output file name [bcftools_stdout]\n");
fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(bcftools_stderr, " -s, --samples <list> list of samples to include\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcfquery(int argc, char *argv[])
{"exclude",1,0,'e'},
{"format",1,0,'f'},
{"output-file",1,0,'o'},
+ {"output",1,0,'o'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
{"targets",1,0,'t'},
args->format_str = str.s;
break;
}
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
case 't': args->targets_list = optarg; break;
/* vcfroh.c -- HMM model for detecting runs of autozygosity.
- Copyright (C) 2013-2018 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int ntot; // some stats to detect if things didn't go wrong
int nno_af; // number of sites rejected because AF could not be determined
int nfiltered; // .. because of filters
- int nnot_biallelic, ndup;
+ int nno_alt, nmultiallelic, ndup;
smpl_t *smpl; // HMM data for each sample
smpl_ilist_t *af_smpl; // list of samples to estimate AF from (--estimate-AF)
smpl_ilist_t *roh_smpl; // list of samples to analyze (--samples, --samples-file)
int af_from_PL; // estimate AF from FMT/PL rather than FMT/GT
char **argv, *targets_list, *regions_list, *af_fname, *af_tag, *samples, *buffer_size, *output_fname;
int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads;
+ int include_noalt_sites;
BGZF *out;
kstring_t str;
{
smpl->rg.state = 1;
smpl->rg.beg = smpl->sites[i];
+ smpl->rg.end = smpl->sites[i];
smpl->rg.rid = args->prev_rid;
smpl->rg.qual = qual;
smpl->rg.nqual = 1;
}
}
-int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
+int read_AF(args_t *args, bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
{
+ if ( tgt->nals < 2 )
+ error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", args->af_fname,tgt->line.s);
if ( tgt->nals != line->n_allele ) return -1; // number of alleles does not match
int i;
else if ( args->af_fname )
{
// Read AF from a file
- ret = read_AF(args->files->targets, line, &alt_freq);
+ ret = read_AF(args, args->files->targets, line, &alt_freq);
}
else if ( args->dflt_AF > 0 )
{
// Skip unwanted lines, for simplicity we consider only biallelic sites
if ( line->rid == args->skip_rid ) return;
- if ( line->n_allele==1 ) { args->nnot_biallelic++; return; } // no ALT allele
- if ( line->n_allele > 3 ) { args->nnot_biallelic++; return; } // cannot be bi-allelic, even with <*>
// This can be raw callable VCF with the symbolic unseen allele <*>
- int ial = 0;
+ int ial = 0, nalt = line->n_allele - 1;
for (i=1; i<line->n_allele; i++)
- if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; }
- if ( ial==0 ) // normal VCF, the symbolic allele is not present
{
- if ( line->n_allele!=2 ) { args->nnot_biallelic++; return; } // not biallelic
- ial = 1;
+ if ( !strcmp("<*>",line->d.allele[i]) || !strcmp("<NON_REF>",line->d.allele[i]) ) nalt--;
+ else if ( !ial ) ial = i;
}
- else
+
+ if ( !nalt ) // no ALT allele
{
- if ( line->n_allele!=3 ) return; // not biallelic
- ial = ial==1 ? 2 : 1; // <*> can come in any order
+ args->nno_alt++;
+ if ( !args->include_noalt_sites ) return;
+ }
+ else if ( nalt>1 )
+ {
+ args->nmultiallelic++;
+ return;
}
+
if ( args->snps_only && !bcf_is_snp(line) ) return;
// Initialize genetic map
int skip_rid = 0;
if ( args->prev_rid<0 )
- {
- args->prev_rid = line->rid;
- args->prev_pos = line->pos;
skip_rid = load_genmap(args, bcf_seqname(args->hdr,line));
- }
// New chromosome?
if ( args->prev_rid!=line->rid )
fprintf(stderr, "General Options:\n");
fprintf(stderr, " --AF-dflt <float> if AF is not known, use this allele frequency [skip]\n");
fprintf(stderr, " --AF-tag <TAG> use TAG for allele frequency\n");
- fprintf(stderr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF\\tALT\\tAF)\n");
+ fprintf(stderr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
fprintf(stderr, " -b --buffer-size <int[,int]> buffer size and the number of overlapping sites, 0 for unlimited [0]\n");
fprintf(stderr, " If the first number is negative, it is interpreted as the maximum memory to\n");
fprintf(stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n");
fprintf(stderr, " Safe value to use is 30 to account for GT errors.\n");
fprintf(stderr, " --include <expr> select sites for which the expression is true\n");
fprintf(stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n");
+ fprintf(stderr, " --include-noalt include sites with no ALT allele (ignored by default)\n");
fprintf(stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n");
fprintf(stderr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n");
fprintf(stderr, " is replaced with chromosome name\n");
{"AF-dflt",1,0,2},
{"include",1,0,3},
{"exclude",1,0,4},
+ {"include-noalt",0,0,5},
{"buffer-size",1,0,'b'},
{"ignore-homref",0,0,'i'},
{"estimate-AF",1,0,'e'},
args->dflt_AF = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
break;
- case 3: args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break;
- case 4: args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break;
+ case 3 :
+ if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 4 :
+ if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 5: args->include_noalt_sites = 1; break;
case 'o': args->output_fname = optarg; break;
case 'O':
if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST;
fprintf(stderr,"Number of lines overlapping with --AF-file/processed: %d/%d\n", args->ntot,nmin);
else
fprintf(stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin);
- fprintf(stderr,"Number of lines filtered/no AF/not biallelic/dup: %d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nnot_biallelic,args->ndup);
+ fprintf(stderr,"Number of lines filtered/no AF/no alt/multiallelic/dup: %d/%d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nno_alt,args->nmultiallelic,args->ndup);
if ( nmin==0 )
{
fprintf(stderr,"No usable sites were found.\n");
/* vcfroh.c -- HMM model for detecting runs of autozygosity.
- Copyright (C) 2013-2018 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int ntot; // some stats to detect if things didn't go wrong
int nno_af; // number of sites rejected because AF could not be determined
int nfiltered; // .. because of filters
- int nnot_biallelic, ndup;
+ int nno_alt, nmultiallelic, ndup;
smpl_t *smpl; // HMM data for each sample
smpl_ilist_t *af_smpl; // list of samples to estimate AF from (--estimate-AF)
smpl_ilist_t *roh_smpl; // list of samples to analyze (--samples, --samples-file)
int af_from_PL; // estimate AF from FMT/PL rather than FMT/GT
char **argv, *targets_list, *regions_list, *af_fname, *af_tag, *samples, *buffer_size, *output_fname;
int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads;
+ int include_noalt_sites;
BGZF *out;
kstring_t str;
{
smpl->rg.state = 1;
smpl->rg.beg = smpl->sites[i];
+ smpl->rg.end = smpl->sites[i];
smpl->rg.rid = args->prev_rid;
smpl->rg.qual = qual;
smpl->rg.nqual = 1;
}
}
-int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
+int read_AF(args_t *args, bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
{
+ if ( tgt->nals < 2 )
+ error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", args->af_fname,tgt->line.s);
if ( tgt->nals != line->n_allele ) return -1; // number of alleles does not match
int i;
case BCF_BT_INT8: BRANCH(int8_t); break;
case BCF_BT_INT16: BRANCH(int16_t); break;
case BCF_BT_INT32: BRANCH(int32_t); break;
- default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); bcftools_exit(1);
}
#undef BRANCH
}
case BCF_BT_INT8: BRANCH(int8_t); break;
case BCF_BT_INT16: BRANCH(int16_t); break;
case BCF_BT_INT32: BRANCH(int32_t); break;
- default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); bcftools_exit(1);
}
#undef BRANCH
}
else if ( args->af_fname )
{
// Read AF from a file
- ret = read_AF(args->files->targets, line, &alt_freq);
+ ret = read_AF(args, args->files->targets, line, &alt_freq);
}
else if ( args->dflt_AF > 0 )
{
case BCF_BT_INT8: BRANCH(int8_t); break;
case BCF_BT_INT16: BRANCH(int16_t); break;
case BCF_BT_INT32: BRANCH(int32_t); break;
- default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); bcftools_exit(1);
}
#undef BRANCH
}
// Skip unwanted lines, for simplicity we consider only biallelic sites
if ( line->rid == args->skip_rid ) return;
- if ( line->n_allele==1 ) { args->nnot_biallelic++; return; } // no ALT allele
- if ( line->n_allele > 3 ) { args->nnot_biallelic++; return; } // cannot be bi-allelic, even with <*>
// This can be raw callable VCF with the symbolic unseen allele <*>
- int ial = 0;
+ int ial = 0, nalt = line->n_allele - 1;
for (i=1; i<line->n_allele; i++)
- if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; }
- if ( ial==0 ) // normal VCF, the symbolic allele is not present
{
- if ( line->n_allele!=2 ) { args->nnot_biallelic++; return; } // not biallelic
- ial = 1;
+ if ( !strcmp("<*>",line->d.allele[i]) || !strcmp("<NON_REF>",line->d.allele[i]) ) nalt--;
+ else if ( !ial ) ial = i;
}
- else
+
+ if ( !nalt ) // no ALT allele
{
- if ( line->n_allele!=3 ) return; // not biallelic
- ial = ial==1 ? 2 : 1; // <*> can come in any order
+ args->nno_alt++;
+ if ( !args->include_noalt_sites ) return;
+ }
+ else if ( nalt>1 )
+ {
+ args->nmultiallelic++;
+ return;
}
+
if ( args->snps_only && !bcf_is_snp(line) ) return;
// Initialize genetic map
int skip_rid = 0;
if ( args->prev_rid<0 )
- {
- args->prev_rid = line->rid;
- args->prev_pos = line->pos;
skip_rid = load_genmap(args, bcf_seqname(args->hdr,line));
- }
// New chromosome?
if ( args->prev_rid!=line->rid )
fprintf(bcftools_stderr, "General Options:\n");
fprintf(bcftools_stderr, " --AF-dflt <float> if AF is not known, use this allele frequency [skip]\n");
fprintf(bcftools_stderr, " --AF-tag <TAG> use TAG for allele frequency\n");
- fprintf(bcftools_stderr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF\\tALT\\tAF)\n");
+ fprintf(bcftools_stderr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
fprintf(bcftools_stderr, " -b --buffer-size <int[,int]> buffer size and the number of overlapping sites, 0 for unlimited [0]\n");
fprintf(bcftools_stderr, " If the first number is negative, it is interpreted as the maximum memory to\n");
fprintf(bcftools_stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n");
fprintf(bcftools_stderr, " Safe value to use is 30 to account for GT errors.\n");
fprintf(bcftools_stderr, " --include <expr> select sites for which the expression is true\n");
fprintf(bcftools_stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n");
+ fprintf(bcftools_stderr, " --include-noalt include sites with no ALT allele (ignored by default)\n");
fprintf(bcftools_stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n");
fprintf(bcftools_stderr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n");
fprintf(bcftools_stderr, " is replaced with chromosome name\n");
fprintf(bcftools_stderr, " -H, --az-to-hw <float> P(HW|AZ) transition probability from AZ to HW state [5e-9]\n");
fprintf(bcftools_stderr, " -V, --viterbi-training <float> estimate HMM parameters, <float> is the convergence threshold, e.g. 1e-10 (experimental)\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcfroh(int argc, char *argv[])
{"AF-dflt",1,0,2},
{"include",1,0,3},
{"exclude",1,0,4},
+ {"include-noalt",0,0,5},
{"buffer-size",1,0,'b'},
{"ignore-homref",0,0,'i'},
{"estimate-AF",1,0,'e'},
args->dflt_AF = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
break;
- case 3: args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break;
- case 4: args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break;
+ case 3 :
+ if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 4 :
+ if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 5: args->include_noalt_sites = 1; break;
case 'o': args->output_fname = optarg; break;
case 'O':
if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST;
fprintf(bcftools_stderr,"Number of lines overlapping with --AF-file/processed: %d/%d\n", args->ntot,nmin);
else
fprintf(bcftools_stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin);
- fprintf(bcftools_stderr,"Number of lines filtered/no AF/not biallelic/dup: %d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nnot_biallelic,args->ndup);
+ fprintf(bcftools_stderr,"Number of lines filtered/no AF/no alt/multiallelic/dup: %d/%d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nno_alt,args->nmultiallelic,args->ndup);
if ( nmin==0 )
{
fprintf(bcftools_stderr,"No usable sites were found.\n");
/* vcfsom.c -- SOM (Self-Organizing Map) filtering.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2014, 2020 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdio.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
static void som_write_map(char *prefix, som_t **som, int nsom)
{
FILE *fp = open_file(NULL,"w","%s.som",prefix);
- fwrite("SOMv1",5,1,fp);
- fwrite(&nsom,sizeof(int),1,fp);
+ size_t nw;
+ if ( (nw=fwrite("SOMv1",5,1,fp))!=5 ) error("Failed to write 5 bytes\n");
+ if ( (nw=fwrite(&nsom,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int));
int i;
for (i=0; i<nsom; i++)
{
- fwrite(&som[i]->size,sizeof(int),1,fp);
- fwrite(&som[i]->kdim,sizeof(int),1,fp);
- fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp);
- fwrite(som[i]->c,sizeof(double),som[i]->size,fp);
+ if ( (nw=fwrite(&som[i]->size,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int));
+ if ( (nw=fwrite(&som[i]->kdim,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int));
+ if ( (nw=fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp))!=sizeof(double)*som[i]->size*som[i]->kdim ) error("Failed to write %zu bytes\n",sizeof(double)*som[i]->size*som[i]->kdim);
+ if ( (nw=fwrite(som[i]->c,sizeof(double),som[i]->size,fp))!=sizeof(double)*som[i]->size ) error("Failed to write %zu bytes\n",sizeof(double)*som[i]->size);
}
if ( fclose(fp) ) error("%s.som: fclose failed\n",prefix);
}
/* vcfsom.c -- SOM (Self-Organizing Map) filtering.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2014, 2020 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdio.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
static void som_write_map(char *prefix, som_t **som, int nsom)
{
FILE *fp = open_file(NULL,"w","%s.som",prefix);
- fwrite("SOMv1",5,1,fp);
- fwrite(&nsom,sizeof(int),1,fp);
+ size_t nw;
+ if ( (nw=fwrite("SOMv1",5,1,fp))!=5 ) error("Failed to write 5 bytes\n");
+ if ( (nw=fwrite(&nsom,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int));
int i;
for (i=0; i<nsom; i++)
{
- fwrite(&som[i]->size,sizeof(int),1,fp);
- fwrite(&som[i]->kdim,sizeof(int),1,fp);
- fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp);
- fwrite(som[i]->c,sizeof(double),som[i]->size,fp);
+ if ( (nw=fwrite(&som[i]->size,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int));
+ if ( (nw=fwrite(&som[i]->kdim,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int));
+ if ( (nw=fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp))!=sizeof(double)*som[i]->size*som[i]->kdim ) error("Failed to write %zu bytes\n",sizeof(double)*som[i]->size*som[i]->kdim);
+ if ( (nw=fwrite(som[i]->c,sizeof(double),som[i]->size,fp))!=sizeof(double)*som[i]->size ) error("Failed to write %zu bytes\n",sizeof(double)*som[i]->size);
}
if ( fclose(fp) ) error("%s.som: fclose failed\n",prefix);
}
fprintf(bcftools_stderr, " -n, --ntrain-sites <int> effective number of training sites [number of good sites]\n");
fprintf(bcftools_stderr, " -r, --random-seed <int> random seed, 0 for time() [1]\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcfsom(int argc, char *argv[])
/* vcfsort.c -- sort subcommand
- Copyright (C) 2017 Genome Research Ltd.
+ Copyright (C) 2017-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
blk_read(args, bhp, args->hdr, blk);
}
- htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
while ( bhp->ndat )
{
fprintf(stderr, "Usage: bcftools sort [OPTIONS] <FILE.vcf>\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -m, --max-mem <float>[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6
- fprintf(stderr, " -o, --output-file <file> output file name [stdout]\n");
- fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(stderr, " -T, --temp-dir <dir> temporary files [/tmp/bcftools-sort.XXXXXX]\n");
+ fprintf(stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6
+ fprintf(stderr, " -o, --output FILE output file name [stdout]\n");
+ fprintf(stderr, " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+#ifdef _WIN32
+ fprintf(stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n");
+#else
+ fprintf(stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n");
+#endif
fprintf(stderr, "\n");
exit(1);
}
-size_t parse_mem_string(char *str)
+size_t parse_mem_string(const char *str)
{
char *tmp;
double mem = strtod(str, &tmp);
- if ( tmp==str ) error("Could not parse: --max-mem %s\n", str);
+ if ( tmp==str ) error("Could not parse the memory string: \"%s\"\n", str);
if ( !strcasecmp("k",tmp) ) mem *= 1000;
else if ( !strcasecmp("m",tmp) ) mem *= 1000*1000;
else if ( !strcasecmp("g",tmp) ) mem *= 1000*1000*1000;
void mkdir_p(const char *fmt, ...);
static void init(args_t *args)
{
-#ifdef _WIN32
- char tmp_path[MAX_PATH];
- int ret = GetTempPath(MAX_PATH, tmp_path);
- if (!ret || ret > MAX_PATH)
- error("Could not get the path to the temporary folder\n");
- if (strlen(tmp_path) + strlen("/bcftools-sort.XXXXXX") >= MAX_PATH)
- error("Full path to the temporary folder is too long\n");
- strcat(tmp_path, "/bcftools-sort.XXXXXX");
- args->tmp_dir = strdup(tmp_path);
-#else
- args->tmp_dir = args->tmp_dir ? strdup(args->tmp_dir) : strdup("/tmp/bcftools-sort.XXXXXX");
-#endif
- size_t len = strlen(args->tmp_dir);
- if ( !strcmp("XXXXXX",args->tmp_dir+len-6) )
- {
+ args->tmp_dir = init_tmp_prefix(args->tmp_dir);
+
#ifdef _WIN32
int ret = mkdir(mktemp(args->tmp_dir), 0700);
if ( ret ) error("mkdir(%s) failed: %s\n", args->tmp_dir,strerror(errno));
int ret = chmod(tmp, S_IRUSR|S_IWUSR|S_IXUSR);
if ( ret ) error("chmod(%s,S_IRUSR|S_IWUSR|S_IXUSR) failed: %s\n", args->tmp_dir,strerror(errno));
#endif
- }
- else {
- mkdir_p("%s/",args->tmp_dir);
- }
fprintf(stderr,"Writing to %s\n", args->tmp_dir);
}
{"temp-dir",required_argument,NULL,'T'},
{"output-type",required_argument,NULL,'O'},
{"output-file",required_argument,NULL,'o'},
+ {"output",required_argument,NULL,'o'},
{"help",no_argument,NULL,'h'},
{0,0,0,0}
};
/* vcfsort.c -- sort subcommand
- Copyright (C) 2017 Genome Research Ltd.
+ Copyright (C) 2017-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
vfprintf(bcftools_stderr, format, ap);
va_end(ap);
clean_files(args);
- exit(-1);
+ bcftools_exit(-1);
}
int cmp_bcf_pos(const void *aptr, const void *bptr)
blk_read(args, bhp, args->hdr, blk);
}
- htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+ htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
while ( bhp->ndat )
{
fprintf(bcftools_stderr, "Usage: bcftools sort [OPTIONS] <FILE.vcf>\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -m, --max-mem <float>[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6
- fprintf(bcftools_stderr, " -o, --output-file <file> output file name [bcftools_stdout]\n");
- fprintf(bcftools_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " -T, --temp-dir <dir> temporary files [/tmp/bcftools-sort.XXXXXX]\n");
+ fprintf(bcftools_stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6
+ fprintf(bcftools_stderr, " -o, --output FILE output file name [bcftools_stdout]\n");
+ fprintf(bcftools_stderr, " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+#ifdef _WIN32
+ fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n");
+#else
+ fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n");
+#endif
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
-size_t parse_mem_string(char *str)
+size_t parse_mem_string(const char *str)
{
char *tmp;
double mem = strtod(str, &tmp);
- if ( tmp==str ) error("Could not parse: --max-mem %s\n", str);
+ if ( tmp==str ) error("Could not parse the memory string: \"%s\"\n", str);
if ( !strcasecmp("k",tmp) ) mem *= 1000;
else if ( !strcasecmp("m",tmp) ) mem *= 1000*1000;
else if ( !strcasecmp("g",tmp) ) mem *= 1000*1000*1000;
void mkdir_p(const char *fmt, ...);
static void init(args_t *args)
{
-#ifdef _WIN32
- char tmp_path[MAX_PATH];
- int ret = GetTempPath(MAX_PATH, tmp_path);
- if (!ret || ret > MAX_PATH)
- error("Could not get the path to the temporary folder\n");
- if (strlen(tmp_path) + strlen("/bcftools-sort.XXXXXX") >= MAX_PATH)
- error("Full path to the temporary folder is too long\n");
- strcat(tmp_path, "/bcftools-sort.XXXXXX");
- args->tmp_dir = strdup(tmp_path);
-#else
- args->tmp_dir = args->tmp_dir ? strdup(args->tmp_dir) : strdup("/tmp/bcftools-sort.XXXXXX");
-#endif
- size_t len = strlen(args->tmp_dir);
- if ( !strcmp("XXXXXX",args->tmp_dir+len-6) )
- {
+ args->tmp_dir = init_tmp_prefix(args->tmp_dir);
+
#ifdef _WIN32
int ret = mkdir(mktemp(args->tmp_dir), 0700);
if ( ret ) error("mkdir(%s) failed: %s\n", args->tmp_dir,strerror(errno));
int ret = chmod(tmp, S_IRUSR|S_IWUSR|S_IXUSR);
if ( ret ) error("chmod(%s,S_IRUSR|S_IWUSR|S_IXUSR) failed: %s\n", args->tmp_dir,strerror(errno));
#endif
- }
- else {
- mkdir_p("%s/",args->tmp_dir);
- }
fprintf(bcftools_stderr,"Writing to %s\n", args->tmp_dir);
}
{"temp-dir",required_argument,NULL,'T'},
{"output-type",required_argument,NULL,'O'},
{"output-file",required_argument,NULL,'o'},
+ {"output",required_argument,NULL,'o'},
{"help",no_argument,NULL,'h'},
{0,0,0,0}
};
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
- Copyright (C) 2012-2017 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdarg.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <math.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include "bcftools.h"
#include "filter.h"
#include "bin.h"
+#include "dist.h"
// Logic of the filters: include or exclude sites which match the filters?
#define FLT_INCLUDE 1
float min, max;
uint64_t *vals_ts, *vals_tv;
void *val;
- int nbins, type, m_val;
+ int nbins, type, m_val, idx;
}
user_stats_t;
#endif
int ts_alt1, tv_alt1;
#if QUAL_STATS
- int *qual_ts, *qual_tv, *qual_snps, *qual_indels;
+ // Values are rounded to one significant digit and 1 is added (Q*10+1); missing and negative values go in the first bin
+ // Only SNPs that are the 1st alternate allele are counted
+ dist_t *qual_ts, *qual_tv, *qual_indels;
#endif
int *insertions, *deletions, m_indel; // maximum indel length
int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
return i-1+d->min;
}
-static inline int clip_nonnegative(float x, int limit)
-{
- if (x >= limit || isnan(x)) return limit - 1;
- else if (x <= 0.0) return 0;
- else return (int) x;
-}
-
#define IC_DBG 0
#if IC_DBG
static void _indel_ctx_print1(_idc1_t *idc)
args->usr = (user_stats_t*) realloc(args->usr,sizeof(user_stats_t)*args->nusr);
user_stats_t *usr = &args->usr[args->nusr-1];
memset(usr,0,sizeof(*usr));
- usr->min = 0;
- usr->max = 1;
+ usr->min = 0;
+ usr->max = 1;
usr->nbins = 100;
+ usr->idx = 0;
char *tmp = str;
while ( *tmp && *tmp!=':' ) tmp++;
+
+ // Tag with an index or just tag? (e.g. PV4[1] vs DP)
+ if ( tmp > str && tmp[-1]==']' )
+ {
+ char *ptr = tmp;
+ while ( ptr>str && *ptr!='[' ) ptr--;
+ if ( *ptr=='[' )
+ {
+ char *ptr2;
+ usr->idx = strtol(ptr+1, &ptr2, 10);
+ if ( ptr+1==ptr2 || ptr2 != tmp-1 ) error("Could not parse the index in \"%s\" (ptr=%s;ptr2=%s(%p),tmp=%s(%p),idx=%d)\n", str,ptr,ptr2,ptr2,tmp,tmp,usr->idx);
+ if ( usr->idx<0 ) error("Error: negative index is not allowed: \"%s\"\n", str);
+ *ptr = 0;
+ }
+ }
+
usr->tag = (char*)calloc(tmp-str+2,sizeof(char));
memcpy(usr->tag,str,tmp-str);
int j;
for (j=0; j<3; j++) stats->af_repeats[j] = (int*) calloc(args->m_af,sizeof(int));
#if QUAL_STATS
- stats->qual_ts = (int*) calloc(args->m_qual,sizeof(int));
- stats->qual_tv = (int*) calloc(args->m_qual,sizeof(int));
- stats->qual_snps = (int*) calloc(args->m_qual,sizeof(int));
- stats->qual_indels = (int*) calloc(args->m_qual,sizeof(int));
+ stats->qual_ts = dist_init(5);
+ stats->qual_tv = dist_init(5);
+ stats->qual_indels = dist_init(5);
#endif
if ( args->files->n_smpl )
{
for (j=0; j<3; j++)
if (stats->af_repeats[j]) free(stats->af_repeats[j]);
#if QUAL_STATS
- if (stats->qual_ts) free(stats->qual_ts);
- if (stats->qual_tv) free(stats->qual_tv);
- if (stats->qual_snps) free(stats->qual_snps);
- if (stats->qual_indels) free(stats->qual_indels);
+ if (stats->qual_ts) dist_destroy(stats->qual_ts);
+ if (stats->qual_tv) dist_destroy(stats->qual_tv);
+ if (stats->qual_indels) dist_destroy(stats->qual_indels);
#endif
#if HWE_STATS
free(stats->af_hwe);
bcf1_t *line = reader->buffer[0];
#if QUAL_STATS
- int iqual = clip_nonnegative(line->qual, args->m_qual);
- stats->qual_indels[iqual]++;
+ int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10);
+ dist_insert(stats->qual_indels, iqual);
#endif
// Check if the indel is near an exon for the frameshift statistics
static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts)
{
- int i;
+ int i, nval;
for (i=0; i<stats->nusr; i++)
{
user_stats_t *usr = &stats->usr[i];
float val;
if ( usr->type==BCF_HT_REAL )
{
- if ( bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue;
- val = ((float*)usr->val)[0];
+ if ( (nval=bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue;
+ if ( usr->idx >= nval ) continue;
+ val = ((float*)usr->val)[usr->idx];
}
else
{
- if ( bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue;
- val = ((int32_t*)usr->val)[0];
+ if ( (nval=bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue;
+ if ( usr->idx >= nval ) continue;
+ val = ((int32_t*)usr->val)[usr->idx];
}
int idx;
if ( val<=usr->min ) idx = 0;
if ( ref<0 ) return;
#if QUAL_STATS
- int iqual = clip_nonnegative(line->qual, args->m_qual);
- stats->qual_snps[iqual]++;
+ int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10);
#endif
int i;
{
stats->ts_alt1++;
#if QUAL_STATS
- stats->qual_ts[iqual]++;
+ dist_insert(stats->qual_ts,iqual);
#endif
do_user_stats(stats, reader, 1);
}
{
stats->tv_alt1++;
#if QUAL_STATS
- stats->qual_tv[iqual]++;
+ dist_insert(stats->qual_tv,iqual);
#endif
do_user_stats(stats, reader, 0);
}
}
}
#if QUAL_STATS
- printf("# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n");
+ printf("# QUAL, Stats by quality\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
- for (i=0; i<args->m_qual; i++)
+ int ndist_ts = dist_nbins(stats->qual_ts);
+ int ndist_tv = dist_nbins(stats->qual_tv);
+ int ndist_in = dist_nbins(stats->qual_indels);
+ int ndist_max = ndist_ts;
+ if ( ndist_max < ndist_tv ) ndist_max = ndist_tv;
+ if ( ndist_max < ndist_in ) ndist_max = ndist_in;
+ uint32_t beg, end;
+ uint32_t nts, ntv, nin;
+ for (i=0; i<ndist_max; i++)
{
- if ( stats->qual_snps[i]+stats->qual_ts[i]+stats->qual_tv[i]+stats->qual_indels[i] == 0 ) continue;
- printf("QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]);
+ nts = ntv = nin = 0;
+ float qval = -1;
+ if ( i < ndist_ts )
+ {
+ nts = dist_get(stats->qual_ts, i, &beg, &end);
+ qval = beg>0 ? 0.1*(beg - 1) : -1;
+ }
+ if ( i < ndist_tv )
+ {
+ ntv = dist_get(stats->qual_tv, i, &beg, &end);
+ if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1;
+ }
+ if ( i < ndist_in )
+ {
+ nin = dist_get(stats->qual_indels, i, &beg, &end);
+ if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1;
+ }
+ if ( nts+ntv+nin==0 ) continue;
+
+ printf("QUAL\t%d\t",id);
+ if ( qval==-1 ) printf(".");
+ else printf("%.1f",qval);
+ printf("\t%d\t%d\t%d\t%d\n",nts+ntv,nts,ntv,nin);
}
}
#endif
for (i=0; i<args->nusr; i++)
{
- printf("# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n",
- args->usr[i].tag,args->usr[i].tag,args->usr[i].tag,args->usr[i].tag);
+ printf("# USR:%s/%d\t[2]id\t[3]%s/%d\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n",
+ args->usr[i].tag,args->usr[i].idx,args->usr[i].tag,args->usr[i].idx);
for (id=0; id<args->nstats; id++)
{
user_stats_t *usr = &args->stats[id].usr[i];
{
if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins
float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1);
- const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s\t%d\t%.0f\t%d\t%d\t%d\n";
- printf(fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
+ const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s/%d\t%d\t%.0f\t%d\t%d\t%d\n";
+ printf(fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
}
}
}
printf("# NRD and discordance is calculated as follows:\n");
printf("# m .. number of matches\n");
printf("# x .. number of mismatches\n");
- printf("# NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n");
- printf("# RR discordance = xRR / (xRR + mRR)\n");
- printf("# RA discordance = xRA / (xRA + mRA)\n");
- printf("# AA discordance = xAA / (xAA + mAA)\n");
+ printf("# NRD = 100 * (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n");
+ printf("# RR discordance = 100 * xRR / (xRR + mRR)\n");
+ printf("# RA discordance = 100 * xRA / (xRA + mRA)\n");
+ printf("# AA discordance = 100 * xAA / (xAA + mAA)\n");
printf("# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
}
else
fprintf(stderr, "Usage: bcftools stats [options] <A.vcf.gz> [<B.vcf.gz>]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " --af-bins <list> allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n");
- fprintf(stderr, " --af-tag <string> allele frequency tag to use, by default estimated from AN,AC or GT\n");
- fprintf(stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n");
- fprintf(stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
- fprintf(stderr, " -d, --depth <int,int,int> depth distribution: min,max,bin size [0,500,1]\n");
- fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -E, --exons <file.gz> tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n");
- fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
- fprintf(stderr, " -F, --fasta-ref <file> faidx indexed reference sequence file to determine INDEL context\n");
- fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -s, --samples <list> list of samples for sample stats, \"-\" to include all samples\n");
- fprintf(stderr, " -S, --samples-file <file> file of samples to include\n");
- fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " -u, --user-tstv <TAG[:min:max:n]> collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
- fprintf(stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
- fprintf(stderr, " -v, --verbose produce verbose per-site and per-sample output\n");
+ fprintf(stderr, " --af-bins LIST Allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n");
+ fprintf(stderr, " --af-tag STRING Allele frequency tag to use, by default estimated from AN,AC or GT\n");
+ fprintf(stderr, " -1, --1st-allele-only Include only 1st allele at multiallelic sites\n");
+ fprintf(stderr, " -c, --collapse STRING Treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
+ fprintf(stderr, " -d, --depth INT,INT,INT Depth distribution: min,max,bin size [0,500,1]\n");
+ fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -E, --exons FILE.gz Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, bgzip compressed)\n");
+ fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(stderr, " -F, --fasta-ref FILE Faidx indexed reference sequence file to determine INDEL context\n");
+ fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -I, --split-by-ID Collect stats for sites with ID separately (known vs novel)\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " -s, --samples LIST List of samples for sample stats, \"-\" to include all samples\n");
+ fprintf(stderr, " -S, --samples-file FILE File of samples to include\n");
+ fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
+ fprintf(stderr, " A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n");
+ fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
+ fprintf(stderr, " -v, --verbose Produce verbose per-site and per-sample output\n");
fprintf(stderr, "\n");
exit(1);
}
case 's': args->samples_list = optarg; break;
case 'S': args->samples_list = optarg; args->samples_is_file = 1; break;
case 'I': args->split_by_id = 1; break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 'h':
case '?': usage(); break;
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
- Copyright (C) 2012-2017 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdarg.h>
#include <unistd.h>
#include <getopt.h>
+#include <assert.h>
#include <math.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include "bcftools.h"
#include "filter.h"
#include "bin.h"
+#include "dist.h"
// Logic of the filters: include or exclude sites which match the filters?
#define FLT_INCLUDE 1
float min, max;
uint64_t *vals_ts, *vals_tv;
void *val;
- int nbins, type, m_val;
+ int nbins, type, m_val, idx;
}
user_stats_t;
#endif
int ts_alt1, tv_alt1;
#if QUAL_STATS
- int *qual_ts, *qual_tv, *qual_snps, *qual_indels;
+ // Values are rounded to one significant digit and 1 is added (Q*10+1); missing and negative values go in the first bin
+ // Only SNPs that are the 1st alternate allele are counted
+ dist_t *qual_ts, *qual_tv, *qual_indels;
#endif
int *insertions, *deletions, m_indel; // maximum indel length
int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
return i-1+d->min;
}
-static inline int clip_nonnegative(float x, int limit)
-{
- if (x >= limit || isnan(x)) return limit - 1;
- else if (x <= 0.0) return 0;
- else return (int) x;
-}
-
#define IC_DBG 0
#if IC_DBG
static void _indel_ctx_print1(_idc1_t *idc)
args->usr = (user_stats_t*) realloc(args->usr,sizeof(user_stats_t)*args->nusr);
user_stats_t *usr = &args->usr[args->nusr-1];
memset(usr,0,sizeof(*usr));
- usr->min = 0;
- usr->max = 1;
+ usr->min = 0;
+ usr->max = 1;
usr->nbins = 100;
+ usr->idx = 0;
char *tmp = str;
while ( *tmp && *tmp!=':' ) tmp++;
+
+ // Tag with an index or just tag? (e.g. PV4[1] vs DP)
+ if ( tmp > str && tmp[-1]==']' )
+ {
+ char *ptr = tmp;
+ while ( ptr>str && *ptr!='[' ) ptr--;
+ if ( *ptr=='[' )
+ {
+ char *ptr2;
+ usr->idx = strtol(ptr+1, &ptr2, 10);
+ if ( ptr+1==ptr2 || ptr2 != tmp-1 ) error("Could not parse the index in \"%s\" (ptr=%s;ptr2=%s(%p),tmp=%s(%p),idx=%d)\n", str,ptr,ptr2,ptr2,tmp,tmp,usr->idx);
+ if ( usr->idx<0 ) error("Error: negative index is not allowed: \"%s\"\n", str);
+ *ptr = 0;
+ }
+ }
+
usr->tag = (char*)calloc(tmp-str+2,sizeof(char));
memcpy(usr->tag,str,tmp-str);
int j;
for (j=0; j<3; j++) stats->af_repeats[j] = (int*) calloc(args->m_af,sizeof(int));
#if QUAL_STATS
- stats->qual_ts = (int*) calloc(args->m_qual,sizeof(int));
- stats->qual_tv = (int*) calloc(args->m_qual,sizeof(int));
- stats->qual_snps = (int*) calloc(args->m_qual,sizeof(int));
- stats->qual_indels = (int*) calloc(args->m_qual,sizeof(int));
+ stats->qual_ts = dist_init(5);
+ stats->qual_tv = dist_init(5);
+ stats->qual_indels = dist_init(5);
#endif
if ( args->files->n_smpl )
{
for (j=0; j<3; j++)
if (stats->af_repeats[j]) free(stats->af_repeats[j]);
#if QUAL_STATS
- if (stats->qual_ts) free(stats->qual_ts);
- if (stats->qual_tv) free(stats->qual_tv);
- if (stats->qual_snps) free(stats->qual_snps);
- if (stats->qual_indels) free(stats->qual_indels);
+ if (stats->qual_ts) dist_destroy(stats->qual_ts);
+ if (stats->qual_tv) dist_destroy(stats->qual_tv);
+ if (stats->qual_indels) dist_destroy(stats->qual_indels);
#endif
#if HWE_STATS
free(stats->af_hwe);
bcf1_t *line = reader->buffer[0];
#if QUAL_STATS
- int iqual = clip_nonnegative(line->qual, args->m_qual);
- stats->qual_indels[iqual]++;
+ int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10);
+ dist_insert(stats->qual_indels, iqual);
#endif
// Check if the indel is near an exon for the frameshift statistics
static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts)
{
- int i;
+ int i, nval;
for (i=0; i<stats->nusr; i++)
{
user_stats_t *usr = &stats->usr[i];
float val;
if ( usr->type==BCF_HT_REAL )
{
- if ( bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue;
- val = ((float*)usr->val)[0];
+ if ( (nval=bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue;
+ if ( usr->idx >= nval ) continue;
+ val = ((float*)usr->val)[usr->idx];
}
else
{
- if ( bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue;
- val = ((int32_t*)usr->val)[0];
+ if ( (nval=bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue;
+ if ( usr->idx >= nval ) continue;
+ val = ((int32_t*)usr->val)[usr->idx];
}
int idx;
if ( val<=usr->min ) idx = 0;
if ( ref<0 ) return;
#if QUAL_STATS
- int iqual = clip_nonnegative(line->qual, args->m_qual);
- stats->qual_snps[iqual]++;
+ int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10);
#endif
int i;
{
stats->ts_alt1++;
#if QUAL_STATS
- stats->qual_ts[iqual]++;
+ dist_insert(stats->qual_ts,iqual);
#endif
do_user_stats(stats, reader, 1);
}
{
stats->tv_alt1++;
#if QUAL_STATS
- stats->qual_tv[iqual]++;
+ dist_insert(stats->qual_tv,iqual);
#endif
do_user_stats(stats, reader, 0);
}
case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
- default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt->type); exit(1); break;
+ default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt->type); bcftools_exit(1); break;
}
#undef BRANCH_INT
case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
- default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
+ default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break;
}
#undef BRANCH_INT
}
case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break;
case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
- default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
+ default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break;
}
#undef BRANCH_INT
}
}
}
#if QUAL_STATS
- fprintf(bcftools_stdout, "# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n");
+ fprintf(bcftools_stdout, "# QUAL, Stats by quality\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
- for (i=0; i<args->m_qual; i++)
+ int ndist_ts = dist_nbins(stats->qual_ts);
+ int ndist_tv = dist_nbins(stats->qual_tv);
+ int ndist_in = dist_nbins(stats->qual_indels);
+ int ndist_max = ndist_ts;
+ if ( ndist_max < ndist_tv ) ndist_max = ndist_tv;
+ if ( ndist_max < ndist_in ) ndist_max = ndist_in;
+ uint32_t beg, end;
+ uint32_t nts, ntv, nin;
+ for (i=0; i<ndist_max; i++)
{
- if ( stats->qual_snps[i]+stats->qual_ts[i]+stats->qual_tv[i]+stats->qual_indels[i] == 0 ) continue;
- fprintf(bcftools_stdout, "QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]);
+ nts = ntv = nin = 0;
+ float qval = -1;
+ if ( i < ndist_ts )
+ {
+ nts = dist_get(stats->qual_ts, i, &beg, &end);
+ qval = beg>0 ? 0.1*(beg - 1) : -1;
+ }
+ if ( i < ndist_tv )
+ {
+ ntv = dist_get(stats->qual_tv, i, &beg, &end);
+ if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1;
+ }
+ if ( i < ndist_in )
+ {
+ nin = dist_get(stats->qual_indels, i, &beg, &end);
+ if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1;
+ }
+ if ( nts+ntv+nin==0 ) continue;
+
+ fprintf(bcftools_stdout, "QUAL\t%d\t",id);
+ if ( qval==-1 ) fprintf(bcftools_stdout, ".");
+ else fprintf(bcftools_stdout, "%.1f",qval);
+ fprintf(bcftools_stdout, "\t%d\t%d\t%d\t%d\n",nts+ntv,nts,ntv,nin);
}
}
#endif
for (i=0; i<args->nusr; i++)
{
- fprintf(bcftools_stdout, "# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n",
- args->usr[i].tag,args->usr[i].tag,args->usr[i].tag,args->usr[i].tag);
+ fprintf(bcftools_stdout, "# USR:%s/%d\t[2]id\t[3]%s/%d\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n",
+ args->usr[i].tag,args->usr[i].idx,args->usr[i].tag,args->usr[i].idx);
for (id=0; id<args->nstats; id++)
{
user_stats_t *usr = &args->stats[id].usr[i];
{
if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins
float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1);
- const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s\t%d\t%.0f\t%d\t%d\t%d\n";
- fprintf(bcftools_stdout, fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
+ const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s/%d\t%d\t%.0f\t%d\t%d\t%d\n";
+ fprintf(bcftools_stdout, fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
}
}
}
fprintf(bcftools_stdout, "# NRD and discordance is calculated as follows:\n");
fprintf(bcftools_stdout, "# m .. number of matches\n");
fprintf(bcftools_stdout, "# x .. number of mismatches\n");
- fprintf(bcftools_stdout, "# NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n");
- fprintf(bcftools_stdout, "# RR discordance = xRR / (xRR + mRR)\n");
- fprintf(bcftools_stdout, "# RA discordance = xRA / (xRA + mRA)\n");
- fprintf(bcftools_stdout, "# AA discordance = xAA / (xAA + mAA)\n");
+ fprintf(bcftools_stdout, "# NRD = 100 * (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n");
+ fprintf(bcftools_stdout, "# RR discordance = 100 * xRR / (xRR + mRR)\n");
+ fprintf(bcftools_stdout, "# RA discordance = 100 * xRA / (xRA + mRA)\n");
+ fprintf(bcftools_stdout, "# AA discordance = 100 * xAA / (xAA + mAA)\n");
fprintf(bcftools_stdout, "# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
}
else
fprintf(bcftools_stderr, "Usage: bcftools stats [options] <A.vcf.gz> [<B.vcf.gz>]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " --af-bins <list> allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n");
- fprintf(bcftools_stderr, " --af-tag <string> allele frequency tag to use, by default estimated from AN,AC or GT\n");
- fprintf(bcftools_stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n");
- fprintf(bcftools_stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
- fprintf(bcftools_stderr, " -d, --depth <int,int,int> depth distribution: min,max,bin size [0,500,1]\n");
- fprintf(bcftools_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -E, --exons <file.gz> tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n");
- fprintf(bcftools_stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
- fprintf(bcftools_stderr, " -F, --fasta-ref <file> faidx indexed reference sequence file to determine INDEL context\n");
- fprintf(bcftools_stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " -s, --samples <list> list of samples for sample stats, \"-\" to include all samples\n");
- fprintf(bcftools_stderr, " -S, --samples-file <file> file of samples to include\n");
- fprintf(bcftools_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -u, --user-tstv <TAG[:min:max:n]> collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
- fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
- fprintf(bcftools_stderr, " -v, --verbose produce verbose per-site and per-sample output\n");
+ fprintf(bcftools_stderr, " --af-bins LIST Allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n");
+ fprintf(bcftools_stderr, " --af-tag STRING Allele frequency tag to use, by default estimated from AN,AC or GT\n");
+ fprintf(bcftools_stderr, " -1, --1st-allele-only Include only 1st allele at multiallelic sites\n");
+ fprintf(bcftools_stderr, " -c, --collapse STRING Treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
+ fprintf(bcftools_stderr, " -d, --depth INT,INT,INT Depth distribution: min,max,bin size [0,500,1]\n");
+ fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -E, --exons FILE.gz Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, bgzip compressed)\n");
+ fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(bcftools_stderr, " -F, --fasta-ref FILE Faidx indexed reference sequence file to determine INDEL context\n");
+ fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -I, --split-by-ID Collect stats for sites with ID separately (known vs novel)\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " -s, --samples LIST List of samples for sample stats, \"-\" to include all samples\n");
+ fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n");
+ fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
+ fprintf(bcftools_stderr, " A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n");
+ fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
+ fprintf(bcftools_stderr, " -v, --verbose Produce verbose per-site and per-sample output\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcfstats(int argc, char *argv[])
case 's': args->samples_list = optarg; break;
case 'S': args->samples_list = optarg; args->samples_is_file = 1; break;
case 'I': args->split_by_id = 1; break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 'h':
case '?': usage(); break;
/* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
- Copyright (C) 2013-2018 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
}
// setup output
+ const char *tmp = hts_bcf_wmode2(args->output_type,args->fn_out);
char modew[8];
- strcpy(modew, "w");
+ strcpy(modew,tmp);
if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel);
- if (args->output_type==FT_BCF) strcat(modew, "bu"); // uncompressed BCF
- else if (args->output_type & FT_BCF) strcat(modew, "b"); // compressed BCF
- else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF
args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
if ( args->n_threads > 0)
fprintf(stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n");
fprintf(stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -o, --output-file <file> output file name [stdout]\n");
+ fprintf(stderr, " -o, --output <file> output file name [stdout]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
{"force-samples",no_argument,NULL,1},
{"output-type",required_argument,NULL,'O'},
{"output-file",required_argument,NULL,'o'},
+ {"output",required_argument,NULL,'o'},
{"types",required_argument,NULL,'v'},
{"exclude-types",required_argument,NULL,'V'},
{"targets",required_argument,NULL,'t'},
break;
case 'v': args->include_types = optarg; break;
case 'V': args->exclude_types = optarg; break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
-
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'c':
{
args->min_ac_type = ALLELE_NONREF;
/* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
- Copyright (C) 2013-2018 Genome Research Ltd.
+ Copyright (C) 2013-2021 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
if (args->include_types || args->exclude_types) {
if (args->include_types && args->exclude_types) {
fprintf(bcftools_stderr, "Error: only supply one of --include-types, --exclude-types options\n");
- exit(1);
+ bcftools_exit(1);
}
char **type_list = 0;
int m = 0, n = 0;
else {
fprintf(bcftools_stderr, "[E::%s] unknown type\n", type_list[i]);
fprintf(bcftools_stderr, "Accepted types are snps, indels, mnps, other\n");
- exit(1);
+ bcftools_exit(1);
}
}
}
else {
fprintf(bcftools_stderr, "[E::%s] unknown type\n", type_list[i]);
fprintf(bcftools_stderr, "Accepted types are snps, indels, mnps, other\n");
- exit(1);
+ bcftools_exit(1);
}
}
}
}
// setup output
+ const char *tmp = hts_bcf_wmode2(args->output_type,args->fn_out);
char modew[8];
- strcpy(modew, "w");
+ strcpy(modew,tmp);
if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel);
- if (args->output_type==FT_BCF) strcat(modew, "bu"); // uncompressed BCF
- else if (args->output_type & FT_BCF) strcat(modew, "b"); // compressed BCF
- else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF
args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
if ( args->n_threads > 0)
case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
- default: fprintf(bcftools_stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
+ default: fprintf(bcftools_stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break;
}
#undef BRANCH_INT
if (!sample_phased) {
fprintf(bcftools_stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n");
fprintf(bcftools_stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(bcftools_stderr, " -o, --output-file <file> output file name [bcftools_stdout]\n");
+ fprintf(bcftools_stderr, " -o, --output <file> output file name [bcftools_stdout]\n");
fprintf(bcftools_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(bcftools_stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
fprintf(bcftools_stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
fprintf(bcftools_stderr, "\n");
- exit(1);
+ bcftools_exit(1);
}
int main_vcfview(int argc, char *argv[])
{"force-samples",no_argument,NULL,1},
{"output-type",required_argument,NULL,'O'},
{"output-file",required_argument,NULL,'o'},
+ {"output",required_argument,NULL,'o'},
{"types",required_argument,NULL,'v'},
{"exclude-types",required_argument,NULL,'V'},
{"targets",required_argument,NULL,'t'},
break;
case 'v': args->include_types = optarg; break;
case 'V': args->exclude_types = optarg; break;
- case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
-
+ case 'e':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i':
+ if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+ args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'c':
{
args->min_ac_type = ALLELE_NONREF;
/* vcmp.c -- reference allele utility functions.
- Copyright (C) 2013 Genome Research Ltd.
+ Copyright (C) 2013-2015, 2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
/* vcmp.c -- reference allele utility functions.
- Copyright (C) 2013 Genome Research Ltd.
+ Copyright (C) 2013-2015, 2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
/* vcmp.h -- reference allele utility functions.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2015 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
/* version.c -- report version numbers for plugins.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdarg.h>
#include <stdlib.h>
#include <stdio.h>
+#include <strings.h>
#include <errno.h>
#include <htslib/hts.h>
#include "bcftools.h"
exit(-1);
}
-
const char *hts_bcf_wmode(int file_type)
{
if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF
return "w"; // uncompressed VCF
}
+const char *hts_bcf_wmode2(int file_type, char *fname)
+{
+ if ( !fname ) return hts_bcf_wmode(file_type);
+ int len = strlen(fname);
+ if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
+ if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF);
+ if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+ if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+ return hts_bcf_wmode(file_type);
+}
/* version.c -- report version numbers for plugins.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdarg.h>
#include <stdlib.h>
#include <stdio.h>
+#include <strings.h>
#include <errno.h>
#include <htslib/hts.h>
#include "bcftools.h"
va_start(ap, format);
vfprintf(bcftools_stderr, format, ap);
va_end(ap);
- exit(-1);
+ bcftools_exit(-1);
}
void error_errno(const char *format, ...)
} else {
fprintf(bcftools_stderr, "\n");
}
- exit(-1);
+ bcftools_exit(-1);
}
-
const char *hts_bcf_wmode(int file_type)
{
if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF
return "w"; // uncompressed VCF
}
+const char *hts_bcf_wmode2(int file_type, char *fname)
+{
+ if ( !fname ) return hts_bcf_wmode(file_type);
+ int len = strlen(fname);
+ if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
+ if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF);
+ if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+ if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+ return hts_bcf_wmode(file_type);
+}
#!/bin/sh
+# version.sh
+#
+# Author : Petr Danecek <pd3@sanger.ac.uk>
+#
+# Copyright (C) 2018-2021 Genome Research Ltd.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
# Master version, for use in tarballs or non-git source copies
-VERSION=1.10.2
+VERSION=1.13
# If we have a git clone, then check against the current tag
if [ -e .git ]
class CyExtension(Extension):
def __init__(self, *args, **kwargs):
self._init_func = kwargs.pop("init_func", None)
+ self._prebuild_func = kwargs.pop("prebuild_func", None)
Extension.__init__(self, *args, **kwargs)
def extend_includes(self, includes):
ext.extra_link_args = []
ext.extra_link_args += ['-Wl,-rpath,$ORIGIN']
-
+
+ if isinstance(ext, CyExtension) and ext._prebuild_func:
+ ext._prebuild_func(ext, self.force)
+
build_ext.build_extension(self, ext)
"htslib": (
'htslib/tabix.c', 'htslib/bgzip.c',
'htslib/htsfile.c',
- "test"),
+ "test", "tests"),
}
else:
lines = re.sub(r"int main\(", "int {}_{}_main(".format(
basename, subname), lines)
+ if basename == "samtools":
+ lines = re.sub(r"main_(reheader)\(",
+ r"samtools_main_\1(", lines)
+ lines = re.sub(r"\bexit\(", "{}_exit(".format(basename), lines)
lines = re.sub("stderr", "{}_stderr".format(basename), lines)
lines = re.sub("stdout", "{}_stdout".format(basename), lines)
lines = re.sub(r" printf\(", " fprintf({}_stdout, ".format(basename), lines)
fn = os.path.basename(filename)
# some specific fixes:
SPECIFIC_SUBSTITUTIONS = {
- "bamtk.c": (
- 'else if (strcmp(argv[1], "tview") == 0)',
- '//else if (strcmp(argv[1], "tview") == 0)'),
"bam_md.c": (
'sam_open_format("-", mode_w',
'sam_open_format({}_stdout_fn, mode_w'.format(basename)),
lines = lines.replace(
SPECIFIC_SUBSTITUTIONS[fn][0],
SPECIFIC_SUBSTITUTIONS[fn][1])
+ if fn == "bamtk.c":
+ lines = re.sub(r'(#include "version.h")', r'\1\n#include "samtools_config_vars.h"', lines)
+ lines = re.sub(r'(else if.*"tview")', r'//\1', lines)
+
outfile.write(lines)
with open(os.path.join("import", "pysam.h")) as inf, \
outf.write(line)
os.rename(tmpfilename, filename)
+ def _update_version_doc_file(dest, value, filename):
+ tmpfilename = filename + ".tmp"
+ with open(filename, encoding="utf-8") as inf:
+ with open(tmpfilename, "w", encoding="utf-8") as outf:
+ for line in inf:
+ if " wraps " in line:
+ # hide the sentence's fullstop from the main regexp
+ line = re.sub(r'\.$', ',DOT', line)
+ line = re.sub(r'{}-[^*,]*'.format(dest),
+ '{}-{}'.format(dest, value), line)
+ line = re.sub(',DOT', '.', line)
+ outf.write(line)
+ os.rename(tmpfilename, filename)
+
version = _getVersion(srcdir)
_update_version_file("__{}_version__".format(dest), version, "pysam/version.py")
_update_version_file(C_VERSION[dest], version + " (pysam)", "pysam/version.h")
+ _update_version_doc_file(dest, version, "README.rst")
+ _update_version_doc_file(dest, version, "doc/index.rst")
sys.exit(0)
fi # if-OS
} # install_os_packages
-# funcion to install Python dependencies
+# function to install Python dependencies
install_python_deps() {
if [ "$OS" == "ubuntu" -o "$OS" == "sl" ] ; then
# problems in the compilation test.
cd tests
-# create auxilliary data
+# create auxiliary data
echo
echo 'building test data'
echo
make -C pysam_data all
make -C cbcf_data all
+make -C tabix_data all
# run nosetests
# -s: do not capture stdout, conflicts with pysam.dispatch
# pin versions, so that tests do not fail when pysam/htslib out of step
# add htslib dependencies
-# NB: we force conda-forge:ncurses due to bioconda/bioconda-recipes#13488
-conda install -y "samtools=1.9" "bcftools=1.9" "htslib=1.9" xz curl bzip2 conda-forge:ncurses
+# NB: force conda-forge:blas due to conda/conda#7548
+conda install -y "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" xz curl bzip2 "conda-forge::blas=*=openblas"
# As HTSLIB_MODE is (defaulted to) 'shared', ensure we don't pick up
# the external headers from the Conda-installed htslib package.
echo
python setup.py install || exit
-# create auxilliary data
+# create auxiliary data
echo
echo 'building test data'
echo
make -C tests/pysam_data
make -C tests/cbcf_data
+make -C tests/tabix_data
# echo any limits that are in place
ulimit -a
-======================================================
-pysam - An interface for reading and writing SAM files
-======================================================
-
+============
Introduction
============
represents a single read along with its fields and optional tags::
for read in samfile.fetch('chr1', 100, 120):
- print read
+ print read
samfile.close()
samfile = pysam.AlignmentFile("ex1.bam", "rb")
pairedreads = pysam.AlignmentFile("allpaired.bam", "wb", template=samfile)
for read in samfile.fetch():
- if read.is_paired:
- pairedreads.write(read)
+ if read.is_paired:
+ pairedreads.write(read)
pairedreads.close()
samfile.close()
The pysam website containing documentation
+===
API
===
SAM/BAM/CRAM files
--------------------
+==================
Objects of type :class:`~pysam.AlignmentFile` allow working with
BAM/SAM formatted files.
Tabix files
------------
+===========
:class:`~pysam.TabixFile` opens tabular files that have been
indexed with tabix_.
:members:
-Fasta files
------------
+FASTA files
+===========
.. autoclass:: pysam.FastaFile
:members:
-Fastq files
------------
+FASTQ files
+===========
.. autoclass:: pysam.FastxFile
:members:
:members:
-VCF files
----------
+VCF/BCF files
+=============
.. autoclass:: pysam.VariantFile
:members:
:members:
HTSFile
--------
+=======
HTSFile is the base class for :class:`pysam.AlignmentFile` and
:class:`pysam.VariantFile`.
+.. _Benchmarking:
+
============
Benchmarking
============
# All configuration values have a default; values that are commented out
# serve to show the default.
-import sys, os, glob
+import sys, os, sysconfig
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
-_libdir = "../build/lib.%s-%s-%s.%s" % (os.uname()[0].lower(), os.uname()[4],
- sys.version_info[0], sys.version_info[1])
+_pyversion = sysconfig.get_python_version()
+_libdir = "../build/lib.%s-%s" % (sysconfig.get_platform(), _pyversion)
if os.path.exists(_libdir):
sys.path.insert(0, os.path.abspath(_libdir))
'sphinx.ext.intersphinx',
'sphinx.ext.napoleon']
-intersphinx_mapping = {'python': ('http://docs.python.org/3.5', None)}
+intersphinx_mapping = {'python': ('https://docs.python.org/%s' % _pyversion, None)}
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# General information about the project.
project = u'pysam'
-copyright = u'2009, Andreas Heger, Kevin Jacobs et al.'
+copyright = u'2009–2021, Andreas Heger, Kevin Jacobs, et al'
# Included at the end of each rst file
rst_epilog = '''
.. _Galaxy: https://main.g2.bx.psu.edu/
.. _cython: http://cython.org/
.. _python: http://python.org/
+.. _pypi: https://pypi.org/
+.. _pip: https://pip.pypa.io/
.. _pyximport: http://www.prescod.net/pyximport/
.. _conda: https://conda.io/docs/
.. _bioconda: https://bioconda.github.io/
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
- ('index', 'pysam.tex', ur'pysam documentation',
- ur'Andreas Heger, Kevin Jacobs et al.', 'manual'),
+ ('index', 'pysam.tex', u'pysam documentation',
+ u'Andreas Heger, Kevin Jacobs, et al.', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
Code specific to pysam
:file:`doc`
- The documentation. To build the latest documention type::
+ The documentation. To build the latest documentation type::
make -C doc html
pytest tests
+Most tests use test data from the :file:`tests/*_data` directories.
+Some of these test data files are generated from other files in these
+directories, which is done by running ``make`` in each directory::
+
+ make -C tests/pysam_data
+ # etc
+
+Alternatively if any :file:`tests/*_data/all.stamp` file is not already
+present, running the unit tests should generate that directory's data
+files automatically.
+
Benchmarking
============
How should I cite pysam
=======================
-Pysam has not been published in print. When refering pysam, please
+Pysam has not been published in print. When referring to pysam, please
use the github URL: https://github.com/pysam-developers/pysam.
As pysam is a wrapper around htslib and the samtools package, I
-suggest cite `Li et al (2009) <http://www.ncbi.nlm.nih.gov/pubmed/19505943>`.
+suggest citing [Li.2009]_, [Bonfield.2021]_, and/or [Danecek.2021]_,
+as appropriate.
Is pysam thread-safe?
=====================
Pysam is a mix of python and C code. Instructions within python are
generally made thread-safe through python's `global interpreter lock`_
-(GIL_). This ensures that python data structures will always be in a
+(:dfn:`GIL`). This ensures that python data structures will always be in a
consistent state.
If an external function outside python is called, the programmer has a
called. This will allow other threads to run concurrently. This can be
beneficial if the external function is expected to halt, for example
when waiting for data to read or write. However, to achieve
-thread-safety, the external function needs to implememented with
+thread-safety, the external function needs to be implemented with
thread-safety in mind. This means that there can be no shared state
between threads, or if there is shared, it needs to be controlled to
prevent any access conflicts.
parts have not been fully tested.
A related issue is when different threads read from the same file
-objec - or the same thread uses two iterators over a file. There is
+object - or the same thread uses two iterators over a file. There is
only a single file-position for each opened file. To prevent this from
hapeding, use the option ``multiple_iterator=True`` when calling
a fetch() method. This will return an iterator on a newly opened
:meth:`~pysam.AlignmentFile.fetch` requires an index when
iterating over a SAM/BAM file. To iterate over a file without
-index, use the ``until_eof=True`::
+index, use the ``until_eof=True``::
bf = pysam.AlignmentFile(fname, "rb")
for r in bf.fetch(until_eof=True):
Binary SAM format. BAM files are binary formatted, indexed and
allow random access.
+ CRAM
+ CRAM is a binary format representing the same sequence alignment
+ information as SAM and BAM, but offering significantly better
+ lossless compression than BAM.
+
TAM
Text SAM file. TAM files are human readable files of
tab-separated fields. TAM files do not allow random access.
BCF
Binary :term:`VCF`
+ FASTA
+ Simple text format containing sequence data, with only the bare
+ minimum of metadata. Typically used for reference sequence data.
+
+ FASTQ
+ Simple text format containing sequence data and associated base
+ qualities.
+
tabix
Utility in the htslib package to index :term:`bgzip` compressed
files.
using cython and a high-level, pythonic API for convenient access to
the data within genomic file formats.
-The current version wraps *htslib-1.10.2*, *samtools-1.10* and
-*bcftools-1.10.2*.
+The current version wraps *htslib-1.13*, *samtools-1.13*, and *bcftools-1.13*.
To install the latest release, type::
References
----------
-.. [Li2009] The Sequence Alignment/Map format and SAMtools. Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup.
- Bioinformatics. 2009 Aug 15;25(16):2078-9. Epub 2009 Jun 8.
- `PMID: 19505943 <http://www.ncbi.nlm.nih.gov/pubmed/19505943?dopt=Abstract>`_
+.. [Li.2009] *The Sequence Alignment/Map format and SAMtools.*
+ Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup.
+ Bioinformatics. 2009 Aug 15;25(16):2078-9.
+ Epub 2009 Jun 8 `btp352 <https://doi.org/10.1093/bioinformatics/btp352>`_.
+ PMID: `19505943 <https://pubmed.ncbi.nlm.nih.gov/19505943>`_.
+
+.. [Bonfield.2021] *HTSlib: C library for reading/writing high-throughput sequencing data.*
+ Bonfield JK, Marshall J, Danecek P, Li H, Ohan V, Whitwham A, Keane T, Davies RM.
+ GigaScience (2021) 10(2) `giab007 <https://doi.org/10.1093/gigascience/giab007>`_.
+ PMID: `33594436 <https://pubmed.ncbi.nlm.nih.gov/33594436>`_.
+
+.. [Danecek.2021] *Twelve years of SAMtools and BCFtools.*
+ Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H.
+ GigaScience (2021) 10(2) `giab008 <https://doi.org/10.1093/gigascience/giab008>`_.
+ PMID: `33590861 <https://pubmed.ncbi.nlm.nih.gov/33590861>`_.
.. seealso::
dependencies (`libcurl`, `libcrypto`), it will fall back to
conservative defaults.
-Options can be passed to the configure script explicitely by
+Options can be passed to the configure script explicitly by
setting the environment variable `HTSLIB_CONFIGURE_OPTIONS`.
For example::
Release notes
=============
+Release 0.17.0
+==============
+
+This release wraps htslib/samtools/bcftools version 1.13. Corresponding
+to new samtools commands, `pysam.samtools` now has additional functions
+`ampliconclip`, `ampliconstats`, `fqimport`, and `version`.
+
+Bugs fixed:
+
+* [#447] The maximum QNAME length is fully restored to 254
+* [#506, #958, #1000] Don't crash the Python interpreter on ``pysam.bcftools.*()`` errors
+* [#603] count_coverage: ignore reads that have no SEQ field
+* [#928] Fix ``pysam.bcftools.mpileup()`` segmentation fault
+* [#983] Add win32/\*.[ch] to MANIFEST.in
+* [#994] Raise exception in ``get_tid()`` if header could not be parsed
+* [#995] Choose TBI/CSI in ``tabix_index()`` via both min_shift and csi
+* [#996] ``AlignmentFile.fetch()`` now works with large chromosomes longer than 2\ :sup:`29` bases
+* [#1019] Fix Sphinx documentation generation by avoiding Python 2 ``ur'string'`` syntax
+* [#1035] Improved handling of file iteration errors
+* [#1038] ``tabix_index()`` no longer leaks file descriptors
+* [#1040] ``print(aligned_segment)`` now prints the correct TLEN value
+ (it also now prints RNAME/RNEXT more clearly and prints POS/PNEXT 1-based)
+* *setup.py* longer uses ``setup(use_2to3)`` for compatibility with setuptools >= v58.0.0
+
+New facilities:
+
+* [PR #963] Additional VCF classes are exposed to pysam programmers
+* [#998, PR #1001] Add ``get/set_encoding_error_handler()`` to control UTF-8 conversion
+* [PR #1012] Running ``python setup.py sdist`` now automatically runs cythonize
+* Running tests with ``pytest`` now automatically runs ``make`` to generate test data
+
+Documentation improvements:
+
+* [#726] Clarify get_forward_sequence/get_forward_qualities documentation
+* [#865] Improved example
+* [#968] ``get_index_statstics`` parameters
+* [#986] Clarify ``VariantFile.fetch`` start/stop region parameters are 0-based and half-open.
+* [#990] Corrected ``PileupColumn.get_query_sequences`` documentation
+* [#999] Fix documentation for ``AlignmentFile.get_reference_length()``
+* [#1002] Document the default min_base_quality for ``pileup()``
+
+
Release 0.16.0
==============
The rationale for this change is to have consistency between
AlignmentFile and VariantFile.
-
+
* AlignmentFile and FastaFile now raise IOError instead of OSError
Medium term we plan to have a 1.0 release. The pysam
* [#473] A new FastxRecord class that can be instantiated from class and
modified in-place. Replaces PersistentFastqProxy.
* [#521] In AligmentFile, Simplify file detection logic and allow remote index files
+
* Removed attempts to guess data and index file names; this is magic left
to htslib.
* Removed file existence check prior to opening files with htslib
* Allow remote indices (tested using S3 signed URLs).
* Document filepath_index and make it an alias for index_filename.
* Added a require_index parameter to AlignmentFile
+
* [#526] handle unset ref when creating new records
* [#513] fix bcf_translate to skip deleted FORMAT fields to avoid
segfaults
header = pysam.AlignmentHeader(
reference_names=["chr1", "chr2"],
reference_lengths=[1000, 1000])
-
+
read = pysam.AlignedSegment(header)
This will affect all code that instantiates AlignedSegment objects
==============
This release wraps htslib/samtools/bcfools versions 1.4.1 in response
-to a security fix in these libraries. Additionaly the following
+to a security fix in these libraries. Additionally the following
issues have been fixed:
* [#452] add GFF3 support for tabix parsers
--------
The 0.9.0 release upgrades htslib to htslib 1.3 and numerous other
-enchancements and bugfixes. See below for a detailed list.
+enhancements and bugfixes. See below for a detailed list.
`Htslib 1.3 <https://github.com/samtools/htslib/releases/tag/1.3>`_
comes with additional capabilities for remote file access which depend
and code bloat.
* run configure for the builtin htslib library in order to detect
optional libraries such as libcurl. Configure behaviour can be
- controlled by setting the environmet variable
+ controlled by setting the environment variable
HTSLIB_CONFIGURE_OPTIONS.
* get_reference_sequence() now returns the reference sequence and not
something looking like it. This bug had effects on
* renamed several methods for pep8 compatibility, old names still retained for
backwards compatibility, but should be considered deprecated.
+
* gettid() is now get_tid()
* getrname() is now get_reference_name()
* parseRegion() is now parse_region()
* some methods have changed for pep8 compatibility without the old
names being present:
+
* fromQualityString() is now qualitystring_to_array()
* toQualityString() is now qualities_to_qualitystring()
-
+
* faidx now returns strings and not binary strings in py3.
* The cython components have been broken up into smaller files with
with reading and writing capability. However, the interface is still
incomplete and preliminary and lacks capability to mutate the
resulting data.
-
+
Release 0.8.1
=============
* issue #19: multiple iterators can now be made to work on the same tabix file
* issue #24: All strings returned from/passed to the pysam API are now unicode in python 3
* issue #5: type guessing for lists of integers fixed
-
+
* API changes for consistency. The old API is still present,
but deprecated.
In particular:
Backwards incompatible changes
-* Empty cigarstring now returns None (intstead of '')
+* Empty cigarstring now returns None (instead of '')
* Empty cigar now returns None (instead of [])
* When using the extension classes in cython modules, AlignedRead
needs to be substituted with AlignedSegment.
Release 0.7.4
=============
-
+
* further bugfixes to setup.py and package layout
Release 0.7.3
=============
-
+
* further bugfixes to setup.py
* upgraded distribute_setup.py to 0.6.34
Release 0.7.2
=============
-
+
* bugfix in installer - failed when cython not present
* changed installation locations of shared libraries
print (rec.pos)
but also to complex attributes such as the contents to the
-:term:`info`, :term:`format` and :term:`genotype` columns. These
+:class:`~pysam.VariantRecord.info`, :class:`~pysam.VariantRecord.format`
+and :term:`genotype` columns. These
complex attributes are views on the underlying htslib data structures
and provide dictionary-like access to the data::
#include <ctype.h>
#include <assert.h>
#include <unistd.h>
+#include <setjmp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
return putc('\n', @pysam@_stdout);
}
+
+static jmp_buf @pysam@_jmpbuf;
+static int @pysam@_status = 0;
+
+int @pysam@_dispatch(int argc, char *argv[])
+{
+ if (setjmp(@pysam@_jmpbuf) == 0)
+ return @pysam@_main(argc, argv);
+ else
+ return @pysam@_status;
+}
+
+void @pysam@_exit(int status)
+{
+ @pysam@_status = status;
+ longjmp(@pysam@_jmpbuf, 1);
+}
+
+
void @pysam@_set_optind(int val)
{
// setting this in cython via
#include <stdio.h>
+#ifndef __has_attribute
+#define __has_attribute(attribute) 0
+#endif
+#ifndef PYSAM_NORETURN
+#if __has_attribute(__noreturn__) || __GNUC__ >= 3
+#define PYSAM_NORETURN __attribute__((__noreturn__))
+#else
+#define PYSAM_NORETURN
+#endif
+#endif
+
extern FILE * @pysam@_stderr;
extern FILE * @pysam@_stdout;
int @pysam@_dispatch(int argc, char *argv[]);
+void PYSAM_NORETURN @pysam@_exit(int status);
+
void @pysam@_set_optind(int);
extern int @pysam@_main(int argc, char *argv[]);
+++ /dev/null
-raise ImportError('''calling "import pysam" from the source directory is not supported - please import pysam from somewhere else.''')
from pysam.libcfaidx import *
import pysam.libctabix as libctabix
from pysam.libctabix import *
-# import pysam.libctabixproxies as libctabixproxies
-# from pysam.libctabixproxies import *
+import pysam.libctabixproxies as libctabixproxies
+from pysam.libctabixproxies import *
import pysam.libcsamfile as libcsamfile
from pysam.libcsamfile import *
import pysam.libcalignmentfile as libcalignmentfile
cdef class PileupColumn:
- cdef bam_pileup1_t ** plp
+ cdef const bam_pileup1_t ** plp
cdef int tid
cdef int pos
cdef int n_pu
AlignmentHeader header)
cdef PileupColumn makePileupColumn(
- bam_pileup1_t ** plp,
+ const bam_pileup1_t ** plp,
int tid,
int pos,
int n_pu,
char * reference_sequence,
AlignmentHeader header)
-cdef PileupRead makePileupRead(bam_pileup1_t * src,
+cdef PileupRead makePileupRead(const bam_pileup1_t * src,
AlignmentHeader header)
cdef uint32_t get_alignment_length(bam1_t * src)
return toupper(ch)
-cdef inline bint pileup_base_qual_skip(bam_pileup1_t * p, uint32_t threshold):
+cdef inline bint pileup_base_qual_skip(const bam_pileup1_t * p, uint32_t threshold):
cdef uint32_t c
if p.qpos < p.b.core.l_qseq:
c = bam_get_qual(p.b)[p.qpos]
cdef class PileupColumn
-cdef PileupColumn makePileupColumn(bam_pileup1_t ** plp,
+cdef PileupColumn makePileupColumn(const bam_pileup1_t ** plp,
int tid,
int pos,
int n_pu,
cdef class PileupRead
-cdef PileupRead makePileupRead(bam_pileup1_t *src,
+cdef PileupRead makePileupRead(const bam_pileup1_t *src,
AlignmentHeader header):
'''return a PileupRead object construted from a bam_pileup1_t * object.'''
# note that the following does not call __init__
# Check if MD tag is valid by matching CIGAR length to MD tag defined length
# Insertions would be in addition to what is described by MD, so we calculate
- # the number of insertions seperately.
+ # the number of insertions separately.
cdef int insertions = 0
while s[s_idx] != 0:
# requires a valid header.
return "\t".join(map(str, (self.query_name,
self.flag,
- self.reference_id,
- self.reference_start,
+ "#%d" % self.reference_id if self.reference_id >= 0 else "*",
+ self.reference_start + 1,
self.mapping_quality,
self.cigarstring,
- self.next_reference_id,
- self.next_reference_start,
- self.query_alignment_length,
+ "#%d" % self.next_reference_id if self.next_reference_id >= 0 else "*",
+ self.next_reference_start + 1,
+ self.template_length,
self.query_sequence,
self.query_qualities,
self.tags)))
if qname is None or len(qname) == 0:
return
- # See issue #447
- # (The threshold is 252 chars, but this includes a \0 byte.
- if len(qname) > 251:
- raise ValueError("query length out of range {} > 251".format(
+ if len(qname) > 254:
+ raise ValueError("query length out of range {} > 254".format(
len(qname)))
qname = force_bytes(qname)
read.query_squence = read.query_sequence[5:10]
read.query_qualities = q[5:10]
- The sequence is returned as it is stored in the BAM file. Some mappers
- might have stored a reverse complement of the original read
- sequence.
+ The sequence is returned as it is stored in the BAM file. (This will
+ be the reverse complement of the original read sequence if the mapper
+ has aligned the read to the reverse strand.)
"""
def __get__(self):
if self.cache_query_sequence:
def __set__(self, val):
pysam_update_flag(self._delegate, val, BAM_FUNMAP)
# setting the unmapped flag requires recalculation of
- # bin as alignment length is now implicitely 1
+ # bin as alignment length is now implicitly 1
update_bin(self._delegate)
property mate_is_unmapped:
def get_forward_sequence(self):
"""return the original read sequence.
- Reads mapping to the reverse strand will be reverse
- complemented.
+ Reads mapped to the reverse strand are stored reverse complemented in
+ the BAM file. This method returns such reads reverse complemented back
+ to their original orientation.
Returns None if the record has no query sequence.
"""
return s
def get_forward_qualities(self):
- """return base qualities of the read sequence.
+ """return the original base qualities of the read sequence,
+ in the same format as the :attr:`query_qualities` property.
- Reads mapping to the reverse strand will be reversed.
+ Reads mapped to the reverse strand have their base qualities stored
+ reversed in the BAM file. This method returns such reads' base qualities
+ reversed back to their original orientation.
"""
if self.is_reverse:
return self.query_qualities[::-1]
*value*.
An existing value of the same *tag* will be overwritten unless
- *replace* is set to False. This is usually not recommened as a
+ *replace* is set to False. This is usually not recommended as a
tag may only appear once in the optional alignment section.
If *value* is None, the tag will be deleted.
return value
def get_tags(self, with_value_type=False):
- """the fields in the optional aligment section.
+ """the fields in the optional alignment section.
Returns a list of all fields in the optional
alignment section. Values are converted to appropriate python
raise ValueError("PileupColumn accessed after iterator finished")
cdef int x
- cdef bam_pileup1_t * p = NULL
+ cdef const bam_pileup1_t * p = NULL
pileups = []
# warning: there could be problems if self.n and self.buf are
cdef uint32_t x = 0
cdef uint32_t c = 0
cdef uint32_t cnt = 0
- cdef bam_pileup1_t * p = NULL
+ cdef const bam_pileup1_t * p = NULL
if self.plp == NULL or self.plp[0] == NULL:
raise ValueError("PileupColumn accessed after iterator finished")
mark_matches: bool
- If True, output bases matching the reference as "," or "."
+ If True, output bases matching the reference as "." or ","
for forward and reverse strand, respectively. This mark
requires the reference sequence. If no reference is
present, this option is ignored.
cdef uint8_t cc = 0
cdef uint8_t rb = 0
cdef kstring_t * buf = &self.buf
- cdef bam_pileup1_t * p = NULL
+ cdef const bam_pileup1_t * p = NULL
if self.plp == NULL or self.plp[0] == NULL:
raise ValueError("PileupColumn accessed after iterator finished")
list: a list of quality scores
"""
cdef uint32_t x = 0
- cdef bam_pileup1_t * p = NULL
+ cdef const bam_pileup1_t * p = NULL
cdef uint32_t c = 0
result = []
for x from 0 <= x < self.n_pu:
raise ValueError("PileupColumn accessed after iterator finished")
cdef uint32_t x = 0
- cdef bam_pileup1_t * p = NULL
+ cdef const bam_pileup1_t * p = NULL
result = []
for x from 0 <= x < self.n_pu:
p = &(self.plp[0][x])
raise ValueError("PileupColumn accessed after iterator finished")
cdef uint32_t x = 0
- cdef bam_pileup1_t * p = NULL
+ cdef const bam_pileup1_t * p = NULL
result = []
for x from 0 <= x < self.n_pu:
p = &(self.plp[0][x])
raise ValueError("PileupColumn accessed after iterator finished")
cdef uint32_t x = 0
- cdef bam_pileup1_t * p = NULL
+ cdef const bam_pileup1_t * p = NULL
result = []
for x from 0 <= x < self.n_pu:
p = &(self.plp[0][x])
cpdef int write(self, AlignedSegment read) except -1
-cdef class PileupColumn:
- cdef bam_pileup1_t ** plp
- cdef int tid
- cdef int pos
- cdef int n_pu
-
-
-cdef class PileupRead:
- cdef AlignedSegment _alignment
- cdef int32_t _qpos
- cdef int _indel
- cdef int _level
- cdef uint32_t _is_del
- cdef uint32_t _is_head
- cdef uint32_t _is_tail
- cdef uint32_t _is_refskip
-
-
cdef class IteratorRow:
cdef int retval
cdef bam1_t * b
cdef int pos
cdef int n_plp
cdef uint32_t min_base_quality
- cdef bam_pileup1_t * plp
+ cdef const bam_pileup1_t * plp
cdef bam_mplp_t pileup_iter
cdef __iterdata iterdata
cdef AlignmentFile samfile
########################################################
## global variables
# maximum genomic coordinace
-# for some reason, using 'int' causes overlflow
+# for some reason, using 'int' causes overflow
cdef int MAX_POS = (1 << 31) - 1
# valid types for SAM headers
return header
+def read_failure_reason(code):
+ if code == -2:
+ return 'truncated file'
+ else:
+ return "error {} while reading file".format(code)
+
# the following should be class-method for VariantHeader, but cdef @classmethods
# are not implemented in cython.
returns -1 if reference is not known.
"""
reference = force_bytes(reference)
- return bam_name2id(self.ptr, reference)
+ tid = bam_name2id(self.ptr, reference)
+ if tid < -1:
+ raise ValueError('could not parse header')
+ return tid
def __str__(self):
'''string with the full contents of the :term:`sam file` header as a
See :meth:`~pysam.HTSFile.parse_region` for more information
on how genomic regions can be specified. :term:`reference` and
- `end` are also accepted for backward compatiblity as synonyms
+ `end` are also accepted for backward compatibility as synonyms
for :term:`contig` and `stop`, respectively.
Without a `contig` or `region` all mapped reads in the file
"""perform a :term:`pileup` within a :term:`region`. The region is
specified by :term:`contig`, `start` and `stop` (using
0-based indexing). :term:`reference` and `end` are also accepted for
- backward compatiblity as synonyms for :term:`contig` and `stop`,
+ backward compatibility as synonyms for :term:`contig` and `stop`,
respectively. Alternatively, a samtools 'region' string
can be supplied.
By default, the samtools pileup engine outputs all reads
overlapping a region. If truncate is True and a region is
- given, only columns in the exact region specificied are
+ given, only columns in the exact region specified are
returned.
max_depth : int
min_base_quality: int
Minimum base quality. Bases below the minimum quality will
- not be output.
+ not be output. The default is 13.
adjust_capq_threshold: int
The region is specified by :term:`contig`, `start` and `stop`.
:term:`reference` and `end` are also accepted for backward
- compatiblity as synonyms for :term:`contig` and `stop`,
+ compatibility as synonyms for :term:`contig` and `stop`,
respectively. Alternatively, a :term:`samtools` :term:`region`
string can be supplied.
The region is specified by :term:`contig`, `start` and `stop`.
:term:`reference` and `end` are also accepted for backward
- compatiblity as synonyms for :term:`contig` and `stop`,
+ compatibility as synonyms for :term:`contig` and `stop`,
respectively. Alternatively, a :term:`samtools` :term:`region`
string can be supplied. The coverage is computed per-base [ACGT].
# count
seq = read.seq
+ if seq is None:
+ continue
quality = read.query_qualities
for qpos, refpos in read.get_aligned_pairs(True):
property nocoordinate:
"""int with total number of reads without coordinates according to the
- statistics recorded in the index. This is a read-only attribute.
+ statistics recorded in the index, i.e., the statistic printed for "*"
+ by the ``samtools idxstats`` command. This is a read-only attribute.
"""
def __get__(self):
self.check_index()
def get_index_statistics(self):
"""return statistics about mapped/unmapped reads per chromosome as
- they are stored in the index.
+ they are stored in the index, similarly to the statistics printed
+ by the ``samtools idxstats`` command.
Returns:
list :
def __next__(self):
cdef int ret = self.cnext()
- if (ret >= 0):
+ if ret >= 0:
return makeAlignedSegment(self.b, self.header)
- elif ret == -2:
- raise IOError('truncated file')
- else:
+ elif ret == -1:
raise StopIteration
+ else:
+ raise IOError(read_failure_reason(ret))
###########################################
# methods/properties referencing the header
def get_reference_length(self, reference):
"""
- return :term:`reference` name corresponding to numerical :term:`tid`
+ return :term:`reference` length corresponding to numerical :term:`tid`
"""
if self.header is None:
raise ValueError("header not available in closed files")
if ret >= 0:
self.current_row += 1
return makeAlignedSegment(self.b, self.header)
- elif ret == -2:
- raise IOError('truncated file')
- else:
+ elif ret == -1:
raise StopIteration
+ else:
+ raise IOError(read_failure_reason(ret))
cdef class IteratorRowAll(IteratorRow):
cdef int ret = self.cnext()
if ret >= 0:
return makeAlignedSegment(self.b, self.header)
- elif ret == -2:
- raise IOError('truncated file')
- else:
+ elif ret == -1:
raise StopIteration
+ else:
+ raise IOError(read_failure_reason(ret))
cdef class IteratorRowAllRefs(IteratorRow):
self.rowiter = IteratorRowRegion(self.samfile,
self.tid,
0,
- 1<<29)
+ MAX_POS)
# set htsfile and header of the rowiter
# to the values in this iterator to reflect multiple_iterators
self.rowiter.htsfile = self.htsfile
cdef int ret = self.cnext()
if ret >= 0:
return makeAlignedSegment(self.b, self.header)
- elif ret == -2:
- raise IOError('truncated file')
- else:
+ elif ret == -1:
raise StopIteration
+ else:
+ raise IOError(read_failure_reason(ret))
cdef int __advance_nofilter(void *data, bam1_t *b):
For reasons of efficiency, the iterator points to the current
pileup buffer. The pileup buffer is updated at every iteration.
- This might cause some unexpected behavious. For example,
+ This might cause some unexpected behaviour. For example,
consider the conversion to a list::
f = AlignmentFile("file.bam", "rb")
# reset in order to avoid memory leak messages for iterators
# that have not been fully consumed
self._free_pileup_iter()
- self.plp = <bam_pileup1_t*>NULL
+ self.plp = <const bam_pileup1_t*>NULL
if self.iterdata.seq != NULL:
free(self.iterdata.seq)
cdef class IndexedReads:
- """*(AlignmentFile samfile, multiple_iterators=True)
-
- Index a Sam/BAM-file by query name while keeping the
+ """Index a Sam/BAM-file by query name while keeping the
original sort order intact.
The index is kept in memory and can be substantial.
__all__ = ['VariantFile',
'VariantHeader',
'VariantHeaderRecord',
+ 'VariantHeaderRecords',
+ 'VariantMetadata',
+ 'VariantHeaderMetadata',
+ 'VariantContig',
+ 'VariantHeaderContigs',
+ 'VariantHeaderSamples',
+ 'VariantRecordFilter',
+ 'VariantRecordFormat',
+ 'VariantRecordInfo',
+ 'VariantRecordSamples',
+ 'VariantRecord',
+ 'VariantRecordSample',
+ 'BaseIndex',
+ 'BCFIndex',
+ 'TabixIndex',
+ 'BaseIterator',
+ 'BCFIterator',
+ 'TabixIterator',
'VariantRecord']
########################################################################
########################################################################
from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
-from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes
########################################################################
########################################################################
cdef int comb(int n, int k) except -1:
- """Return binomial coeffient: n choose k
+ """Return binomial coefficient: n choose k
>>> comb(5, 1)
5
else:
# Otherwise, copy the entire block
b = datac[:n]
- value = tuple(v.decode('utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
+ value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
else:
value = []
if type == BCF_BT_INT8:
# causes a memory leak https://github.com/pysam-developers/pysam/issues/773
# return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
if (r.d.m_id == 0):
- raise ValueError('Error extracing ID')
+ raise ValueError('Error extracting ID')
return charptr_to_str(r.d.id) if r.d.id != b'.' else None
@id.setter
pass
-# Interal function to clean up after iteration stop or failure.
+# Internal function to clean up after iteration stop or failure.
# This would be a nested function if it weren't a cdef function.
cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
bcf_destroy1(record)
try:
rid = index.refmap[contig]
except KeyError:
- # A query for a non-existant contig yields an empty iterator, does not raise an error
+ # A query for a non-existent contig yields an empty iterator, does not raise an error
self.iter = NULL
return
try:
rid = index.refmap[contig]
except KeyError:
- # A query for a non-existant contig yields an empty iterator, does not raise an error
+ # A query for a non-existent contig yields an empty iterator, does not raise an error
self.iter = NULL
return
return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None):
- """fetch records in a :term:`region` using 0-based indexing. The
- region is specified by :term:`contig`, *start* and *end*.
- Alternatively, a samtools :term:`region` string can be supplied.
+ """fetch records in a :term:`region`, specified either by
+ :term:`contig`, *start*, and *end* (which are 0-based, half-open);
+ or alternatively by a samtools :term:`region` string (which is
+ 1-based inclusive).
Without *contig* or *region* all mapped records will be fetched. The
records will be returned ordered by contig, which will not necessarily
cdef extern from "bcftools.pysam.h":
- int bcftools_main(int argc, char *argv[])
+ int bcftools_dispatch(int argc, char *argv[])
void bcftools_set_stderr(int fd)
void bcftools_close_stderr()
void bcftools_set_stdout(int fd)
cdef class FastxFile:
- """Stream access to :term:`fasta` or :term:`fastq` formatted files.
+ r"""Stream access to :term:`fasta` or :term:`fastq` formatted files.
The file is automatically opened.
... print(entry.quality)
>>> with pysam.FastxFile(filename) as fin, open(out_filename, mode='w') as fout:
... for entry in fin:
- ... fout.write(str(entry))
+ ... fout.write(str(entry) + '\n')
"""
def __cinit__(self, *args, **kwargs):
int SEEK_SET
# Return a virtual file pointer to the current location in the file.
- # No interpetation of the value should be made, other than a subsequent
+ # No interpretation of the value should be made, other than a subsequent
# call to bgzf_seek can be used to position the file at the same point.
# Return value is non-negative on success.
int64_t bgzf_tell(BGZF *fp)
# Read one line from a BGZF file. It is faster than bgzf_getc()
#
# @param fp BGZF file handler
- # @param delim delimitor
+ # @param delim delimiter
# @param str string to write to; must be initialized
# @return length of the string; 0 on end-of-file; negative on error
int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
ctypedef struct hts_md5_context
- # /*! @abstract Intialises an MD5 context.
+ # /*! @abstract Initialises an MD5 context.
# * @discussion
# * The expected use is to allocate an hts_md5_context using
# * hts_md5_init(). This pointer is then passed into one or more calls
# tbx.h definitions
int8_t TBX_MAX_SHIFT
- int8_t TBX_GENERIC
- int8_t TBX_SAM
- int8_t TBX_VCF
- int8_t TBX_UCSC
+ int32_t TBX_GENERIC
+ int32_t TBX_SAM
+ int32_t TBX_VCF
+ int32_t TBX_UCSC
ctypedef struct tbx_conf_t:
int32_t preset
# === Dictionary ===
#
- # The header keeps three dictonaries. The first keeps IDs in the
+ # The header keeps three dictionaries. The first keeps IDs in the
# "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
# in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
# is the actual hash table, which is opaque to the end users. In the hash
# be determined.
#
# The value of @which determines if existing INFO/AC,AN can be
- # used (BCF_UN_INFO) and and if indv fields can be splitted
- # (BCF_UN_FMT).
+ # used (BCF_UN_INFO) and and if indv fields can be split (BCF_UN_FMT).
int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
# bcf_gt_type() - determines type of the genotype
# the container, meaning multiple compression headers to manipulate.
# Changing RG may change the size of the compression header and
# therefore the length field in the container. Hence we rewrite all
- # blocks just incase and also emit the adjusted container.
+ # blocks just in case and also emit the adjusted container.
#
# The current implementation can only cope with renumbering a single
# RG (and only then if it is using HUFFMAN or BETA codecs). In
# 2 if the file is a stream and thus unseekable
# 1 if the file contains an EOF block
# 0 if the file does not contain an EOF block
- # -1 if an error occured whilst reading the file or we could not seek back to where we were
+ # -1 if an error occurred whilst reading the file or we could not seek back to where we were
#
#
int cram_check_EOF(cram_fd *fd)
cdef hFILE *fp
cdef readonly object name, mode
- def __init__(self, name, mode='r', closedf=True):
+ def __init__(self, name, mode='r', closefd=True):
self._open(name, mode, closefd=True)
def __dealloc__(self):
rval = hts_opt_apply(self.htsfile, opts)
if rval != 0:
hts_opt_free(opts)
- raise RuntimeError('An error occured while applying the requested format options')
+ raise RuntimeError('An error occurred while applying the requested format options')
hts_opt_free(opts)
def parse_region(self, contig=None, start=None, stop=None,
either be specified by :term:`contig`, `start` and
`stop`. `start` and `stop` denote 0-based, half-open
intervals. :term:`reference` and `end` are also accepted for
- backward compatiblity as synonyms for :term:`contig` and
+ backward compatibility as synonyms for :term:`contig` and
`stop`, respectively.
Alternatively, a samtools :term:`region` string can be
cdef extern from "samtools.pysam.h":
- int samtools_main(int argc, char *argv[])
+ int samtools_dispatch(int argc, char *argv[])
void samtools_set_stderr(int fd)
void samtools_close_stderr()
void samtools_set_stdout(int fd)
# DEALINGS IN THE SOFTWARE.
#
###############################################################################
-import binascii
import os
import sys
tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
tbx_destroy, hisremote, region_list, hts_getline, \
- TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, htsExactFormat, bcf, \
- bcf_index_build2
+ TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
+ no_compression, bcf, bcf_index_build2
from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
from pysam.libcutils cimport encode_filename, from_string_and_size
index : string
The filename of the index. If not set, the default is to
- assume that the index is called ``filename.tbi`
+ assume that the index is called ``filename.tbi``
mode : char
The file opening mode. Currently, only ``r`` is permitted.
property contigs:
'''list of chromosome names'''
def __get__(self):
- cdef char ** sequences
+ cdef const char ** sequences
cdef int nsequences
with nogil:
raise IOError("error %i when closing file %s" % (r, filename_in))
-def is_gzip_file(filename):
- gzip_magic_hex = b'1f8b'
- fd = os.open(filename, os.O_RDONLY)
- header = os.read(fd, 2)
- return header == binascii.a2b_hex(gzip_magic_hex)
-
-
def tabix_index(filename,
force=False,
seq_col=None,
compressed. The original file will be removed and only the compressed
file will be retained.
- *min-shift* sets the minimal interval size to 1<<INT; 0 for the
- old tabix index. The default of -1 is changed inside htslib to
- the old tabix default of 0.
+ By default or when *min_shift* is 0, creates a TBI index. If *min_shift*
+ is greater than zero and/or *csi* is True, creates a CSI index with a
+ minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set).
*index* controls the filename which should be used for creating the index.
If not set, the default is to append ``.tbi`` to *filename*.
- If *csi* is set, create a CSI index, the default is to create a
- TBI index.
-
When automatically compressing files, if *keep_original* is set the
uncompressed file will not be deleted.
'''
- if not os.path.exists(filename):
- raise IOError("No such file '%s'" % filename)
-
if preset is None and \
(seq_col is None or start_col is None or end_col is None):
raise ValueError(
"neither preset nor seq_col,start_col and end_col given")
- if not is_gzip_file(filename):
- tabix_compress(filename, filename + ".gz", force=force)
- if not keep_original:
- os.unlink(filename)
- filename += ".gz"
-
fn = encode_filename(filename)
cdef char *cfn = fn
cdef htsFile *fp = hts_open(cfn, "r")
- cdef htsExactFormat fmt = fp.format.format
+ if fp == NULL:
+ raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))
+
+ cdef htsFormat fmt = hts_get_format(fp)[0]
hts_close(fp)
-
+
+ if fmt.compression == no_compression:
+ tabix_compress(filename, filename + ".gz", force=force)
+ if not keep_original:
+ os.unlink(filename)
+ filename += ".gz"
+ fn = encode_filename(filename)
+ cfn = fn
+
# columns (1-based):
# preset-code, contig, start, end, metachar for
# comments, lines to ignore at beginning
}
conf_data = None
- if preset == "bcf" or fmt == bcf:
+ if preset == "bcf" or fmt.format == bcf:
csi = True
- if min_shift == -1:
- min_shift = 14
elif preset:
try:
conf_data = preset2conf[preset]
if conf_data:
conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
- if csi:
+ if csi or min_shift > 0:
suffix = ".csi"
+ if min_shift <= 0: min_shift = 14
else:
suffix = ".tbi"
+ min_shift = 0
+
index = index or filename + suffix
fn_index = encode_filename(index)
cdef char *fnidx = fn_index
cdef int retval = 0
- if csi and fmt == bcf:
+ if csi and fmt.format == bcf:
with nogil:
retval = bcf_index_build2(cfn, fnidx, min_shift)
else:
cpdef qualities_to_qualitystring(qualities, int offset=*)
########################################################################
+## String encoding configuration facilities
########################################################################
+
+cpdef get_encoding_error_handler()
+cpdef set_encoding_error_handler(name)
+
########################################################################
## Python 3 compatibility functions
########################################################################
-cdef charptr_to_str(const char *s, encoding=*)
-cdef bytes charptr_to_bytes(const char *s, encoding=*)
-cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*)
-cdef force_str(object s, encoding=*)
-cdef bytes force_bytes(object s, encoding=*)
+cdef charptr_to_str(const char *s, encoding=*, errors=*)
+cdef bytes charptr_to_bytes(const char *s, encoding=*, errors=*)
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*, errors=*)
+cdef force_str(object s, encoding=*, errors=*)
+cdef bytes force_bytes(object s, encoding=*, errors=*)
+cdef decode_bytes(bytes s, encoding=*, errors=*)
cdef bytes encode_filename(object filename)
cdef from_string_and_size(const char *s, size_t length)
import os
import io
from contextlib import contextmanager
+from codecs import register_error
from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION
from cpython cimport PyBytes_Check, PyUnicode_Check
from libc.stdio cimport stdout as c_stdout
from posix.fcntl cimport open as c_open, O_WRONLY
-from libcsamtools cimport samtools_main, samtools_set_stdout, samtools_set_stderr, \
+from libcsamtools cimport samtools_dispatch, samtools_set_stdout, samtools_set_stderr, \
samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn, samtools_set_optind
-from libcbcftools cimport bcftools_main, bcftools_set_stdout, bcftools_set_stderr, \
+from libcbcftools cimport bcftools_dispatch, bcftools_set_stdout, bcftools_set_stderr, \
bcftools_close_stdout, bcftools_close_stderr, bcftools_set_stdout_fn, bcftools_set_optind
#####################################################################
########################################################################
+## String encoding configuration facilities
########################################################################
+
+# Codec error handler that just interprets each bad byte as ISO-8859-1.
+def latin1_replace(exception):
+ return (chr(exception.object[exception.start]), exception.end)
+
+register_error('pysam.latin1replace', latin1_replace)
+
+
+cdef str ERROR_HANDLER = 'strict'
+
+cpdef get_encoding_error_handler():
+ return ERROR_HANDLER
+
+cpdef set_encoding_error_handler(name):
+ global ERROR_HANDLER
+ previous = ERROR_HANDLER
+ ERROR_HANDLER = name
+ return previous
+
########################################################################
## Python 3 compatibility functions
########################################################################
cdef from_string_and_size(const char* s, size_t length):
if IS_PYTHON3:
- return s[:length].decode("utf8")
+ return s[:length].decode('utf-8', ERROR_HANDLER)
else:
return s[:length]
raise TypeError("Argument must be string or unicode.")
-cdef bytes force_bytes(object s, encoding=TEXT_ENCODING):
+cdef bytes force_bytes(object s, encoding=None, errors=None):
"""convert string or unicode object to bytes, assuming
utf8 encoding.
"""
elif PyBytes_Check(s):
return s
elif PyUnicode_Check(s):
- return s.encode(encoding)
+ return s.encode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
else:
raise TypeError("Argument must be string, bytes or unicode.")
-cdef charptr_to_str(const char* s, encoding=TEXT_ENCODING):
+cdef charptr_to_str(const char* s, encoding=None, errors=None):
if s == NULL:
return None
if PY_MAJOR_VERSION < 3:
return s
else:
- return s.decode(encoding)
+ return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
-cdef charptr_to_str_w_len(const char* s, size_t n, encoding=TEXT_ENCODING):
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding=None, errors=None):
if s == NULL:
return None
if PY_MAJOR_VERSION < 3:
return s[:n]
else:
- return s[:n].decode(encoding)
+ return s[:n].decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
-cdef bytes charptr_to_bytes(const char* s, encoding=TEXT_ENCODING):
+cdef bytes charptr_to_bytes(const char* s, encoding=None, errors=None):
if s == NULL:
return None
else:
return s
-cdef force_str(object s, encoding=TEXT_ENCODING):
+cdef force_str(object s, encoding=None, errors=None):
"""Return s converted to str type of current Python
(bytes in Py2, unicode in Py3)"""
if s is None:
if PY_MAJOR_VERSION < 3:
return s
elif PyBytes_Check(s):
- return s.decode(encoding)
+ return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
else:
# assume unicode
return s
+cdef decode_bytes(bytes s, encoding=None, errors=None):
+ """Return s converted to current Python's str type,
+ always decoding even in Python 2"""
+ if s is None:
+ return None
+ else:
+ return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
+
+
cpdef parse_region(contig=None,
start=None,
stop=None,
`end`. `start` and `end` denote 0-based, half-open intervals.
:term:`reference` and `end` are also accepted for backward
- compatiblity as synonyms for :term:`contig` and `stop`,
+ compatibility as synonyms for :term:`contig` and `stop`,
respectively.
Alternatively, a samtools :term:`region` string can be supplied.
if collection == b"samtools":
samtools_set_stdout(stdout_h)
samtools_set_stderr(stderr_h)
- retval = samtools_main(n + 2, cargs)
+ retval = samtools_dispatch(n + 2, cargs)
samtools_close_stdout()
samtools_close_stderr()
elif collection == b"bcftools":
bcftools_set_stdout(stdout_h)
bcftools_set_stderr(stderr_h)
- retval = bcftools_main(n + 2, cargs)
+ retval = bcftools_dispatch(n + 2, cargs)
bcftools_close_stdout()
bcftools_close_stderr()
return retval, out_stderr, out_stdout
-__all__ = ["qualitystring_to_array",
- "array_to_qualitystring",
- "qualities_to_qualitystring"]
+__all__ = [
+ "qualitystring_to_array",
+ "array_to_qualitystring",
+ "qualities_to_qualitystring",
+ "get_encoding_error_handler",
+ "set_encoding_error_handler",
+]
"quickcheck": ("quickcheck", None),
"split": ("split", None),
"flags": ("flags", None),
+ "ampliconclip": ("ampliconclip", None),
+ "ampliconstats": ("ampliconstats", None),
+ "version": ("version", None),
+ "fqimport": ("import", None),
}
# instantiate samtools commands as python functions
// Version information used while compiling samtools, bcftools, and htslib
-#define SAMTOOLS_VERSION "1.10 (pysam)"
-#define BCFTOOLS_VERSION "1.10.2 (pysam)"
-#define HTS_VERSION_TEXT "1.10.2 (pysam)"
+#define SAMTOOLS_VERSION "1.13 (pysam)"
+#define BCFTOOLS_VERSION "1.13 (pysam)"
+#define HTS_VERSION_TEXT "1.13 (pysam)"
# pysam versioning information
-__version__ = "0.16.0.1"
+__version__ = "0.17.0"
-__samtools_version__ = "1.10"
-__bcftools_version__ = "1.10.2"
-__htslib_version__ = "1.10.2"
+__samtools_version__ = "1.13"
+__bcftools_version__ = "1.13"
+__htslib_version__ = "1.13"
The MIT/Expat License
-Copyright (C) 2008-2019 Genome Research Ltd.
+Copyright (C) 2008-2021 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
The typical simple case of building Samtools using the HTSlib bundled within
this Samtools release tarball is done as follows:
- cd .../samtools-1.10 # Within the unpacked release directory
+ cd .../samtools-1.13 # Within the unpacked release directory
./configure
make
installation using the HTSlib bundled within this Samtools release tarball,
and building the various HTSlib utilities such as bgzip is done as follows:
- cd .../samtools-1.10 # Within the unpacked release directory
+ cd .../samtools-1.13 # Within the unpacked release directory
./configure --prefix=/path/to/location
make all all-htslib
make install install-htslib
To build with plug-ins, you need to use the --enable-plugins configure option
as follows:
- cd .../samtools-1.10 # Within the unpacked release directory
+ cd .../samtools-1.13 # Within the unpacked release directory
./configure --enable-plugins --prefix=/path/to/location
make all all-htslib
make install install-htslib
the source distribution instead of installing the package. In that case
you can use:
- cd .../samtools-1.10 # Within the unpacked release directory
- ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.10
+ cd .../samtools-1.13 # Within the unpacked release directory
+ ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.13
make all all-htslib
It is possible to override the built-in search path using the HTS_PATH
It is recommended that you perform your own rigorous tests for an entire
pipeline if you wish to switch to one of the optimised zlib implementations.
+
+Citing
+======
+
+Please cite this paper when using SAMtools for your publications:
+
+Twelve years of SAMtools and BCFtools
+Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li
+GigaScience, Volume 10, Issue 2, February 2021, giab008, https://doi.org/10.1093/gigascience/giab008
+
+@article{10.1093/gigascience/giab008,
+ author = {Danecek, Petr and Bonfield, James K and Liddle, Jennifer and Marshall, John and Ohan, Valeriu and Pollard, Martin O and Whitwham, Andrew and Keane, Thomas and McCarthy, Shane A and Davies, Robert M and Li, Heng},
+ title = "{Twelve years of SAMtools and BCFtools}",
+ journal = {GigaScience},
+ volume = {10},
+ number = {2},
+ year = {2021},
+ month = {02},
+ abstract = "{SAMtools and BCFtools are widely used programs for processing and analysing high-throughput sequencing data. They include tools for file format conversion and manipulation, sorting, querying, statistics, variant calling, and effect analysis amongst other methods.The first version appeared online 12 years ago and has been maintained and further developed ever since, with many new features and improvements added over the years. The SAMtools and BCFtools packages represent a unique collection of tools that have been used in numerous other software projects and countless genomic pipelines.Both SAMtools and BCFtools are freely available on GitHub under the permissive MIT licence, free for both non-commercial and commercial use. Both packages have been installed \\>1 million times via Bioconda. The source code and documentation are available from https://www.htslib.org.}",
+ issn = {2047-217X},
+ doi = {10.1093/gigascience/giab008},
+ url = {https://doi.org/10.1093/gigascience/giab008},
+ note = {giab008},
+ eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab008/36332246/giab008.pdf},
+}
--- /dev/null
+/* stats.c -- This is the former bamcheck integrated into samtools/htslib.
+
+ Copyright (C) 2020-2021 Genome Research Ltd.
+
+ Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+/*
+ * This tool is designed to give "samtools stats" style output, but dedicated
+ * to small amplicon sequencing projects. It gathers stats on the
+ * distribution of reads across amplicons.
+ */
+
+/*
+ * TODO:
+ * - Cope with multiple references. What do we do here? Just request one?
+ * - Permit regions rather than consuming whole file (maybe solves above).
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <math.h>
+
+#include <htslib/sam.h>
+#include <htslib/khash.h>
+
+#include "samtools.h"
+#include "sam_opts.h"
+#include "bam_ampliconclip.h"
+
+KHASH_MAP_INIT_INT64(tcoord, int64_t)
+KHASH_MAP_INIT_STR(qname, int64_t)
+
+#ifndef MIN
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef MAX
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#ifndef ABS
+#define ABS(a) ((a)>=0?(a):-(a))
+#endif
+
+#define TCOORD_MIN_COUNT 10
+#define MAX_AMP 1000 // Default maximum number of amplicons
+#define MAX_AMP_LEN 1000 // Default maximum length of any single amplicon
+#define MAX_PRIMER_PER_AMPLICON 4 // Max primers per LEFT/RIGHT
+#define MAX_DEPTH 5 // Number of different depths permitted
+
+typedef struct {
+ sam_global_args ga;
+ uint32_t flag_require;
+ uint32_t flag_filter;
+ int max_delta; // Used for matching read to amplicon primer loc
+ int min_depth[MAX_DEPTH]; // Used for coverage; must be >= min_depth deep
+ int use_sample_name;
+ int max_amp; // Total number of amplicons
+ int max_amp_len; // Maximum length of an individual amplicon
+ double depth_bin;// aggregate depth within this fraction
+ int tlen_adj; // Adjust tlen by this amount, due to clip but no fixmate
+ FILE *out_fp;
+ char *argv;
+ int tcoord_min_count;
+ int tcoord_bin;
+ int multi_ref;
+} astats_args_t;
+
+typedef struct {
+ int nseq; // total sequence count
+ int nfiltered; // sequence filtered
+ int nfailprimer;// count of sequences not matching the primer locations
+
+ // Sizes of memory allocated below, to permit reset
+ int max_amp, max_amp_len, max_len;
+
+ // Summary across all samples, sum(x) plus sum(x^2) for s.d. calc
+ int64_t *nreads, *nreads2; // [max_amp]
+ double *nfull_reads; // [max_amp]; 0.5/read if paired.
+ double *nrperc, *nrperc2; // [max_amp]
+ int64_t *nbases, *nbases2; // [max_amp]
+ int64_t *coverage; // [max_amp][max_amp_len]
+ double (*covered_perc)[MAX_DEPTH]; // [max_amp][MAX_DEPTH]
+ double (*covered_perc2)[MAX_DEPTH];// [max_amp][MAX_DEPTH];
+ khash_t(tcoord) **tcoord; // [max_amp+1]
+
+ // 0 is correct pair, 1 is incorrect pair, 2 is unidentified
+ int (*amp_dist)[3]; // [MAX_AMP][3];
+
+ int *depth_valid; // [max_len]
+ int *depth_all; // [max_len]
+ khash_t(qname) *qend; // queryname end, for overlap removal
+} astats_t;
+
+// We can have multiple primers for LEFT / RIGHT, so this
+// permits detection by any compatible combination.
+// One reference:
+typedef struct {
+ int64_t left[MAX_PRIMER_PER_AMPLICON];
+ int nleft;
+ int64_t right[MAX_PRIMER_PER_AMPLICON];
+ int nright;
+ int64_t max_left, min_right; // inner dimensions
+ int64_t min_left, max_right; // outer dimensions
+} amplicon_t;
+
+// Multiple references, we have an array of amplicons_t - one per used ref.
+// We have per reference local and global stats here, as some of the stats
+// are coordinate based. However we report them combined together as a single
+// list across all references.
+// "namp" is the number of amplicons in this reference, but they're
+// numbered first_amp to first_amp+namp-1 inclusively.
+typedef struct {
+ int tid, namp;
+ int64_t len;
+ bed_entry_list_t *sites;
+ amplicon_t *amp;
+ astats_t *lstats, *gstats; // local (1 file) and global (all file) stats
+ const char *ref; // ref name (pointer to the bed hash table key)
+ int first_amp; // first amplicon number for this ref
+} amplicons_t;
+
+// Reinitialised for each new reference/chromosome.
+// Counts from 1 to namp, -1 for no match and 0 for ?.
+static int *pos2start = NULL;
+static int *pos2end = NULL;
+static int pos2size = 0; // allocated size of pos2start/end
+
+// Lookup table to go from position to amplicon based on
+// read start / end.
+static int initialise_amp_pos_lookup(astats_args_t *args,
+ amplicons_t *amps,
+ int ref) {
+ int64_t i, j;
+ amplicon_t *amp = amps[ref].amp;
+ int64_t max_len = amps[ref].len;
+ int namp = amps[ref].namp;
+
+ if (max_len+1 > pos2size) {
+ if (!(pos2start = realloc(pos2start, (max_len+1)*sizeof(*pos2start))))
+ return -1;
+ if (!(pos2end = realloc(pos2end, (max_len+1)*sizeof(*pos2end))))
+ return -1;
+ pos2size = max_len;
+ }
+ for (i = 0; i < max_len; i++)
+ pos2start[i] = pos2end[i] = -1;
+
+ for (i = 0; i < namp; i++) {
+ for (j = 0; j < amp[i].nleft; j++) {
+ int64_t p;
+ for (p = amp[i].left[j] - args->max_delta;
+ p <= amp[i].left[j] + args->max_delta; p++) {
+ if (p < 1 || p > max_len)
+ continue;
+ pos2start[p-1] = i;
+ }
+ }
+ for (j = 0; j < amp[i].nright; j++) {
+ int64_t p;
+ for (p = amp[i].right[j] - args->max_delta;
+ p <= amp[i].right[j] + args->max_delta; p++) {
+ if (p < 1 || p > max_len)
+ continue;
+ pos2end[p-1] = i;
+ }
+ }
+ }
+
+ return 0;
+}
+
+// Counts amplicons.
+// Assumption: input BED file alternates between LEFT and RIGHT primers
+// per amplicon, thus we can count the number based on the switching
+// orientation.
+static int count_amplicon(bed_entry_list_t *sites) {
+ int i, namp, last_rev = 0;
+ for (i = namp = 0; i < sites->length; i++) {
+ if (sites->bp[i].rev == 0 && last_rev)
+ namp++;
+ last_rev = sites->bp[i].rev;
+ }
+
+ return ++namp;
+}
+
+// We're only interest in the internal part of the amplicon.
+// Our bed file has LEFT start/end followed by RIGHT start/end,
+// so collapse these to LEFT end / RIGHT start.
+//
+// Returns right most amplicon position on success,
+// < 0 on error
+static int64_t bed2amplicon(astats_args_t *args, bed_entry_list_t *sites,
+ amplicon_t *amp, int *namp, int do_title,
+ const char *ref, int first_amp) {
+ int i, j;
+ int64_t max_right = 0;
+ FILE *ofp = args->out_fp;
+
+ *namp = 0;
+
+ // Assume all primers for the same amplicon are adjacent in BED
+ // with all + followed by all -. Thus - to + signifies next primer set.
+ int last_rev = 0;
+ amp[0].max_left = 0;
+ amp[0].min_right = INT64_MAX;
+ amp[0].min_left = INT64_MAX;
+ amp[0].max_right = 0;
+ if (do_title) {
+ fprintf(ofp, "# Amplicon locations from BED file.\n");
+ fprintf(ofp, "# LEFT/RIGHT are <start>-<end> format and "
+ "comma-separated for alt-primers.\n");
+ if (args->multi_ref)
+ fprintf(ofp, "#\n# AMPLICON\tREF\tNUMBER\tLEFT\tRIGHT\n");
+ else
+ fprintf(ofp, "#\n# AMPLICON\tNUMBER\tLEFT\tRIGHT\n");
+ }
+ for (i = j = 0; i < sites->length; i++) {
+ if (i == 0 && sites->bp[i].rev != 0) {
+ fprintf(stderr, "[ampliconstats] error: BED file should start"
+ " with the + strand primer\n");
+ return -1;
+ }
+ if (sites->bp[i].rev == 0 && last_rev) {
+ j++;
+ if (j >= args->max_amp) {
+ fprintf(stderr, "[ampliconstats] error: too many amplicons"
+ " (%d). Use -a option to raise this.\n", j);
+ return -1;
+ }
+ amp[j].max_left = 0;
+ amp[j].min_right = INT64_MAX;
+ amp[j].min_left = INT64_MAX;
+ amp[j].max_right = 0;
+ }
+ if (sites->bp[i].rev == 0) {
+ if (i == 0 || last_rev) {
+ if (j>0) fprintf(ofp, "\n");
+ if (args->multi_ref)
+ fprintf(ofp, "AMPLICON\t%s\t%d", ref, j+1 + first_amp);
+ else
+ fprintf(ofp, "AMPLICON\t%d", j+1);
+ }
+ if (amp[j].nleft >= MAX_PRIMER_PER_AMPLICON) {
+ print_error_errno("ampliconstats",
+ "too many primers per amplicon (%d).\n",
+ MAX_PRIMER_PER_AMPLICON);
+ return -1;
+ }
+ amp[j].left[amp[j].nleft++] = sites->bp[i].right;
+ if (amp[j].max_left < sites->bp[i].right+1)
+ amp[j].max_left = sites->bp[i].right+1;
+ if (amp[j].min_left > sites->bp[i].right+1)
+ amp[j].min_left = sites->bp[i].right+1;
+ // BED file, so left+1 as zero based. right(+1-1) as
+ // BED goes one beyond end (and we want inclusive range).
+ fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nleft > 1],
+ sites->bp[i].left+1, sites->bp[i].right);
+ } else {
+ if (amp[j].nright >= MAX_PRIMER_PER_AMPLICON) {
+ print_error_errno("ampliconstats",
+ "too many primers per amplicon (%d)",
+ MAX_PRIMER_PER_AMPLICON);
+ return -1;
+ }
+ amp[j].right[amp[j].nright++] = sites->bp[i].left;
+ if (amp[j].min_right > sites->bp[i].left-1)
+ amp[j].min_right = sites->bp[i].left-1;
+ if (amp[j].max_right < sites->bp[i].left-1) {
+ amp[j].max_right = sites->bp[i].left-1;
+ if (amp[j].max_right - amp[j].min_left + 1 >=
+ args->max_amp_len) {
+ fprintf(stderr, "[ampliconstats] error: amplicon "
+ "longer (%d) than max_amp_len option (%d)\n",
+ (int)(amp[j].max_right - amp[j].min_left + 2),
+ args->max_amp_len);
+ return -1;
+ }
+ if (max_right < amp[j].max_right)
+ max_right = amp[j].max_right;
+ }
+ fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nright > 1],
+ sites->bp[i].left+1, sites->bp[i].right);
+ }
+ last_rev = sites->bp[i].rev;
+ }
+ if (last_rev != 1) {
+ fprintf(ofp, "\n"); // useful if going to stdout
+ fprintf(stderr, "[ampliconstats] error: bed file does not end on"
+ " a reverse strand primer.\n");
+ return -1;
+ }
+ *namp = ++j;
+ if (j) fprintf(ofp, "\n");
+
+ if (j >= args->max_amp) {
+ fprintf(stderr, "[ampliconstats] error: "
+ "too many amplicons (%d). Use -a option to raise this.", j);
+ return -1;
+ }
+
+// for (i = 0; i < *namp; i++) {
+// printf("%d\t%ld", i, amp[i].length);
+// for (j = 0; j < amp[i].nleft; j++)
+// printf("%c%ld", "\t,"[j>0], amp[i].left[j]);
+// for (j = 0; j < amp[i].nright; j++)
+// printf("%c%ld", "\t,"[j>0], amp[i].right[j]);
+// printf("\n");
+// }
+
+ return max_right;
+}
+
+void stats_free(astats_t *st) {
+ if (!st)
+ return;
+
+ free(st->nreads);
+ free(st->nreads2);
+ free(st->nfull_reads);
+ free(st->nrperc);
+ free(st->nrperc2);
+ free(st->nbases);
+ free(st->nbases2);
+ free(st->coverage);
+ free(st->covered_perc);
+ free(st->covered_perc2);
+ free(st->amp_dist);
+
+ free(st->depth_valid);
+ free(st->depth_all);
+
+ if (st->tcoord) {
+ int i;
+ for (i = 0; i <= st->max_amp; i++) {
+ if (st->tcoord[i])
+ kh_destroy(tcoord, st->tcoord[i]);
+ }
+ free(st->tcoord);
+ }
+
+ khiter_t k;
+ for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
+ if (kh_exist(st->qend, k))
+ free((void *)kh_key(st->qend, k));
+ kh_destroy(qname, st->qend);
+
+ free(st);
+}
+
+astats_t *stats_alloc(int64_t max_len, int max_amp, int max_amp_len) {
+ astats_t *st = calloc(1, sizeof(*st));
+ if (!st)
+ return NULL;
+
+ st->max_amp = max_amp;
+ st->max_amp_len = max_amp_len;
+ st->max_len = max_len;
+
+ if (!(st->nreads = calloc(max_amp, sizeof(*st->nreads)))) goto err;
+ if (!(st->nreads2 = calloc(max_amp, sizeof(*st->nreads2)))) goto err;
+ if (!(st->nrperc = calloc(max_amp, sizeof(*st->nrperc)))) goto err;
+ if (!(st->nrperc2 = calloc(max_amp, sizeof(*st->nrperc2)))) goto err;
+ if (!(st->nbases = calloc(max_amp, sizeof(*st->nbases)))) goto err;
+ if (!(st->nbases2 = calloc(max_amp, sizeof(*st->nbases2)))) goto err;
+
+ if (!(st->nfull_reads = calloc(max_amp, sizeof(*st->nfull_reads))))
+ goto err;
+
+ if (!(st->coverage = calloc(max_amp*max_amp_len, sizeof(*st->coverage))))
+ goto err;
+
+ if (!(st->covered_perc = calloc(max_amp, sizeof(*st->covered_perc))))
+ goto err;
+ if (!(st->covered_perc2 = calloc(max_amp, sizeof(*st->covered_perc2))))
+ goto err;
+
+ if (!(st->tcoord = calloc(max_amp+1, sizeof(*st->tcoord)))) goto err;
+ int i;
+ for (i = 0; i <= st->max_amp; i++)
+ if (!(st->tcoord[i] = kh_init(tcoord)))
+ goto err;
+
+ if (!(st->qend = kh_init(qname)))
+ goto err;
+
+ if (!(st->depth_valid = calloc(max_len, sizeof(*st->depth_valid))))
+ goto err;
+ if (!(st->depth_all = calloc(max_len, sizeof(*st->depth_all))))
+ goto err;
+
+ if (!(st->amp_dist = calloc(max_amp, sizeof(*st->amp_dist)))) goto err;
+
+ return st;
+
+ err:
+ stats_free(st);
+ return NULL;
+}
+
+static void stats_reset(astats_t *st) {
+ st->nseq = 0;
+ st->nfiltered = 0;
+ st->nfailprimer = 0;
+
+ memset(st->nreads, 0, st->max_amp * sizeof(*st->nreads));
+ memset(st->nreads2, 0, st->max_amp * sizeof(*st->nreads2));
+ memset(st->nfull_reads, 0, st->max_amp * sizeof(*st->nfull_reads));
+
+ memset(st->nrperc, 0, st->max_amp * sizeof(*st->nrperc));
+ memset(st->nrperc2, 0, st->max_amp * sizeof(*st->nrperc2));
+
+ memset(st->nbases, 0, st->max_amp * sizeof(*st->nbases));
+ memset(st->nbases2, 0, st->max_amp * sizeof(*st->nbases2));
+
+ memset(st->coverage, 0, st->max_amp * st->max_amp_len
+ * sizeof(*st->coverage));
+ memset(st->covered_perc, 0, st->max_amp * sizeof(*st->covered_perc));
+ memset(st->covered_perc2, 0, st->max_amp * sizeof(*st->covered_perc2));
+
+ // Keep the allocated entries as it's likely all files will share
+ // the same keys. Instead we reset counters to zero for common ones
+ // and delete rare ones.
+ int i;
+ for (i = 0; i <= st->max_amp; i++) {
+ khiter_t k;
+ for (k = kh_begin(st->tcoord[i]);
+ k != kh_end(st->tcoord[i]); k++)
+ if (kh_exist(st->tcoord[i], k)) {
+ if (kh_value(st->tcoord[i], k) < 5)
+ kh_del(tcoord, st->tcoord[i], k);
+ else
+ kh_value(st->tcoord[i], k) = 0;
+ }
+ }
+
+ khiter_t k;
+ for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
+ if (kh_exist(st->qend, k))
+ free((void *)kh_key(st->qend, k));
+ kh_clear(qname, st->qend);
+
+ memset(st->depth_valid, 0, st->max_len * sizeof(*st->depth_valid));
+ memset(st->depth_all, 0, st->max_len * sizeof(*st->depth_all));
+ memset(st->amp_dist, 0, st->max_amp * sizeof(*st->amp_dist));
+}
+
+static void amp_stats_reset(amplicons_t *amps, int nref) {
+ int i;
+ for (i = 0; i < nref; i++) {
+ if (!amps[i].sites)
+ continue;
+ stats_reset(amps[i].lstats);
+ }
+}
+
+static int accumulate_stats(astats_args_t *args, amplicons_t *amps,
+ bam1_t *b) {
+ int ref = b->core.tid;
+ amplicon_t *amp = amps[ref].amp;
+ astats_t *stats = amps[ref].lstats;
+ int len = amps[ref].len;
+
+ if (!stats)
+ return 0;
+
+ stats->nseq++;
+ if ((b->core.flag & args->flag_require) != args->flag_require ||
+ (b->core.flag & args->flag_filter) != 0) {
+ stats->nfiltered++;
+ return 0;
+ }
+
+ int64_t start = b->core.pos, mstart = start; // modified start
+ int64_t end = bam_endpos(b), i;
+
+ // Compute all-template-depth and valid-template-depth.
+ // We track current end location per read name so we can remove overlaps.
+ // Potentially we could use this data for a better amplicon-depth
+ // count too, but for now it's purely for the per-base plots.
+ int ret;
+ khiter_t k;
+ int prev_start = 0, prev_end = 0;
+ if ((b->core.flag & BAM_FPAIRED)
+ && !(b->core.flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY))) {
+ k = kh_put(qname, stats->qend, bam_get_qname(b), &ret);
+ if (ret == 0) {
+ prev_start = kh_value(stats->qend, k) & 0xffffffff;
+ prev_end = kh_value(stats->qend, k)>>32;
+ mstart = MAX(mstart, prev_end);
+ // Ideally we'd reuse strings so we don't thrash free/malloc.
+ // However let's see if the official way of doing that (malloc
+ // itself) is fast enough first.
+ free((void *)kh_key(stats->qend, k));
+ kh_del(qname, stats->qend, k);
+ //fprintf(stderr, "remove overlap %d to %d\n", (int)start, (int)mstart);
+ } else {
+ if (!(kh_key(stats->qend, k) = strdup(bam_get_qname(b))))
+ return -1;
+
+ kh_value(stats->qend, k) = start | (end << 32);
+ }
+ }
+ for (i = mstart; i < end && i < len; i++)
+ stats->depth_all[i]++;
+ if (i < end) {
+ print_error("ampliconstats", "record %s overhangs end of reference",
+ bam_get_qname(b));
+ // But keep going, as it's harmless.
+ }
+
+ // On single ended runs, eg ONT or PacBio, we just use the start/end
+ // of the template to assign.
+ int anum = (b->core.flag & BAM_FREVERSE) || !(b->core.flag & BAM_FPAIRED)
+ ? (end-1 >= 0 && end-1 < len ? pos2end[end-1] : -1)
+ : (start >= 0 && start < len ? pos2start[start] : -1);
+
+ // ivar sometimes soft-clips 100% of the bases.
+ // This is essentially unmapped
+ if (end == start && (args->flag_filter & BAM_FUNMAP)) {
+ stats->nfiltered++;
+ return 0;
+ }
+
+ if (anum == -1)
+ stats->nfailprimer++;
+
+ if (anum >= 0) {
+ int64_t c = MIN(end,amp[anum].min_right+1) - MAX(start,amp[anum].max_left);
+ if (c > 0) {
+ stats->nreads[anum]++;
+ // NB: ref bases rather than read bases
+ stats->nbases[anum] += c;
+
+ int64_t i;
+ if (start < 0) start = 0;
+ if (end > len) end = len;
+
+ int64_t ostart = MAX(start, amp[anum].min_left-1);
+ int64_t oend = MIN(end, amp[anum].max_right);
+ int64_t offset = amp[anum].min_left-1;
+ for (i = ostart; i < oend; i++)
+ stats->coverage[anum*stats->max_amp_len + i-offset]++;
+ } else {
+ stats->nfailprimer++;
+ }
+ }
+
+ // Template length in terms of amplicon number to amplicon number.
+ // We expect left to right of same amplicon (len 0), but it may go
+ // to next amplicon (len 1) or prev (len -1), etc.
+ int64_t t_end;
+ int oth_anum = -1;
+
+ if (b->core.flag & BAM_FPAIRED) {
+ t_end = (b->core.flag & BAM_FREVERSE ? end : start)
+ + b->core.isize;
+
+ // If we've clipped the primers but not followed up with a fixmates
+ // then our start+TLEN will take us to a location which is
+ // length(LEFT_PRIMER) + length(RIGHT_PRIMER) too far away.
+ //
+ // The correct solution is to run samtools fixmate so TLEN is correct.
+ // The hacky solution is to fudge the expected tlen by double the
+ // average primer length (e.g. 50).
+ t_end += b->core.isize > 0 ? -args->tlen_adj : +args->tlen_adj;
+
+ if (t_end > 0 && t_end < len && b->core.isize != 0)
+ oth_anum = (b->core.flag & BAM_FREVERSE)
+ ? pos2start[t_end]
+ : pos2end[t_end];
+ } else {
+ // Not paired (see int anum = (REV || !PAIR) ?en :st expr above)
+ oth_anum = pos2start[start];
+ t_end = end;
+ }
+
+ // We don't want to count our pairs twice.
+ // If both left/right are known, count it on left only.
+ // If only one is known, we'll only get to this code once
+ // so we can also count it.
+ int astatus = 2;
+ if (anum != -1 && oth_anum != -1) {
+ astatus = oth_anum == anum ? 0 : 1;
+ if (start <= t_end)
+ stats->amp_dist[anum][astatus]++;
+ } else if (anum >= 0) {
+ stats->amp_dist[anum][astatus = 2]++;
+ }
+
+ if (astatus == 0 && !(b->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))) {
+ if (prev_end && mstart > prev_end) {
+ // 2nd read with gap to 1st; undo previous increment.
+ for (i = prev_start; i < prev_end; i++)
+ stats->depth_valid[i]--;
+ stats->nfull_reads[anum] -= (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
+ } else {
+ // 1st read, or 2nd read that overlaps 1st
+ for (i = mstart; i < end; i++)
+ stats->depth_valid[i]++;
+ stats->nfull_reads[anum] += (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
+ }
+ }
+
+ // Track template start,end frequencies, so we can give stats on
+ // amplicon primer usage.
+ if ((b->core.flag & BAM_FPAIRED) && b->core.isize <= 0)
+ // left to right only, so we don't double count template positions.
+ return 0;
+
+ start = b->core.pos;
+ t_end = b->core.flag & BAM_FPAIRED
+ ? start + b->core.isize-1
+ : end;
+ uint64_t tcoord = MIN(start+1, UINT32_MAX) | (MIN(t_end+1, UINT32_MAX)<<32);
+ k = kh_put(tcoord, stats->tcoord[anum+1], tcoord, &ret);
+ if (ret < 0)
+ return -1;
+ if (ret == 0)
+ kh_value(stats->tcoord[anum+1], k)++;
+ else
+ kh_value(stats->tcoord[anum+1], k)=1;
+ kh_value(stats->tcoord[anum+1], k) |= ((int64_t)astatus<<32);
+
+ return 0;
+}
+
+// Append file local stats to global stats
+int append_lstats(astats_t *lstats, astats_t *gstats, int namp, int all_nseq) {
+ gstats->nseq += lstats->nseq;
+ gstats->nfiltered += lstats->nfiltered;
+ gstats->nfailprimer += lstats->nfailprimer;
+
+ int a;
+ for (a = -1; a < namp; a++) {
+ // Add khash local (kl) to khash global (kg)
+ khiter_t kl, kg;
+ for (kl = kh_begin(lstats->tcoord[a+1]);
+ kl != kh_end(lstats->tcoord[a+1]); kl++) {
+ if (!kh_exist(lstats->tcoord[a+1], kl) ||
+ kh_value(lstats->tcoord[a+1], kl) == 0)
+ continue;
+
+ int ret;
+ kg = kh_put(tcoord, gstats->tcoord[a+1],
+ kh_key(lstats->tcoord[a+1], kl),
+ &ret);
+ if (ret < 0)
+ return -1;
+
+ kh_value(gstats->tcoord[a+1], kg) =
+ (ret == 0
+ ? (kh_value(gstats->tcoord[a+1], kg) & 0xFFFFFFFF)
+ : 0)
+ + kh_value(lstats->tcoord[a+1], kl);
+ }
+ if (a == -1) continue;
+
+ gstats->nreads[a] += lstats->nreads[a];
+ gstats->nreads2[a] += lstats->nreads[a] * lstats->nreads[a];
+ gstats->nfull_reads[a] += lstats->nfull_reads[a];
+
+ // To get mean & sd for amplicon read percentage, we need
+ // to do the divisions here as nseq differs for each sample.
+ double nrperc = all_nseq ? 100.0 * lstats->nreads[a] / all_nseq : 0;
+ gstats->nrperc[a] += nrperc;
+ gstats->nrperc2[a] += nrperc*nrperc;
+
+ gstats->nbases[a] += lstats->nbases[a];
+ gstats->nbases2[a] += lstats->nbases[a] * lstats->nbases[a];
+
+ int d;
+ for (d = 0; d < MAX_DEPTH; d++) {
+ gstats->covered_perc[a][d] += lstats->covered_perc[a][d];
+ gstats->covered_perc2[a][d] += lstats->covered_perc[a][d]
+ * lstats->covered_perc[a][d];
+ }
+
+ for (d = 0; d < 3; d++)
+ gstats->amp_dist[a][d] += lstats->amp_dist[a][d];
+ }
+
+ for (a = 0; a < lstats->max_len; a++) {
+ gstats->depth_valid[a] += lstats->depth_valid[a];
+ gstats->depth_all[a] += lstats->depth_all[a];
+ }
+
+ return 0;
+}
+
+int append_stats(amplicons_t *amps, int nref) {
+ int i, r, all_nseq = 0;
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = amps[r].lstats;
+ all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer;
+ }
+
+ for (i = 0; i < nref; i++) {
+ if (!amps[i].sites)
+ continue;
+ if (append_lstats(amps[i].lstats, amps[i].gstats, amps[i].namp,
+ all_nseq) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+typedef struct {
+ int32_t start, end;
+ uint32_t freq;
+ uint32_t status;
+} tcoord_t;
+
+// Sort tcoord by descending frequency and then ascending start and end.
+static int tcoord_freq_sort(const void *vp1, const void *vp2) {
+ const tcoord_t *t1 = (const tcoord_t *)vp1;
+ const tcoord_t *t2 = (const tcoord_t *)vp2;
+
+ if (t1->freq != t2->freq)
+ return t2->freq - t1->freq;
+
+ if (t1->start != t2->start)
+ return t1->start - t2->start;
+
+ return t1->end - t2->end;
+}
+
+
+/*
+ * Merges tcoord start,end,freq,status tuples if their coordinates are
+ * close together. We aim to keep the start,end for the most frequent
+ * value and assume that is the correct coordinate and all others are
+ * minor fluctuations due to errors or variants.
+ *
+ * We sort by frequency first and then merge later items in the list into
+ * the earlier more frequent ones. It's O(N^2), but sufficient for now
+ * given current scale of projects.
+ *
+ * If we ever need to resolve that then consider sorting by start
+ * coordinate and scanning the list to find all items within X, find
+ * the most frequent of those, and then cluster that way. (I'd have
+ * done that had I thought of it at the time!)
+ */
+static void aggregate_tcoord(astats_args_t *args, tcoord_t *tpos, size_t *np){
+ size_t n = *np, j, j2, j3, k;
+
+ // Sort by frequency and cluster infrequent coords into frequent
+ // ones provided they're close by.
+ // This is O(N^2), but we've already binned by tcoord_bin/2 so
+ // the list isn't intended to be vast at this point.
+ qsort(tpos, n, sizeof(*tpos), tcoord_freq_sort);
+
+ // For frequency ties, find mid start coord, and then find mid end
+ // coord of those matching start.
+ // We make that the first item so we merge into that mid point.
+ for (j = 0; j < n; j++) {
+ for (j2 = j+1; j2 < n; j2++) {
+ if (tpos[j].freq != tpos[j2].freq)
+ break;
+ if (tpos[j2].start - tpos[j].start >= args->tcoord_bin)
+ break;
+ }
+
+ // j to j2 all within bin of a common start,
+ // m is the mid start.
+ if (j2-1 > j) {
+ size_t m = (j2-1 + j)/2;
+
+ // Find mid end for this same start
+ while (m > 1 && tpos[m].start == tpos[m-1].start)
+ m--;
+ for (j3 = m+1; j3 < j2; j3++) {
+ if (tpos[m].start != tpos[j3].start)
+ break;
+ if (tpos[m].end - tpos[j3].end >= args->tcoord_bin)
+ break;
+ }
+ if (j3-1 > m)
+ m = (j3-1 + m)/2;
+
+ // Swap with first item.
+ tcoord_t tmp = tpos[j];
+ tpos[j] = tpos[m];
+ tpos[m] = tmp;
+ j = j2-1;
+ }
+ }
+
+ // Now merge in coordinates.
+ // This bit is O(N^2), so consider binning first to reduce the
+ // size of the list if we have excessive positional variation.
+ for (k = j = 0; j < n; j++) {
+ if (!tpos[j].freq)
+ continue;
+
+ if (k < j)
+ tpos[k] = tpos[j];
+
+ for (j2 = j+1; j2 < n; j2++) {
+ if (ABS(tpos[j].start-tpos[j2].start) < args->tcoord_bin/2 &&
+ ABS(tpos[j].end -tpos[j2].end) < args->tcoord_bin/2 &&
+ tpos[j].status == tpos[j2].status) {
+ tpos[k].freq += tpos[j2].freq;
+ tpos[j2].freq = 0;
+ }
+ }
+ k++;
+ }
+
+ *np = k;
+}
+
+int dump_stats(astats_args_t *args, char type, char *name, int nfile,
+ amplicons_t *amps, int nref, int local) {
+ int i, r;
+ FILE *ofp = args->out_fp;
+ tcoord_t *tpos = NULL;
+ size_t ntcoord = 0;
+
+ // summary stats for this sample (or for all samples)
+ fprintf(ofp, "# Summary stats.\n");
+ fprintf(ofp, "# Use 'grep ^%cSS | cut -f 2-' to extract this part.\n", type);
+
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ int nmatch = stats->nseq - stats->nfiltered - stats->nfailprimer;
+ char *name_ref = malloc(strlen(name) + strlen(amps[r].ref) + 2);
+ if (!name_ref)
+ return -1;
+ if (args->multi_ref)
+ sprintf(name_ref, "%s\t%s", name, amps[r].ref);
+ else
+ sprintf(name_ref, "%s", name);
+ fprintf(ofp, "%cSS\t%s\traw total sequences:\t%d\n",
+ type, name_ref, stats->nseq);
+ fprintf(ofp, "%cSS\t%s\tfiltered sequences:\t%d\n",
+ type, name_ref, stats->nfiltered);
+ fprintf(ofp, "%cSS\t%s\tfailed primer match:\t%d\n",
+ type, name_ref, stats->nfailprimer);
+ fprintf(ofp, "%cSS\t%s\tmatching sequences:\t%d\n",
+ type, name_ref, nmatch);
+
+ int d = 0;
+ do {
+ // From first to last amplicon only, so not entire consensus.
+ // If contig length is known, maybe we want to add the missing
+ // count to < DEPTH figures?
+ int64_t start = 0, covered = 0, total = 0;
+ amplicon_t *amp = amps[r].amp;
+ for (i = 0; i < amps[r].namp; i++) {
+ int64_t j, offset = amp[i].min_left-1;
+ if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
+ fprintf(stderr, "[ampliconstats] error: "
+ "Maximum amplicon length (%d) exceeded for '%s'\n",
+ stats->max_amp, name);
+ return -1;
+ }
+ for (j = MAX(start, amp[i].max_left-1);
+ j < MAX(start, amp[i].min_right); j++) {
+ if (stats->coverage[i*stats->max_amp_len + j-offset]
+ >= args->min_depth[d])
+ covered++;
+ total++;
+ }
+ start = MAX(start, amp[i].min_right);
+ }
+ fprintf(ofp, "%cSS\t%s\tconsensus depth count < %d and >= %d:\t%"
+ PRId64"\t%"PRId64"\n", type, name_ref,
+ args->min_depth[d], args->min_depth[d],
+ total-covered, covered);
+ } while (++d < MAX_DEPTH && args->min_depth[d]);
+
+ free(name_ref);
+ }
+
+ // Read count
+ fprintf(ofp, "# Absolute matching read counts per amplicon.\n");
+ fprintf(ofp, "# Use 'grep ^%cREADS | cut -f 2-' to extract this part.\n", type);
+ fprintf(ofp, "%cREADS\t%s", type, name);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ fprintf(ofp, "\t%"PRId64, stats->nreads[i]);
+ }
+ }
+ fprintf(ofp, "\n");
+
+ // Valid depth is the number of full length reads (already divided
+ // by the number we expect to cover), so +0.5 per read in pair.
+ // A.k.a "usable depth" in the plots.
+ fprintf(ofp, "%cVDEPTH\t%s", type, name);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++)
+ fprintf(ofp, "\t%d", (int)stats->nfull_reads[i]);
+ }
+ fprintf(ofp, "\n");
+
+ if (type == 'C') {
+ // For combined we can compute mean & standard deviation too
+ fprintf(ofp, "CREADS\tMEAN");
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ fprintf(ofp, "\t%.1f", stats->nreads[i] / (double)nfile);
+ }
+ }
+ fprintf(ofp, "\n");
+
+ fprintf(ofp, "CREADS\tSTDDEV");
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ double n1 = stats->nreads[i];
+ fprintf(ofp, "\t%.1f", nfile > 1 && stats->nreads2[i] > 0
+ ? sqrt(stats->nreads2[i]/(double)nfile
+ - (n1/nfile)*(n1/nfile))
+ : 0);
+ }
+ }
+ fprintf(ofp, "\n");
+ }
+
+ fprintf(ofp, "# Read percentage of distribution between amplicons.\n");
+ fprintf(ofp, "# Use 'grep ^%cRPERC | cut -f 2-' to extract this part.\n", type);
+ fprintf(ofp, "%cRPERC\t%s", type, name);
+ int all_nseq = 0;
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer;
+ }
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ if (type == 'C') {
+ fprintf(ofp, "\t%.3f", (double)stats->nrperc[i] / nfile);
+ } else {
+ fprintf(ofp, "\t%.3f",
+ all_nseq ? 100.0 * stats->nreads[i] / all_nseq : 0);
+ }
+ }
+ }
+ fprintf(ofp, "\n");
+
+ if (type == 'C') {
+ // For combined we compute mean and standard deviation too
+ fprintf(ofp, "CRPERC\tMEAN");
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ fprintf(ofp, "\t%.3f", stats->nrperc[i] / nfile);
+ }
+ }
+ fprintf(ofp, "\n");
+
+ fprintf(ofp, "CRPERC\tSTDDEV");
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ // variance = SUM(X^2) - ((SUM(X)^2) / N)
+ double n1 = stats->nrperc[i];
+ double v = stats->nrperc2[i]/nfile - (n1/nfile)*(n1/nfile);
+ fprintf(ofp, "\t%.3f", v>0?sqrt(v):0);
+ }
+ }
+ fprintf(ofp, "\n");
+ }
+
+ // Base depth
+ fprintf(ofp, "# Read depth per amplicon.\n");
+ fprintf(ofp, "# Use 'grep ^%cDEPTH | cut -f 2-' to extract this part.\n", type);
+ fprintf(ofp, "%cDEPTH\t%s", type, name);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ amplicon_t *amp = amps[r].amp;
+ for (i = 0; i < amps[r].namp; i++) {
+ int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
+ int64_t alen = amp[i].min_right - amp[i].max_left+1;
+ fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen : 0);
+ }
+ }
+ fprintf(ofp, "\n");
+
+ if (type == 'C') {
+ // For combined we can compute mean & standard deviation too
+ fprintf(ofp, "CDEPTH\tMEAN");
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ amplicon_t *amp = amps[r].amp;
+ int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
+ for (i = 0; i < amps[r].namp; i++) {
+ int64_t alen = amp[i].min_right - amp[i].max_left+1;
+ fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen / nfile : 0);
+ }
+ }
+ fprintf(ofp, "\n");
+
+ fprintf(ofp, "CDEPTH\tSTDDEV");
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ amplicon_t *amp = amps[r].amp;
+ for (i = 0; i < amps[r].namp; i++) {
+ double alen = amp[i].min_right - amp[i].max_left+1;
+ double n1 = stats->nbases[i] / alen;
+ double v = stats->nbases2[i] / (alen*alen) /nfile
+ - (n1/nfile)*(n1/nfile);
+ fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
+ }
+ }
+ fprintf(ofp, "\n");
+ }
+
+ // Percent Coverage
+ if (type == 'F') {
+ fprintf(ofp, "# Percentage coverage per amplicon\n");
+ fprintf(ofp, "# Use 'grep ^%cPCOV | cut -f 2-' to extract this part.\n", type);
+ int d = 0;
+ do {
+ fprintf(ofp, "%cPCOV-%d\t%s", type, args->min_depth[d], name);
+
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ amplicon_t *amp = amps[r].amp;
+ for (i = 0; i < amps[r].namp; i++) {
+ int covered = 0;
+ if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
+ fprintf(stderr, "[ampliconstats] error: "
+ "Maximum amplicon length (%d) exceeded for '%s'\n",
+ stats->max_amp, name);
+ return -1;
+ }
+ int64_t j, offset = amp[i].min_left-1;
+ for (j = amp[i].max_left-1; j < amp[i].min_right; j++) {
+ int apos = i*stats->max_amp_len + j-offset;
+ if (stats->coverage[apos] >= args->min_depth[d])
+ covered++;
+ }
+ int64_t alen = amp[i].min_right - amp[i].max_left+1;
+ stats->covered_perc[i][d] = 100.0 * covered / alen;
+ fprintf(ofp, "\t%.2f", 100.0 * covered / alen);
+ }
+ }
+ fprintf(ofp, "\n");
+ } while (++d < MAX_DEPTH && args->min_depth[d]);
+
+ } else if (type == 'C') {
+ // For combined we can compute mean & standard deviation too
+ int d = 0;
+ do {
+ fprintf(ofp, "CPCOV-%d\tMEAN", args->min_depth[d]);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ fprintf(ofp, "\t%.1f", stats->covered_perc[i][d] / nfile);
+ }
+ }
+ fprintf(ofp, "\n");
+
+ fprintf(ofp, "CPCOV-%d\tSTDDEV", args->min_depth[d]);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ double n1 = stats->covered_perc[i][d] / nfile;
+ double v = stats->covered_perc2[i][d] / nfile - n1*n1;
+ fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
+ }
+ }
+ fprintf(ofp, "\n");
+ } while (++d < MAX_DEPTH && args->min_depth[d]);
+ }
+
+ // Plus base depth for all reads, irrespective of amplicon.
+ // This is post overlap removal, if reads in the read-pair overlap.
+ fprintf(ofp, "# Depth per reference base for ALL data.\n");
+ fprintf(ofp, "# Use 'grep ^%cDP_ALL | cut -f 2-' to extract this part.\n",
+ type);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ if (args->multi_ref)
+ fprintf(ofp, "%cDP_ALL\t%s\t%s", type, name, amps[r].ref);
+ else
+ fprintf(ofp, "%cDP_ALL\t%s", type, name);
+
+ for (i = 0; i < amps[r].len; i++) {
+ // Basic run-length encoding provided all values are within
+ // +- depth_bin fraction of the mid-point.
+ int dmin = stats->depth_all[i], dmax = stats->depth_all[i], j;
+ double dmid = (dmin + dmax)/2.0;
+ double low = dmid*(1-args->depth_bin);
+ double high = dmid*(1+args->depth_bin);
+ for (j = i+1; j < amps[r].len; j++) {
+ int d = stats->depth_all[j];
+ if (d < low || d > high)
+ break;
+ if (dmin > d) {
+ dmin = d;
+ dmid = (dmin + dmax)/2.0;
+ low = dmid*(1-args->depth_bin);
+ high = dmid*(1+args->depth_bin);
+ } else if (dmax < d) {
+ dmax = d;
+ dmid = (dmin + dmax)/2.0;
+ low = dmid*(1-args->depth_bin);
+ high = dmid*(1+args->depth_bin);
+ }
+ }
+ fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
+ i = j-1;
+ }
+ fprintf(ofp, "\n");
+ }
+
+ // And depth for only reads matching to a single amplicon for full
+ // length. This is post read overlap removal.
+ fprintf(ofp, "# Depth per reference base for full-length valid amplicon data.\n");
+ fprintf(ofp, "# Use 'grep ^%cDP_VALID | cut -f 2-' to extract this "
+ "part.\n", type);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ if (args->multi_ref)
+ fprintf(ofp, "%cDP_VALID\t%s\t%s", type, name, amps[r].ref);
+ else
+ fprintf(ofp, "%cDP_VALID\t%s", type, name);
+
+ for (i = 0; i < amps[r].len; i++) {
+ int dmin = stats->depth_valid[i], dmax = stats->depth_valid[i], j;
+ double dmid = (dmin + dmax)/2.0;
+ double low = dmid*(1-args->depth_bin);
+ double high = dmid*(1+args->depth_bin);
+ for (j = i+1; j < amps[r].len; j++) {
+ int d = stats->depth_valid[j];
+ if (d < low || d > high)
+ break;
+ if (dmin > d) {
+ dmin = d;
+ dmid = (dmin + dmax)/2.0;
+ low = dmid*(1-args->depth_bin);
+ high = dmid*(1+args->depth_bin);
+ } else if (dmax < d) {
+ dmax = d;
+ dmid = (dmin + dmax)/2.0;
+ low = dmid*(1-args->depth_bin);
+ high = dmid*(1+args->depth_bin);
+ }
+ }
+ fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
+ i = j-1;
+ }
+ fprintf(ofp, "\n");
+ }
+
+ // TCOORD (start to end) distribution
+ fprintf(ofp, "# Distribution of aligned template coordinates.\n");
+ fprintf(ofp, "# Use 'grep ^%cTCOORD | cut -f 2-' to extract this part.\n", type);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0 - (nref==1); i < amps[r].namp; i++) {
+ if (ntcoord < kh_size(stats->tcoord[i+1])) {
+ ntcoord = kh_size(stats->tcoord[i+1]);
+ tcoord_t *tmp = realloc(tpos, ntcoord * sizeof(*tmp));
+ if (!tmp) {
+ free(tpos);
+ return -1;
+ }
+ tpos = tmp;
+ }
+
+ khiter_t k;
+ size_t n = 0, j;
+ for (k = kh_begin(stats->tcoord[i+1]);
+ k != kh_end(stats->tcoord[i+1]); k++) {
+ if (!kh_exist(stats->tcoord[i+1], k) ||
+ (kh_value(stats->tcoord[i+1], k) & 0xFFFFFFFF) == 0)
+ continue;
+ // Key is start,end in 32-bit quantities.
+ // Yes this limits us to 4Gb references, but just how
+ // many primers are we planning on making? Not that many
+ // I hope.
+ tpos[n].start = kh_key(stats->tcoord[i+1], k)&0xffffffff;
+ tpos[n].end = kh_key(stats->tcoord[i+1], k)>>32;
+
+ // Value is frequency (top 32-bits) and status (bottom 32).
+ tpos[n].freq = kh_value(stats->tcoord[i+1], k)&0xffffffff;
+ tpos[n].status = kh_value(stats->tcoord[i+1], k)>>32;
+ n++;
+ }
+
+ if (args->tcoord_bin > 1)
+ aggregate_tcoord(args, tpos, &n);
+
+ fprintf(ofp, "%cTCOORD\t%s\t%d", type, name,
+ i+1+amps[r].first_amp); // per amplicon
+ for (j = 0; j < n; j++) {
+ if (tpos[j].freq < args->tcoord_min_count)
+ continue;
+ fprintf(ofp, "\t%d,%d,%u,%u",
+ tpos[j].start,
+ tpos[j].end,
+ tpos[j].freq,
+ tpos[j].status);
+ }
+ fprintf(ofp, "\n");
+ }
+ }
+
+
+ // AMP length distribution.
+ // 0 = both ends in this amplicon
+ // 1 = ends in different amplicons
+ // 2 = other end matching an unknown amplicon site
+ // (see tcoord for further analysis of where)
+ fprintf(ofp, "# Classification of amplicon status. Columns are\n");
+ fprintf(ofp, "# number with both primers from this amplicon, number with\n");
+ fprintf(ofp, "# primers from different amplicon, and number with a position\n");
+ fprintf(ofp, "# not matching any valid amplicon primer site\n");
+ fprintf(ofp, "# Use 'grep ^%cAMP | cut -f 2-' to extract this part.\n", type);
+
+ fprintf(ofp, "%cAMP\t%s\t0", type, name); // all merged
+ int amp_dist[3] = {0};
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) { // accumulate for all amps
+ amp_dist[0] += stats->amp_dist[i][0];
+ amp_dist[1] += stats->amp_dist[i][1];
+ amp_dist[2] += stats->amp_dist[i][2];
+ }
+ }
+ fprintf(ofp, "\t%d\t%d\t%d\n", amp_dist[0], amp_dist[1], amp_dist[2]);
+
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ // per amplicon
+ fprintf(ofp, "%cAMP\t%s\t%d", type, name, i+1+amps[r].first_amp);
+ fprintf(ofp, "\t%d\t%d\t%d\n", stats->amp_dist[i][0],
+ stats->amp_dist[i][1], stats->amp_dist[i][2]);
+ }
+ }
+
+ free(tpos);
+ return 0;
+}
+
+int dump_lstats(astats_args_t *args, char type, char *name, int nfile,
+ amplicons_t *amps, int nref) {
+ return dump_stats(args, type, name, nfile, amps, nref, 1);
+}
+
+int dump_gstats(astats_args_t *args, char type, char *name, int nfile,
+ amplicons_t *amps, int nref) {
+ return dump_stats(args, type, name, nfile, amps, nref, 0);
+}
+
+char const *get_sample_name(sam_hdr_t *header, char *RG) {
+ kstring_t ks = {0};
+ sam_hdr_find_tag_id(header, "RG", RG?"ID":NULL, RG, "SM", &ks);
+ return ks.s;
+}
+
+// Return maximum reference length (SQ is NULL) or the length
+// of the specified reference in SQ.
+int64_t get_ref_len(sam_hdr_t *header, const char *SQ) {
+ if (SQ) {
+ int tid = SQ ? sam_hdr_name2tid(header, SQ) : 0;
+ return tid >= 0 ? sam_hdr_tid2len(header, tid) : -1;
+ } else {
+ int nref = sam_hdr_nref(header), tid;;
+ int64_t len = 0;
+ for (tid = 0; tid < nref; tid++) {
+ int64_t rl = sam_hdr_tid2len(header, tid);
+ if (len < rl)
+ len = rl;
+ }
+ return len;
+ }
+}
+
+static int amplicon_stats(astats_args_t *args,
+ khash_t(bed_list_hash) *bed_hash,
+ char **filev, int filec) {
+ int i, ref = -1, ref_tid = -1, ret = -1, nref = 0;
+ samFile *fp = NULL;
+ sam_hdr_t *header = NULL;
+ bam1_t *b = bam_init1();
+ FILE *ofp = args->out_fp;
+ char sname_[8192], *sname = NULL;
+ amplicons_t *amps = NULL;
+
+ // Report initial SS header. We gather data from the bed_hash entries
+ // as well as from the first SAM header (with the requirement that all
+ // headers should be compatible).
+ if (filec) {
+ if (!(fp = sam_open_format(filev[0], "r", &args->ga.in))) {
+ print_error_errno("ampliconstats",
+ "Cannot open input file \"%s\"",
+ filev[0]);
+ goto err;
+ }
+ if (!(header = sam_hdr_read(fp)))
+ goto err;
+
+ if (!amps) {
+ amps = calloc(nref=sam_hdr_nref(header), sizeof(*amps));
+ if (!amps)
+ goto err;
+ fprintf(ofp, "# Summary statistics, used for scaling the plots.\n");
+ fprintf(ofp, "SS\tSamtools version: %s\n", samtools_version());
+ fprintf(ofp, "SS\tCommand line: %s\n", args->argv);
+ fprintf(ofp, "SS\tNumber of files:\t%d\n", filec);
+
+ // Note: order of hash entries will be different to order of
+ // BED file which may also differ to order of SQ headers.
+ // SQ header is canonical ordering (pos sorted file).
+ khiter_t k;
+ int bam_nref = sam_hdr_nref(header);
+ for (i = 0; i < bam_nref; i++) {
+ k = kh_get(bed_list_hash, bed_hash,
+ sam_hdr_tid2name(header, i));
+ if (!kh_exist(bed_hash, k))
+ continue;
+
+ bed_entry_list_t *sites = &kh_value(bed_hash, k);
+
+ ref = i;
+ amps[ref].ref = kh_key(bed_hash, k);
+ amps[ref].sites = sites;
+ amps[ref].namp = count_amplicon(sites);
+ amps[ref].amp = calloc(sites->length,
+ sizeof(*amps[ref].amp));
+ if (!amps[ref].amp)
+ goto err;
+ if (args->multi_ref)
+ fprintf(ofp, "SS\tNumber of amplicons:\t%s\t%d\n",
+ kh_key(bed_hash, k), amps[ref].namp);
+ else
+ fprintf(ofp, "SS\tNumber of amplicons:\t%d\n",
+ amps[ref].namp);
+
+ amps[ref].tid = ref;
+ if (ref_tid == -1)
+ ref_tid = ref;
+
+ int64_t len = get_ref_len(header, kh_key(bed_hash, k));
+ amps[ref].len = len;
+ if (args->multi_ref)
+ fprintf(ofp, "SS\tReference length:\t%s\t%"PRId64"\n",
+ kh_key(bed_hash, k), len);
+ else
+ fprintf(ofp, "SS\tReference length:\t%"PRId64"\n",
+ len);
+
+ amps[ref].lstats = stats_alloc(len, args->max_amp,
+ args->max_amp_len);
+ amps[ref].gstats = stats_alloc(len, args->max_amp,
+ args->max_amp_len);
+ if (!amps[ref].lstats || !amps[ref].gstats)
+ goto err;
+ }
+ }
+
+ sam_hdr_destroy(header);
+ header = NULL;
+ if (sam_close(fp) < 0) {
+ fp = NULL;
+ goto err;
+ }
+ fp = NULL;
+ }
+ fprintf(ofp, "SS\tEnd of summary\n");
+
+ // Extract the bits of amplicon data we need from bed hash and turn
+ // it into a position-to-amplicon lookup table.
+ int offset = 0;
+ for (i = 0; i < nref; i++) {
+ if (!amps[i].sites)
+ continue;
+
+ amps[i].first_amp = offset;
+ if (bed2amplicon(args, amps[i].sites, amps[i].amp,
+ &s[i].namp, i==0, amps[i].ref, offset) < 0)
+ goto err;
+
+ offset += amps[i].namp; // cumulative amplicon number across refs
+ }
+
+ // Now iterate over file contents, one at a time.
+ for (i = 0; i < filec; i++) {
+ char *nstart = filev[i];
+
+ fp = sam_open_format(filev[i], "r", &args->ga.in);
+ if (!fp) {
+ print_error_errno("ampliconstats",
+ "Cannot open input file \"%s\"",
+ filev[i]);
+ goto err;
+ }
+
+ if (args->ga.nthreads > 0)
+ hts_set_threads(fp, args->ga.nthreads);
+
+ if (!(header = sam_hdr_read(fp)))
+ goto err;
+
+ if (nref != sam_hdr_nref(header)) {
+ print_error_errno("ampliconstats",
+ "SAM headers are not consistent across input files");
+ goto err;
+ }
+ int r;
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].ref ||
+ strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 ||
+ amps[r].len != sam_hdr_tid2len(header, r)) {
+ print_error_errno("ampliconstats",
+ "SAM headers are not consistent across "
+ "input files");
+ goto err;
+ }
+ }
+
+ if (args->use_sample_name)
+ sname = (char *)get_sample_name(header, NULL);
+
+ if (!sname) {
+ sname = sname_;
+ char *nend = filev[i] + strlen(filev[i]), *cp;
+ if ((cp = strrchr(filev[i], '/')))
+ nstart = cp+1;
+ if ((cp = strrchr(nstart, '.')) &&
+ (strcmp(cp, ".bam") == 0 ||
+ strcmp(cp, ".sam") == 0 ||
+ strcmp(cp, ".cram") == 0))
+ nend = cp;
+ if (nend - nstart >= 8192) nend = nstart+8191;
+ memcpy(sname, nstart, nend-nstart);
+ sname[nend-nstart] = 0;
+ }
+
+ // Stats local to this sample only
+ amp_stats_reset(amps, nref);
+
+ int last_ref = -9;
+ while ((r = sam_read1(fp, header, b)) >= 0) {
+ // Other filter options useful here?
+ if (b->core.tid < 0)
+ continue;
+
+ if (last_ref != b->core.tid) {
+ last_ref = b->core.tid;
+ if (initialise_amp_pos_lookup(args, amps, last_ref) < 0)
+ goto err;
+ }
+
+ if (accumulate_stats(args, amps, b) < 0)
+ goto err;
+ }
+
+ if (r < -1) {
+ print_error_errno("ampliconstats", "Fail reading record");
+ goto err;
+ }
+
+ sam_hdr_destroy(header);
+ if (sam_close(fp) < 0) {
+ fp = NULL;
+ goto err;
+ }
+
+ fp = NULL;
+ header = NULL;
+
+ if (dump_lstats(args, 'F', sname, filec, amps, nref) < 0)
+ goto err;
+
+ if (append_stats(amps, nref) < 0)
+ goto err;
+
+ if (sname && sname != sname_)
+ free(sname);
+ sname = NULL;
+ }
+
+ if (dump_gstats(args, 'C', "COMBINED", filec, amps, nref) < 0)
+ goto err;
+
+ ret = 0;
+ err:
+ bam_destroy1(b);
+ if (ret) {
+ if (header)
+ sam_hdr_destroy(header);
+ if (fp)
+ sam_close(fp);
+ }
+ for (i = 0; i < nref; i++) {
+ stats_free(amps[i].lstats);
+ stats_free(amps[i].gstats);
+ free(amps[i].amp);
+ }
+ free(amps);
+ free(pos2start);
+ free(pos2end);
+ if (ret) {
+ if (sname && sname != sname_)
+ free(sname);
+ }
+
+ return ret;
+}
+
+static int usage(astats_args_t *args, FILE *fp, int exit_status) {
+ fprintf(fp,
+"\n"
+"Usage: samtools ampliconstats [options] primers.bed *.bam > astats.txt\n"
+"\n"
+"Options:\n");
+ fprintf(fp, " -f, --required-flag STR|INT\n"
+ " Only include reads with all of the FLAGs present [0x%X]\n",args->flag_require);
+ fprintf(fp, " -F, --filter-flag STR|INT\n"
+ " Only include reads with none of the FLAGs present [0x%X]\n",args->flag_filter & 0xffff);
+ fprintf(fp, " -a, --max-amplicons INT\n"
+ " Change the maximum number of amplicons permitted [%d]\n", MAX_AMP);
+ fprintf(fp, " -l, --max-amplicon-length INT\n"
+ " Change the maximum length of an individual amplicon [%d]\n", MAX_AMP_LEN);
+ fprintf(fp, " -d, --min-depth INT[,INT]...\n"
+ " Minimum base depth(s) to consider position covered [%d]\n", args->min_depth[0]);
+ fprintf(fp, " -m, --pos-margin INT\n"
+ " Margin of error for matching primer positions [%d]\n", args->max_delta);
+ fprintf(fp, " -o, --output FILE\n"
+ " Specify output file [stdout if unset]\n");
+ fprintf(fp, " -s, --use-sample-name\n"
+ " Use the sample name from the first @RG header line\n");
+ fprintf(fp, " -t, --tlen-adjust INT\n"
+ " Add/subtract from TLEN; use when clipping but no fixmate step\n");
+ fprintf(fp, " -b, --tcoord-bin INT\n"
+ " Bin template start,end positions into multiples of INT[1]\n");
+ fprintf(fp, " -c, --tcoord-min-count INT\n"
+ " Minimum template start,end frequency for recording [%d]\n", TCOORD_MIN_COUNT);
+ fprintf(fp, " -D, --depth-bin FRACTION\n"
+ " Merge FDP values within +/- FRACTION together\n");
+ fprintf(fp, " -S, --single-ref\n"
+ " Force single-ref (<=1.12) output format\n");
+ sam_global_opt_help(fp, "I.--.@");
+
+ return exit_status;
+}
+
+int main_ampliconstats(int argc, char **argv) {
+ astats_args_t args = {
+ .ga = SAM_GLOBAL_ARGS_INIT,
+ .flag_require = 0,
+ .flag_filter = 0x10B04,
+ //.sites = BED_LIST_INIT,
+ .max_delta = 30, // large enough to cope with alt primers
+ .min_depth = {1},
+ .use_sample_name = 0,
+ .max_amp = MAX_AMP,
+ .max_amp_len = MAX_AMP_LEN,
+ .tlen_adj = 0,
+ .out_fp = stdout,
+ .tcoord_min_count = TCOORD_MIN_COUNT,
+ .tcoord_bin = 1,
+ .depth_bin = 0.01,
+ .multi_ref = 1
+ }, oargs = args;
+
+ static const struct option loptions[] =
+ {
+ SAM_OPT_GLOBAL_OPTIONS('I', 0, '-', '-', 0, '@'),
+ {"help", no_argument, NULL, 'h'},
+ {"flag-require", required_argument, NULL, 'f'},
+ {"flag-filter", required_argument, NULL, 'F'},
+ {"min-depth", required_argument, NULL, 'd'},
+ {"output", required_argument, NULL, 'o'},
+ {"pos-margin", required_argument, NULL, 'm'},
+ {"use-sample-name", no_argument, NULL, 's'},
+ {"max-amplicons", required_argument, NULL, 'a'},
+ {"max-amplicon-length", required_argument, NULL, 'l'},
+ {"tlen-adjust", required_argument, NULL, 't'},
+ {"tcoord-min-count", required_argument, NULL, 'c'},
+ {"tcoord-bin", required_argument, NULL, 'b'},
+ {"depth-bin", required_argument, NULL, 'D'},
+ {"single-ref", no_argument, NULL, 'S'},
+ {NULL, 0, NULL, 0}
+ };
+ int opt;
+
+ while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) {
+ switch (opt) {
+ case 'f': args.flag_require = bam_str2flag(optarg); break;
+ case 'F':
+ if (args.flag_filter & 0x10000)
+ args.flag_filter = 0; // strip default on first -F usage
+ args.flag_filter |= bam_str2flag(optarg); break;
+
+ case 'm': args.max_delta = atoi(optarg); break; // margin
+ case 'D': args.depth_bin = atof(optarg); break; // depth bin fraction
+ case 'd': {
+ int d = 0;
+ char *cp = optarg, *ep;
+ do {
+ long n = strtol(cp, &ep, 10);
+ args.min_depth[d++] = n;
+ if (*ep != ',')
+ break;
+ cp = ep+1;
+ } while (d < MAX_DEPTH);
+ break;
+ }
+
+ case 'a': args.max_amp = atoi(optarg)+1;break;
+ case 'l': args.max_amp_len = atoi(optarg)+1;break;
+
+ case 'c': args.tcoord_min_count = atoi(optarg);break;
+ case 'b':
+ args.tcoord_bin = atoi(optarg);
+ if (args.tcoord_bin < 1)
+ args.tcoord_bin = 1;
+ break;
+
+ case 't': args.tlen_adj = atoi(optarg);break;
+
+ case 's': args.use_sample_name = 1;break;
+
+ case 'o':
+ if (!(args.out_fp = fopen(optarg, "w"))) {
+ perror(optarg);
+ return 1;
+ }
+ break;
+
+ case 'S':
+ args.multi_ref = 0;
+ break;
+
+ case '?': return usage(&oargs, stderr, EXIT_FAILURE);
+ case 'h': return usage(&oargs, stdout, EXIT_SUCCESS);
+
+ default:
+ if (parse_sam_global_opt(opt, optarg, loptions, &args.ga) != 0)
+ usage(&oargs,stderr, EXIT_FAILURE);
+ break;
+ }
+ }
+
+ if (argc <= optind)
+ return usage(&oargs, stdout, EXIT_SUCCESS);
+ if (argc <= optind+1 && isatty(STDIN_FILENO))
+ return usage(&oargs, stderr, EXIT_FAILURE);
+
+ khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
+ if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) {
+ print_error_errno("ampliconstats",
+ "Could not read file \"%s\"", argv[optind]);
+ return 1;
+
+ }
+
+ khiter_t k, ref_count = 0;
+ for (k = kh_begin(bed_hash); k != kh_end(bed_hash); k++) {
+ if (!kh_exist(bed_hash, k))
+ continue;
+ ref_count++;
+ }
+ if (ref_count == 0)
+ return 1;
+ if (ref_count > 1 && args.multi_ref == 0) {
+ print_error("ampliconstats",
+ "Single-ref mode is not permitted for BED files\n"
+ "containing more than one reference.");
+ return 1;
+ }
+
+ args.argv = stringify_argv(argc, argv);
+ int ret;
+ if (argc == ++optind) {
+ char *av = "-";
+ ret = amplicon_stats(&args, bed_hash, &av, 1);
+ } else {
+ ret = amplicon_stats(&args, bed_hash, &argv[optind], argc-optind);
+ }
+
+ free(args.argv);
+ destroy_bed_hash(bed_hash);
+
+ return ret;
+}
--- /dev/null
+#include "samtools.pysam.h"
+
+/* stats.c -- This is the former bamcheck integrated into samtools/htslib.
+
+ Copyright (C) 2020-2021 Genome Research Ltd.
+
+ Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+/*
+ * This tool is designed to give "samtools stats" style output, but dedicated
+ * to small amplicon sequencing projects. It gathers stats on the
+ * distribution of reads across amplicons.
+ */
+
+/*
+ * TODO:
+ * - Cope with multiple references. What do we do here? Just request one?
+ * - Permit regions rather than consuming whole file (maybe solves above).
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <math.h>
+
+#include <htslib/sam.h>
+#include <htslib/khash.h>
+
+#include "samtools.h"
+#include "sam_opts.h"
+#include "bam_ampliconclip.h"
+
+KHASH_MAP_INIT_INT64(tcoord, int64_t)
+KHASH_MAP_INIT_STR(qname, int64_t)
+
+#ifndef MIN
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef MAX
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#ifndef ABS
+#define ABS(a) ((a)>=0?(a):-(a))
+#endif
+
+#define TCOORD_MIN_COUNT 10
+#define MAX_AMP 1000 // Default maximum number of amplicons
+#define MAX_AMP_LEN 1000 // Default maximum length of any single amplicon
+#define MAX_PRIMER_PER_AMPLICON 4 // Max primers per LEFT/RIGHT
+#define MAX_DEPTH 5 // Number of different depths permitted
+
+typedef struct {
+ sam_global_args ga;
+ uint32_t flag_require;
+ uint32_t flag_filter;
+ int max_delta; // Used for matching read to amplicon primer loc
+ int min_depth[MAX_DEPTH]; // Used for coverage; must be >= min_depth deep
+ int use_sample_name;
+ int max_amp; // Total number of amplicons
+ int max_amp_len; // Maximum length of an individual amplicon
+ double depth_bin;// aggregate depth within this fraction
+ int tlen_adj; // Adjust tlen by this amount, due to clip but no fixmate
+ FILE *out_fp;
+ char *argv;
+ int tcoord_min_count;
+ int tcoord_bin;
+ int multi_ref;
+} astats_args_t;
+
+typedef struct {
+ int nseq; // total sequence count
+ int nfiltered; // sequence filtered
+ int nfailprimer;// count of sequences not matching the primer locations
+
+ // Sizes of memory allocated below, to permit reset
+ int max_amp, max_amp_len, max_len;
+
+ // Summary across all samples, sum(x) plus sum(x^2) for s.d. calc
+ int64_t *nreads, *nreads2; // [max_amp]
+ double *nfull_reads; // [max_amp]; 0.5/read if paired.
+ double *nrperc, *nrperc2; // [max_amp]
+ int64_t *nbases, *nbases2; // [max_amp]
+ int64_t *coverage; // [max_amp][max_amp_len]
+ double (*covered_perc)[MAX_DEPTH]; // [max_amp][MAX_DEPTH]
+ double (*covered_perc2)[MAX_DEPTH];// [max_amp][MAX_DEPTH];
+ khash_t(tcoord) **tcoord; // [max_amp+1]
+
+ // 0 is correct pair, 1 is incorrect pair, 2 is unidentified
+ int (*amp_dist)[3]; // [MAX_AMP][3];
+
+ int *depth_valid; // [max_len]
+ int *depth_all; // [max_len]
+ khash_t(qname) *qend; // queryname end, for overlap removal
+} astats_t;
+
+// We can have multiple primers for LEFT / RIGHT, so this
+// permits detection by any compatible combination.
+// One reference:
+typedef struct {
+ int64_t left[MAX_PRIMER_PER_AMPLICON];
+ int nleft;
+ int64_t right[MAX_PRIMER_PER_AMPLICON];
+ int nright;
+ int64_t max_left, min_right; // inner dimensions
+ int64_t min_left, max_right; // outer dimensions
+} amplicon_t;
+
+// Multiple references, we have an array of amplicons_t - one per used ref.
+// We have per reference local and global stats here, as some of the stats
+// are coordinate based. However we report them combined together as a single
+// list across all references.
+// "namp" is the number of amplicons in this reference, but they're
+// numbered first_amp to first_amp+namp-1 inclusively.
+typedef struct {
+ int tid, namp;
+ int64_t len;
+ bed_entry_list_t *sites;
+ amplicon_t *amp;
+ astats_t *lstats, *gstats; // local (1 file) and global (all file) stats
+ const char *ref; // ref name (pointer to the bed hash table key)
+ int first_amp; // first amplicon number for this ref
+} amplicons_t;
+
+// Reinitialised for each new reference/chromosome.
+// Counts from 1 to namp, -1 for no match and 0 for ?.
+static int *pos2start = NULL;
+static int *pos2end = NULL;
+static int pos2size = 0; // allocated size of pos2start/end
+
+// Lookup table to go from position to amplicon based on
+// read start / end.
+static int initialise_amp_pos_lookup(astats_args_t *args,
+ amplicons_t *amps,
+ int ref) {
+ int64_t i, j;
+ amplicon_t *amp = amps[ref].amp;
+ int64_t max_len = amps[ref].len;
+ int namp = amps[ref].namp;
+
+ if (max_len+1 > pos2size) {
+ if (!(pos2start = realloc(pos2start, (max_len+1)*sizeof(*pos2start))))
+ return -1;
+ if (!(pos2end = realloc(pos2end, (max_len+1)*sizeof(*pos2end))))
+ return -1;
+ pos2size = max_len;
+ }
+ for (i = 0; i < max_len; i++)
+ pos2start[i] = pos2end[i] = -1;
+
+ for (i = 0; i < namp; i++) {
+ for (j = 0; j < amp[i].nleft; j++) {
+ int64_t p;
+ for (p = amp[i].left[j] - args->max_delta;
+ p <= amp[i].left[j] + args->max_delta; p++) {
+ if (p < 1 || p > max_len)
+ continue;
+ pos2start[p-1] = i;
+ }
+ }
+ for (j = 0; j < amp[i].nright; j++) {
+ int64_t p;
+ for (p = amp[i].right[j] - args->max_delta;
+ p <= amp[i].right[j] + args->max_delta; p++) {
+ if (p < 1 || p > max_len)
+ continue;
+ pos2end[p-1] = i;
+ }
+ }
+ }
+
+ return 0;
+}
+
+// Counts amplicons.
+// Assumption: input BED file alternates between LEFT and RIGHT primers
+// per amplicon, thus we can count the number based on the switching
+// orientation.
+static int count_amplicon(bed_entry_list_t *sites) {
+ int i, namp, last_rev = 0;
+ for (i = namp = 0; i < sites->length; i++) {
+ if (sites->bp[i].rev == 0 && last_rev)
+ namp++;
+ last_rev = sites->bp[i].rev;
+ }
+
+ return ++namp;
+}
+
+// We're only interest in the internal part of the amplicon.
+// Our bed file has LEFT start/end followed by RIGHT start/end,
+// so collapse these to LEFT end / RIGHT start.
+//
+// Returns right most amplicon position on success,
+// < 0 on error
+static int64_t bed2amplicon(astats_args_t *args, bed_entry_list_t *sites,
+ amplicon_t *amp, int *namp, int do_title,
+ const char *ref, int first_amp) {
+ int i, j;
+ int64_t max_right = 0;
+ FILE *ofp = args->out_fp;
+
+ *namp = 0;
+
+ // Assume all primers for the same amplicon are adjacent in BED
+ // with all + followed by all -. Thus - to + signifies next primer set.
+ int last_rev = 0;
+ amp[0].max_left = 0;
+ amp[0].min_right = INT64_MAX;
+ amp[0].min_left = INT64_MAX;
+ amp[0].max_right = 0;
+ if (do_title) {
+ fprintf(ofp, "# Amplicon locations from BED file.\n");
+ fprintf(ofp, "# LEFT/RIGHT are <start>-<end> format and "
+ "comma-separated for alt-primers.\n");
+ if (args->multi_ref)
+ fprintf(ofp, "#\n# AMPLICON\tREF\tNUMBER\tLEFT\tRIGHT\n");
+ else
+ fprintf(ofp, "#\n# AMPLICON\tNUMBER\tLEFT\tRIGHT\n");
+ }
+ for (i = j = 0; i < sites->length; i++) {
+ if (i == 0 && sites->bp[i].rev != 0) {
+ fprintf(samtools_stderr, "[ampliconstats] error: BED file should start"
+ " with the + strand primer\n");
+ return -1;
+ }
+ if (sites->bp[i].rev == 0 && last_rev) {
+ j++;
+ if (j >= args->max_amp) {
+ fprintf(samtools_stderr, "[ampliconstats] error: too many amplicons"
+ " (%d). Use -a option to raise this.\n", j);
+ return -1;
+ }
+ amp[j].max_left = 0;
+ amp[j].min_right = INT64_MAX;
+ amp[j].min_left = INT64_MAX;
+ amp[j].max_right = 0;
+ }
+ if (sites->bp[i].rev == 0) {
+ if (i == 0 || last_rev) {
+ if (j>0) fprintf(ofp, "\n");
+ if (args->multi_ref)
+ fprintf(ofp, "AMPLICON\t%s\t%d", ref, j+1 + first_amp);
+ else
+ fprintf(ofp, "AMPLICON\t%d", j+1);
+ }
+ if (amp[j].nleft >= MAX_PRIMER_PER_AMPLICON) {
+ print_error_errno("ampliconstats",
+ "too many primers per amplicon (%d).\n",
+ MAX_PRIMER_PER_AMPLICON);
+ return -1;
+ }
+ amp[j].left[amp[j].nleft++] = sites->bp[i].right;
+ if (amp[j].max_left < sites->bp[i].right+1)
+ amp[j].max_left = sites->bp[i].right+1;
+ if (amp[j].min_left > sites->bp[i].right+1)
+ amp[j].min_left = sites->bp[i].right+1;
+ // BED file, so left+1 as zero based. right(+1-1) as
+ // BED goes one beyond end (and we want inclusive range).
+ fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nleft > 1],
+ sites->bp[i].left+1, sites->bp[i].right);
+ } else {
+ if (amp[j].nright >= MAX_PRIMER_PER_AMPLICON) {
+ print_error_errno("ampliconstats",
+ "too many primers per amplicon (%d)",
+ MAX_PRIMER_PER_AMPLICON);
+ return -1;
+ }
+ amp[j].right[amp[j].nright++] = sites->bp[i].left;
+ if (amp[j].min_right > sites->bp[i].left-1)
+ amp[j].min_right = sites->bp[i].left-1;
+ if (amp[j].max_right < sites->bp[i].left-1) {
+ amp[j].max_right = sites->bp[i].left-1;
+ if (amp[j].max_right - amp[j].min_left + 1 >=
+ args->max_amp_len) {
+ fprintf(samtools_stderr, "[ampliconstats] error: amplicon "
+ "longer (%d) than max_amp_len option (%d)\n",
+ (int)(amp[j].max_right - amp[j].min_left + 2),
+ args->max_amp_len);
+ return -1;
+ }
+ if (max_right < amp[j].max_right)
+ max_right = amp[j].max_right;
+ }
+ fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nright > 1],
+ sites->bp[i].left+1, sites->bp[i].right);
+ }
+ last_rev = sites->bp[i].rev;
+ }
+ if (last_rev != 1) {
+ fprintf(ofp, "\n"); // useful if going to samtools_stdout
+ fprintf(samtools_stderr, "[ampliconstats] error: bed file does not end on"
+ " a reverse strand primer.\n");
+ return -1;
+ }
+ *namp = ++j;
+ if (j) fprintf(ofp, "\n");
+
+ if (j >= args->max_amp) {
+ fprintf(samtools_stderr, "[ampliconstats] error: "
+ "too many amplicons (%d). Use -a option to raise this.", j);
+ return -1;
+ }
+
+// for (i = 0; i < *namp; i++) {
+// fprintf(samtools_stdout, "%d\t%ld", i, amp[i].length);
+// for (j = 0; j < amp[i].nleft; j++)
+// fprintf(samtools_stdout, "%c%ld", "\t,"[j>0], amp[i].left[j]);
+// for (j = 0; j < amp[i].nright; j++)
+// fprintf(samtools_stdout, "%c%ld", "\t,"[j>0], amp[i].right[j]);
+// fprintf(samtools_stdout, "\n");
+// }
+
+ return max_right;
+}
+
+void stats_free(astats_t *st) {
+ if (!st)
+ return;
+
+ free(st->nreads);
+ free(st->nreads2);
+ free(st->nfull_reads);
+ free(st->nrperc);
+ free(st->nrperc2);
+ free(st->nbases);
+ free(st->nbases2);
+ free(st->coverage);
+ free(st->covered_perc);
+ free(st->covered_perc2);
+ free(st->amp_dist);
+
+ free(st->depth_valid);
+ free(st->depth_all);
+
+ if (st->tcoord) {
+ int i;
+ for (i = 0; i <= st->max_amp; i++) {
+ if (st->tcoord[i])
+ kh_destroy(tcoord, st->tcoord[i]);
+ }
+ free(st->tcoord);
+ }
+
+ khiter_t k;
+ for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
+ if (kh_exist(st->qend, k))
+ free((void *)kh_key(st->qend, k));
+ kh_destroy(qname, st->qend);
+
+ free(st);
+}
+
+astats_t *stats_alloc(int64_t max_len, int max_amp, int max_amp_len) {
+ astats_t *st = calloc(1, sizeof(*st));
+ if (!st)
+ return NULL;
+
+ st->max_amp = max_amp;
+ st->max_amp_len = max_amp_len;
+ st->max_len = max_len;
+
+ if (!(st->nreads = calloc(max_amp, sizeof(*st->nreads)))) goto err;
+ if (!(st->nreads2 = calloc(max_amp, sizeof(*st->nreads2)))) goto err;
+ if (!(st->nrperc = calloc(max_amp, sizeof(*st->nrperc)))) goto err;
+ if (!(st->nrperc2 = calloc(max_amp, sizeof(*st->nrperc2)))) goto err;
+ if (!(st->nbases = calloc(max_amp, sizeof(*st->nbases)))) goto err;
+ if (!(st->nbases2 = calloc(max_amp, sizeof(*st->nbases2)))) goto err;
+
+ if (!(st->nfull_reads = calloc(max_amp, sizeof(*st->nfull_reads))))
+ goto err;
+
+ if (!(st->coverage = calloc(max_amp*max_amp_len, sizeof(*st->coverage))))
+ goto err;
+
+ if (!(st->covered_perc = calloc(max_amp, sizeof(*st->covered_perc))))
+ goto err;
+ if (!(st->covered_perc2 = calloc(max_amp, sizeof(*st->covered_perc2))))
+ goto err;
+
+ if (!(st->tcoord = calloc(max_amp+1, sizeof(*st->tcoord)))) goto err;
+ int i;
+ for (i = 0; i <= st->max_amp; i++)
+ if (!(st->tcoord[i] = kh_init(tcoord)))
+ goto err;
+
+ if (!(st->qend = kh_init(qname)))
+ goto err;
+
+ if (!(st->depth_valid = calloc(max_len, sizeof(*st->depth_valid))))
+ goto err;
+ if (!(st->depth_all = calloc(max_len, sizeof(*st->depth_all))))
+ goto err;
+
+ if (!(st->amp_dist = calloc(max_amp, sizeof(*st->amp_dist)))) goto err;
+
+ return st;
+
+ err:
+ stats_free(st);
+ return NULL;
+}
+
+static void stats_reset(astats_t *st) {
+ st->nseq = 0;
+ st->nfiltered = 0;
+ st->nfailprimer = 0;
+
+ memset(st->nreads, 0, st->max_amp * sizeof(*st->nreads));
+ memset(st->nreads2, 0, st->max_amp * sizeof(*st->nreads2));
+ memset(st->nfull_reads, 0, st->max_amp * sizeof(*st->nfull_reads));
+
+ memset(st->nrperc, 0, st->max_amp * sizeof(*st->nrperc));
+ memset(st->nrperc2, 0, st->max_amp * sizeof(*st->nrperc2));
+
+ memset(st->nbases, 0, st->max_amp * sizeof(*st->nbases));
+ memset(st->nbases2, 0, st->max_amp * sizeof(*st->nbases2));
+
+ memset(st->coverage, 0, st->max_amp * st->max_amp_len
+ * sizeof(*st->coverage));
+ memset(st->covered_perc, 0, st->max_amp * sizeof(*st->covered_perc));
+ memset(st->covered_perc2, 0, st->max_amp * sizeof(*st->covered_perc2));
+
+ // Keep the allocated entries as it's likely all files will share
+ // the same keys. Instead we reset counters to zero for common ones
+ // and delete rare ones.
+ int i;
+ for (i = 0; i <= st->max_amp; i++) {
+ khiter_t k;
+ for (k = kh_begin(st->tcoord[i]);
+ k != kh_end(st->tcoord[i]); k++)
+ if (kh_exist(st->tcoord[i], k)) {
+ if (kh_value(st->tcoord[i], k) < 5)
+ kh_del(tcoord, st->tcoord[i], k);
+ else
+ kh_value(st->tcoord[i], k) = 0;
+ }
+ }
+
+ khiter_t k;
+ for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
+ if (kh_exist(st->qend, k))
+ free((void *)kh_key(st->qend, k));
+ kh_clear(qname, st->qend);
+
+ memset(st->depth_valid, 0, st->max_len * sizeof(*st->depth_valid));
+ memset(st->depth_all, 0, st->max_len * sizeof(*st->depth_all));
+ memset(st->amp_dist, 0, st->max_amp * sizeof(*st->amp_dist));
+}
+
+static void amp_stats_reset(amplicons_t *amps, int nref) {
+ int i;
+ for (i = 0; i < nref; i++) {
+ if (!amps[i].sites)
+ continue;
+ stats_reset(amps[i].lstats);
+ }
+}
+
+static int accumulate_stats(astats_args_t *args, amplicons_t *amps,
+ bam1_t *b) {
+ int ref = b->core.tid;
+ amplicon_t *amp = amps[ref].amp;
+ astats_t *stats = amps[ref].lstats;
+ int len = amps[ref].len;
+
+ if (!stats)
+ return 0;
+
+ stats->nseq++;
+ if ((b->core.flag & args->flag_require) != args->flag_require ||
+ (b->core.flag & args->flag_filter) != 0) {
+ stats->nfiltered++;
+ return 0;
+ }
+
+ int64_t start = b->core.pos, mstart = start; // modified start
+ int64_t end = bam_endpos(b), i;
+
+ // Compute all-template-depth and valid-template-depth.
+ // We track current end location per read name so we can remove overlaps.
+ // Potentially we could use this data for a better amplicon-depth
+ // count too, but for now it's purely for the per-base plots.
+ int ret;
+ khiter_t k;
+ int prev_start = 0, prev_end = 0;
+ if ((b->core.flag & BAM_FPAIRED)
+ && !(b->core.flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY))) {
+ k = kh_put(qname, stats->qend, bam_get_qname(b), &ret);
+ if (ret == 0) {
+ prev_start = kh_value(stats->qend, k) & 0xffffffff;
+ prev_end = kh_value(stats->qend, k)>>32;
+ mstart = MAX(mstart, prev_end);
+ // Ideally we'd reuse strings so we don't thrash free/malloc.
+ // However let's see if the official way of doing that (malloc
+ // itself) is fast enough first.
+ free((void *)kh_key(stats->qend, k));
+ kh_del(qname, stats->qend, k);
+ //fprintf(samtools_stderr, "remove overlap %d to %d\n", (int)start, (int)mstart);
+ } else {
+ if (!(kh_key(stats->qend, k) = strdup(bam_get_qname(b))))
+ return -1;
+
+ kh_value(stats->qend, k) = start | (end << 32);
+ }
+ }
+ for (i = mstart; i < end && i < len; i++)
+ stats->depth_all[i]++;
+ if (i < end) {
+ print_error("ampliconstats", "record %s overhangs end of reference",
+ bam_get_qname(b));
+ // But keep going, as it's harmless.
+ }
+
+ // On single ended runs, eg ONT or PacBio, we just use the start/end
+ // of the template to assign.
+ int anum = (b->core.flag & BAM_FREVERSE) || !(b->core.flag & BAM_FPAIRED)
+ ? (end-1 >= 0 && end-1 < len ? pos2end[end-1] : -1)
+ : (start >= 0 && start < len ? pos2start[start] : -1);
+
+ // ivar sometimes soft-clips 100% of the bases.
+ // This is essentially unmapped
+ if (end == start && (args->flag_filter & BAM_FUNMAP)) {
+ stats->nfiltered++;
+ return 0;
+ }
+
+ if (anum == -1)
+ stats->nfailprimer++;
+
+ if (anum >= 0) {
+ int64_t c = MIN(end,amp[anum].min_right+1) - MAX(start,amp[anum].max_left);
+ if (c > 0) {
+ stats->nreads[anum]++;
+ // NB: ref bases rather than read bases
+ stats->nbases[anum] += c;
+
+ int64_t i;
+ if (start < 0) start = 0;
+ if (end > len) end = len;
+
+ int64_t ostart = MAX(start, amp[anum].min_left-1);
+ int64_t oend = MIN(end, amp[anum].max_right);
+ int64_t offset = amp[anum].min_left-1;
+ for (i = ostart; i < oend; i++)
+ stats->coverage[anum*stats->max_amp_len + i-offset]++;
+ } else {
+ stats->nfailprimer++;
+ }
+ }
+
+ // Template length in terms of amplicon number to amplicon number.
+ // We expect left to right of same amplicon (len 0), but it may go
+ // to next amplicon (len 1) or prev (len -1), etc.
+ int64_t t_end;
+ int oth_anum = -1;
+
+ if (b->core.flag & BAM_FPAIRED) {
+ t_end = (b->core.flag & BAM_FREVERSE ? end : start)
+ + b->core.isize;
+
+ // If we've clipped the primers but not followed up with a fixmates
+ // then our start+TLEN will take us to a location which is
+ // length(LEFT_PRIMER) + length(RIGHT_PRIMER) too far away.
+ //
+ // The correct solution is to run samtools fixmate so TLEN is correct.
+ // The hacky solution is to fudge the expected tlen by double the
+ // average primer length (e.g. 50).
+ t_end += b->core.isize > 0 ? -args->tlen_adj : +args->tlen_adj;
+
+ if (t_end > 0 && t_end < len && b->core.isize != 0)
+ oth_anum = (b->core.flag & BAM_FREVERSE)
+ ? pos2start[t_end]
+ : pos2end[t_end];
+ } else {
+ // Not paired (see int anum = (REV || !PAIR) ?en :st expr above)
+ oth_anum = pos2start[start];
+ t_end = end;
+ }
+
+ // We don't want to count our pairs twice.
+ // If both left/right are known, count it on left only.
+ // If only one is known, we'll only get to this code once
+ // so we can also count it.
+ int astatus = 2;
+ if (anum != -1 && oth_anum != -1) {
+ astatus = oth_anum == anum ? 0 : 1;
+ if (start <= t_end)
+ stats->amp_dist[anum][astatus]++;
+ } else if (anum >= 0) {
+ stats->amp_dist[anum][astatus = 2]++;
+ }
+
+ if (astatus == 0 && !(b->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))) {
+ if (prev_end && mstart > prev_end) {
+ // 2nd read with gap to 1st; undo previous increment.
+ for (i = prev_start; i < prev_end; i++)
+ stats->depth_valid[i]--;
+ stats->nfull_reads[anum] -= (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
+ } else {
+ // 1st read, or 2nd read that overlaps 1st
+ for (i = mstart; i < end; i++)
+ stats->depth_valid[i]++;
+ stats->nfull_reads[anum] += (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
+ }
+ }
+
+ // Track template start,end frequencies, so we can give stats on
+ // amplicon primer usage.
+ if ((b->core.flag & BAM_FPAIRED) && b->core.isize <= 0)
+ // left to right only, so we don't double count template positions.
+ return 0;
+
+ start = b->core.pos;
+ t_end = b->core.flag & BAM_FPAIRED
+ ? start + b->core.isize-1
+ : end;
+ uint64_t tcoord = MIN(start+1, UINT32_MAX) | (MIN(t_end+1, UINT32_MAX)<<32);
+ k = kh_put(tcoord, stats->tcoord[anum+1], tcoord, &ret);
+ if (ret < 0)
+ return -1;
+ if (ret == 0)
+ kh_value(stats->tcoord[anum+1], k)++;
+ else
+ kh_value(stats->tcoord[anum+1], k)=1;
+ kh_value(stats->tcoord[anum+1], k) |= ((int64_t)astatus<<32);
+
+ return 0;
+}
+
+// Append file local stats to global stats
+int append_lstats(astats_t *lstats, astats_t *gstats, int namp, int all_nseq) {
+ gstats->nseq += lstats->nseq;
+ gstats->nfiltered += lstats->nfiltered;
+ gstats->nfailprimer += lstats->nfailprimer;
+
+ int a;
+ for (a = -1; a < namp; a++) {
+ // Add khash local (kl) to khash global (kg)
+ khiter_t kl, kg;
+ for (kl = kh_begin(lstats->tcoord[a+1]);
+ kl != kh_end(lstats->tcoord[a+1]); kl++) {
+ if (!kh_exist(lstats->tcoord[a+1], kl) ||
+ kh_value(lstats->tcoord[a+1], kl) == 0)
+ continue;
+
+ int ret;
+ kg = kh_put(tcoord, gstats->tcoord[a+1],
+ kh_key(lstats->tcoord[a+1], kl),
+ &ret);
+ if (ret < 0)
+ return -1;
+
+ kh_value(gstats->tcoord[a+1], kg) =
+ (ret == 0
+ ? (kh_value(gstats->tcoord[a+1], kg) & 0xFFFFFFFF)
+ : 0)
+ + kh_value(lstats->tcoord[a+1], kl);
+ }
+ if (a == -1) continue;
+
+ gstats->nreads[a] += lstats->nreads[a];
+ gstats->nreads2[a] += lstats->nreads[a] * lstats->nreads[a];
+ gstats->nfull_reads[a] += lstats->nfull_reads[a];
+
+ // To get mean & sd for amplicon read percentage, we need
+ // to do the divisions here as nseq differs for each sample.
+ double nrperc = all_nseq ? 100.0 * lstats->nreads[a] / all_nseq : 0;
+ gstats->nrperc[a] += nrperc;
+ gstats->nrperc2[a] += nrperc*nrperc;
+
+ gstats->nbases[a] += lstats->nbases[a];
+ gstats->nbases2[a] += lstats->nbases[a] * lstats->nbases[a];
+
+ int d;
+ for (d = 0; d < MAX_DEPTH; d++) {
+ gstats->covered_perc[a][d] += lstats->covered_perc[a][d];
+ gstats->covered_perc2[a][d] += lstats->covered_perc[a][d]
+ * lstats->covered_perc[a][d];
+ }
+
+ for (d = 0; d < 3; d++)
+ gstats->amp_dist[a][d] += lstats->amp_dist[a][d];
+ }
+
+ for (a = 0; a < lstats->max_len; a++) {
+ gstats->depth_valid[a] += lstats->depth_valid[a];
+ gstats->depth_all[a] += lstats->depth_all[a];
+ }
+
+ return 0;
+}
+
+int append_stats(amplicons_t *amps, int nref) {
+ int i, r, all_nseq = 0;
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = amps[r].lstats;
+ all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer;
+ }
+
+ for (i = 0; i < nref; i++) {
+ if (!amps[i].sites)
+ continue;
+ if (append_lstats(amps[i].lstats, amps[i].gstats, amps[i].namp,
+ all_nseq) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+typedef struct {
+ int32_t start, end;
+ uint32_t freq;
+ uint32_t status;
+} tcoord_t;
+
+// Sort tcoord by descending frequency and then ascending start and end.
+static int tcoord_freq_sort(const void *vp1, const void *vp2) {
+ const tcoord_t *t1 = (const tcoord_t *)vp1;
+ const tcoord_t *t2 = (const tcoord_t *)vp2;
+
+ if (t1->freq != t2->freq)
+ return t2->freq - t1->freq;
+
+ if (t1->start != t2->start)
+ return t1->start - t2->start;
+
+ return t1->end - t2->end;
+}
+
+
+/*
+ * Merges tcoord start,end,freq,status tuples if their coordinates are
+ * close together. We aim to keep the start,end for the most frequent
+ * value and assume that is the correct coordinate and all others are
+ * minor fluctuations due to errors or variants.
+ *
+ * We sort by frequency first and then merge later items in the list into
+ * the earlier more frequent ones. It's O(N^2), but sufficient for now
+ * given current scale of projects.
+ *
+ * If we ever need to resolve that then consider sorting by start
+ * coordinate and scanning the list to find all items within X, find
+ * the most frequent of those, and then cluster that way. (I'd have
+ * done that had I thought of it at the time!)
+ */
+static void aggregate_tcoord(astats_args_t *args, tcoord_t *tpos, size_t *np){
+ size_t n = *np, j, j2, j3, k;
+
+ // Sort by frequency and cluster infrequent coords into frequent
+ // ones provided they're close by.
+ // This is O(N^2), but we've already binned by tcoord_bin/2 so
+ // the list isn't intended to be vast at this point.
+ qsort(tpos, n, sizeof(*tpos), tcoord_freq_sort);
+
+ // For frequency ties, find mid start coord, and then find mid end
+ // coord of those matching start.
+ // We make that the first item so we merge into that mid point.
+ for (j = 0; j < n; j++) {
+ for (j2 = j+1; j2 < n; j2++) {
+ if (tpos[j].freq != tpos[j2].freq)
+ break;
+ if (tpos[j2].start - tpos[j].start >= args->tcoord_bin)
+ break;
+ }
+
+ // j to j2 all within bin of a common start,
+ // m is the mid start.
+ if (j2-1 > j) {
+ size_t m = (j2-1 + j)/2;
+
+ // Find mid end for this same start
+ while (m > 1 && tpos[m].start == tpos[m-1].start)
+ m--;
+ for (j3 = m+1; j3 < j2; j3++) {
+ if (tpos[m].start != tpos[j3].start)
+ break;
+ if (tpos[m].end - tpos[j3].end >= args->tcoord_bin)
+ break;
+ }
+ if (j3-1 > m)
+ m = (j3-1 + m)/2;
+
+ // Swap with first item.
+ tcoord_t tmp = tpos[j];
+ tpos[j] = tpos[m];
+ tpos[m] = tmp;
+ j = j2-1;
+ }
+ }
+
+ // Now merge in coordinates.
+ // This bit is O(N^2), so consider binning first to reduce the
+ // size of the list if we have excessive positional variation.
+ for (k = j = 0; j < n; j++) {
+ if (!tpos[j].freq)
+ continue;
+
+ if (k < j)
+ tpos[k] = tpos[j];
+
+ for (j2 = j+1; j2 < n; j2++) {
+ if (ABS(tpos[j].start-tpos[j2].start) < args->tcoord_bin/2 &&
+ ABS(tpos[j].end -tpos[j2].end) < args->tcoord_bin/2 &&
+ tpos[j].status == tpos[j2].status) {
+ tpos[k].freq += tpos[j2].freq;
+ tpos[j2].freq = 0;
+ }
+ }
+ k++;
+ }
+
+ *np = k;
+}
+
+int dump_stats(astats_args_t *args, char type, char *name, int nfile,
+ amplicons_t *amps, int nref, int local) {
+ int i, r;
+ FILE *ofp = args->out_fp;
+ tcoord_t *tpos = NULL;
+ size_t ntcoord = 0;
+
+ // summary stats for this sample (or for all samples)
+ fprintf(ofp, "# Summary stats.\n");
+ fprintf(ofp, "# Use 'grep ^%cSS | cut -f 2-' to extract this part.\n", type);
+
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ int nmatch = stats->nseq - stats->nfiltered - stats->nfailprimer;
+ char *name_ref = malloc(strlen(name) + strlen(amps[r].ref) + 2);
+ if (!name_ref)
+ return -1;
+ if (args->multi_ref)
+ sprintf(name_ref, "%s\t%s", name, amps[r].ref);
+ else
+ sprintf(name_ref, "%s", name);
+ fprintf(ofp, "%cSS\t%s\traw total sequences:\t%d\n",
+ type, name_ref, stats->nseq);
+ fprintf(ofp, "%cSS\t%s\tfiltered sequences:\t%d\n",
+ type, name_ref, stats->nfiltered);
+ fprintf(ofp, "%cSS\t%s\tfailed primer match:\t%d\n",
+ type, name_ref, stats->nfailprimer);
+ fprintf(ofp, "%cSS\t%s\tmatching sequences:\t%d\n",
+ type, name_ref, nmatch);
+
+ int d = 0;
+ do {
+ // From first to last amplicon only, so not entire consensus.
+ // If contig length is known, maybe we want to add the missing
+ // count to < DEPTH figures?
+ int64_t start = 0, covered = 0, total = 0;
+ amplicon_t *amp = amps[r].amp;
+ for (i = 0; i < amps[r].namp; i++) {
+ int64_t j, offset = amp[i].min_left-1;
+ if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
+ fprintf(samtools_stderr, "[ampliconstats] error: "
+ "Maximum amplicon length (%d) exceeded for '%s'\n",
+ stats->max_amp, name);
+ return -1;
+ }
+ for (j = MAX(start, amp[i].max_left-1);
+ j < MAX(start, amp[i].min_right); j++) {
+ if (stats->coverage[i*stats->max_amp_len + j-offset]
+ >= args->min_depth[d])
+ covered++;
+ total++;
+ }
+ start = MAX(start, amp[i].min_right);
+ }
+ fprintf(ofp, "%cSS\t%s\tconsensus depth count < %d and >= %d:\t%"
+ PRId64"\t%"PRId64"\n", type, name_ref,
+ args->min_depth[d], args->min_depth[d],
+ total-covered, covered);
+ } while (++d < MAX_DEPTH && args->min_depth[d]);
+
+ free(name_ref);
+ }
+
+ // Read count
+ fprintf(ofp, "# Absolute matching read counts per amplicon.\n");
+ fprintf(ofp, "# Use 'grep ^%cREADS | cut -f 2-' to extract this part.\n", type);
+ fprintf(ofp, "%cREADS\t%s", type, name);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ fprintf(ofp, "\t%"PRId64, stats->nreads[i]);
+ }
+ }
+ fprintf(ofp, "\n");
+
+ // Valid depth is the number of full length reads (already divided
+ // by the number we expect to cover), so +0.5 per read in pair.
+ // A.k.a "usable depth" in the plots.
+ fprintf(ofp, "%cVDEPTH\t%s", type, name);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++)
+ fprintf(ofp, "\t%d", (int)stats->nfull_reads[i]);
+ }
+ fprintf(ofp, "\n");
+
+ if (type == 'C') {
+ // For combined we can compute mean & standard deviation too
+ fprintf(ofp, "CREADS\tMEAN");
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ fprintf(ofp, "\t%.1f", stats->nreads[i] / (double)nfile);
+ }
+ }
+ fprintf(ofp, "\n");
+
+ fprintf(ofp, "CREADS\tSTDDEV");
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ double n1 = stats->nreads[i];
+ fprintf(ofp, "\t%.1f", nfile > 1 && stats->nreads2[i] > 0
+ ? sqrt(stats->nreads2[i]/(double)nfile
+ - (n1/nfile)*(n1/nfile))
+ : 0);
+ }
+ }
+ fprintf(ofp, "\n");
+ }
+
+ fprintf(ofp, "# Read percentage of distribution between amplicons.\n");
+ fprintf(ofp, "# Use 'grep ^%cRPERC | cut -f 2-' to extract this part.\n", type);
+ fprintf(ofp, "%cRPERC\t%s", type, name);
+ int all_nseq = 0;
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer;
+ }
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ if (type == 'C') {
+ fprintf(ofp, "\t%.3f", (double)stats->nrperc[i] / nfile);
+ } else {
+ fprintf(ofp, "\t%.3f",
+ all_nseq ? 100.0 * stats->nreads[i] / all_nseq : 0);
+ }
+ }
+ }
+ fprintf(ofp, "\n");
+
+ if (type == 'C') {
+ // For combined we compute mean and standard deviation too
+ fprintf(ofp, "CRPERC\tMEAN");
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ fprintf(ofp, "\t%.3f", stats->nrperc[i] / nfile);
+ }
+ }
+ fprintf(ofp, "\n");
+
+ fprintf(ofp, "CRPERC\tSTDDEV");
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ // variance = SUM(X^2) - ((SUM(X)^2) / N)
+ double n1 = stats->nrperc[i];
+ double v = stats->nrperc2[i]/nfile - (n1/nfile)*(n1/nfile);
+ fprintf(ofp, "\t%.3f", v>0?sqrt(v):0);
+ }
+ }
+ fprintf(ofp, "\n");
+ }
+
+ // Base depth
+ fprintf(ofp, "# Read depth per amplicon.\n");
+ fprintf(ofp, "# Use 'grep ^%cDEPTH | cut -f 2-' to extract this part.\n", type);
+ fprintf(ofp, "%cDEPTH\t%s", type, name);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ amplicon_t *amp = amps[r].amp;
+ for (i = 0; i < amps[r].namp; i++) {
+ int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
+ int64_t alen = amp[i].min_right - amp[i].max_left+1;
+ fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen : 0);
+ }
+ }
+ fprintf(ofp, "\n");
+
+ if (type == 'C') {
+ // For combined we can compute mean & standard deviation too
+ fprintf(ofp, "CDEPTH\tMEAN");
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ amplicon_t *amp = amps[r].amp;
+ int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
+ for (i = 0; i < amps[r].namp; i++) {
+ int64_t alen = amp[i].min_right - amp[i].max_left+1;
+ fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen / nfile : 0);
+ }
+ }
+ fprintf(ofp, "\n");
+
+ fprintf(ofp, "CDEPTH\tSTDDEV");
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ amplicon_t *amp = amps[r].amp;
+ for (i = 0; i < amps[r].namp; i++) {
+ double alen = amp[i].min_right - amp[i].max_left+1;
+ double n1 = stats->nbases[i] / alen;
+ double v = stats->nbases2[i] / (alen*alen) /nfile
+ - (n1/nfile)*(n1/nfile);
+ fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
+ }
+ }
+ fprintf(ofp, "\n");
+ }
+
+ // Percent Coverage
+ if (type == 'F') {
+ fprintf(ofp, "# Percentage coverage per amplicon\n");
+ fprintf(ofp, "# Use 'grep ^%cPCOV | cut -f 2-' to extract this part.\n", type);
+ int d = 0;
+ do {
+ fprintf(ofp, "%cPCOV-%d\t%s", type, args->min_depth[d], name);
+
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ amplicon_t *amp = amps[r].amp;
+ for (i = 0; i < amps[r].namp; i++) {
+ int covered = 0;
+ if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
+ fprintf(samtools_stderr, "[ampliconstats] error: "
+ "Maximum amplicon length (%d) exceeded for '%s'\n",
+ stats->max_amp, name);
+ return -1;
+ }
+ int64_t j, offset = amp[i].min_left-1;
+ for (j = amp[i].max_left-1; j < amp[i].min_right; j++) {
+ int apos = i*stats->max_amp_len + j-offset;
+ if (stats->coverage[apos] >= args->min_depth[d])
+ covered++;
+ }
+ int64_t alen = amp[i].min_right - amp[i].max_left+1;
+ stats->covered_perc[i][d] = 100.0 * covered / alen;
+ fprintf(ofp, "\t%.2f", 100.0 * covered / alen);
+ }
+ }
+ fprintf(ofp, "\n");
+ } while (++d < MAX_DEPTH && args->min_depth[d]);
+
+ } else if (type == 'C') {
+ // For combined we can compute mean & standard deviation too
+ int d = 0;
+ do {
+ fprintf(ofp, "CPCOV-%d\tMEAN", args->min_depth[d]);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ fprintf(ofp, "\t%.1f", stats->covered_perc[i][d] / nfile);
+ }
+ }
+ fprintf(ofp, "\n");
+
+ fprintf(ofp, "CPCOV-%d\tSTDDEV", args->min_depth[d]);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ double n1 = stats->covered_perc[i][d] / nfile;
+ double v = stats->covered_perc2[i][d] / nfile - n1*n1;
+ fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
+ }
+ }
+ fprintf(ofp, "\n");
+ } while (++d < MAX_DEPTH && args->min_depth[d]);
+ }
+
+ // Plus base depth for all reads, irrespective of amplicon.
+ // This is post overlap removal, if reads in the read-pair overlap.
+ fprintf(ofp, "# Depth per reference base for ALL data.\n");
+ fprintf(ofp, "# Use 'grep ^%cDP_ALL | cut -f 2-' to extract this part.\n",
+ type);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ if (args->multi_ref)
+ fprintf(ofp, "%cDP_ALL\t%s\t%s", type, name, amps[r].ref);
+ else
+ fprintf(ofp, "%cDP_ALL\t%s", type, name);
+
+ for (i = 0; i < amps[r].len; i++) {
+ // Basic run-length encoding provided all values are within
+ // +- depth_bin fraction of the mid-point.
+ int dmin = stats->depth_all[i], dmax = stats->depth_all[i], j;
+ double dmid = (dmin + dmax)/2.0;
+ double low = dmid*(1-args->depth_bin);
+ double high = dmid*(1+args->depth_bin);
+ for (j = i+1; j < amps[r].len; j++) {
+ int d = stats->depth_all[j];
+ if (d < low || d > high)
+ break;
+ if (dmin > d) {
+ dmin = d;
+ dmid = (dmin + dmax)/2.0;
+ low = dmid*(1-args->depth_bin);
+ high = dmid*(1+args->depth_bin);
+ } else if (dmax < d) {
+ dmax = d;
+ dmid = (dmin + dmax)/2.0;
+ low = dmid*(1-args->depth_bin);
+ high = dmid*(1+args->depth_bin);
+ }
+ }
+ fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
+ i = j-1;
+ }
+ fprintf(ofp, "\n");
+ }
+
+ // And depth for only reads matching to a single amplicon for full
+ // length. This is post read overlap removal.
+ fprintf(ofp, "# Depth per reference base for full-length valid amplicon data.\n");
+ fprintf(ofp, "# Use 'grep ^%cDP_VALID | cut -f 2-' to extract this "
+ "part.\n", type);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ if (args->multi_ref)
+ fprintf(ofp, "%cDP_VALID\t%s\t%s", type, name, amps[r].ref);
+ else
+ fprintf(ofp, "%cDP_VALID\t%s", type, name);
+
+ for (i = 0; i < amps[r].len; i++) {
+ int dmin = stats->depth_valid[i], dmax = stats->depth_valid[i], j;
+ double dmid = (dmin + dmax)/2.0;
+ double low = dmid*(1-args->depth_bin);
+ double high = dmid*(1+args->depth_bin);
+ for (j = i+1; j < amps[r].len; j++) {
+ int d = stats->depth_valid[j];
+ if (d < low || d > high)
+ break;
+ if (dmin > d) {
+ dmin = d;
+ dmid = (dmin + dmax)/2.0;
+ low = dmid*(1-args->depth_bin);
+ high = dmid*(1+args->depth_bin);
+ } else if (dmax < d) {
+ dmax = d;
+ dmid = (dmin + dmax)/2.0;
+ low = dmid*(1-args->depth_bin);
+ high = dmid*(1+args->depth_bin);
+ }
+ }
+ fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
+ i = j-1;
+ }
+ fprintf(ofp, "\n");
+ }
+
+ // TCOORD (start to end) distribution
+ fprintf(ofp, "# Distribution of aligned template coordinates.\n");
+ fprintf(ofp, "# Use 'grep ^%cTCOORD | cut -f 2-' to extract this part.\n", type);
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0 - (nref==1); i < amps[r].namp; i++) {
+ if (ntcoord < kh_size(stats->tcoord[i+1])) {
+ ntcoord = kh_size(stats->tcoord[i+1]);
+ tcoord_t *tmp = realloc(tpos, ntcoord * sizeof(*tmp));
+ if (!tmp) {
+ free(tpos);
+ return -1;
+ }
+ tpos = tmp;
+ }
+
+ khiter_t k;
+ size_t n = 0, j;
+ for (k = kh_begin(stats->tcoord[i+1]);
+ k != kh_end(stats->tcoord[i+1]); k++) {
+ if (!kh_exist(stats->tcoord[i+1], k) ||
+ (kh_value(stats->tcoord[i+1], k) & 0xFFFFFFFF) == 0)
+ continue;
+ // Key is start,end in 32-bit quantities.
+ // Yes this limits us to 4Gb references, but just how
+ // many primers are we planning on making? Not that many
+ // I hope.
+ tpos[n].start = kh_key(stats->tcoord[i+1], k)&0xffffffff;
+ tpos[n].end = kh_key(stats->tcoord[i+1], k)>>32;
+
+ // Value is frequency (top 32-bits) and status (bottom 32).
+ tpos[n].freq = kh_value(stats->tcoord[i+1], k)&0xffffffff;
+ tpos[n].status = kh_value(stats->tcoord[i+1], k)>>32;
+ n++;
+ }
+
+ if (args->tcoord_bin > 1)
+ aggregate_tcoord(args, tpos, &n);
+
+ fprintf(ofp, "%cTCOORD\t%s\t%d", type, name,
+ i+1+amps[r].first_amp); // per amplicon
+ for (j = 0; j < n; j++) {
+ if (tpos[j].freq < args->tcoord_min_count)
+ continue;
+ fprintf(ofp, "\t%d,%d,%u,%u",
+ tpos[j].start,
+ tpos[j].end,
+ tpos[j].freq,
+ tpos[j].status);
+ }
+ fprintf(ofp, "\n");
+ }
+ }
+
+
+ // AMP length distribution.
+ // 0 = both ends in this amplicon
+ // 1 = ends in different amplicons
+ // 2 = other end matching an unknown amplicon site
+ // (see tcoord for further analysis of where)
+ fprintf(ofp, "# Classification of amplicon status. Columns are\n");
+ fprintf(ofp, "# number with both primers from this amplicon, number with\n");
+ fprintf(ofp, "# primers from different amplicon, and number with a position\n");
+ fprintf(ofp, "# not matching any valid amplicon primer site\n");
+ fprintf(ofp, "# Use 'grep ^%cAMP | cut -f 2-' to extract this part.\n", type);
+
+ fprintf(ofp, "%cAMP\t%s\t0", type, name); // all merged
+ int amp_dist[3] = {0};
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) { // accumulate for all amps
+ amp_dist[0] += stats->amp_dist[i][0];
+ amp_dist[1] += stats->amp_dist[i][1];
+ amp_dist[2] += stats->amp_dist[i][2];
+ }
+ }
+ fprintf(ofp, "\t%d\t%d\t%d\n", amp_dist[0], amp_dist[1], amp_dist[2]);
+
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
+ astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+ for (i = 0; i < amps[r].namp; i++) {
+ // per amplicon
+ fprintf(ofp, "%cAMP\t%s\t%d", type, name, i+1+amps[r].first_amp);
+ fprintf(ofp, "\t%d\t%d\t%d\n", stats->amp_dist[i][0],
+ stats->amp_dist[i][1], stats->amp_dist[i][2]);
+ }
+ }
+
+ free(tpos);
+ return 0;
+}
+
+int dump_lstats(astats_args_t *args, char type, char *name, int nfile,
+ amplicons_t *amps, int nref) {
+ return dump_stats(args, type, name, nfile, amps, nref, 1);
+}
+
+int dump_gstats(astats_args_t *args, char type, char *name, int nfile,
+ amplicons_t *amps, int nref) {
+ return dump_stats(args, type, name, nfile, amps, nref, 0);
+}
+
+char const *get_sample_name(sam_hdr_t *header, char *RG) {
+ kstring_t ks = {0};
+ sam_hdr_find_tag_id(header, "RG", RG?"ID":NULL, RG, "SM", &ks);
+ return ks.s;
+}
+
+// Return maximum reference length (SQ is NULL) or the length
+// of the specified reference in SQ.
+int64_t get_ref_len(sam_hdr_t *header, const char *SQ) {
+ if (SQ) {
+ int tid = SQ ? sam_hdr_name2tid(header, SQ) : 0;
+ return tid >= 0 ? sam_hdr_tid2len(header, tid) : -1;
+ } else {
+ int nref = sam_hdr_nref(header), tid;;
+ int64_t len = 0;
+ for (tid = 0; tid < nref; tid++) {
+ int64_t rl = sam_hdr_tid2len(header, tid);
+ if (len < rl)
+ len = rl;
+ }
+ return len;
+ }
+}
+
+static int amplicon_stats(astats_args_t *args,
+ khash_t(bed_list_hash) *bed_hash,
+ char **filev, int filec) {
+ int i, ref = -1, ref_tid = -1, ret = -1, nref = 0;
+ samFile *fp = NULL;
+ sam_hdr_t *header = NULL;
+ bam1_t *b = bam_init1();
+ FILE *ofp = args->out_fp;
+ char sname_[8192], *sname = NULL;
+ amplicons_t *amps = NULL;
+
+ // Report initial SS header. We gather data from the bed_hash entries
+ // as well as from the first SAM header (with the requirement that all
+ // headers should be compatible).
+ if (filec) {
+ if (!(fp = sam_open_format(filev[0], "r", &args->ga.in))) {
+ print_error_errno("ampliconstats",
+ "Cannot open input file \"%s\"",
+ filev[0]);
+ goto err;
+ }
+ if (!(header = sam_hdr_read(fp)))
+ goto err;
+
+ if (!amps) {
+ amps = calloc(nref=sam_hdr_nref(header), sizeof(*amps));
+ if (!amps)
+ goto err;
+ fprintf(ofp, "# Summary statistics, used for scaling the plots.\n");
+ fprintf(ofp, "SS\tSamtools version: %s\n", samtools_version());
+ fprintf(ofp, "SS\tCommand line: %s\n", args->argv);
+ fprintf(ofp, "SS\tNumber of files:\t%d\n", filec);
+
+ // Note: order of hash entries will be different to order of
+ // BED file which may also differ to order of SQ headers.
+ // SQ header is canonical ordering (pos sorted file).
+ khiter_t k;
+ int bam_nref = sam_hdr_nref(header);
+ for (i = 0; i < bam_nref; i++) {
+ k = kh_get(bed_list_hash, bed_hash,
+ sam_hdr_tid2name(header, i));
+ if (!kh_exist(bed_hash, k))
+ continue;
+
+ bed_entry_list_t *sites = &kh_value(bed_hash, k);
+
+ ref = i;
+ amps[ref].ref = kh_key(bed_hash, k);
+ amps[ref].sites = sites;
+ amps[ref].namp = count_amplicon(sites);
+ amps[ref].amp = calloc(sites->length,
+ sizeof(*amps[ref].amp));
+ if (!amps[ref].amp)
+ goto err;
+ if (args->multi_ref)
+ fprintf(ofp, "SS\tNumber of amplicons:\t%s\t%d\n",
+ kh_key(bed_hash, k), amps[ref].namp);
+ else
+ fprintf(ofp, "SS\tNumber of amplicons:\t%d\n",
+ amps[ref].namp);
+
+ amps[ref].tid = ref;
+ if (ref_tid == -1)
+ ref_tid = ref;
+
+ int64_t len = get_ref_len(header, kh_key(bed_hash, k));
+ amps[ref].len = len;
+ if (args->multi_ref)
+ fprintf(ofp, "SS\tReference length:\t%s\t%"PRId64"\n",
+ kh_key(bed_hash, k), len);
+ else
+ fprintf(ofp, "SS\tReference length:\t%"PRId64"\n",
+ len);
+
+ amps[ref].lstats = stats_alloc(len, args->max_amp,
+ args->max_amp_len);
+ amps[ref].gstats = stats_alloc(len, args->max_amp,
+ args->max_amp_len);
+ if (!amps[ref].lstats || !amps[ref].gstats)
+ goto err;
+ }
+ }
+
+ sam_hdr_destroy(header);
+ header = NULL;
+ if (sam_close(fp) < 0) {
+ fp = NULL;
+ goto err;
+ }
+ fp = NULL;
+ }
+ fprintf(ofp, "SS\tEnd of summary\n");
+
+ // Extract the bits of amplicon data we need from bed hash and turn
+ // it into a position-to-amplicon lookup table.
+ int offset = 0;
+ for (i = 0; i < nref; i++) {
+ if (!amps[i].sites)
+ continue;
+
+ amps[i].first_amp = offset;
+ if (bed2amplicon(args, amps[i].sites, amps[i].amp,
+ &s[i].namp, i==0, amps[i].ref, offset) < 0)
+ goto err;
+
+ offset += amps[i].namp; // cumulative amplicon number across refs
+ }
+
+ // Now iterate over file contents, one at a time.
+ for (i = 0; i < filec; i++) {
+ char *nstart = filev[i];
+
+ fp = sam_open_format(filev[i], "r", &args->ga.in);
+ if (!fp) {
+ print_error_errno("ampliconstats",
+ "Cannot open input file \"%s\"",
+ filev[i]);
+ goto err;
+ }
+
+ if (args->ga.nthreads > 0)
+ hts_set_threads(fp, args->ga.nthreads);
+
+ if (!(header = sam_hdr_read(fp)))
+ goto err;
+
+ if (nref != sam_hdr_nref(header)) {
+ print_error_errno("ampliconstats",
+ "SAM headers are not consistent across input files");
+ goto err;
+ }
+ int r;
+ for (r = 0; r < nref; r++) {
+ if (!amps[r].ref ||
+ strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 ||
+ amps[r].len != sam_hdr_tid2len(header, r)) {
+ print_error_errno("ampliconstats",
+ "SAM headers are not consistent across "
+ "input files");
+ goto err;
+ }
+ }
+
+ if (args->use_sample_name)
+ sname = (char *)get_sample_name(header, NULL);
+
+ if (!sname) {
+ sname = sname_;
+ char *nend = filev[i] + strlen(filev[i]), *cp;
+ if ((cp = strrchr(filev[i], '/')))
+ nstart = cp+1;
+ if ((cp = strrchr(nstart, '.')) &&
+ (strcmp(cp, ".bam") == 0 ||
+ strcmp(cp, ".sam") == 0 ||
+ strcmp(cp, ".cram") == 0))
+ nend = cp;
+ if (nend - nstart >= 8192) nend = nstart+8191;
+ memcpy(sname, nstart, nend-nstart);
+ sname[nend-nstart] = 0;
+ }
+
+ // Stats local to this sample only
+ amp_stats_reset(amps, nref);
+
+ int last_ref = -9;
+ while ((r = sam_read1(fp, header, b)) >= 0) {
+ // Other filter options useful here?
+ if (b->core.tid < 0)
+ continue;
+
+ if (last_ref != b->core.tid) {
+ last_ref = b->core.tid;
+ if (initialise_amp_pos_lookup(args, amps, last_ref) < 0)
+ goto err;
+ }
+
+ if (accumulate_stats(args, amps, b) < 0)
+ goto err;
+ }
+
+ if (r < -1) {
+ print_error_errno("ampliconstats", "Fail reading record");
+ goto err;
+ }
+
+ sam_hdr_destroy(header);
+ if (sam_close(fp) < 0) {
+ fp = NULL;
+ goto err;
+ }
+
+ fp = NULL;
+ header = NULL;
+
+ if (dump_lstats(args, 'F', sname, filec, amps, nref) < 0)
+ goto err;
+
+ if (append_stats(amps, nref) < 0)
+ goto err;
+
+ if (sname && sname != sname_)
+ free(sname);
+ sname = NULL;
+ }
+
+ if (dump_gstats(args, 'C', "COMBINED", filec, amps, nref) < 0)
+ goto err;
+
+ ret = 0;
+ err:
+ bam_destroy1(b);
+ if (ret) {
+ if (header)
+ sam_hdr_destroy(header);
+ if (fp)
+ sam_close(fp);
+ }
+ for (i = 0; i < nref; i++) {
+ stats_free(amps[i].lstats);
+ stats_free(amps[i].gstats);
+ free(amps[i].amp);
+ }
+ free(amps);
+ free(pos2start);
+ free(pos2end);
+ if (ret) {
+ if (sname && sname != sname_)
+ free(sname);
+ }
+
+ return ret;
+}
+
+static int usage(astats_args_t *args, FILE *fp, int exit_status) {
+ fprintf(fp,
+"\n"
+"Usage: samtools ampliconstats [options] primers.bed *.bam > astats.txt\n"
+"\n"
+"Options:\n");
+ fprintf(fp, " -f, --required-flag STR|INT\n"
+ " Only include reads with all of the FLAGs present [0x%X]\n",args->flag_require);
+ fprintf(fp, " -F, --filter-flag STR|INT\n"
+ " Only include reads with none of the FLAGs present [0x%X]\n",args->flag_filter & 0xffff);
+ fprintf(fp, " -a, --max-amplicons INT\n"
+ " Change the maximum number of amplicons permitted [%d]\n", MAX_AMP);
+ fprintf(fp, " -l, --max-amplicon-length INT\n"
+ " Change the maximum length of an individual amplicon [%d]\n", MAX_AMP_LEN);
+ fprintf(fp, " -d, --min-depth INT[,INT]...\n"
+ " Minimum base depth(s) to consider position covered [%d]\n", args->min_depth[0]);
+ fprintf(fp, " -m, --pos-margin INT\n"
+ " Margin of error for matching primer positions [%d]\n", args->max_delta);
+ fprintf(fp, " -o, --output FILE\n"
+ " Specify output file [samtools_stdout if unset]\n");
+ fprintf(fp, " -s, --use-sample-name\n"
+ " Use the sample name from the first @RG header line\n");
+ fprintf(fp, " -t, --tlen-adjust INT\n"
+ " Add/subtract from TLEN; use when clipping but no fixmate step\n");
+ fprintf(fp, " -b, --tcoord-bin INT\n"
+ " Bin template start,end positions into multiples of INT[1]\n");
+ fprintf(fp, " -c, --tcoord-min-count INT\n"
+ " Minimum template start,end frequency for recording [%d]\n", TCOORD_MIN_COUNT);
+ fprintf(fp, " -D, --depth-bin FRACTION\n"
+ " Merge FDP values within +/- FRACTION together\n");
+ fprintf(fp, " -S, --single-ref\n"
+ " Force single-ref (<=1.12) output format\n");
+ sam_global_opt_help(fp, "I.--.@");
+
+ return exit_status;
+}
+
+int main_ampliconstats(int argc, char **argv) {
+ astats_args_t args = {
+ .ga = SAM_GLOBAL_ARGS_INIT,
+ .flag_require = 0,
+ .flag_filter = 0x10B04,
+ //.sites = BED_LIST_INIT,
+ .max_delta = 30, // large enough to cope with alt primers
+ .min_depth = {1},
+ .use_sample_name = 0,
+ .max_amp = MAX_AMP,
+ .max_amp_len = MAX_AMP_LEN,
+ .tlen_adj = 0,
+ .out_fp = samtools_stdout,
+ .tcoord_min_count = TCOORD_MIN_COUNT,
+ .tcoord_bin = 1,
+ .depth_bin = 0.01,
+ .multi_ref = 1
+ }, oargs = args;
+
+ static const struct option loptions[] =
+ {
+ SAM_OPT_GLOBAL_OPTIONS('I', 0, '-', '-', 0, '@'),
+ {"help", no_argument, NULL, 'h'},
+ {"flag-require", required_argument, NULL, 'f'},
+ {"flag-filter", required_argument, NULL, 'F'},
+ {"min-depth", required_argument, NULL, 'd'},
+ {"output", required_argument, NULL, 'o'},
+ {"pos-margin", required_argument, NULL, 'm'},
+ {"use-sample-name", no_argument, NULL, 's'},
+ {"max-amplicons", required_argument, NULL, 'a'},
+ {"max-amplicon-length", required_argument, NULL, 'l'},
+ {"tlen-adjust", required_argument, NULL, 't'},
+ {"tcoord-min-count", required_argument, NULL, 'c'},
+ {"tcoord-bin", required_argument, NULL, 'b'},
+ {"depth-bin", required_argument, NULL, 'D'},
+ {"single-ref", no_argument, NULL, 'S'},
+ {NULL, 0, NULL, 0}
+ };
+ int opt;
+
+ while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) {
+ switch (opt) {
+ case 'f': args.flag_require = bam_str2flag(optarg); break;
+ case 'F':
+ if (args.flag_filter & 0x10000)
+ args.flag_filter = 0; // strip default on first -F usage
+ args.flag_filter |= bam_str2flag(optarg); break;
+
+ case 'm': args.max_delta = atoi(optarg); break; // margin
+ case 'D': args.depth_bin = atof(optarg); break; // depth bin fraction
+ case 'd': {
+ int d = 0;
+ char *cp = optarg, *ep;
+ do {
+ long n = strtol(cp, &ep, 10);
+ args.min_depth[d++] = n;
+ if (*ep != ',')
+ break;
+ cp = ep+1;
+ } while (d < MAX_DEPTH);
+ break;
+ }
+
+ case 'a': args.max_amp = atoi(optarg)+1;break;
+ case 'l': args.max_amp_len = atoi(optarg)+1;break;
+
+ case 'c': args.tcoord_min_count = atoi(optarg);break;
+ case 'b':
+ args.tcoord_bin = atoi(optarg);
+ if (args.tcoord_bin < 1)
+ args.tcoord_bin = 1;
+ break;
+
+ case 't': args.tlen_adj = atoi(optarg);break;
+
+ case 's': args.use_sample_name = 1;break;
+
+ case 'o':
+ if (!(args.out_fp = fopen(optarg, "w"))) {
+ perror(optarg);
+ return 1;
+ }
+ break;
+
+ case 'S':
+ args.multi_ref = 0;
+ break;
+
+ case '?': return usage(&oargs, samtools_stderr, EXIT_FAILURE);
+ case 'h': return usage(&oargs, samtools_stdout, EXIT_SUCCESS);
+
+ default:
+ if (parse_sam_global_opt(opt, optarg, loptions, &args.ga) != 0)
+ usage(&oargs,samtools_stderr, EXIT_FAILURE);
+ break;
+ }
+ }
+
+ if (argc <= optind)
+ return usage(&oargs, samtools_stdout, EXIT_SUCCESS);
+ if (argc <= optind+1 && isatty(STDIN_FILENO))
+ return usage(&oargs, samtools_stderr, EXIT_FAILURE);
+
+ khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
+ if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) {
+ print_error_errno("ampliconstats",
+ "Could not read file \"%s\"", argv[optind]);
+ return 1;
+
+ }
+
+ khiter_t k, ref_count = 0;
+ for (k = kh_begin(bed_hash); k != kh_end(bed_hash); k++) {
+ if (!kh_exist(bed_hash, k))
+ continue;
+ ref_count++;
+ }
+ if (ref_count == 0)
+ return 1;
+ if (ref_count > 1 && args.multi_ref == 0) {
+ print_error("ampliconstats",
+ "Single-ref mode is not permitted for BED files\n"
+ "containing more than one reference.");
+ return 1;
+ }
+
+ args.argv = stringify_argv(argc, argv);
+ int ret;
+ if (argc == ++optind) {
+ char *av = "-";
+ ret = amplicon_stats(&args, bed_hash, &av, 1);
+ } else {
+ ret = amplicon_stats(&args, bed_hash, &argv[optind], argc-optind);
+ }
+
+ free(args.argv);
+ destroy_bed_hash(bed_hash);
+
+ return ret;
+}
/* bam.c -- BAM format.
- Copyright (C) 2008-2013, 2015, 2019 Genome Research Ltd.
+ Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
uint8_t *seq, *qual, *p;
// test if removal is necessary
if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing
- cigar = bam1_cigar(b);
+ cigar = bam_get_cigar(b);
for (k = 0; k < b->core.n_cigar; ++k)
if (bam_cigar_op(cigar[k]) == BAM_CBACK) break;
if (k == b->core.n_cigar) return 0; // no 'B'
if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed
// allocate memory for the new CIGAR
- if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
- b->m_data = b->data_len + b->core.n_cigar * 4;
+ if (b->l_data + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
+ b->m_data = b->l_data + b->core.n_cigar * 4;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
- cigar = bam1_cigar(b); // after realloc, cigar may be changed
+ cigar = bam_get_cigar(b); // after realloc, cigar may be changed
}
new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
// the core loop
- seq = bam1_seq(b); qual = bam1_qual(b);
+ seq = bam_get_seq(b); qual = bam_get_qual(b);
no_qual = (qual[0] == 0xff); // test whether base quality is available
i = j = 0; end_j = -1;
for (k = l = 0; k < b->core.n_cigar; ++k) {
if (i != j) { // no need to copy if i == j
int u, c, c0;
for (u = 0; u < len; ++u) { // construct the consensus
- c = bam1_seqi(seq, i+u);
+ c = bam_seqi(seq, i+u);
if (j + u < end_j) { // in an overlap
- c0 = bam1_seqi(seq, j+u);
+ c0 = bam_seqi(seq, j+u);
if (c != c0) { // a mismatch; choose the better base
if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
bam1_seq_seti(seq, j+u, c);
p = b->data + b->core.l_qname + l * 4;
memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
memmove(p, qual, j); p += j; // set QUAL
- memmove(p, bam1_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields
+ memmove(p, bam_get_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields
b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
- b->data_len = p - b->data; // update record length
+ b->l_data = p - b->data; // update record length
return 0;
rmB_err:
/* bam.c -- BAM format.
- Copyright (C) 2008-2013, 2015, 2019 Genome Research Ltd.
+ Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
uint8_t *seq, *qual, *p;
// test if removal is necessary
if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing
- cigar = bam1_cigar(b);
+ cigar = bam_get_cigar(b);
for (k = 0; k < b->core.n_cigar; ++k)
if (bam_cigar_op(cigar[k]) == BAM_CBACK) break;
if (k == b->core.n_cigar) return 0; // no 'B'
if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed
// allocate memory for the new CIGAR
- if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
- b->m_data = b->data_len + b->core.n_cigar * 4;
+ if (b->l_data + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
+ b->m_data = b->l_data + b->core.n_cigar * 4;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
- cigar = bam1_cigar(b); // after realloc, cigar may be changed
+ cigar = bam_get_cigar(b); // after realloc, cigar may be changed
}
new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
// the core loop
- seq = bam1_seq(b); qual = bam1_qual(b);
+ seq = bam_get_seq(b); qual = bam_get_qual(b);
no_qual = (qual[0] == 0xff); // test whether base quality is available
i = j = 0; end_j = -1;
for (k = l = 0; k < b->core.n_cigar; ++k) {
if (i != j) { // no need to copy if i == j
int u, c, c0;
for (u = 0; u < len; ++u) { // construct the consensus
- c = bam1_seqi(seq, i+u);
+ c = bam_seqi(seq, i+u);
if (j + u < end_j) { // in an overlap
- c0 = bam1_seqi(seq, j+u);
+ c0 = bam_seqi(seq, j+u);
if (c != c0) { // a mismatch; choose the better base
if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
bam1_seq_seti(seq, j+u, c);
p = b->data + b->core.l_qname + l * 4;
memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
memmove(p, qual, j); p += j; // set QUAL
- memmove(p, bam1_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields
+ memmove(p, bam_get_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields
b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
- b->data_len = p - b->data; // update record length
+ b->l_data = p - b->data; // update record length
return 0;
rmB_err:
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "1.10"
+#define BAM_VERSION "1.13"
#include <stdint.h>
#include <stdlib.h>
#define BAM_OFHEX 1
#define BAM_OFSTR 2
-/*! @abstract defautl mask for pileup */
+/*! @abstract default mask for pileup */
#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
/*! @typedef
{ // do realignment; this is the bottleneck
const uint8_t *qual = bam_get_qual(p->b), *bq;
uint8_t *qq;
+ if (qend < qbeg) {
+ fprintf(stderr, "Impossible data in bcf_call_gap_prep\n");
+ exit(1);
+ }
qq = calloc(qend - qbeg, 1);
bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
if (bq) ++bq; // skip type
{ // do realignment; this is the bottleneck
const uint8_t *qual = bam_get_qual(p->b), *bq;
uint8_t *qq;
+ if (qend < qbeg) {
+ fprintf(samtools_stderr, "Impossible data in bcf_call_gap_prep\n");
+ samtools_exit(1);
+ }
qq = calloc(qend - qbeg, 1);
bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
if (bq) ++bq; // skip type
/* bam2depth.c -- depth subcommand.
Copyright (C) 2011, 2012 Broad Institute.
- Copyright (C) 2012-2016, 2018, 2019 Genome Research Ltd.
+ Copyright (C) 2012-2016, 2018, 2019-2021 Genome Research Ltd.
+
+ Author: Heng Li <lh3@sanger.ac.uk> (to 2020)
+ Author: James Bonfield <jkb@sanger.ac.uk> (2021 rewrite)
- Author: Heng Li <lh3@sanger.ac.uk>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
DEALINGS IN THE SOFTWARE. */
/* This program demonstrates how to generate pileup from multiple BAMs
- * simutaneously, to achieve random access and to use the BED interface.
+ * simultaneously, to achieve random access and to use the BED interface.
* To compile this program separately, you may:
*
* gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -lhts -lz
#include "samtools.h"
#include "bedidx.h"
#include "sam_opts.h"
+#include "htslib/khash.h"
-#define BAM_FMAX ((BAM_FSUPPLEMENTARY << 1) - 1)
+// From bam_plcmd.c
+int read_file_list(const char *file_list, int *n, char **argv[]);
-typedef struct { // auxiliary data structure
- samFile *fp; // the file handle
- sam_hdr_t *hdr; // the file header
- hts_itr_t *iter; // NULL if a region not specified
- int min_mapQ, min_len; // mapQ filter; length filter
- uint32_t flags; // read filtering flags
-} aux_t;
+// We accumulate to hist[pos & (size-1)]. This is a ring-buffer.
+// We track where we last got to in output and what the biggest value
+// we've written to so far (in absolute unmasked coordinates) in
+// "last_output" and "end_pos" respectively.
+// For each new record we just flush anything we haven't written yet
+// already, between "last_output" and this read's start position, and
+// initialise any newly seen positions between "end_pos" and this read's
+// end position.
+typedef struct {
+ size_t size;
+ int **hist; // hist[nfiles][size]
+ hts_pos_t *end_pos; // end_pos[nfiles]
+ hts_pos_t last_output;
+ int last_ref;
+ int nfiles;
+ const char *ref;
+ kstring_t ks;
+ hts_pos_t beg, end; // limit to region
+ int tid;
+} depth_hist;
-// This function reads a BAM alignment from one BAM file.
-static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
-{
- aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
- int ret;
- while (1)
- {
- ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b);
- if ( ret<0 ) break;
- if ( b->core.flag & aux->flags) continue;
- if ( (int)b->core.qual < aux->min_mapQ ) continue;
- if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue;
- break;
+typedef struct {
+ int header;
+ int flag;
+ int min_qual;
+ int min_mqual;
+ int min_len;
+ int skip_del;
+ int all_pos;
+ int remove_overlaps;
+ FILE *out;
+ char *reg;
+ void *bed;
+} depth_opt;
+
+static void zero_region(depth_opt *opt, depth_hist *dh,
+ const char *name, hts_pos_t start, hts_pos_t end) {
+ hts_pos_t i;
+ kstring_t *ks = &dh->ks;
+
+ kputs(name, ks_clear(ks));
+ kputc('\t', ks);
+ size_t cur_l = ks->l;
+ if (dh->beg >= 0 && start < dh->beg)
+ start = dh->beg;
+ if (dh->end >= 0 && end > dh->end)
+ end = dh->end;
+
+ for (i = start; i < end; i++) {
+ // Could be optimised, but needs better API to skip to next
+ // bed region.
+ if (opt->bed && bed_overlap(opt->bed, name, i, i+1) == 0)
+ continue;
+
+ ks->l = cur_l;
+ kputll(i+1, ks);
+ int n;
+ for (n = 0; n < dh->nfiles; n++) {
+ kputc_('\t', ks);
+ kputc_('0', ks);
+ }
+ kputc('\n', ks);
+ fputs(ks->s, opt->out);
}
- return ret;
+ ks->l = cur_l;
}
-int read_file_list(const char *file_list,int *n,char **argv[]);
-
-static int usage() {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
- fprintf(stderr, "Options:\n");
- fprintf(stderr, " -a output all positions (including zero depth)\n");
- fprintf(stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n");
- fprintf(stderr, " -b <bed> list of positions or regions\n");
- fprintf(stderr, " -X use customized index files\n");
- fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
- fprintf(stderr, " -H print a file header\n");
- fprintf(stderr, " -l <int> read length threshold (ignore reads shorter than <int>) [0]\n");
- fprintf(stderr, " -d/-m <int> maximum coverage depth [8000]. If 0, depth is set to the maximum\n"
- " integer value, effectively removing any depth limit.\n"); // the htslib's default
- fprintf(stderr, " -o FILE where to write output to [stdout]\n");
- fprintf(stderr, " -q <int> base quality threshold [0]\n");
- fprintf(stderr, " -Q <int> mapping quality threshold [0]\n");
- fprintf(stderr, " -r <chr:from-to> region\n");
- fprintf(stderr, " -g <flags> include reads that have any of the specified flags set [0]\n");
- fprintf(stderr, " -G <flags> filter out reads that have any of the specified flags set"
- " [UNMAP,SECONDARY,QCFAIL,DUP]\n");
-
- sam_global_opt_help(stderr, "-.--.--.");
-
- fprintf(stderr, "\n");
- fprintf(stderr, "The output is a simple tab-separated table with three columns: reference name,\n");
- fprintf(stderr, "position, and coverage depth. Note that positions with zero coverage may be\n");
- fprintf(stderr, "omitted by default; see the -a option.\n");
- fprintf(stderr, "\n");
-
- return EXIT_FAILURE;
+// A variation of bam_cigar2qlen which doesn't count soft-clips in to the
+// equation. Basically it's the number of bases in query that are aligned
+// in some way to the reference (including insertions, which are considered
+// to be aligned by dint of being anchored either side).
+hts_pos_t qlen_used(bam1_t *b) {
+ int n_cigar = b->core.n_cigar;
+ const uint32_t *cigar = bam_get_cigar(b);
+
+ hts_pos_t l;
+
+ if (b->core.l_qseq) {
+ // Known SEQ permits of short cut of l_qseq minus CSOFT_CLIPs.
+ // Full scan not needed, which helps on excessively long CIGARs.
+ l = b->core.l_qseq;
+ int kl, kr;
+ for (kl = 0; kl < n_cigar; kl++)
+ if (bam_cigar_op(cigar[kl]) == BAM_CSOFT_CLIP)
+ l -= bam_cigar_oplen(cigar[kl]);
+ else
+ break;
+
+ for (kr = n_cigar-1; kr > kl; kr--)
+ if (bam_cigar_op(cigar[kr]) == BAM_CSOFT_CLIP)
+ l -= bam_cigar_oplen(cigar[kr]);
+ else
+ break;
+ } else {
+ // Unknown SEQ ("*") needs a full scan through the CIGAR string.
+ static int query[16] = {
+ //M I D N S H P = X B ? ? ? ? ? ?
+ 1,1,0,0, 0,0,0,1, 1,0,0,0, 0,0,0,0
+ };
+ int k;
+ for (k = l = 0; k < n_cigar; k++)
+ if (query[bam_cigar_op(cigar[k])])
+ l += bam_cigar_oplen(cigar[k]);
+ }
+ return l;
+
}
-int main_depth(int argc, char *argv[])
-{
- int i, n, tid, reg_tid, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, has_index_file = 0;
- hts_pos_t beg, end, pos, last_pos = -1;
- int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1;
- const bam_pileup1_t **plp;
- char *reg = 0; // specified region
- void *bed = 0; // BED data structure
- char *file_list = NULL, **fn = NULL;
- sam_hdr_t *h = NULL; // BAM header of the 1st input
- aux_t **data;
- bam_mplp_t mplp;
- int last_tid = -1, ret;
- int print_header = 0;
- char *output_file = NULL;
- FILE *file_out = stdout;
- uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
- int tflags = 0;
+// Adds the depth for a single read to a depth_hist struct.
+// For just one file, this is easy. We just have a circular buffer
+// where we increment values for bits that overlap existing data
+// and initialise values for coordinates which we're seeing for the first
+// time. This is tracked by "end_pos" to know where we've got to.
+//
+// As the input is sorted, we can flush output from "last_output" to
+// b->core.pos.
+//
+// With multiple files, we must feed data in sorted order as if all files
+// are merged, but track depth per file. This also means "end_pos" is per
+// file too, but "last_output" is global as it corresponds to rows printed.
+static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b,
+ int overlap_clip, int file) {
+ hts_pos_t i;
+ size_t hmask = dh->size-1;
+ int n;
- sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
- static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
- { NULL, 0, NULL, 0 }
- };
+ if (!b || b->core.tid != dh->last_ref) {
+ // New ref
+ if (dh->last_ref >= 0) {
+ // do end
+ size_t cur_l = dh->ks.l;
+ int nf = dh->nfiles;
+ i = dh->last_output;
+ for (i = dh->last_output; nf; i++) {
+ nf = 0;
+ for (n = 0; n < dh->nfiles; n++) {
+ if (i < dh->end_pos[n])
+ nf++;
+ }
+ if (!nf)
+ break;
+
+ if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0)
+ continue;
- // parse the command line
- while ((n = getopt_long(argc, argv, "r:b:Xq:Q:l:f:am:d:Ho:g:G:", lopts, NULL)) >= 0) {
- switch (n) {
- case 'l': min_len = atoi(optarg); break; // minimum query length
- case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header
- case 'b':
- bed = bed_read(optarg); // BED or position list file can be parsed now
- if (!bed) {
- print_error_errno("depth", "Could not read file \"%s\"", optarg);
- return EXIT_FAILURE;
+ dh->ks.l = cur_l;
+ kputll(i+1, &dh->ks);
+ for (n = 0; n < dh->nfiles; n++) {
+ kputc_('\t', &dh->ks);
+ int d = i < dh->end_pos[n]
+ ? dh->hist[n][i & hmask]
+ : 0;
+ kputuw(d, &dh->ks);
}
- break;
- case 'X': has_index_file = 1; break;
- case 'q': baseQ = atoi(optarg); break; // base quality threshold
- case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold
- case 'f': file_list = optarg; break;
- case 'a': all++; break;
- case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth
- case 'H': print_header = 1; break;
- case 'o': output_file = optarg; break;
- case 'g':
- tflags = bam_str2flag(optarg);
- if (tflags < 0 || tflags > BAM_FMAX) {
- print_error_errno("depth", "Flag value \"%s\" is not supported", optarg);
- return 1;
+ kputc('\n', &dh->ks);
+ fputs(dh->ks.s, opt->out);
+ }
+ if (opt->all_pos) {
+ // End of last ref
+ zero_region(opt, dh,
+ sam_hdr_tid2name(h, dh->last_ref),
+ i, sam_hdr_tid2len(h, dh->last_ref));
+ }
+ dh->ks.l = cur_l;
+ }
+
+ if (opt->all_pos > 1 && !opt->reg) {
+ // Any previous unused refs
+ int lr = dh->last_ref < 0 ? 0 : dh->last_ref+1;
+ int rr = b ? b->core.tid : sam_hdr_nref(h), r;
+ for (r = lr; r < rr; r++)
+ zero_region(opt, dh,
+ sam_hdr_tid2name(h, r),
+ 0, sam_hdr_tid2len(h, r));
+ }
+
+ if (!b) {
+ // we're just flushing to end of file
+ if (opt->all_pos && opt->reg && dh->last_ref < 0)
+ // -a or -aa without a single read being output yet
+ zero_region(opt, dh, sam_hdr_tid2name(h, dh->tid), dh->beg,
+ MIN(dh->end, sam_hdr_tid2len(h, dh->tid)));
+
+ return 0;
+ }
+
+ for (n = 0; dh->end_pos && n < dh->nfiles; n++)
+ dh->end_pos[n] = 0;
+ dh->last_output = dh->beg >= 0
+ ? MAX(b->core.pos, dh->beg)
+ : b->core.pos;
+ dh->last_ref = b->core.tid;
+ dh->ref = sam_hdr_tid2name(h, b->core.tid);
+ kputs(dh->ref, ks_clear(&dh->ks));
+ kputc('\t', &dh->ks);
+
+ if (opt->all_pos)
+ // Start of ref
+ zero_region(opt, dh, dh->ref, 0, b->core.pos);
+ } else {
+ if (dh->last_output < b->core.pos) {
+ // Flush any depth outputs up to start of new read
+ size_t cur_l = dh->ks.l;
+ int nf = dh->nfiles;
+ for (i = dh->last_output; i < b->core.pos; i++) {
+ nf = 0;
+ for (n = 0; n < dh->nfiles; n++) {
+ if (i < dh->end_pos[n])
+ nf++;
}
- flags &= ~tflags;
- break;
- case 'G':
- tflags = bam_str2flag(optarg);
- if (tflags < 0 || tflags > BAM_FMAX) {
- print_error_errno("depth", "Flag value \"%s\" is not supported", optarg);
- return 1;
+ if (!nf)
+ break;
+
+ if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0)
+ continue;
+
+ dh->ks.l = cur_l;
+ kputll(i+1, &dh->ks);
+ for (n = 0; n < dh->nfiles; n++) {
+ kputc_('\t', &dh->ks);
+ int d = i < dh->end_pos[n]
+ ? dh->hist[n][i & hmask]
+ : 0;
+ kputuw(d, &dh->ks);
}
- flags |= tflags;
- break;
- default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break;
- /* else fall-through */
- case '?': return usage();
+ kputc('\n', &dh->ks);
+ fputs(dh->ks.s, opt->out);
+ }
+ if (opt->all_pos && i < b->core.pos)
+ // Hole in middle of ref
+ zero_region(opt, dh, dh->ref, i, b->core.pos);
+
+ dh->ks.l = cur_l;
+ dh->last_output = b->core.pos;
}
}
- if (optind == argc && !file_list)
- return usage();
-
- /* output file provided by user */
- if (output_file != NULL && strcmp(output_file,"-")!=0) {
- file_out = fopen( output_file, "w" );
- if (file_out == NULL) {
- print_error_errno("depth", "Cannot open \"%s\" for writing.", output_file);
- return EXIT_FAILURE;
- }
+
+ hts_pos_t end_pos = bam_endpos(b); // 0 based, 1 past end.
+ //printf("%d %d\n", (int)b->core.pos+1, (int)end_pos);
+
+ if (b->core.tid < dh->last_ref ||
+ (dh->last_ref == b->core.tid && end_pos < dh->last_output)) {
+ print_error_errno("depth", "Data is not position sorted");
+ return -1;
}
+ // If needed, grow the circular buffer.
+ if (end_pos+1 - b->core.pos >= dh->size) {
+ size_t old_size = dh->size;
+ size_t old_hmask = hmask;
+ while (end_pos+1 - b->core.pos >= dh->size)
+ dh->size = dh->size ? 2*dh->size : 2048;
+ hmask = dh->size-1;
+ if (!dh->hist) {
+ dh->hist = calloc(dh->nfiles, sizeof(*dh->hist));
+ dh->end_pos = calloc(dh->nfiles, sizeof(*dh->end_pos));
+ if (!dh->hist || !dh->end_pos)
+ return -1;
+ }
+ for (n = 0; n < dh->nfiles; n++) {
+ int *hist = calloc(dh->size, sizeof(*dh->hist[n]));
+ if (!hist)
+ return -1;
- // initialize the auxiliary data structures
- if (file_list)
- {
- if (has_index_file) {
- print_error("depth", "The -f option cannot be combined with -X");
- return 1;
+ // Simple approach for now; copy over old histogram verbatim.
+ for (i = dh->last_output; i < dh->last_output + old_size; i++)
+ hist[i & hmask] = dh->hist[n][i & old_hmask];
+ free(dh->hist[n]);
+ dh->hist[n] = hist;
}
- if ( read_file_list(file_list,&nfiles,&fn) ) return EXIT_FAILURE;
- n = nfiles;
- argv = fn;
- optind = 0;
}
- else if (has_index_file) { // Calculate # of input BAM files
- if ((argc - optind) % 2 != 0) {
- fprintf(stderr, "Error: Odd number of filenames detected! Each BAM file should have an index file\n");
- return 1;
- }
- n = (argc - optind) / 2;
+
+ // Accumulate depth, based on CIGAR
+ uint32_t *cig = bam_get_cigar(b);
+ int ncig = b->core.n_cigar, j, k, spos = 0;
+
+ // Zero new (previously unseen) coordinates so increment works later.
+ hts_pos_t end = MAX(dh->end_pos[file], b->core.pos);
+ if (end_pos > end && (end & hmask) < (end_pos & hmask)) {
+ memset(&dh->hist[file][end & hmask], 0,
+ sizeof(**dh->hist) * (end_pos - end));
} else {
- n = argc - optind;
+ for (i = end; i < end_pos; i++)
+ dh->hist[file][i & hmask] = 0;
}
- data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input
- reg_tid = 0; beg = 0; end = HTS_POS_MAX; // set the default region
-
- for (i = 0; i < n; ++i) {
- int rf;
- data[i] = calloc(1, sizeof(aux_t));
- data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM
- if (data[i]->fp == NULL) {
- print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]);
- status = EXIT_FAILURE;
- goto depth_end;
- }
- rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ;
- if (baseQ) rf |= SAM_QUAL;
- if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
- print_error_errno("depth", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
- status = EXIT_FAILURE;
- goto depth_end;
- }
- if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
- print_error_errno("depth", "Failed to set CRAM_OPT_DECODE_MD value");
- status = EXIT_FAILURE;
- goto depth_end;
- }
- data[i]->min_mapQ = mapQ; // set the mapQ filter
- data[i]->min_len = min_len; // set the qlen filter
- data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header
- if (data[i]->hdr == NULL) {
- print_error_errno("depth", "Couldn't read header for \"%s\"",
- argv[optind+i]);
- status = EXIT_FAILURE;
- goto depth_end;
- }
- if (reg) { // if a region is specified
- hts_idx_t *idx = NULL;
- // If index filename has not been specfied, look in BAM folder
- if (has_index_file) {
- idx = sam_index_load2(data[i]->fp, argv[optind+i], argv[optind+i+n]); // load the index
+
+ i = b->core.pos;
+ uint8_t *qual = bam_get_qual(b);
+ int min_qual = opt->min_qual;
+ for (j = 0; j < ncig; j++) {
+ int op = bam_cigar_op(cig[j]);
+ int oplen = bam_cigar_oplen(cig[j]);
+
+ switch (op) {
+ case BAM_CDEL:
+ case BAM_CREF_SKIP:
+ if (op != BAM_CDEL || opt->skip_del) {
+ // don't increment reference location
+ if (i + oplen >= dh->end_pos[file]) {
+ for (k = 0; k < oplen; k++, i++) {
+ if (i >= dh->end_pos[file])
+ // redundant due to zero new elements above?
+ dh->hist[file][i & hmask] = 0;
+ }
+ } else {
+ i += oplen;
+ }
+ } else { // op == BAM_CDEL and we count them (-J option),
+ // We don't incr spos here, but we still use qual.
+ // This doesn't make much sense, but it's for compatibility
+ // with the old code. Arguably DEL shouldn't have a min
+ // qual and should always pass (as we've explicitly asked to
+ // include them).
+ int *hist = dh->hist[file];
+ k = 0;
+ if (overlap_clip) {
+ if (i+oplen < overlap_clip) {
+ i += oplen;
+ break;
+ } else if (i < overlap_clip) {
+ k = overlap_clip - i;
+ i = overlap_clip;
+ }
+ }
+
+ // Question: should we even check quality values for DEL?
+ // We've explicitly asked to include them, and the quality
+ // is wrong anyway (it's the neighbouring base). We do this
+ // for now for compatibility with the old depth command.
+
+ if (spos < b->core.l_qseq)
+ for (; k < oplen; k++, i++)
+ hist[i & hmask]+=qual[spos]>=min_qual;
+ else
+ for (; k < oplen; k++, i++)
+ hist[i & hmask]++;
+ }
+ break;
+
+ case BAM_CMATCH:
+ case BAM_CEQUAL:
+ case BAM_CDIFF:
+ if ((i & hmask) < ((i+oplen) & hmask)) {
+ // Optimisation when not wrapping around
+
+ // Unrolling doesn't help clang, but helps gcc,
+ // especially when not using -O3.
+ int *hist = &dh->hist[file][i & hmask];
+ if (min_qual || overlap_clip) {
+ k = 0;
+ if (overlap_clip) {
+ if (i+oplen < overlap_clip) {
+ i += oplen;
+ spos += oplen;
+ break;
+ } else if (i < overlap_clip) {
+ oplen -= overlap_clip - i;
+ spos += overlap_clip - i;
+ hist += overlap_clip - i;
+ i = overlap_clip;
+ }
+ }
+
+ // approx 50% of this func cpu time in this loop
+ for (; k < (oplen & ~7); k+=8) {
+ hist[k+0]+=qual[spos+0]>=min_qual;
+ hist[k+1]+=qual[spos+1]>=min_qual;
+ hist[k+2]+=qual[spos+2]>=min_qual;
+ hist[k+3]+=qual[spos+3]>=min_qual;
+ hist[k+4]+=qual[spos+4]>=min_qual;
+ hist[k+5]+=qual[spos+5]>=min_qual;
+ hist[k+6]+=qual[spos+6]>=min_qual;
+ hist[k+7]+=qual[spos+7]>=min_qual;
+ spos += 8;
+ }
+ } else {
+ // easier to vectorize when no min_qual
+ for (k = 0; k < (oplen & ~7); k+=8) {
+ hist[k+0]++;
+ hist[k+1]++;
+ hist[k+2]++;
+ hist[k+3]++;
+ hist[k+4]++;
+ hist[k+5]++;
+ hist[k+6]++;
+ hist[k+7]++;
+ }
+ spos += k;
+ }
+ for (; k < oplen && spos < b->core.l_qseq; k++, spos++)
+ hist[k]+=qual[spos]>=min_qual;
+ for (; k < oplen; k++, spos++)
+ hist[k]++;
+ i += oplen;
} else {
- idx = sam_index_load(data[i]->fp, argv[optind+i]);
+ // Simple to understand case, but slower.
+ // We use this only for reads with wrap-around.
+ int *hist = dh->hist[file];
+ k = 0;
+ if (overlap_clip) {
+ if (i+oplen < overlap_clip) {
+ i += oplen;
+ break;
+ } else if (i < overlap_clip) {
+ oplen -= overlap_clip - i;
+ spos += overlap_clip - i;
+ i = overlap_clip;
+ }
+ }
+ for (; k < oplen && spos < b->core.l_qseq; k++, i++, spos++)
+ hist[i & hmask]+=qual[spos]>=min_qual;
+ for (; k < oplen; k++, i++, spos++)
+ hist[i & hmask]++;
}
- if (idx == NULL) {
- print_error("depth", "can't load index for \"%s\"", argv[optind+i]);
- status = EXIT_FAILURE;
- goto depth_end;
+ break;
+
+ case BAM_CINS:
+ case BAM_CSOFT_CLIP:
+ spos += oplen;
+ break;
+
+ case BAM_CPAD:
+ case BAM_CHARD_CLIP:
+ // ignore
+ break;
+
+ default:
+ print_error("depth", "Unsupported cigar op '%d'", op);
+ return -1;
+ }
+ }
+
+ if (dh->end >= 0 && end_pos > dh->end)
+ end_pos = dh->end;
+ if (dh->end_pos[file] < end_pos)
+ dh->end_pos[file] = end_pos;
+
+ return 0;
+}
+
+// Hash on name -> alignment end pos. This permits a naive overlap removal.
+// Note it cannot analyse the overlapping sequence and qualities, so the
+// interaction of basecalls/qualities and the -Q parameter cannot be
+// applied here (unlike the full mpileup algorithm).
+KHASH_MAP_INIT_STR(olap_hash, hts_pos_t)
+typedef khash_t(olap_hash) olap_hash_t;
+
+static int fastdepth_core(depth_opt *opt, uint32_t nfiles, char **fn,
+ samFile **fp, hts_itr_t **itr, sam_hdr_t **h) {
+ int ret = -1, err = 1, i;
+ olap_hash_t **overlaps = NULL;
+ depth_hist dh = {0};
+
+ // An array of bam structs, one per input file, to hold the next entry
+ bam1_t **b = calloc(nfiles, sizeof(*b));
+ int *finished = calloc(nfiles, sizeof(*finished)), to_go = nfiles;
+ if (!b || !finished)
+ goto err;
+
+ for (i = 0; i < nfiles; i++)
+ if (!(b[i] = bam_init1()))
+ goto err;
+
+ // Do we need one overlap hash per file? Or shared?
+ if (opt->remove_overlaps) {
+ if (!(overlaps = calloc(nfiles, sizeof(*overlaps))))
+ return -1;
+ for (i = 0; i < nfiles; i++) {
+ if (!(overlaps[i] = kh_init(olap_hash)))
+ return -1;
+ }
+ }
+
+ // Create the initial histogram
+ dh.nfiles = nfiles;
+ dh.size = 0;
+ dh.hist = NULL;
+ dh.last_ref = -99;
+ dh.end_pos = NULL;
+ dh.last_output = itr && itr[0] ? itr[0]->beg : 0;
+ ks_initialize(&dh.ks);
+
+ // Clip results to region if specified
+ dh.beg = -1;
+ dh.end = -1;
+ dh.tid = 0;
+ if (itr && itr[0]) {
+ dh.tid = itr[0]->tid;
+ dh.beg = itr[0]->beg;
+ dh.end = itr[0]->end;
+ }
+
+ if (opt->header) {
+ fprintf(opt->out, "#CHROM\tPOS");
+ for (i = 0; i < nfiles; i++)
+ fprintf(opt->out, "\t%s", fn[i]);
+ fputc('\n', opt->out);
+ }
+
+ // Populate first record per file
+ for (i = 0; i < nfiles; i++) {
+ for(;;) {
+ ret = itr && itr[i]
+ ? sam_itr_next(fp[i], itr[i], b[i])
+ : sam_read1(fp[i], h[i], b[i]);
+ if (ret < -1)
+ goto err;
+ if (ret == -1) {
+ to_go--;
+ finished[i] = 1;
+ break;
}
- data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator
- hts_idx_destroy(idx); // the index is not needed any more; free the memory
- if (data[i]->iter == NULL) {
- print_error("depth", "can't parse region \"%s\"", reg);
- status = EXIT_FAILURE;
- goto depth_end;
+
+ if (b[i]->core.tid < 0)
+ continue;
+ if (b[i]->core.flag & opt->flag)
+ continue;
+ if (b[i]->core.qual < opt->min_mqual)
+ continue;
+
+ // Original samtools depth used the total sequence (l_qseq)
+ // including soft-clips. This doesn't feel like a useful metric
+ // to be filtering on. We now only count sequence bases that
+ // form the used part of the alignment.
+ if (opt->min_len) {
+ if (qlen_used(b[i]) < opt->min_len)
+ continue;
}
+
+ break;
}
- data[i]->flags = flags;
}
- if (print_header) {
- fputs("#CHROM\tPOS", file_out);
- for (i = 0; i < n; ++i) {
- fputc('\t', file_out);
- fputs(argv[optind+i], file_out);
+
+ // Loop through input files, merging in order so we're
+ // always adding the next record in sequence
+ while (to_go) {
+ // Find next record in file list
+ int best_tid = INT_MAX, best_file = 0;
+ hts_pos_t best_pos = HTS_POS_MAX;
+
+ for (i = 0; i < nfiles; i++) {
+ if (finished[i])
+ continue;
+ if (best_tid > b[i]->core.tid) {
+ best_tid = b[i]->core.tid;
+ best_pos = b[i]->core.pos;
+ best_file = i;
+ } else if (best_tid == b[i]->core.tid &&
+ best_pos > b[i]->core.pos) {
+ best_pos = b[i]->core.pos;
+ best_file = i;
}
- fputc('\n', file_out);
}
- h = data[0]->hdr; // easy access to the header of the 1st BAM
- if (reg) {
- beg = data[0]->iter->beg; // and to the parsed region coordinates
- end = data[0]->iter->end;
- reg_tid = data[0]->iter->tid;
- }
+ i = best_file;
- // the core multi-pileup loop
- mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
- if (0 < max_depth)
- bam_mplp_set_maxcnt(mplp,max_depth); // set maximum coverage depth
- else if (!max_depth)
- bam_mplp_set_maxcnt(mplp,INT_MAX);
- n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
- plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
- while ((ret=bam_mplp64_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
- if (pos < beg || pos >= end) continue; // out of range; skip
- if (tid >= sam_hdr_nref(h)) continue; // diff number of @SQ lines per file?
- if (all) {
- while (tid > last_tid) {
- if (last_tid >= 0 && !reg) {
- // Deal with remainder or entirety of last tid.
- while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
- // Horribly inefficient, but the bed API is an obfuscated black box.
- if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)
- continue;
- fputs(sam_hdr_tid2name(h, last_tid), file_out);
- fprintf(file_out, "\t%"PRIhts_pos, last_pos+1);
- for (i = 0; i < n; i++)
- fputc('\t', file_out), fputc('0', file_out);
- fputc('\n', file_out);
- }
+ hts_pos_t clip = 0;
+ if (overlaps && (b[i]->core.flag & BAM_FPAIRED) &&
+ !(b[i]->core.flag & BAM_FMUNMAP)) {
+ khiter_t k = kh_get(olap_hash, overlaps[i], bam_get_qname(b[i]));
+ if (k == kh_end(overlaps[i])) {
+ // not seen before
+ hts_pos_t endpos = bam_endpos(b[i]);
+
+ // Don't add if mate location is known and can't overlap.
+ if (b[i]->core.mpos == -1 ||
+ (b[i]->core.tid == b[i]->core.mtid &&
+ b[i]->core.mpos <= endpos)) {
+ k = kh_put(olap_hash, overlaps[i], bam_get_qname(b[i]),
+ &ret);
+ if (ret < 0)
+ return -1;
+ kh_key(overlaps[i], k) = strdup(bam_get_qname(b[i]));
+ kh_value(overlaps[i], k) = endpos;
}
- last_tid++;
- last_pos = -1;
- if (all < 2)
- break;
+ } else {
+ // seen before
+ clip = kh_value(overlaps[i], k);
+ free((char *)kh_key(overlaps[i], k));
+ kh_del(olap_hash, overlaps[i], k);
}
+ }
- // Deal with missing portion of current tid
- while (++last_pos < pos) {
- if (last_pos < beg) continue; // out of range; skip
- if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0)
- continue;
- fputs(sam_hdr_tid2name(h, tid), file_out);
- fprintf(file_out, "\t%"PRIhts_pos, last_pos+1);
- for (i = 0; i < n; i++)
- fputc('\t', file_out), fputc('0', file_out);
- fputc('\n', file_out);
+ // Add the next merged BAM record to the depth plot
+ if ((ret = add_depth(opt, &dh, h[i], b[i], clip, i)) < 0) {
+ ret = -1;
+ goto err;
+ }
+
+ // Populate next record from this file
+ for(;!finished[i];) {
+ ret = itr && itr[i]
+ ? sam_itr_next(fp[i], itr[i], b[i])
+ : sam_read1(fp[i], h[i], b[i]);
+ if (ret < -1) {
+ ret = -1;
+ goto err;
+ }
+ if (ret == -1) {
+ to_go--;
+ finished[i] = 1;
+ break;
}
- last_tid = tid;
- last_pos = pos;
- }
- if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), pos, pos + 1) == 0) continue;
- fputs(sam_hdr_tid2name(h, tid), file_out);
- fprintf(file_out, "\t%"PRIhts_pos, pos+1); // a customized printf() would be faster
- for (i = 0; i < n; ++i) { // base level filters have to go here
- int j, m = 0;
- for (j = 0; j < n_plp[i]; ++j) {
- const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
- if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
- else if (p->qpos < p->b->core.l_qseq &&
- bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
+ if (b[i]->core.tid < 0)
+ continue;
+ if (b[i]->core.flag & opt->flag)
+ continue;
+ if (b[i]->core.qual < opt->min_mqual)
+ continue;
+
+ if (opt->min_len) {
+ if (qlen_used(b[i]) < opt->min_len)
+ continue;
}
- fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
+
+ break;
}
- fputc('\n', file_out);
}
- if (ret < 0) status = EXIT_FAILURE;
- free(n_plp); free(plp);
- bam_mplp_destroy(mplp);
-
- if (all) {
- // Handle terminating region
- if (last_tid < 0 && reg) {
- last_tid = reg_tid;
- last_pos = beg-1;
+
+ // Tidy up end.
+ ret = add_depth(opt, &dh, h[0], NULL, 0, 0);
+ err = 0;
+
+ err:
+ if (ret == 0 && err)
+ ret = -1;
+
+ for (i = 0; i < nfiles; i++) {
+ if (b[i])
+ bam_destroy1(b[i]);
+ if (dh.hist && dh.hist[i])
+ free(dh.hist[i]);
+ }
+ free(b);
+ free(finished);
+ ks_free(&dh.ks);
+ free(dh.hist);
+ free(dh.end_pos);
+ if (overlaps) {
+ khiter_t k;
+ for (i = 0; i < nfiles; i++) {
+ if (!overlaps[i])
+ continue;
+ for (k = kh_begin(overlaps[i]); k < kh_end(overlaps[i]); k++)
+ if (kh_exist(overlaps[i], k))
+ free((char *)kh_key(overlaps[i], k));
+ kh_destroy(olap_hash, overlaps[i]);
}
- while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) {
- while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
- if (last_pos >= end) break;
- if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)
- continue;
- fputs(sam_hdr_tid2name(h, last_tid), file_out);
- fprintf(file_out, "\t%"PRIhts_pos, last_pos+1);
- for (i = 0; i < n; i++)
- fputc('\t', file_out), fputc('0', file_out);
- fputc('\n', file_out);
+ free(overlaps);
+ }
+
+ return ret;
+}
+
+static void usage_exit(FILE *fp, int exit_status)
+{
+ fprintf(fp, "Usage: samtools depth [options] in.bam [in.bam ...]\n");
+ fprintf(fp, "\nOptions:\n");
+ fprintf(fp, " -a Output all positions (including zero depth)\n");
+ fprintf(fp, " -a -a, -aa Output absolutely all positions, including unused ref seqs\n");
+ fprintf(fp, " -r REG Specify a region in chr or chr:from-to syntax\n");
+ fprintf(fp, " -b FILE Use bed FILE for list of regions\n");
+ fprintf(fp, " -f FILE Specify list of input BAM/SAM/CRAM filenames\n");
+ fprintf(fp, " -X Use custom index files (in -X *.bam *.bam.bai order)\n");
+ fprintf(fp, " -g INT Remove specified flags from default flag filter\n");
+ fprintf(fp, " -G INT Add specified flags to the default flag filter\n");
+ fprintf(fp, " -H Print a file header line\n");
+ fprintf(fp, " -l INT Minimum read length [0]\n");
+ fprintf(fp, " -o FILE Write output to FILE [stdout]\n");
+ fprintf(fp, " -q INT Minimum base quality [0]\n");
+ fprintf(fp, " -Q INT Minimum mapping quality [0]\n");
+ fprintf(fp, " -H Print a file header\n");
+ fprintf(fp, " -J Include reads with deletions in depth computation\n");
+ fprintf(fp, " -s Do not count overlapping reads within a template\n");
+ sam_global_opt_help(fp, "-.---@-.");
+ exit(exit_status);
+}
+
+int main_depth(int argc, char *argv[])
+{
+ int nfiles, i;
+ samFile **fp;
+ sam_hdr_t **header;
+ int c, has_index_file = 0;
+ char *file_list = NULL, **fn = NULL;
+ depth_opt opt = {
+ .flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL,
+ .min_qual = 0,
+ .min_mqual = 0,
+ .skip_del = 1,
+ .header = 0,
+ .min_len = 0,
+ .out = stdout,
+ .all_pos = 0,
+ .remove_overlaps = 0,
+ .reg = NULL,
+ .bed = NULL,
+ };
+
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'),
+ {NULL, 0, NULL, 0}
+ };
+
+ while ((c = getopt_long(argc, argv, "@:q:Q:JHd:m:l:g:G:o:ar:Xf:b:s",
+ lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'a':
+ opt.all_pos++;
+ break;
+
+ case 'b':
+ opt.bed = bed_read(optarg);
+ if (!opt.bed) {
+ print_error_errno("depth", "Could not read file \"%s\"",
+ optarg);
+ return 1;
}
- last_tid++;
- last_pos = -1;
- if (all < 2 || reg)
+ break;
+
+ case 'f':
+ file_list = optarg;
+ break;
+
+ case 'd':
+ case 'm':
+ // depth limit - now ignored
+ break;
+
+ case 'g':
+ opt.flag &= ~bam_str2flag(optarg);
+ break;
+ case 'G':
+ opt.flag |= bam_str2flag(optarg);
+ break;
+
+ case 'l':
+ opt.min_len = atoi(optarg);
+ break;
+
+ case 'H':
+ opt.header = 1;
+ break;
+
+ case 'q':
+ opt.min_qual = atoi(optarg);
+ break;
+ case 'Q':
+ opt.min_mqual = atoi(optarg);
+ break;
+
+ case 'J':
+ opt.skip_del = 0;
+ break;
+
+ case 'o':
+ if (opt.out != stdout)
break;
+ opt.out = fopen(optarg, "w");
+ if (!opt.out) {
+ print_error_errno("depth", "Cannot open \"%s\" for writing.",
+ optarg);
+ return EXIT_FAILURE;
+ }
+ break;
+
+ case 'r':
+ opt.reg = optarg;
+ break;
+
+ case 's':
+ opt.remove_overlaps = 1;
+ break;
+
+ case 'X':
+ has_index_file = 1;
+ break;
+
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?':
+ usage_exit(stderr, EXIT_FAILURE);
}
}
-depth_end:
- if (((file_out != stdout)? fclose(file_out) : fflush(file_out)) != 0) {
- if (status == EXIT_SUCCESS) {
- if (file_out != stdout)
- print_error_errno("depth", "error on closing \"%s\"", output_file);
- else
- print_error_errno("depth", "error on flushing standard output");
- status = EXIT_FAILURE;
+ if (argc < optind+1 && !file_list) {
+ if (argc == optind)
+ usage_exit(stdout, EXIT_SUCCESS);
+ else
+ usage_exit(stderr, EXIT_FAILURE);
+ }
+
+ if (file_list) {
+ if (has_index_file) {
+ print_error("depth", "The -f option cannot be combined with -X");
+ return 1;
+ }
+ if (read_file_list(file_list, &nfiles, &fn))
+ return 1;
+ argv = fn;
+ argc = nfiles;
+ optind = 0;
+ } else {
+ nfiles = argc - optind;
+ }
+
+ if (has_index_file) {
+ if (nfiles%1) {
+ print_error("depth", "-X needs one index specified per bam file");
+ return 1;
}
+ nfiles /= 2;
+ }
+ fp = malloc(nfiles * sizeof(*fp));
+ header = malloc(nfiles * sizeof(*header));
+ if (!fp || !header) {
+ print_error_errno("depth", "Out of memory");
+ return 1;
}
- for (i = 0; i < n && data[i]; ++i) {
- sam_hdr_destroy(data[i]->hdr);
- if (data[i]->fp) sam_close(data[i]->fp);
- hts_itr_destroy(data[i]->iter);
- free(data[i]);
+ hts_itr_t **itr = NULL;
+ if (opt.reg) {
+ itr = calloc(nfiles, sizeof(*itr));
+ if (!itr)
+ return 1;
}
- free(data); free(reg);
- if (bed) bed_destroy(bed);
- if ( file_list )
- {
- for (i=0; i<n; i++) free(fn[i]);
+
+ for (i = 0; i < nfiles; i++, optind++) {
+ fp[i] = sam_open_format(argv[optind], "r", &ga.in);
+ if (fp[i] == NULL) {
+ print_error_errno("depth",
+ "Cannot open input file \"%s\"", argv[optind]);
+ return 1;
+ }
+
+ if (ga.nthreads > 0)
+ hts_set_threads(fp[i], ga.nthreads);
+
+ if (hts_set_opt(fp[i], CRAM_OPT_REQUIRED_FIELDS,
+ SAM_FLAG | SAM_RNAME | SAM_POS | SAM_CIGAR
+ | (opt.remove_overlaps ? SAM_QNAME|SAM_RNEXT|SAM_PNEXT
+ : 0)
+ | (opt.min_mqual ? SAM_MAPQ : 0)
+ | (opt.min_len ? SAM_SEQ : 0)
+ | (opt.min_qual ? SAM_QUAL : 0))) {
+ fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+ return 1;
+ }
+
+ if (hts_set_opt(fp[i], CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ return 1;
+ }
+
+ // FIXME: what if headers differ?
+ header[i] = sam_hdr_read(fp[i]);
+ if (header == NULL) {
+ fprintf(stderr, "Failed to read header for \"%s\"\n",
+ argv[optind]);
+ return 1;
+ }
+
+ if (opt.reg) {
+ hts_idx_t *idx = has_index_file
+ ? sam_index_load2(fp[i], argv[optind], argv[optind+nfiles])
+ : sam_index_load(fp[i], argv[optind]);
+ if (!idx) {
+ print_error("depth", "cannot load index for \"%s\"",
+ argv[optind]);
+ return 1;
+ }
+ if (!(itr[i] = sam_itr_querys(idx, header[i], opt.reg))) {
+ print_error("depth", "cannot parse region \"%s\"", opt.reg);
+ return 1;
+ }
+ hts_idx_destroy(idx);
+ }
+ }
+
+ int ret = fastdepth_core(&opt, nfiles, &argv[argc-nfiles], fp, itr, header)
+ ? 1 : 0;
+
+ for (i = 0; i < nfiles; i++) {
+ sam_hdr_destroy(header[i]);
+ sam_close(fp[i]);
+ if (itr && itr[i])
+ hts_itr_destroy(itr[i]);
+ }
+ free(header);
+ free(fp);
+ free(itr);
+ if (file_list) {
+ for (i=0; i<nfiles; i++)
+ free(fn[i]);
free(fn);
}
+ if (opt.bed)
+ bed_destroy(opt.bed);
sam_global_args_free(&ga);
- return status;
+ if (opt.out != stdout) fclose(opt.out);
+ return ret;
}
#ifdef _MAIN_BAM2DEPTH
/* bam2depth.c -- depth subcommand.
Copyright (C) 2011, 2012 Broad Institute.
- Copyright (C) 2012-2016, 2018, 2019 Genome Research Ltd.
+ Copyright (C) 2012-2016, 2018, 2019-2021 Genome Research Ltd.
+
+ Author: Heng Li <lh3@sanger.ac.uk> (to 2020)
+ Author: James Bonfield <jkb@sanger.ac.uk> (2021 rewrite)
- Author: Heng Li <lh3@sanger.ac.uk>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
DEALINGS IN THE SOFTWARE. */
/* This program demonstrates how to generate pileup from multiple BAMs
- * simutaneously, to achieve random access and to use the BED interface.
+ * simultaneously, to achieve random access and to use the BED interface.
* To compile this program separately, you may:
*
* gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -lhts -lz
#include "samtools.h"
#include "bedidx.h"
#include "sam_opts.h"
+#include "htslib/khash.h"
-#define BAM_FMAX ((BAM_FSUPPLEMENTARY << 1) - 1)
+// From bam_plcmd.c
+int read_file_list(const char *file_list, int *n, char **argv[]);
-typedef struct { // auxiliary data structure
- samFile *fp; // the file handle
- sam_hdr_t *hdr; // the file header
- hts_itr_t *iter; // NULL if a region not specified
- int min_mapQ, min_len; // mapQ filter; length filter
- uint32_t flags; // read filtering flags
-} aux_t;
+// We accumulate to hist[pos & (size-1)]. This is a ring-buffer.
+// We track where we last got to in output and what the biggest value
+// we've written to so far (in absolute unmasked coordinates) in
+// "last_output" and "end_pos" respectively.
+// For each new record we just flush anything we haven't written yet
+// already, between "last_output" and this read's start position, and
+// initialise any newly seen positions between "end_pos" and this read's
+// end position.
+typedef struct {
+ size_t size;
+ int **hist; // hist[nfiles][size]
+ hts_pos_t *end_pos; // end_pos[nfiles]
+ hts_pos_t last_output;
+ int last_ref;
+ int nfiles;
+ const char *ref;
+ kstring_t ks;
+ hts_pos_t beg, end; // limit to region
+ int tid;
+} depth_hist;
-// This function reads a BAM alignment from one BAM file.
-static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
-{
- aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
- int ret;
- while (1)
- {
- ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b);
- if ( ret<0 ) break;
- if ( b->core.flag & aux->flags) continue;
- if ( (int)b->core.qual < aux->min_mapQ ) continue;
- if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue;
- break;
+typedef struct {
+ int header;
+ int flag;
+ int min_qual;
+ int min_mqual;
+ int min_len;
+ int skip_del;
+ int all_pos;
+ int remove_overlaps;
+ FILE *out;
+ char *reg;
+ void *bed;
+} depth_opt;
+
+static void zero_region(depth_opt *opt, depth_hist *dh,
+ const char *name, hts_pos_t start, hts_pos_t end) {
+ hts_pos_t i;
+ kstring_t *ks = &dh->ks;
+
+ kputs(name, ks_clear(ks));
+ kputc('\t', ks);
+ size_t cur_l = ks->l;
+ if (dh->beg >= 0 && start < dh->beg)
+ start = dh->beg;
+ if (dh->end >= 0 && end > dh->end)
+ end = dh->end;
+
+ for (i = start; i < end; i++) {
+ // Could be optimised, but needs better API to skip to next
+ // bed region.
+ if (opt->bed && bed_overlap(opt->bed, name, i, i+1) == 0)
+ continue;
+
+ ks->l = cur_l;
+ kputll(i+1, ks);
+ int n;
+ for (n = 0; n < dh->nfiles; n++) {
+ kputc_('\t', ks);
+ kputc_('0', ks);
+ }
+ kputc('\n', ks);
+ fputs(ks->s, opt->out);
}
- return ret;
+ ks->l = cur_l;
}
-int read_file_list(const char *file_list,int *n,char **argv[]);
-
-static int usage() {
- fprintf(samtools_stderr, "\n");
- fprintf(samtools_stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
- fprintf(samtools_stderr, "Options:\n");
- fprintf(samtools_stderr, " -a output all positions (including zero depth)\n");
- fprintf(samtools_stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n");
- fprintf(samtools_stderr, " -b <bed> list of positions or regions\n");
- fprintf(samtools_stderr, " -X use customized index files\n");
- fprintf(samtools_stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
- fprintf(samtools_stderr, " -H print a file header\n");
- fprintf(samtools_stderr, " -l <int> read length threshold (ignore reads shorter than <int>) [0]\n");
- fprintf(samtools_stderr, " -d/-m <int> maximum coverage depth [8000]. If 0, depth is set to the maximum\n"
- " integer value, effectively removing any depth limit.\n"); // the htslib's default
- fprintf(samtools_stderr, " -o FILE where to write output to [samtools_stdout]\n");
- fprintf(samtools_stderr, " -q <int> base quality threshold [0]\n");
- fprintf(samtools_stderr, " -Q <int> mapping quality threshold [0]\n");
- fprintf(samtools_stderr, " -r <chr:from-to> region\n");
- fprintf(samtools_stderr, " -g <flags> include reads that have any of the specified flags set [0]\n");
- fprintf(samtools_stderr, " -G <flags> filter out reads that have any of the specified flags set"
- " [UNMAP,SECONDARY,QCFAIL,DUP]\n");
-
- sam_global_opt_help(samtools_stderr, "-.--.--.");
-
- fprintf(samtools_stderr, "\n");
- fprintf(samtools_stderr, "The output is a simple tab-separated table with three columns: reference name,\n");
- fprintf(samtools_stderr, "position, and coverage depth. Note that positions with zero coverage may be\n");
- fprintf(samtools_stderr, "omitted by default; see the -a option.\n");
- fprintf(samtools_stderr, "\n");
-
- return EXIT_FAILURE;
+// A variation of bam_cigar2qlen which doesn't count soft-clips in to the
+// equation. Basically it's the number of bases in query that are aligned
+// in some way to the reference (including insertions, which are considered
+// to be aligned by dint of being anchored either side).
+hts_pos_t qlen_used(bam1_t *b) {
+ int n_cigar = b->core.n_cigar;
+ const uint32_t *cigar = bam_get_cigar(b);
+
+ hts_pos_t l;
+
+ if (b->core.l_qseq) {
+ // Known SEQ permits of short cut of l_qseq minus CSOFT_CLIPs.
+ // Full scan not needed, which helps on excessively long CIGARs.
+ l = b->core.l_qseq;
+ int kl, kr;
+ for (kl = 0; kl < n_cigar; kl++)
+ if (bam_cigar_op(cigar[kl]) == BAM_CSOFT_CLIP)
+ l -= bam_cigar_oplen(cigar[kl]);
+ else
+ break;
+
+ for (kr = n_cigar-1; kr > kl; kr--)
+ if (bam_cigar_op(cigar[kr]) == BAM_CSOFT_CLIP)
+ l -= bam_cigar_oplen(cigar[kr]);
+ else
+ break;
+ } else {
+ // Unknown SEQ ("*") needs a full scan through the CIGAR string.
+ static int query[16] = {
+ //M I D N S H P = X B ? ? ? ? ? ?
+ 1,1,0,0, 0,0,0,1, 1,0,0,0, 0,0,0,0
+ };
+ int k;
+ for (k = l = 0; k < n_cigar; k++)
+ if (query[bam_cigar_op(cigar[k])])
+ l += bam_cigar_oplen(cigar[k]);
+ }
+ return l;
+
}
-int main_depth(int argc, char *argv[])
-{
- int i, n, tid, reg_tid, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, has_index_file = 0;
- hts_pos_t beg, end, pos, last_pos = -1;
- int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1;
- const bam_pileup1_t **plp;
- char *reg = 0; // specified region
- void *bed = 0; // BED data structure
- char *file_list = NULL, **fn = NULL;
- sam_hdr_t *h = NULL; // BAM header of the 1st input
- aux_t **data;
- bam_mplp_t mplp;
- int last_tid = -1, ret;
- int print_header = 0;
- char *output_file = NULL;
- FILE *file_out = samtools_stdout;
- uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
- int tflags = 0;
+// Adds the depth for a single read to a depth_hist struct.
+// For just one file, this is easy. We just have a circular buffer
+// where we increment values for bits that overlap existing data
+// and initialise values for coordinates which we're seeing for the first
+// time. This is tracked by "end_pos" to know where we've got to.
+//
+// As the input is sorted, we can flush output from "last_output" to
+// b->core.pos.
+//
+// With multiple files, we must feed data in sorted order as if all files
+// are merged, but track depth per file. This also means "end_pos" is per
+// file too, but "last_output" is global as it corresponds to rows printed.
+static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b,
+ int overlap_clip, int file) {
+ hts_pos_t i;
+ size_t hmask = dh->size-1;
+ int n;
- sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
- static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
- { NULL, 0, NULL, 0 }
- };
+ if (!b || b->core.tid != dh->last_ref) {
+ // New ref
+ if (dh->last_ref >= 0) {
+ // do end
+ size_t cur_l = dh->ks.l;
+ int nf = dh->nfiles;
+ i = dh->last_output;
+ for (i = dh->last_output; nf; i++) {
+ nf = 0;
+ for (n = 0; n < dh->nfiles; n++) {
+ if (i < dh->end_pos[n])
+ nf++;
+ }
+ if (!nf)
+ break;
+
+ if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0)
+ continue;
- // parse the command line
- while ((n = getopt_long(argc, argv, "r:b:Xq:Q:l:f:am:d:Ho:g:G:", lopts, NULL)) >= 0) {
- switch (n) {
- case 'l': min_len = atoi(optarg); break; // minimum query length
- case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header
- case 'b':
- bed = bed_read(optarg); // BED or position list file can be parsed now
- if (!bed) {
- print_error_errno("depth", "Could not read file \"%s\"", optarg);
- return EXIT_FAILURE;
+ dh->ks.l = cur_l;
+ kputll(i+1, &dh->ks);
+ for (n = 0; n < dh->nfiles; n++) {
+ kputc_('\t', &dh->ks);
+ int d = i < dh->end_pos[n]
+ ? dh->hist[n][i & hmask]
+ : 0;
+ kputuw(d, &dh->ks);
}
- break;
- case 'X': has_index_file = 1; break;
- case 'q': baseQ = atoi(optarg); break; // base quality threshold
- case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold
- case 'f': file_list = optarg; break;
- case 'a': all++; break;
- case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth
- case 'H': print_header = 1; break;
- case 'o': output_file = optarg; break;
- case 'g':
- tflags = bam_str2flag(optarg);
- if (tflags < 0 || tflags > BAM_FMAX) {
- print_error_errno("depth", "Flag value \"%s\" is not supported", optarg);
- return 1;
+ kputc('\n', &dh->ks);
+ fputs(dh->ks.s, opt->out);
+ }
+ if (opt->all_pos) {
+ // End of last ref
+ zero_region(opt, dh,
+ sam_hdr_tid2name(h, dh->last_ref),
+ i, sam_hdr_tid2len(h, dh->last_ref));
+ }
+ dh->ks.l = cur_l;
+ }
+
+ if (opt->all_pos > 1 && !opt->reg) {
+ // Any previous unused refs
+ int lr = dh->last_ref < 0 ? 0 : dh->last_ref+1;
+ int rr = b ? b->core.tid : sam_hdr_nref(h), r;
+ for (r = lr; r < rr; r++)
+ zero_region(opt, dh,
+ sam_hdr_tid2name(h, r),
+ 0, sam_hdr_tid2len(h, r));
+ }
+
+ if (!b) {
+ // we're just flushing to end of file
+ if (opt->all_pos && opt->reg && dh->last_ref < 0)
+ // -a or -aa without a single read being output yet
+ zero_region(opt, dh, sam_hdr_tid2name(h, dh->tid), dh->beg,
+ MIN(dh->end, sam_hdr_tid2len(h, dh->tid)));
+
+ return 0;
+ }
+
+ for (n = 0; dh->end_pos && n < dh->nfiles; n++)
+ dh->end_pos[n] = 0;
+ dh->last_output = dh->beg >= 0
+ ? MAX(b->core.pos, dh->beg)
+ : b->core.pos;
+ dh->last_ref = b->core.tid;
+ dh->ref = sam_hdr_tid2name(h, b->core.tid);
+ kputs(dh->ref, ks_clear(&dh->ks));
+ kputc('\t', &dh->ks);
+
+ if (opt->all_pos)
+ // Start of ref
+ zero_region(opt, dh, dh->ref, 0, b->core.pos);
+ } else {
+ if (dh->last_output < b->core.pos) {
+ // Flush any depth outputs up to start of new read
+ size_t cur_l = dh->ks.l;
+ int nf = dh->nfiles;
+ for (i = dh->last_output; i < b->core.pos; i++) {
+ nf = 0;
+ for (n = 0; n < dh->nfiles; n++) {
+ if (i < dh->end_pos[n])
+ nf++;
}
- flags &= ~tflags;
- break;
- case 'G':
- tflags = bam_str2flag(optarg);
- if (tflags < 0 || tflags > BAM_FMAX) {
- print_error_errno("depth", "Flag value \"%s\" is not supported", optarg);
- return 1;
+ if (!nf)
+ break;
+
+ if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0)
+ continue;
+
+ dh->ks.l = cur_l;
+ kputll(i+1, &dh->ks);
+ for (n = 0; n < dh->nfiles; n++) {
+ kputc_('\t', &dh->ks);
+ int d = i < dh->end_pos[n]
+ ? dh->hist[n][i & hmask]
+ : 0;
+ kputuw(d, &dh->ks);
}
- flags |= tflags;
- break;
- default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break;
- /* else fall-through */
- case '?': return usage();
+ kputc('\n', &dh->ks);
+ fputs(dh->ks.s, opt->out);
+ }
+ if (opt->all_pos && i < b->core.pos)
+ // Hole in middle of ref
+ zero_region(opt, dh, dh->ref, i, b->core.pos);
+
+ dh->ks.l = cur_l;
+ dh->last_output = b->core.pos;
}
}
- if (optind == argc && !file_list)
- return usage();
-
- /* output file provided by user */
- if (output_file != NULL && strcmp(output_file,"-")!=0) {
- file_out = fopen( output_file, "w" );
- if (file_out == NULL) {
- print_error_errno("depth", "Cannot open \"%s\" for writing.", output_file);
- return EXIT_FAILURE;
- }
+
+ hts_pos_t end_pos = bam_endpos(b); // 0 based, 1 past end.
+ //printf("%d %d\n", (int)b->core.pos+1, (int)end_pos);
+
+ if (b->core.tid < dh->last_ref ||
+ (dh->last_ref == b->core.tid && end_pos < dh->last_output)) {
+ print_error_errno("depth", "Data is not position sorted");
+ return -1;
}
+ // If needed, grow the circular buffer.
+ if (end_pos+1 - b->core.pos >= dh->size) {
+ size_t old_size = dh->size;
+ size_t old_hmask = hmask;
+ while (end_pos+1 - b->core.pos >= dh->size)
+ dh->size = dh->size ? 2*dh->size : 2048;
+ hmask = dh->size-1;
+ if (!dh->hist) {
+ dh->hist = calloc(dh->nfiles, sizeof(*dh->hist));
+ dh->end_pos = calloc(dh->nfiles, sizeof(*dh->end_pos));
+ if (!dh->hist || !dh->end_pos)
+ return -1;
+ }
+ for (n = 0; n < dh->nfiles; n++) {
+ int *hist = calloc(dh->size, sizeof(*dh->hist[n]));
+ if (!hist)
+ return -1;
- // initialize the auxiliary data structures
- if (file_list)
- {
- if (has_index_file) {
- print_error("depth", "The -f option cannot be combined with -X");
- return 1;
+ // Simple approach for now; copy over old histogram verbatim.
+ for (i = dh->last_output; i < dh->last_output + old_size; i++)
+ hist[i & hmask] = dh->hist[n][i & old_hmask];
+ free(dh->hist[n]);
+ dh->hist[n] = hist;
}
- if ( read_file_list(file_list,&nfiles,&fn) ) return EXIT_FAILURE;
- n = nfiles;
- argv = fn;
- optind = 0;
}
- else if (has_index_file) { // Calculate # of input BAM files
- if ((argc - optind) % 2 != 0) {
- fprintf(samtools_stderr, "Error: Odd number of filenames detected! Each BAM file should have an index file\n");
- return 1;
- }
- n = (argc - optind) / 2;
+
+ // Accumulate depth, based on CIGAR
+ uint32_t *cig = bam_get_cigar(b);
+ int ncig = b->core.n_cigar, j, k, spos = 0;
+
+ // Zero new (previously unseen) coordinates so increment works later.
+ hts_pos_t end = MAX(dh->end_pos[file], b->core.pos);
+ if (end_pos > end && (end & hmask) < (end_pos & hmask)) {
+ memset(&dh->hist[file][end & hmask], 0,
+ sizeof(**dh->hist) * (end_pos - end));
} else {
- n = argc - optind;
+ for (i = end; i < end_pos; i++)
+ dh->hist[file][i & hmask] = 0;
}
- data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input
- reg_tid = 0; beg = 0; end = HTS_POS_MAX; // set the default region
-
- for (i = 0; i < n; ++i) {
- int rf;
- data[i] = calloc(1, sizeof(aux_t));
- data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM
- if (data[i]->fp == NULL) {
- print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]);
- status = EXIT_FAILURE;
- goto depth_end;
- }
- rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ;
- if (baseQ) rf |= SAM_QUAL;
- if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
- print_error_errno("depth", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
- status = EXIT_FAILURE;
- goto depth_end;
- }
- if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
- print_error_errno("depth", "Failed to set CRAM_OPT_DECODE_MD value");
- status = EXIT_FAILURE;
- goto depth_end;
- }
- data[i]->min_mapQ = mapQ; // set the mapQ filter
- data[i]->min_len = min_len; // set the qlen filter
- data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header
- if (data[i]->hdr == NULL) {
- print_error_errno("depth", "Couldn't read header for \"%s\"",
- argv[optind+i]);
- status = EXIT_FAILURE;
- goto depth_end;
- }
- if (reg) { // if a region is specified
- hts_idx_t *idx = NULL;
- // If index filename has not been specfied, look in BAM folder
- if (has_index_file) {
- idx = sam_index_load2(data[i]->fp, argv[optind+i], argv[optind+i+n]); // load the index
+
+ i = b->core.pos;
+ uint8_t *qual = bam_get_qual(b);
+ int min_qual = opt->min_qual;
+ for (j = 0; j < ncig; j++) {
+ int op = bam_cigar_op(cig[j]);
+ int oplen = bam_cigar_oplen(cig[j]);
+
+ switch (op) {
+ case BAM_CDEL:
+ case BAM_CREF_SKIP:
+ if (op != BAM_CDEL || opt->skip_del) {
+ // don't increment reference location
+ if (i + oplen >= dh->end_pos[file]) {
+ for (k = 0; k < oplen; k++, i++) {
+ if (i >= dh->end_pos[file])
+ // redundant due to zero new elements above?
+ dh->hist[file][i & hmask] = 0;
+ }
+ } else {
+ i += oplen;
+ }
+ } else { // op == BAM_CDEL and we count them (-J option),
+ // We don't incr spos here, but we still use qual.
+ // This doesn't make much sense, but it's for compatibility
+ // with the old code. Arguably DEL shouldn't have a min
+ // qual and should always pass (as we've explicitly asked to
+ // include them).
+ int *hist = dh->hist[file];
+ k = 0;
+ if (overlap_clip) {
+ if (i+oplen < overlap_clip) {
+ i += oplen;
+ break;
+ } else if (i < overlap_clip) {
+ k = overlap_clip - i;
+ i = overlap_clip;
+ }
+ }
+
+ // Question: should we even check quality values for DEL?
+ // We've explicitly asked to include them, and the quality
+ // is wrong anyway (it's the neighbouring base). We do this
+ // for now for compatibility with the old depth command.
+
+ if (spos < b->core.l_qseq)
+ for (; k < oplen; k++, i++)
+ hist[i & hmask]+=qual[spos]>=min_qual;
+ else
+ for (; k < oplen; k++, i++)
+ hist[i & hmask]++;
+ }
+ break;
+
+ case BAM_CMATCH:
+ case BAM_CEQUAL:
+ case BAM_CDIFF:
+ if ((i & hmask) < ((i+oplen) & hmask)) {
+ // Optimisation when not wrapping around
+
+ // Unrolling doesn't help clang, but helps gcc,
+ // especially when not using -O3.
+ int *hist = &dh->hist[file][i & hmask];
+ if (min_qual || overlap_clip) {
+ k = 0;
+ if (overlap_clip) {
+ if (i+oplen < overlap_clip) {
+ i += oplen;
+ spos += oplen;
+ break;
+ } else if (i < overlap_clip) {
+ oplen -= overlap_clip - i;
+ spos += overlap_clip - i;
+ hist += overlap_clip - i;
+ i = overlap_clip;
+ }
+ }
+
+ // approx 50% of this func cpu time in this loop
+ for (; k < (oplen & ~7); k+=8) {
+ hist[k+0]+=qual[spos+0]>=min_qual;
+ hist[k+1]+=qual[spos+1]>=min_qual;
+ hist[k+2]+=qual[spos+2]>=min_qual;
+ hist[k+3]+=qual[spos+3]>=min_qual;
+ hist[k+4]+=qual[spos+4]>=min_qual;
+ hist[k+5]+=qual[spos+5]>=min_qual;
+ hist[k+6]+=qual[spos+6]>=min_qual;
+ hist[k+7]+=qual[spos+7]>=min_qual;
+ spos += 8;
+ }
+ } else {
+ // easier to vectorize when no min_qual
+ for (k = 0; k < (oplen & ~7); k+=8) {
+ hist[k+0]++;
+ hist[k+1]++;
+ hist[k+2]++;
+ hist[k+3]++;
+ hist[k+4]++;
+ hist[k+5]++;
+ hist[k+6]++;
+ hist[k+7]++;
+ }
+ spos += k;
+ }
+ for (; k < oplen && spos < b->core.l_qseq; k++, spos++)
+ hist[k]+=qual[spos]>=min_qual;
+ for (; k < oplen; k++, spos++)
+ hist[k]++;
+ i += oplen;
} else {
- idx = sam_index_load(data[i]->fp, argv[optind+i]);
+ // Simple to understand case, but slower.
+ // We use this only for reads with wrap-around.
+ int *hist = dh->hist[file];
+ k = 0;
+ if (overlap_clip) {
+ if (i+oplen < overlap_clip) {
+ i += oplen;
+ break;
+ } else if (i < overlap_clip) {
+ oplen -= overlap_clip - i;
+ spos += overlap_clip - i;
+ i = overlap_clip;
+ }
+ }
+ for (; k < oplen && spos < b->core.l_qseq; k++, i++, spos++)
+ hist[i & hmask]+=qual[spos]>=min_qual;
+ for (; k < oplen; k++, i++, spos++)
+ hist[i & hmask]++;
}
- if (idx == NULL) {
- print_error("depth", "can't load index for \"%s\"", argv[optind+i]);
- status = EXIT_FAILURE;
- goto depth_end;
+ break;
+
+ case BAM_CINS:
+ case BAM_CSOFT_CLIP:
+ spos += oplen;
+ break;
+
+ case BAM_CPAD:
+ case BAM_CHARD_CLIP:
+ // ignore
+ break;
+
+ default:
+ print_error("depth", "Unsupported cigar op '%d'", op);
+ return -1;
+ }
+ }
+
+ if (dh->end >= 0 && end_pos > dh->end)
+ end_pos = dh->end;
+ if (dh->end_pos[file] < end_pos)
+ dh->end_pos[file] = end_pos;
+
+ return 0;
+}
+
+// Hash on name -> alignment end pos. This permits a naive overlap removal.
+// Note it cannot analyse the overlapping sequence and qualities, so the
+// interaction of basecalls/qualities and the -Q parameter cannot be
+// applied here (unlike the full mpileup algorithm).
+KHASH_MAP_INIT_STR(olap_hash, hts_pos_t)
+typedef khash_t(olap_hash) olap_hash_t;
+
+static int fastdepth_core(depth_opt *opt, uint32_t nfiles, char **fn,
+ samFile **fp, hts_itr_t **itr, sam_hdr_t **h) {
+ int ret = -1, err = 1, i;
+ olap_hash_t **overlaps = NULL;
+ depth_hist dh = {0};
+
+ // An array of bam structs, one per input file, to hold the next entry
+ bam1_t **b = calloc(nfiles, sizeof(*b));
+ int *finished = calloc(nfiles, sizeof(*finished)), to_go = nfiles;
+ if (!b || !finished)
+ goto err;
+
+ for (i = 0; i < nfiles; i++)
+ if (!(b[i] = bam_init1()))
+ goto err;
+
+ // Do we need one overlap hash per file? Or shared?
+ if (opt->remove_overlaps) {
+ if (!(overlaps = calloc(nfiles, sizeof(*overlaps))))
+ return -1;
+ for (i = 0; i < nfiles; i++) {
+ if (!(overlaps[i] = kh_init(olap_hash)))
+ return -1;
+ }
+ }
+
+ // Create the initial histogram
+ dh.nfiles = nfiles;
+ dh.size = 0;
+ dh.hist = NULL;
+ dh.last_ref = -99;
+ dh.end_pos = NULL;
+ dh.last_output = itr && itr[0] ? itr[0]->beg : 0;
+ ks_initialize(&dh.ks);
+
+ // Clip results to region if specified
+ dh.beg = -1;
+ dh.end = -1;
+ dh.tid = 0;
+ if (itr && itr[0]) {
+ dh.tid = itr[0]->tid;
+ dh.beg = itr[0]->beg;
+ dh.end = itr[0]->end;
+ }
+
+ if (opt->header) {
+ fprintf(opt->out, "#CHROM\tPOS");
+ for (i = 0; i < nfiles; i++)
+ fprintf(opt->out, "\t%s", fn[i]);
+ fputc('\n', opt->out);
+ }
+
+ // Populate first record per file
+ for (i = 0; i < nfiles; i++) {
+ for(;;) {
+ ret = itr && itr[i]
+ ? sam_itr_next(fp[i], itr[i], b[i])
+ : sam_read1(fp[i], h[i], b[i]);
+ if (ret < -1)
+ goto err;
+ if (ret == -1) {
+ to_go--;
+ finished[i] = 1;
+ break;
}
- data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator
- hts_idx_destroy(idx); // the index is not needed any more; free the memory
- if (data[i]->iter == NULL) {
- print_error("depth", "can't parse region \"%s\"", reg);
- status = EXIT_FAILURE;
- goto depth_end;
+
+ if (b[i]->core.tid < 0)
+ continue;
+ if (b[i]->core.flag & opt->flag)
+ continue;
+ if (b[i]->core.qual < opt->min_mqual)
+ continue;
+
+ // Original samtools depth used the total sequence (l_qseq)
+ // including soft-clips. This doesn't feel like a useful metric
+ // to be filtering on. We now only count sequence bases that
+ // form the used part of the alignment.
+ if (opt->min_len) {
+ if (qlen_used(b[i]) < opt->min_len)
+ continue;
}
+
+ break;
}
- data[i]->flags = flags;
}
- if (print_header) {
- fputs("#CHROM\tPOS", file_out);
- for (i = 0; i < n; ++i) {
- fputc('\t', file_out);
- fputs(argv[optind+i], file_out);
+
+ // Loop through input files, merging in order so we're
+ // always adding the next record in sequence
+ while (to_go) {
+ // Find next record in file list
+ int best_tid = INT_MAX, best_file = 0;
+ hts_pos_t best_pos = HTS_POS_MAX;
+
+ for (i = 0; i < nfiles; i++) {
+ if (finished[i])
+ continue;
+ if (best_tid > b[i]->core.tid) {
+ best_tid = b[i]->core.tid;
+ best_pos = b[i]->core.pos;
+ best_file = i;
+ } else if (best_tid == b[i]->core.tid &&
+ best_pos > b[i]->core.pos) {
+ best_pos = b[i]->core.pos;
+ best_file = i;
}
- fputc('\n', file_out);
}
- h = data[0]->hdr; // easy access to the header of the 1st BAM
- if (reg) {
- beg = data[0]->iter->beg; // and to the parsed region coordinates
- end = data[0]->iter->end;
- reg_tid = data[0]->iter->tid;
- }
+ i = best_file;
- // the core multi-pileup loop
- mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
- if (0 < max_depth)
- bam_mplp_set_maxcnt(mplp,max_depth); // set maximum coverage depth
- else if (!max_depth)
- bam_mplp_set_maxcnt(mplp,INT_MAX);
- n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
- plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
- while ((ret=bam_mplp64_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
- if (pos < beg || pos >= end) continue; // out of range; skip
- if (tid >= sam_hdr_nref(h)) continue; // diff number of @SQ lines per file?
- if (all) {
- while (tid > last_tid) {
- if (last_tid >= 0 && !reg) {
- // Deal with remainder or entirety of last tid.
- while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
- // Horribly inefficient, but the bed API is an obfuscated black box.
- if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)
- continue;
- fputs(sam_hdr_tid2name(h, last_tid), file_out);
- fprintf(file_out, "\t%"PRIhts_pos, last_pos+1);
- for (i = 0; i < n; i++)
- fputc('\t', file_out), fputc('0', file_out);
- fputc('\n', file_out);
- }
+ hts_pos_t clip = 0;
+ if (overlaps && (b[i]->core.flag & BAM_FPAIRED) &&
+ !(b[i]->core.flag & BAM_FMUNMAP)) {
+ khiter_t k = kh_get(olap_hash, overlaps[i], bam_get_qname(b[i]));
+ if (k == kh_end(overlaps[i])) {
+ // not seen before
+ hts_pos_t endpos = bam_endpos(b[i]);
+
+ // Don't add if mate location is known and can't overlap.
+ if (b[i]->core.mpos == -1 ||
+ (b[i]->core.tid == b[i]->core.mtid &&
+ b[i]->core.mpos <= endpos)) {
+ k = kh_put(olap_hash, overlaps[i], bam_get_qname(b[i]),
+ &ret);
+ if (ret < 0)
+ return -1;
+ kh_key(overlaps[i], k) = strdup(bam_get_qname(b[i]));
+ kh_value(overlaps[i], k) = endpos;
}
- last_tid++;
- last_pos = -1;
- if (all < 2)
- break;
+ } else {
+ // seen before
+ clip = kh_value(overlaps[i], k);
+ free((char *)kh_key(overlaps[i], k));
+ kh_del(olap_hash, overlaps[i], k);
}
+ }
- // Deal with missing portion of current tid
- while (++last_pos < pos) {
- if (last_pos < beg) continue; // out of range; skip
- if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0)
- continue;
- fputs(sam_hdr_tid2name(h, tid), file_out);
- fprintf(file_out, "\t%"PRIhts_pos, last_pos+1);
- for (i = 0; i < n; i++)
- fputc('\t', file_out), fputc('0', file_out);
- fputc('\n', file_out);
+ // Add the next merged BAM record to the depth plot
+ if ((ret = add_depth(opt, &dh, h[i], b[i], clip, i)) < 0) {
+ ret = -1;
+ goto err;
+ }
+
+ // Populate next record from this file
+ for(;!finished[i];) {
+ ret = itr && itr[i]
+ ? sam_itr_next(fp[i], itr[i], b[i])
+ : sam_read1(fp[i], h[i], b[i]);
+ if (ret < -1) {
+ ret = -1;
+ goto err;
+ }
+ if (ret == -1) {
+ to_go--;
+ finished[i] = 1;
+ break;
}
- last_tid = tid;
- last_pos = pos;
- }
- if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), pos, pos + 1) == 0) continue;
- fputs(sam_hdr_tid2name(h, tid), file_out);
- fprintf(file_out, "\t%"PRIhts_pos, pos+1); // a customized fprintf(samtools_stdout, ) would be faster
- for (i = 0; i < n; ++i) { // base level filters have to go here
- int j, m = 0;
- for (j = 0; j < n_plp[i]; ++j) {
- const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
- if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
- else if (p->qpos < p->b->core.l_qseq &&
- bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
+ if (b[i]->core.tid < 0)
+ continue;
+ if (b[i]->core.flag & opt->flag)
+ continue;
+ if (b[i]->core.qual < opt->min_mqual)
+ continue;
+
+ if (opt->min_len) {
+ if (qlen_used(b[i]) < opt->min_len)
+ continue;
}
- fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
+
+ break;
}
- fputc('\n', file_out);
}
- if (ret < 0) status = EXIT_FAILURE;
- free(n_plp); free(plp);
- bam_mplp_destroy(mplp);
-
- if (all) {
- // Handle terminating region
- if (last_tid < 0 && reg) {
- last_tid = reg_tid;
- last_pos = beg-1;
+
+ // Tidy up end.
+ ret = add_depth(opt, &dh, h[0], NULL, 0, 0);
+ err = 0;
+
+ err:
+ if (ret == 0 && err)
+ ret = -1;
+
+ for (i = 0; i < nfiles; i++) {
+ if (b[i])
+ bam_destroy1(b[i]);
+ if (dh.hist && dh.hist[i])
+ free(dh.hist[i]);
+ }
+ free(b);
+ free(finished);
+ ks_free(&dh.ks);
+ free(dh.hist);
+ free(dh.end_pos);
+ if (overlaps) {
+ khiter_t k;
+ for (i = 0; i < nfiles; i++) {
+ if (!overlaps[i])
+ continue;
+ for (k = kh_begin(overlaps[i]); k < kh_end(overlaps[i]); k++)
+ if (kh_exist(overlaps[i], k))
+ free((char *)kh_key(overlaps[i], k));
+ kh_destroy(olap_hash, overlaps[i]);
}
- while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) {
- while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
- if (last_pos >= end) break;
- if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)
- continue;
- fputs(sam_hdr_tid2name(h, last_tid), file_out);
- fprintf(file_out, "\t%"PRIhts_pos, last_pos+1);
- for (i = 0; i < n; i++)
- fputc('\t', file_out), fputc('0', file_out);
- fputc('\n', file_out);
+ free(overlaps);
+ }
+
+ return ret;
+}
+
+static void usage_exit(FILE *fp, int exit_status)
+{
+ fprintf(fp, "Usage: samtools depth [options] in.bam [in.bam ...]\n");
+ fprintf(fp, "\nOptions:\n");
+ fprintf(fp, " -a Output all positions (including zero depth)\n");
+ fprintf(fp, " -a -a, -aa Output absolutely all positions, including unused ref seqs\n");
+ fprintf(fp, " -r REG Specify a region in chr or chr:from-to syntax\n");
+ fprintf(fp, " -b FILE Use bed FILE for list of regions\n");
+ fprintf(fp, " -f FILE Specify list of input BAM/SAM/CRAM filenames\n");
+ fprintf(fp, " -X Use custom index files (in -X *.bam *.bam.bai order)\n");
+ fprintf(fp, " -g INT Remove specified flags from default flag filter\n");
+ fprintf(fp, " -G INT Add specified flags to the default flag filter\n");
+ fprintf(fp, " -H Print a file header line\n");
+ fprintf(fp, " -l INT Minimum read length [0]\n");
+ fprintf(fp, " -o FILE Write output to FILE [samtools_stdout]\n");
+ fprintf(fp, " -q INT Minimum base quality [0]\n");
+ fprintf(fp, " -Q INT Minimum mapping quality [0]\n");
+ fprintf(fp, " -H Print a file header\n");
+ fprintf(fp, " -J Include reads with deletions in depth computation\n");
+ fprintf(fp, " -s Do not count overlapping reads within a template\n");
+ sam_global_opt_help(fp, "-.---@-.");
+ samtools_exit(exit_status);
+}
+
+int main_depth(int argc, char *argv[])
+{
+ int nfiles, i;
+ samFile **fp;
+ sam_hdr_t **header;
+ int c, has_index_file = 0;
+ char *file_list = NULL, **fn = NULL;
+ depth_opt opt = {
+ .flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL,
+ .min_qual = 0,
+ .min_mqual = 0,
+ .skip_del = 1,
+ .header = 0,
+ .min_len = 0,
+ .out = samtools_stdout,
+ .all_pos = 0,
+ .remove_overlaps = 0,
+ .reg = NULL,
+ .bed = NULL,
+ };
+
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'),
+ {NULL, 0, NULL, 0}
+ };
+
+ while ((c = getopt_long(argc, argv, "@:q:Q:JHd:m:l:g:G:o:ar:Xf:b:s",
+ lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'a':
+ opt.all_pos++;
+ break;
+
+ case 'b':
+ opt.bed = bed_read(optarg);
+ if (!opt.bed) {
+ print_error_errno("depth", "Could not read file \"%s\"",
+ optarg);
+ return 1;
}
- last_tid++;
- last_pos = -1;
- if (all < 2 || reg)
+ break;
+
+ case 'f':
+ file_list = optarg;
+ break;
+
+ case 'd':
+ case 'm':
+ // depth limit - now ignored
+ break;
+
+ case 'g':
+ opt.flag &= ~bam_str2flag(optarg);
+ break;
+ case 'G':
+ opt.flag |= bam_str2flag(optarg);
+ break;
+
+ case 'l':
+ opt.min_len = atoi(optarg);
+ break;
+
+ case 'H':
+ opt.header = 1;
+ break;
+
+ case 'q':
+ opt.min_qual = atoi(optarg);
+ break;
+ case 'Q':
+ opt.min_mqual = atoi(optarg);
+ break;
+
+ case 'J':
+ opt.skip_del = 0;
+ break;
+
+ case 'o':
+ if (opt.out != samtools_stdout)
break;
+ opt.out = fopen(optarg, "w");
+ if (!opt.out) {
+ print_error_errno("depth", "Cannot open \"%s\" for writing.",
+ optarg);
+ return EXIT_FAILURE;
+ }
+ break;
+
+ case 'r':
+ opt.reg = optarg;
+ break;
+
+ case 's':
+ opt.remove_overlaps = 1;
+ break;
+
+ case 'X':
+ has_index_file = 1;
+ break;
+
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?':
+ usage_exit(samtools_stderr, EXIT_FAILURE);
}
}
-depth_end:
- if (((file_out != samtools_stdout)? fclose(file_out) : fflush(file_out)) != 0) {
- if (status == EXIT_SUCCESS) {
- if (file_out != samtools_stdout)
- print_error_errno("depth", "error on closing \"%s\"", output_file);
- else
- print_error_errno("depth", "error on flushing standard output");
- status = EXIT_FAILURE;
+ if (argc < optind+1 && !file_list) {
+ if (argc == optind)
+ usage_exit(samtools_stdout, EXIT_SUCCESS);
+ else
+ usage_exit(samtools_stderr, EXIT_FAILURE);
+ }
+
+ if (file_list) {
+ if (has_index_file) {
+ print_error("depth", "The -f option cannot be combined with -X");
+ return 1;
+ }
+ if (read_file_list(file_list, &nfiles, &fn))
+ return 1;
+ argv = fn;
+ argc = nfiles;
+ optind = 0;
+ } else {
+ nfiles = argc - optind;
+ }
+
+ if (has_index_file) {
+ if (nfiles%1) {
+ print_error("depth", "-X needs one index specified per bam file");
+ return 1;
}
+ nfiles /= 2;
+ }
+ fp = malloc(nfiles * sizeof(*fp));
+ header = malloc(nfiles * sizeof(*header));
+ if (!fp || !header) {
+ print_error_errno("depth", "Out of memory");
+ return 1;
}
- for (i = 0; i < n && data[i]; ++i) {
- sam_hdr_destroy(data[i]->hdr);
- if (data[i]->fp) sam_close(data[i]->fp);
- hts_itr_destroy(data[i]->iter);
- free(data[i]);
+ hts_itr_t **itr = NULL;
+ if (opt.reg) {
+ itr = calloc(nfiles, sizeof(*itr));
+ if (!itr)
+ return 1;
}
- free(data); free(reg);
- if (bed) bed_destroy(bed);
- if ( file_list )
- {
- for (i=0; i<n; i++) free(fn[i]);
+
+ for (i = 0; i < nfiles; i++, optind++) {
+ fp[i] = sam_open_format(argv[optind], "r", &ga.in);
+ if (fp[i] == NULL) {
+ print_error_errno("depth",
+ "Cannot open input file \"%s\"", argv[optind]);
+ return 1;
+ }
+
+ if (ga.nthreads > 0)
+ hts_set_threads(fp[i], ga.nthreads);
+
+ if (hts_set_opt(fp[i], CRAM_OPT_REQUIRED_FIELDS,
+ SAM_FLAG | SAM_RNAME | SAM_POS | SAM_CIGAR
+ | (opt.remove_overlaps ? SAM_QNAME|SAM_RNEXT|SAM_PNEXT
+ : 0)
+ | (opt.min_mqual ? SAM_MAPQ : 0)
+ | (opt.min_len ? SAM_SEQ : 0)
+ | (opt.min_qual ? SAM_QUAL : 0))) {
+ fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+ return 1;
+ }
+
+ if (hts_set_opt(fp[i], CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ return 1;
+ }
+
+ // FIXME: what if headers differ?
+ header[i] = sam_hdr_read(fp[i]);
+ if (header == NULL) {
+ fprintf(samtools_stderr, "Failed to read header for \"%s\"\n",
+ argv[optind]);
+ return 1;
+ }
+
+ if (opt.reg) {
+ hts_idx_t *idx = has_index_file
+ ? sam_index_load2(fp[i], argv[optind], argv[optind+nfiles])
+ : sam_index_load(fp[i], argv[optind]);
+ if (!idx) {
+ print_error("depth", "cannot load index for \"%s\"",
+ argv[optind]);
+ return 1;
+ }
+ if (!(itr[i] = sam_itr_querys(idx, header[i], opt.reg))) {
+ print_error("depth", "cannot parse region \"%s\"", opt.reg);
+ return 1;
+ }
+ hts_idx_destroy(idx);
+ }
+ }
+
+ int ret = fastdepth_core(&opt, nfiles, &argv[argc-nfiles], fp, itr, header)
+ ? 1 : 0;
+
+ for (i = 0; i < nfiles; i++) {
+ sam_hdr_destroy(header[i]);
+ sam_close(fp[i]);
+ if (itr && itr[i])
+ hts_itr_destroy(itr[i]);
+ }
+ free(header);
+ free(fp);
+ free(itr);
+ if (file_list) {
+ for (i=0; i<nfiles; i++)
+ free(fn[i]);
free(fn);
}
+ if (opt.bed)
+ bed_destroy(opt.bed);
sam_global_args_free(&ga);
- return status;
+ if (opt.out != samtools_stdout) fclose(opt.out);
+ return ret;
}
#ifdef _MAIN_BAM2DEPTH
/* bam_addrprg.c -- samtools command to add or replace readgroups.
- Copyright (c) 2013, 2015-2017, 2019 Genome Research Limited.
+ Copyright (c) 2013, 2015-2017, 2019-2021 Genome Research Limited.
Author: Martin O. Pollard <mp15@sanger.ac.uk>
rg_mode mode;
sam_global_args ga;
htsThreadPool p;
+ int uncompressed;
+ int overwrite_hdr_rg;
};
struct state;
static void usage(FILE *fp)
{
fprintf(fp,
- "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-o <output.bam>] <input.bam>\n"
+ "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-m orphan_only|overwrite_all] [-o <output.bam>] <input.bam>\n"
"\n"
"Options:\n"
" -m MODE Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n"
" -o FILE Where to write output to [stdout]\n"
" -r STRING @RG line text\n"
" -R STRING ID of @RG line in existing header to use\n"
+ " -u Output uncompressed data\n"
+ " -w Overwrite an existing @RG line\n"
" --no-PG Do not add a PG line\n"
);
sam_global_opt_help(fp, "..O..@..");
};
kstring_t rg_line = {0,0,NULL};
- while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h@:", lopts, NULL)) >= 0) {
+ while ((n = getopt_long(argc, argv, "r:R:m:o:O:h@:uw", lopts, NULL)) >= 0) {
switch (n) {
case 'r':
// Are we adding to existing rg line?
case 1:
retval->no_pg = 1;
break;
+ case 'u':
+ retval->uncompressed = 1;
+ break;
+ case 'w':
+ retval->overwrite_hdr_rg = 1;
+ break;
case '?':
usage(stderr);
free(retval);
}
static bool init(const parsed_opts_t* opts, state_t** state_out) {
- char output_mode[8] = "w";
+ char output_mode[9] = "w";
state_t* retval = (state_t*) calloc(1, sizeof(state_t));
if (retval == NULL) {
retval->input_header = sam_hdr_read(retval->input_file);
retval->output_header = sam_hdr_dup(retval->input_header);
+
+ if (opts->uncompressed)
+ strcat(output_mode, "0");
if (opts->output_name) // File format auto-detection
- sam_open_mode(output_mode + 1, opts->output_name, NULL);
+ sam_open_mode(output_mode + strlen(output_mode),
+ opts->output_name, NULL);
retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out);
if (retval->output_file == NULL) {
// Check does not already exist
kstring_t hdr_line = { 0, 0, NULL };
if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) {
- fprintf(stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n");
- free(hdr_line.s);
- return false;
+ if (opts->overwrite_hdr_rg) {
+ if(-1 == sam_hdr_remove_line_id(retval->output_header, "RG", "ID", opts->rg_id)) {
+ fprintf(stderr, "[init] Error removing the RG line with ID:%s from the output header.\n", opts->rg_id);
+ ks_free(&hdr_line);
+ return false;
+ }
+ } else {
+ fprintf(stderr, "[init] RG line with ID:%s already present in the header. Use -w to overwrite.\n", opts->rg_id);
+ ks_free(&hdr_line);
+ return false;
+ }
}
+ ks_free(&hdr_line);
+
if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) {
fprintf(stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id);
return false;
return false;
}
retval->rg_id = strdup(opts->rg_id);
- free(hdr_line.s);
+ ks_free(&hdr_line);
} else {
kstring_t rg_id = { 0, 0, NULL };
if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) {
/* bam_addrprg.c -- samtools command to add or replace readgroups.
- Copyright (c) 2013, 2015-2017, 2019 Genome Research Limited.
+ Copyright (c) 2013, 2015-2017, 2019-2021 Genome Research Limited.
Author: Martin O. Pollard <mp15@sanger.ac.uk>
rg_mode mode;
sam_global_args ga;
htsThreadPool p;
+ int uncompressed;
+ int overwrite_hdr_rg;
};
struct state;
static void usage(FILE *fp)
{
fprintf(fp,
- "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-o <output.bam>] <input.bam>\n"
+ "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-m orphan_only|overwrite_all] [-o <output.bam>] <input.bam>\n"
"\n"
"Options:\n"
" -m MODE Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n"
" -o FILE Where to write output to [samtools_stdout]\n"
" -r STRING @RG line text\n"
" -R STRING ID of @RG line in existing header to use\n"
+ " -u Output uncompressed data\n"
+ " -w Overwrite an existing @RG line\n"
" --no-PG Do not add a PG line\n"
);
sam_global_opt_help(fp, "..O..@..");
};
kstring_t rg_line = {0,0,NULL};
- while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h@:", lopts, NULL)) >= 0) {
+ while ((n = getopt_long(argc, argv, "r:R:m:o:O:h@:uw", lopts, NULL)) >= 0) {
switch (n) {
case 'r':
// Are we adding to existing rg line?
case 1:
retval->no_pg = 1;
break;
+ case 'u':
+ retval->uncompressed = 1;
+ break;
+ case 'w':
+ retval->overwrite_hdr_rg = 1;
+ break;
case '?':
usage(samtools_stderr);
free(retval);
}
static bool init(const parsed_opts_t* opts, state_t** state_out) {
- char output_mode[8] = "w";
+ char output_mode[9] = "w";
state_t* retval = (state_t*) calloc(1, sizeof(state_t));
if (retval == NULL) {
retval->input_header = sam_hdr_read(retval->input_file);
retval->output_header = sam_hdr_dup(retval->input_header);
+
+ if (opts->uncompressed)
+ strcat(output_mode, "0");
if (opts->output_name) // File format auto-detection
- sam_open_mode(output_mode + 1, opts->output_name, NULL);
+ sam_open_mode(output_mode + strlen(output_mode),
+ opts->output_name, NULL);
retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out);
if (retval->output_file == NULL) {
// Check does not already exist
kstring_t hdr_line = { 0, 0, NULL };
if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) {
- fprintf(samtools_stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n");
- free(hdr_line.s);
- return false;
+ if (opts->overwrite_hdr_rg) {
+ if(-1 == sam_hdr_remove_line_id(retval->output_header, "RG", "ID", opts->rg_id)) {
+ fprintf(samtools_stderr, "[init] Error removing the RG line with ID:%s from the output header.\n", opts->rg_id);
+ ks_free(&hdr_line);
+ return false;
+ }
+ } else {
+ fprintf(samtools_stderr, "[init] RG line with ID:%s already present in the header. Use -w to overwrite.\n", opts->rg_id);
+ ks_free(&hdr_line);
+ return false;
+ }
}
+ ks_free(&hdr_line);
+
if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) {
fprintf(samtools_stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id);
return false;
return false;
}
retval->rg_id = strdup(opts->rg_id);
- free(hdr_line.s);
+ ks_free(&hdr_line);
} else {
kstring_t rg_id = { 0, 0, NULL };
if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) {
--- /dev/null
+/* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
+ from the 5' end.
+
+ Copyright (C) 2020-2021 Genome Research Ltd.
+
+ Authors: Andrew Whitwham <aw7@sanger.ac.uk>
+ Rob Davies <rmd+git@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+*/
+
+#include <config.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include "htslib/thread_pool.h"
+#include "sam_opts.h"
+#include <htslib/hts.h>
+#include "htslib/hfile.h"
+#include "htslib/kstring.h"
+#include "htslib/sam.h"
+#include "samtools.h"
+#include "bam_ampliconclip.h"
+
+typedef enum {
+ soft_clip,
+ hard_clip
+} clipping_type;
+
+typedef struct {
+ int add_pg;
+ int use_strand;
+ int write_clipped;
+ int mark_fail;
+ int both;
+ int fail_len;
+ int filter_len;
+ int unmapped;
+ int oa_tag;
+ int del_tag;
+ int tol;
+ char *arg_list;
+ char *stats_file;
+ char *rejects_file;
+} cl_param_t;
+
+
+static int bed_entry_sort(const void *av, const void *bv) {
+ bed_entry_t *a = (bed_entry_t *) av;
+ bed_entry_t *b = (bed_entry_t *) bv;
+ return a->right < b->right ? -1 : (a->right == b->right ? 0 : 1);
+}
+
+
+int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists) {
+ hFILE *fp;
+ int line_count = 0, ret;
+ int64_t left, right;
+ kstring_t line = KS_INITIALIZE;
+ bed_entry_list_t *list;
+ khiter_t bed_itr;
+
+ if ((fp = hopen(infile, "r")) == NULL) {
+ print_error_errno("amplicon", "unable to open file %s.", infile);
+ return 1;
+ }
+
+ char ref[1024];
+
+ while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) {
+ line_count++;
+ int hret;
+ char strand;
+
+ if (line.l == 0 || *line.s == '#') continue;
+ if (strncmp(line.s, "track ", 6) == 0) continue;
+ if (strncmp(line.s, "browser ", 8) == 0) continue;
+
+ if (get_strand) {
+ if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64" %*s %*s %c",
+ ref, &left, &right, &strand) != 4) {
+ fprintf(stderr, "[amplicon] error: bad bed file format in line %d of %s.\n"
+ "(N.B. ref/chrom name limited to 1023 characters.)\n",
+ line_count, infile);
+ ret = 1;
+ goto error;
+ }
+ } else {
+ if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64,
+ ref, &left, &right) != 3) {
+ fprintf(stderr, "[amplicon] error: bad bed file format in line %d of %s\n"
+ "(N.B. ref/chrom name limited to 1023 characters.)\n",
+ line_count, infile);
+ ret = 1;
+ goto error;
+ }
+ }
+
+ bed_itr = kh_get(bed_list_hash, bed_lists, ref);
+
+ if (bed_itr == kh_end(bed_lists)) { // new ref entry
+ char *ref_name = strdup(ref); // need a copy for the hash key
+
+ if (!ref_name) {
+ fprintf(stderr, "[amplicon] error: unable to allocate memory for ref name.\n");
+ ret = 1;
+ goto error;
+ }
+
+ bed_itr = kh_put(bed_list_hash, bed_lists, ref_name, &hret);
+
+ if (hret > 0) {
+ list = &kh_val(bed_lists, bed_itr);
+
+ // initialise the new hash entry
+ list->longest = 0;
+ list->size = 0;
+ list->length = 0;
+ list->bp = NULL;
+ } else {
+ fprintf(stderr, "[amplicon] error: ref hashing failure.\n");
+ ret = 1;
+ goto error;
+ }
+ } else { // existing ref
+ list = &kh_val(bed_lists, bed_itr);
+ }
+
+ if (list->length == list->size) {
+ bed_entry_t *tmp;
+
+ list->size += list->size / 2 + 256;
+
+ if ((tmp = realloc(list->bp, list->size * sizeof(bed_entry_t))) == NULL) {
+ fprintf(stderr, "[amplicon] error: unable to allocate more memory for bed data.\n");
+ ret = 1;
+ goto error;
+ }
+
+ list->bp = tmp;
+ }
+
+ list->bp[list->length].left = left;
+ list->bp[list->length].right = right;
+
+ if (get_strand) {
+ if (strand == '+') {
+ list->bp[list->length].rev = 0;
+ } else if (strand == '-') {
+ list->bp[list->length].rev = 1;
+ } else {
+ fprintf(stderr, "[amplicon] error: bad strand value in line %d, expecting '+' or '-', found '%c'.\n",
+ line_count, strand);
+ ret = 1;
+ goto error;
+ }
+ }
+
+ if (right - left > list->longest)
+ list->longest = right - left;
+
+ list->length++;
+ }
+
+ if (sort_by_pos) {
+ for (bed_itr = kh_begin(bed_lists); bed_itr != kh_end(bed_lists); ++bed_itr) {
+ if (kh_exist(bed_lists, bed_itr)) {
+ list = &kh_val(bed_lists, bed_itr);
+ qsort(list->bp, list->length, sizeof(list->bp[0]), bed_entry_sort);
+ }
+ }
+ }
+
+ if (kh_size(bed_lists) > 0) {// any entries
+ ret = 0;
+ } else {
+ ret = 1;
+ }
+
+error:
+ ks_free(&line);
+
+ if (hclose(fp) != 0) {
+ fprintf(stderr, "[amplicon] warning: failed to close %s", infile);
+ }
+
+ return ret;
+}
+
+
+void destroy_bed_hash(khash_t(bed_list_hash) *hash) {
+ khiter_t itr;
+
+ for (itr = kh_begin(hash); itr != kh_end(hash); ++itr) {
+ if (kh_exist(hash, itr)) {
+ free(kh_val(hash, itr).bp);
+ free((char *)kh_key(hash, itr));
+ kh_key(hash, itr) = NULL;
+ }
+ }
+
+ kh_destroy(bed_list_hash, hash);
+}
+
+
+static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos,
+ int is_rev, int use_strand, int64_t longest,
+ cl_param_t *param) {
+ int i, size; // may need this to be variable
+ int tol = param->tol;
+ int l = 0, mid = sites->length / 2, r = sites->length;
+ int pos_tol = is_rev ? (pos > tol ? pos - tol : 0) : pos;
+
+ while (r - l > 1) {
+ if (sites->bp[mid].right <= pos_tol) {
+ l = mid;
+ } else {
+ r = mid;
+ }
+ mid = (l + r) / 2;
+ }
+
+ size = 0;
+
+ for (i = l; i < sites->length; i++) {
+ hts_pos_t mod_left, mod_right;
+
+ if (use_strand && is_rev != sites->bp[i].rev)
+ continue;
+
+ if (is_rev) {
+ mod_left = sites->bp[i].left;
+ mod_right = sites->bp[i].right + tol;
+ } else {
+ if (sites->bp[i].left > tol) {
+ mod_left = sites->bp[i].left - tol;
+ } else {
+ mod_left = 0;
+ }
+ mod_right = sites->bp[i].right;
+ }
+
+ if (pos + longest + tol < mod_right)
+ break;
+
+ if (pos >= mod_left && pos <= mod_right) {
+ if (is_rev) {
+ if (size < pos - sites->bp[i].left) {
+ size = pos - sites->bp[i].left;
+ }
+ } else {
+ if (size < sites->bp[i].right - pos) {
+ size = sites->bp[i].right - pos;
+ }
+ }
+ }
+ }
+
+ return size;
+}
+
+
+static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
+ clipping_type clipping) {
+ uint32_t *orig_cigar = bam_get_cigar(rec);
+ uint8_t *orig_seq = bam_get_seq(rec);
+ uint8_t *orig_qual = bam_get_qual(rec);
+ uint8_t *orig_aux = bam_get_aux(rec);
+ uint32_t *new_cigar;
+ uint8_t *new_qual;
+ size_t orig_l_aux = bam_get_l_aux(rec);
+ uint32_t i, j, odd_base = 0;
+ uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0;
+ hts_pos_t new_pos = rec->core.pos;
+ uint32_t cig_type, cig_op;
+
+ if (rec->l_data + 8 > rec_out->m_data) {
+ uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8);
+ if (!new_data) {
+ fprintf(stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n");
+ return 1;
+ }
+ rec_out->data = new_data;
+ rec_out->m_data = rec->l_data + 8;
+ }
+
+ // Copy core data & name
+ memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
+ memcpy(rec_out->data, rec->data, rec->core.l_qname);
+
+ if (clipping == hard_clip && bases >= rec->core.l_qseq) {
+ rec_out->core.l_qseq = 0;
+ rec_out->core.n_cigar = 0;
+
+ if (orig_l_aux)
+ memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+ rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+ return 0;
+ }
+
+ // Modify CIGAR
+ new_cigar = bam_get_cigar(rec_out);
+
+ for (i = 0; i < rec->core.n_cigar; i++) {
+ cig_op = bam_cigar_op(orig_cigar[i]);
+ cig_type = bam_cigar_type(cig_op);
+
+ if (cig_op == BAM_CHARD_CLIP) {
+ hardclip += bam_cigar_oplen(orig_cigar[i]);
+ } else {
+ if (cig_type & 2) {
+ if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) {
+ ref_remove -= bam_cigar_oplen(orig_cigar[i]);
+ } else {
+ break;
+ }
+ new_pos += bam_cigar_oplen(orig_cigar[i]);
+ }
+ if (cig_type & 1) {
+ qry_removed += bam_cigar_oplen(orig_cigar[i]);
+ }
+ }
+ }
+
+ if (i < rec->core.n_cigar) {
+ cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i]));
+
+ // account for the last operation
+ if (cig_type & 2) {
+ new_pos += ref_remove;
+ }
+ if (cig_type & 1) {
+ qry_removed += ref_remove;
+ }
+ } else {
+ qry_removed = rec->core.l_qseq;
+ }
+
+ j = 0;
+ if (clipping == hard_clip && hardclip + qry_removed > 0) {
+ new_cigar[j++] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP);
+ }
+ if (clipping == soft_clip) {
+ if (hardclip > 0) {
+ new_cigar[j++] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP);
+ }
+ if (qry_removed > 0) {
+ new_cigar[j++] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP);
+ }
+ }
+
+ if (i < rec->core.n_cigar
+ && bam_cigar_oplen(orig_cigar[i]) > ref_remove) {
+ new_cigar[j++] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i]));
+
+ // fill in the rest of the cigar
+ i++;
+
+ for (; i < rec->core.n_cigar; i++) {
+ new_cigar[j++] = orig_cigar[i];
+ }
+ }
+
+ rec_out->core.n_cigar = j;
+
+ if (clipping == soft_clip) {
+ qry_removed = 0; // Copy all the sequence and confidence values
+ odd_base = 1; // account for an odd number of bases
+ }
+
+ new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2;
+ // Copy remaining SEQ
+ if ((qry_removed & 1) == 0) {
+ memcpy(bam_get_seq(rec_out), orig_seq + (qry_removed / 2),
+ (rec->core.l_qseq - qry_removed + odd_base) / 2);
+ } else {
+ uint8_t *in = orig_seq + qry_removed / 2;
+ uint8_t *out = bam_get_seq(rec_out);
+ uint32_t i;
+ for (i = qry_removed; i < rec->core.l_qseq - 1; i += 2) {
+ *out++ = ((in[0] & 0x0f) << 4) | ((in[1] & 0xf0) >> 4);
+ in++;
+ }
+ if (i < rec->core.l_qseq) {
+ *out++ = (in[0] & 0x0f) << 4;
+ }
+ assert(out == new_qual);
+ }
+
+ // Copy remaining QUAL
+ memmove(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
+
+ // Set new l_qseq
+ rec_out->core.l_qseq -= qry_removed;
+
+ // Move AUX
+ if (orig_l_aux)
+ memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+ // Set new l_data
+ rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+ // put in new pos
+ rec_out->core.pos = new_pos;
+
+ return 0;
+}
+
+
+static int bam_trim_right(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
+ clipping_type clipping) {
+ uint32_t *orig_cigar = bam_get_cigar(rec);
+ uint8_t *orig_seq = bam_get_seq(rec);
+ uint8_t *orig_qual = bam_get_qual(rec);
+ uint8_t *orig_aux = bam_get_aux(rec);
+ uint32_t *new_cigar;
+ uint32_t new_n_cigar = 0;
+ uint8_t *new_qual;
+ size_t orig_l_aux = bam_get_l_aux(rec);
+ int32_t i;
+ int32_t j;
+ uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0;
+ uint32_t cig_type, cig_op;
+
+ if (rec->l_data + 8 > rec_out->m_data) {
+ uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8);
+ if (!new_data) {
+ fprintf(stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n");
+ return 1;
+ }
+ rec_out->data = new_data;
+ rec_out->m_data = rec->l_data + 8;
+ }
+
+ // Copy core data & name
+ memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
+ memcpy(rec_out->data, rec->data, rec->core.l_qname);
+
+ if (clipping == hard_clip && bases >= rec->core.l_qseq) {
+ rec_out->core.l_qseq = 0;
+ rec_out->core.n_cigar = 0;
+
+ if (orig_l_aux)
+ memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+ rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+ return 0;
+ }
+
+ // Modify CIGAR here
+ new_cigar = bam_get_cigar(rec_out);
+
+ for (i = rec->core.n_cigar - 1; i >= 0; --i) {
+ cig_op = bam_cigar_op(orig_cigar[i]);
+ cig_type = bam_cigar_type(cig_op);
+
+ if (cig_op == BAM_CHARD_CLIP) {
+ hardclip += bam_cigar_oplen(orig_cigar[i]);
+ } else {
+ if (cig_type & 2) {
+ if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) {
+ ref_remove -= bam_cigar_oplen(orig_cigar[i]);
+ } else {
+ break;
+ }
+ }
+ if (cig_type & 1) {
+ qry_removed += bam_cigar_oplen(orig_cigar[i]);
+ }
+ }
+ }
+
+ if (i >= 0) {
+ cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i]));
+ if (cig_type & 1) {
+ qry_removed += ref_remove;
+ }
+ j = i;
+ if (qry_removed > 0) j++;
+ if (hardclip > 0 && (clipping == soft_clip || qry_removed == 0)) j++;
+ } else {
+ qry_removed = rec->core.l_qseq;
+ j = 0;
+ if (hardclip > 0 && clipping == soft_clip) j++;
+ }
+
+ if (clipping == hard_clip && hardclip + qry_removed > 0) {
+ new_cigar[j] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP);
+ new_n_cigar++;
+ }
+ if (clipping == soft_clip) {
+ if (hardclip > 0) {
+ new_cigar[j] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP);
+ new_n_cigar++;
+ if (qry_removed > 0) --j;
+ }
+ if (qry_removed > 0) {
+ new_cigar[j] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP);
+ new_n_cigar++;
+ }
+ }
+
+ if (j > 0) {
+ new_cigar[--j] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i]));
+ new_n_cigar++;
+ }
+
+ // fill in the rest of the cigar
+ while (j > 0) {
+ new_cigar[--j] = orig_cigar[--i];
+ new_n_cigar++;
+ }
+
+ rec_out->core.n_cigar = new_n_cigar;
+
+ if (clipping == soft_clip)
+ qry_removed = 0; // Copy all the sequence and confidence values
+
+ new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2;
+ // Copy remaining SEQ
+ memcpy(bam_get_seq(rec_out), orig_seq, (rec->core.l_qseq - qry_removed + 1) / 2);
+
+ // Copy remaining QUAL
+ memcpy(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
+
+ // Set new l_qseq
+ rec_out->core.l_qseq -= qry_removed;
+
+ // Copy AUX
+ if (orig_l_aux)
+ memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+ // Set new l_data
+ rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+ return 0;
+}
+
+
+static hts_pos_t active_query_len(bam1_t *b) {
+ uint32_t *cigar = bam_get_cigar(b);
+ uint32_t cig_type, cig_op;
+ hts_pos_t len = 0;
+ int i;
+
+ for (i = 0; i < b->core.n_cigar; i++) {
+ cig_op = bam_cigar_op(cigar[i]);
+ cig_type = bam_cigar_type(cig_op);
+
+ if ((cig_type & 1) && (cig_op != BAM_CSOFT_CLIP)) {
+ len += bam_cigar_oplen(cigar[i]);
+ }
+ }
+
+ return len;
+}
+
+
+static inline void swap_bams(bam1_t **a, bam1_t **b) {
+ bam1_t *tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+
+// Format OA:Z:(RNAME,POS,strand,CIGAR,MAPQ,NM;
+static inline int tag_original_data(bam1_t *orig, kstring_t *oa_tag) {
+ char strand;
+ uint8_t *nm_tag, *old_oa_tag;
+ uint32_t *cigar;
+ int64_t nm = 0;
+ int i, res = 0;
+
+ ks_clear(oa_tag);
+
+ // if there is an existing OA tag the new one gets appended to it
+ if ((old_oa_tag = bam_aux_get(orig, "OA"))) {
+ res |= ksprintf(oa_tag, "%s", bam_aux2Z(old_oa_tag)) < 0;
+ }
+
+ if (orig->core.flag & BAM_FREVERSE)
+ strand = '-';
+ else
+ strand = '+';
+
+ if ((nm_tag = bam_aux_get(orig, "NM"))) {
+ nm = bam_aux2i(nm_tag);
+ }
+
+ res |= ksprintf(oa_tag, "%s,%"PRIhts_pos",%c,", bam_get_qname(orig), orig->core.pos + 1, strand) < 0;
+
+ for (i = 0, cigar = bam_get_cigar(orig); i < orig->core.n_cigar && res == 0; ++i) {
+ res |= kputw(bam_cigar_oplen(cigar[i]), oa_tag) < 0;
+ res |= kputc(bam_cigar_opchr(cigar[i]), oa_tag) < 0;
+ }
+
+ if (nm_tag) {
+ res |= ksprintf(oa_tag, ",%d,%"PRId64";", orig->core.qual, nm) < 0;
+ } else {
+ res |= ksprintf(oa_tag, "%d,;", orig->core.qual) < 0;
+ }
+
+ return res;
+}
+
+
+static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
+ clipping_type clipping, cl_param_t *param) {
+ int ret = 1, r, file_open = 0;
+
+ bam_hdr_t *header = NULL;
+ bam1_t *b = NULL, *b_tmp = NULL;
+ long f_count = 0, r_count = 0, n_count = 0, l_count = 0, l_exclude = 0, b_count = 0;
+ long filtered = 0, written = 0, failed = 0;
+ kstring_t str = KS_INITIALIZE;
+ kstring_t oat = KS_INITIALIZE;
+ bed_entry_list_t *sites;
+ FILE *stats_fp = stderr;
+ khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
+
+ if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash)) {
+ fprintf(stderr, "[ampliconclip] error: unable to load bed file.\n");
+ goto fail;
+ }
+
+ if ((header = sam_hdr_read(in)) == NULL) {
+ fprintf(stderr, "[ampliconclip] error: could not read header\n");
+ goto fail;
+ }
+
+ // changing pos can ruin coordinate sort order
+ if (sam_hdr_find_tag_hd(header, "SO", &str) == 0 && str.s && strcmp(str.s, "coordinate") == 0) {
+ const char *new_order = "unknown";
+
+ if (sam_hdr_update_hd(header, "SO", new_order) == -1) {
+ fprintf(stderr, "[ampliconclip] error: unable to change sort order to 'SO:%s'\n", new_order);
+ goto fail;
+ }
+ }
+
+ ks_free(&str);
+
+ if (param->add_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(),
+ param->arg_list ? "CL" : NULL,
+ param->arg_list ? param->arg_list : NULL,
+ NULL) != 0) {
+ fprintf(stderr, "[ampliconclip] warning: unable to add @PG line to header.\n");
+ }
+ if (sam_hdr_write(out, header) < 0) {
+ fprintf(stderr, "[ampliconclip] error: could not write header.\n");
+ goto fail;
+ }
+
+ if (reject) {
+ if (sam_hdr_write(reject, header) < 0) {
+ fprintf(stderr, "[ampliconclip] error: could not write header to rejects file.\n");
+ goto fail;
+ }
+ }
+
+ b = bam_init1();
+ b_tmp = bam_init1();
+ if (!b || !b_tmp) {
+ fprintf(stderr, "[ampliconclip] error: out of memory when trying to create record.\n");
+ goto fail;
+ }
+
+ int32_t last_tid = -1;
+ int ref_found = 0;
+
+ while ((r = sam_read1(in, header, b)) >= 0) {
+ hts_pos_t pos;
+ int is_rev;
+ int p_size;
+ int been_clipped = 0, filter = 0;
+ int exclude = (BAM_FUNMAP | BAM_FQCFAIL);
+ khiter_t itr;
+
+ l_count++;
+
+ if (b->core.tid != last_tid) {
+ const char *ref_name;
+
+ ref_found = 0;
+ last_tid = b->core.tid;
+
+ if ((ref_name = sam_hdr_tid2name(header, b->core.tid)) != NULL) {
+ itr = kh_get(bed_list_hash, bed_hash, ref_name);
+
+ if (itr != kh_end(bed_hash)) {
+ sites = &kh_val(bed_hash, itr);
+ ref_found = 1;
+ }
+ }
+ }
+
+ if (!(b->core.flag & exclude) && ref_found) {
+ if (param->oa_tag)
+ if (tag_original_data(b, &oat))
+ goto fail;
+
+ if (!param->both) {
+ if (bam_is_rev(b)) {
+ pos = bam_endpos(b);
+ is_rev = 1;
+ } else {
+ pos = b->core.pos;
+ is_rev = 0;
+ }
+
+ if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
+ if (is_rev) {
+ if (bam_trim_right(b, b_tmp, p_size, clipping) != 0)
+ goto fail;
+
+ swap_bams(&b, &b_tmp);
+ r_count++;
+ } else {
+ if (bam_trim_left(b, b_tmp, p_size, clipping) != 0)
+ goto fail;
+
+ swap_bams(&b, &b_tmp);
+ f_count++;
+ }
+
+ if (param->oa_tag) {
+ if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s))
+ goto fail;
+ }
+
+ if (param->del_tag) {
+ uint8_t *tag;
+
+ if ((tag = bam_aux_get(b, "NM")))
+ bam_aux_del(b, tag);
+
+ if ((tag = bam_aux_get(b, "MD")))
+ bam_aux_del(b, tag);
+ }
+
+ been_clipped = 1;
+ } else {
+ if (param->mark_fail) {
+ b->core.flag |= BAM_FQCFAIL;
+ }
+
+ n_count++;
+ }
+ } else {
+ int left = 0, right = 0;
+
+ // left first
+ pos = b->core.pos;
+ is_rev = 0;
+
+ if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
+ if (bam_trim_left(b, b_tmp, p_size, clipping) != 0)
+ goto fail;
+
+ swap_bams(&b, &b_tmp);
+ f_count++;
+ left = 1;
+ been_clipped = 1;
+ }
+
+ // the right
+ pos = bam_endpos(b);
+ is_rev = 1;
+
+ if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
+ if (bam_trim_right(b, b_tmp, p_size, clipping) != 0)
+ goto fail;
+
+ swap_bams(&b, &b_tmp);
+ r_count++;
+ right = 1;
+ been_clipped = 1;
+ }
+
+ if (left || right) {
+ uint8_t *tag;
+
+ if (param->oa_tag) {
+ if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s))
+ goto fail;
+ }
+
+ if (param->del_tag) {
+ if ((tag = bam_aux_get(b, "NM")))
+ bam_aux_del(b, tag);
+
+ if ((tag = bam_aux_get(b, "MD")))
+ bam_aux_del(b, tag);
+ }
+ }
+
+ if (left && right) {
+ b_count++;
+ } else if (!left && !right) {
+ if (param->mark_fail) {
+ b->core.flag |= BAM_FQCFAIL;
+ }
+
+ n_count++;
+ }
+ }
+
+ if (param->fail_len >= 0 || param->filter_len >= 0) {
+ hts_pos_t aql = active_query_len(b);
+
+ if (param->fail_len >= 0 && aql <= param->fail_len) {
+ b->core.flag |= BAM_FQCFAIL;
+ }
+
+ if (param->filter_len >= 0 && aql <= param->filter_len) {
+ filter = 1;
+ }
+ }
+
+ if (b->core.flag & BAM_FQCFAIL) {
+ failed++;
+ }
+
+ if (param->write_clipped && !been_clipped) {
+ filter = 1;
+ }
+
+ } else {
+ l_exclude++;
+
+ if (param->unmapped) {
+ filter = 1;
+ }
+ }
+
+ if (!filter) {
+ if (sam_write1(out, header, b) < 0) {
+ fprintf(stderr, "[ampliconclip] error: could not write line %ld.\n", l_count);
+ goto fail;
+ }
+
+ written++;
+ } else {
+ if (reject) {
+ if (sam_write1(reject, header, b) < 0) {
+ fprintf(stderr, "[ampliconclip] error: could not write to reject file %s\n",
+ param->rejects_file);
+ goto fail;
+ }
+ }
+
+ filtered++;
+ }
+ }
+
+ if (r < -1) {
+ fprintf(stderr, "[ampliconclip] error: failed to read input.\n");
+ goto fail;
+ }
+
+ if (param->stats_file) {
+ if ((stats_fp = fopen(param->stats_file, "w")) == NULL) {
+ fprintf(stderr, "[ampliconclip] warning: cannot write stats to %s.\n", param->stats_file);
+ } else {
+ file_open = 1;
+ }
+ }
+
+ fprintf(stats_fp, "COMMAND: %s\n"
+ "TOTAL READS: %ld\n"
+ "TOTAL CLIPPED: %ld\n"
+ "FORWARD CLIPPED: %ld\n"
+ "REVERSE CLIPPED: %ld\n"
+ "BOTH CLIPPED: %ld\n"
+ "NOT CLIPPED: %ld\n"
+ "EXCLUDED: %ld\n"
+ "FILTERED: %ld\n"
+ "FAILED: %ld\n"
+ "WRITTEN: %ld\n", param->arg_list, l_count, f_count + r_count,
+ f_count, r_count, b_count, n_count, l_exclude,
+ filtered, failed, written);
+
+ if (file_open) {
+ fclose(stats_fp);
+ }
+
+ ret = 0;
+
+fail:
+ destroy_bed_hash(bed_hash);
+ ks_free(&oat);
+ sam_hdr_destroy(header);
+ bam_destroy1(b);
+ bam_destroy1(b_tmp);
+ return ret;
+}
+
+
+static void usage(void) {
+ fprintf(stderr, "Usage: samtools ampliconclip -b BED file <input.bam> -o <output.bam>\n\n");
+ fprintf(stderr, "Option: \n");
+ fprintf(stderr, " -b FILE BED file of regions (eg amplicon primers) to be removed.\n");
+ fprintf(stderr, " -o FILE output file name (default stdout).\n");
+ fprintf(stderr, " -f FILE write stats to file name (default stderr)\n");
+ fprintf(stderr, " -u Output uncompressed data\n");
+ fprintf(stderr, " --soft-clip soft clip amplicon primers from reads (default)\n");
+ fprintf(stderr, " --hard-clip hard clip amplicon primers from reads.\n");
+ fprintf(stderr, " --both-ends clip on both 5' and 3' ends.\n");
+ fprintf(stderr, " --strand use strand data from BED file to match read direction.\n");
+ fprintf(stderr, " --clipped only output clipped reads.\n");
+ fprintf(stderr, " --fail mark unclipped, mapped reads as QCFAIL.\n");
+ fprintf(stderr, " --filter-len INT do not output reads INT size or shorter.\n");
+ fprintf(stderr, " --fail-len INT mark as QCFAIL reads INT size or shorter.\n");
+ fprintf(stderr, " --no-excluded do not write excluded reads (unmapped or QCFAIL).\n");
+ fprintf(stderr, " --rejects-file FILE file to write filtered reads.\n");
+ fprintf(stderr, " --original for clipped entries add an OA tag with original data.\n");
+ fprintf(stderr, " --keep-tag for clipped entries keep the old NM and MD tags.\n");
+ fprintf(stderr, " --tolerance match region within this number of bases, default 5.\n");
+ fprintf(stderr, " --no-PG do not add an @PG line.\n");
+ sam_global_opt_help(stderr, "-.O..@-.");
+ fprintf(stderr, "\nAbout: Soft clips read alignments where they match BED file defined regions.\n"
+ "Default clipping is only on the 5' end.\n\n");
+}
+
+
+int amplicon_clip_main(int argc, char **argv) {
+ int c, ret;
+ char wmode[4] = {'w', 'b', 0, 0};
+ char *bedfile = NULL, *fnout = "-";
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ htsThreadPool p = {NULL, 0};
+ samFile *in = NULL, *out = NULL, *reject = NULL;
+ clipping_type clipping = soft_clip;
+ cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL};
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
+ {"no-PG", no_argument, NULL, 1002},
+ {"soft-clip", no_argument, NULL, 1003},
+ {"hard-clip", no_argument, NULL, 1004},
+ {"strand", no_argument, NULL, 1005},
+ {"clipped", no_argument, NULL, 1006},
+ {"fail", no_argument, NULL, 1007},
+ {"both-ends", no_argument, NULL, 1008},
+ {"filter-len", required_argument, NULL, 1009},
+ {"fail-len", required_argument, NULL, 1010},
+ {"no-excluded", no_argument, NULL, 1011},
+ {"rejects-file", required_argument, NULL, 1012},
+ {"original", no_argument, NULL, 1013},
+ {"keep-tag", no_argument, NULL, 1014},
+ {"tolerance", required_argument, NULL, 1015},
+ {NULL, 0, NULL, 0}
+ };
+
+ while ((c = getopt_long(argc, argv, "b:@:o:O:f:u", lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'b': bedfile = optarg; break;
+ case 'o': fnout = optarg; break;
+ case 'f': param.stats_file = optarg; break;
+ case 'u': wmode[2] = '0'; break;
+ case 1002: param.add_pg = 0; break;
+ case 1003: clipping = soft_clip; break;
+ case 1004: clipping = hard_clip; break;
+ case 1005: param.use_strand = 1; break;
+ case 1006: param.write_clipped = 1; break;
+ case 1007: param.mark_fail = 1; break;
+ case 1008: param.both = 1; break;
+ case 1009: param.filter_len = atoi(optarg); break;
+ case 1010: param.fail_len = atoi(optarg); break;
+ case 1011: param.unmapped = 1; break;
+ case 1012: param.rejects_file = optarg; break;
+ case 1013: param.oa_tag = 1; break;
+ case 1014: param.del_tag = 0; break;
+ case 1015: param.tol = atoi(optarg); break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage(); exit(1);
+ }
+ }
+
+ if (!bedfile) {
+ usage();
+ return 1;
+ }
+
+ if (optind + 1 > argc) {
+ usage();
+ return 1;
+ }
+
+ if (param.tol < 0) {
+ fprintf(stderr, "[ampliconclip] warning: invalid tolerance of %d,"
+ " reseting tolerance to default of 5.\n", param.tol);
+ param.tol = 5;
+ }
+
+ if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) {
+ print_error_errno("ampliconclip", "cannot open input file");
+ return 1;
+ }
+
+ sam_open_mode(wmode+1, fnout, NULL);
+
+ if ((out = sam_open_format(fnout, wmode, &ga.out)) == NULL) {
+ print_error_errno("ampliconclip", "cannot open output file");
+ return 1;
+ }
+
+ if (param.rejects_file) {
+ sam_open_mode(wmode+1, param.rejects_file, NULL);
+
+ if ((reject = sam_open_format(param.rejects_file, wmode, &ga.out)) == NULL) {
+ print_error_errno("ampliconclip", "cannot open rejects file");
+ return 1;
+ }
+ }
+
+ if (ga.nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(stderr, "[ampliconclip] error: cannot create thread pool.\n");
+ return 1;
+ }
+ hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
+ hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+
+ if (reject) {
+ hts_set_opt(reject, HTS_OPT_THREAD_POOL, &p);
+ }
+ }
+
+ param.arg_list = stringify_argv(argc + 1, argv - 1);
+
+ ret = bam_clip(in, out, reject, bedfile, clipping, ¶m);
+
+ // cleanup
+ sam_close(in);
+
+ if (sam_close(out) < 0) {
+ fprintf(stderr, "[ampliconclip] error: error while closing output file %s.\n", argv[optind+1]);
+ ret = 1;
+ }
+
+ if (reject) {
+ if (sam_close(reject) < 0) {
+ fprintf(stderr, "[ampliconclip] error: error while closing reject file %s.\n", param.rejects_file);
+ ret = 1;
+ }
+ }
+
+ if (p.pool) hts_tpool_destroy(p.pool);
+
+ sam_global_args_free(&ga);
+ free(param.arg_list);
+
+ return ret;
+}
+
--- /dev/null
+#include "samtools.pysam.h"
+
+/* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
+ from the 5' end.
+
+ Copyright (C) 2020-2021 Genome Research Ltd.
+
+ Authors: Andrew Whitwham <aw7@sanger.ac.uk>
+ Rob Davies <rmd+git@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+*/
+
+#include <config.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include "htslib/thread_pool.h"
+#include "sam_opts.h"
+#include <htslib/hts.h>
+#include "htslib/hfile.h"
+#include "htslib/kstring.h"
+#include "htslib/sam.h"
+#include "samtools.h"
+#include "bam_ampliconclip.h"
+
+typedef enum {
+ soft_clip,
+ hard_clip
+} clipping_type;
+
+typedef struct {
+ int add_pg;
+ int use_strand;
+ int write_clipped;
+ int mark_fail;
+ int both;
+ int fail_len;
+ int filter_len;
+ int unmapped;
+ int oa_tag;
+ int del_tag;
+ int tol;
+ char *arg_list;
+ char *stats_file;
+ char *rejects_file;
+} cl_param_t;
+
+
+static int bed_entry_sort(const void *av, const void *bv) {
+ bed_entry_t *a = (bed_entry_t *) av;
+ bed_entry_t *b = (bed_entry_t *) bv;
+ return a->right < b->right ? -1 : (a->right == b->right ? 0 : 1);
+}
+
+
+int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists) {
+ hFILE *fp;
+ int line_count = 0, ret;
+ int64_t left, right;
+ kstring_t line = KS_INITIALIZE;
+ bed_entry_list_t *list;
+ khiter_t bed_itr;
+
+ if ((fp = hopen(infile, "r")) == NULL) {
+ print_error_errno("amplicon", "unable to open file %s.", infile);
+ return 1;
+ }
+
+ char ref[1024];
+
+ while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) {
+ line_count++;
+ int hret;
+ char strand;
+
+ if (line.l == 0 || *line.s == '#') continue;
+ if (strncmp(line.s, "track ", 6) == 0) continue;
+ if (strncmp(line.s, "browser ", 8) == 0) continue;
+
+ if (get_strand) {
+ if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64" %*s %*s %c",
+ ref, &left, &right, &strand) != 4) {
+ fprintf(samtools_stderr, "[amplicon] error: bad bed file format in line %d of %s.\n"
+ "(N.B. ref/chrom name limited to 1023 characters.)\n",
+ line_count, infile);
+ ret = 1;
+ goto error;
+ }
+ } else {
+ if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64,
+ ref, &left, &right) != 3) {
+ fprintf(samtools_stderr, "[amplicon] error: bad bed file format in line %d of %s\n"
+ "(N.B. ref/chrom name limited to 1023 characters.)\n",
+ line_count, infile);
+ ret = 1;
+ goto error;
+ }
+ }
+
+ bed_itr = kh_get(bed_list_hash, bed_lists, ref);
+
+ if (bed_itr == kh_end(bed_lists)) { // new ref entry
+ char *ref_name = strdup(ref); // need a copy for the hash key
+
+ if (!ref_name) {
+ fprintf(samtools_stderr, "[amplicon] error: unable to allocate memory for ref name.\n");
+ ret = 1;
+ goto error;
+ }
+
+ bed_itr = kh_put(bed_list_hash, bed_lists, ref_name, &hret);
+
+ if (hret > 0) {
+ list = &kh_val(bed_lists, bed_itr);
+
+ // initialise the new hash entry
+ list->longest = 0;
+ list->size = 0;
+ list->length = 0;
+ list->bp = NULL;
+ } else {
+ fprintf(samtools_stderr, "[amplicon] error: ref hashing failure.\n");
+ ret = 1;
+ goto error;
+ }
+ } else { // existing ref
+ list = &kh_val(bed_lists, bed_itr);
+ }
+
+ if (list->length == list->size) {
+ bed_entry_t *tmp;
+
+ list->size += list->size / 2 + 256;
+
+ if ((tmp = realloc(list->bp, list->size * sizeof(bed_entry_t))) == NULL) {
+ fprintf(samtools_stderr, "[amplicon] error: unable to allocate more memory for bed data.\n");
+ ret = 1;
+ goto error;
+ }
+
+ list->bp = tmp;
+ }
+
+ list->bp[list->length].left = left;
+ list->bp[list->length].right = right;
+
+ if (get_strand) {
+ if (strand == '+') {
+ list->bp[list->length].rev = 0;
+ } else if (strand == '-') {
+ list->bp[list->length].rev = 1;
+ } else {
+ fprintf(samtools_stderr, "[amplicon] error: bad strand value in line %d, expecting '+' or '-', found '%c'.\n",
+ line_count, strand);
+ ret = 1;
+ goto error;
+ }
+ }
+
+ if (right - left > list->longest)
+ list->longest = right - left;
+
+ list->length++;
+ }
+
+ if (sort_by_pos) {
+ for (bed_itr = kh_begin(bed_lists); bed_itr != kh_end(bed_lists); ++bed_itr) {
+ if (kh_exist(bed_lists, bed_itr)) {
+ list = &kh_val(bed_lists, bed_itr);
+ qsort(list->bp, list->length, sizeof(list->bp[0]), bed_entry_sort);
+ }
+ }
+ }
+
+ if (kh_size(bed_lists) > 0) {// any entries
+ ret = 0;
+ } else {
+ ret = 1;
+ }
+
+error:
+ ks_free(&line);
+
+ if (hclose(fp) != 0) {
+ fprintf(samtools_stderr, "[amplicon] warning: failed to close %s", infile);
+ }
+
+ return ret;
+}
+
+
+void destroy_bed_hash(khash_t(bed_list_hash) *hash) {
+ khiter_t itr;
+
+ for (itr = kh_begin(hash); itr != kh_end(hash); ++itr) {
+ if (kh_exist(hash, itr)) {
+ free(kh_val(hash, itr).bp);
+ free((char *)kh_key(hash, itr));
+ kh_key(hash, itr) = NULL;
+ }
+ }
+
+ kh_destroy(bed_list_hash, hash);
+}
+
+
+static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos,
+ int is_rev, int use_strand, int64_t longest,
+ cl_param_t *param) {
+ int i, size; // may need this to be variable
+ int tol = param->tol;
+ int l = 0, mid = sites->length / 2, r = sites->length;
+ int pos_tol = is_rev ? (pos > tol ? pos - tol : 0) : pos;
+
+ while (r - l > 1) {
+ if (sites->bp[mid].right <= pos_tol) {
+ l = mid;
+ } else {
+ r = mid;
+ }
+ mid = (l + r) / 2;
+ }
+
+ size = 0;
+
+ for (i = l; i < sites->length; i++) {
+ hts_pos_t mod_left, mod_right;
+
+ if (use_strand && is_rev != sites->bp[i].rev)
+ continue;
+
+ if (is_rev) {
+ mod_left = sites->bp[i].left;
+ mod_right = sites->bp[i].right + tol;
+ } else {
+ if (sites->bp[i].left > tol) {
+ mod_left = sites->bp[i].left - tol;
+ } else {
+ mod_left = 0;
+ }
+ mod_right = sites->bp[i].right;
+ }
+
+ if (pos + longest + tol < mod_right)
+ break;
+
+ if (pos >= mod_left && pos <= mod_right) {
+ if (is_rev) {
+ if (size < pos - sites->bp[i].left) {
+ size = pos - sites->bp[i].left;
+ }
+ } else {
+ if (size < sites->bp[i].right - pos) {
+ size = sites->bp[i].right - pos;
+ }
+ }
+ }
+ }
+
+ return size;
+}
+
+
+static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
+ clipping_type clipping) {
+ uint32_t *orig_cigar = bam_get_cigar(rec);
+ uint8_t *orig_seq = bam_get_seq(rec);
+ uint8_t *orig_qual = bam_get_qual(rec);
+ uint8_t *orig_aux = bam_get_aux(rec);
+ uint32_t *new_cigar;
+ uint8_t *new_qual;
+ size_t orig_l_aux = bam_get_l_aux(rec);
+ uint32_t i, j, odd_base = 0;
+ uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0;
+ hts_pos_t new_pos = rec->core.pos;
+ uint32_t cig_type, cig_op;
+
+ if (rec->l_data + 8 > rec_out->m_data) {
+ uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8);
+ if (!new_data) {
+ fprintf(samtools_stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n");
+ return 1;
+ }
+ rec_out->data = new_data;
+ rec_out->m_data = rec->l_data + 8;
+ }
+
+ // Copy core data & name
+ memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
+ memcpy(rec_out->data, rec->data, rec->core.l_qname);
+
+ if (clipping == hard_clip && bases >= rec->core.l_qseq) {
+ rec_out->core.l_qseq = 0;
+ rec_out->core.n_cigar = 0;
+
+ if (orig_l_aux)
+ memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+ rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+ return 0;
+ }
+
+ // Modify CIGAR
+ new_cigar = bam_get_cigar(rec_out);
+
+ for (i = 0; i < rec->core.n_cigar; i++) {
+ cig_op = bam_cigar_op(orig_cigar[i]);
+ cig_type = bam_cigar_type(cig_op);
+
+ if (cig_op == BAM_CHARD_CLIP) {
+ hardclip += bam_cigar_oplen(orig_cigar[i]);
+ } else {
+ if (cig_type & 2) {
+ if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) {
+ ref_remove -= bam_cigar_oplen(orig_cigar[i]);
+ } else {
+ break;
+ }
+ new_pos += bam_cigar_oplen(orig_cigar[i]);
+ }
+ if (cig_type & 1) {
+ qry_removed += bam_cigar_oplen(orig_cigar[i]);
+ }
+ }
+ }
+
+ if (i < rec->core.n_cigar) {
+ cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i]));
+
+ // account for the last operation
+ if (cig_type & 2) {
+ new_pos += ref_remove;
+ }
+ if (cig_type & 1) {
+ qry_removed += ref_remove;
+ }
+ } else {
+ qry_removed = rec->core.l_qseq;
+ }
+
+ j = 0;
+ if (clipping == hard_clip && hardclip + qry_removed > 0) {
+ new_cigar[j++] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP);
+ }
+ if (clipping == soft_clip) {
+ if (hardclip > 0) {
+ new_cigar[j++] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP);
+ }
+ if (qry_removed > 0) {
+ new_cigar[j++] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP);
+ }
+ }
+
+ if (i < rec->core.n_cigar
+ && bam_cigar_oplen(orig_cigar[i]) > ref_remove) {
+ new_cigar[j++] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i]));
+
+ // fill in the rest of the cigar
+ i++;
+
+ for (; i < rec->core.n_cigar; i++) {
+ new_cigar[j++] = orig_cigar[i];
+ }
+ }
+
+ rec_out->core.n_cigar = j;
+
+ if (clipping == soft_clip) {
+ qry_removed = 0; // Copy all the sequence and confidence values
+ odd_base = 1; // account for an odd number of bases
+ }
+
+ new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2;
+ // Copy remaining SEQ
+ if ((qry_removed & 1) == 0) {
+ memcpy(bam_get_seq(rec_out), orig_seq + (qry_removed / 2),
+ (rec->core.l_qseq - qry_removed + odd_base) / 2);
+ } else {
+ uint8_t *in = orig_seq + qry_removed / 2;
+ uint8_t *out = bam_get_seq(rec_out);
+ uint32_t i;
+ for (i = qry_removed; i < rec->core.l_qseq - 1; i += 2) {
+ *out++ = ((in[0] & 0x0f) << 4) | ((in[1] & 0xf0) >> 4);
+ in++;
+ }
+ if (i < rec->core.l_qseq) {
+ *out++ = (in[0] & 0x0f) << 4;
+ }
+ assert(out == new_qual);
+ }
+
+ // Copy remaining QUAL
+ memmove(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
+
+ // Set new l_qseq
+ rec_out->core.l_qseq -= qry_removed;
+
+ // Move AUX
+ if (orig_l_aux)
+ memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+ // Set new l_data
+ rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+ // put in new pos
+ rec_out->core.pos = new_pos;
+
+ return 0;
+}
+
+
+static int bam_trim_right(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
+ clipping_type clipping) {
+ uint32_t *orig_cigar = bam_get_cigar(rec);
+ uint8_t *orig_seq = bam_get_seq(rec);
+ uint8_t *orig_qual = bam_get_qual(rec);
+ uint8_t *orig_aux = bam_get_aux(rec);
+ uint32_t *new_cigar;
+ uint32_t new_n_cigar = 0;
+ uint8_t *new_qual;
+ size_t orig_l_aux = bam_get_l_aux(rec);
+ int32_t i;
+ int32_t j;
+ uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0;
+ uint32_t cig_type, cig_op;
+
+ if (rec->l_data + 8 > rec_out->m_data) {
+ uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8);
+ if (!new_data) {
+ fprintf(samtools_stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n");
+ return 1;
+ }
+ rec_out->data = new_data;
+ rec_out->m_data = rec->l_data + 8;
+ }
+
+ // Copy core data & name
+ memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
+ memcpy(rec_out->data, rec->data, rec->core.l_qname);
+
+ if (clipping == hard_clip && bases >= rec->core.l_qseq) {
+ rec_out->core.l_qseq = 0;
+ rec_out->core.n_cigar = 0;
+
+ if (orig_l_aux)
+ memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+ rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+ return 0;
+ }
+
+ // Modify CIGAR here
+ new_cigar = bam_get_cigar(rec_out);
+
+ for (i = rec->core.n_cigar - 1; i >= 0; --i) {
+ cig_op = bam_cigar_op(orig_cigar[i]);
+ cig_type = bam_cigar_type(cig_op);
+
+ if (cig_op == BAM_CHARD_CLIP) {
+ hardclip += bam_cigar_oplen(orig_cigar[i]);
+ } else {
+ if (cig_type & 2) {
+ if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) {
+ ref_remove -= bam_cigar_oplen(orig_cigar[i]);
+ } else {
+ break;
+ }
+ }
+ if (cig_type & 1) {
+ qry_removed += bam_cigar_oplen(orig_cigar[i]);
+ }
+ }
+ }
+
+ if (i >= 0) {
+ cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i]));
+ if (cig_type & 1) {
+ qry_removed += ref_remove;
+ }
+ j = i;
+ if (qry_removed > 0) j++;
+ if (hardclip > 0 && (clipping == soft_clip || qry_removed == 0)) j++;
+ } else {
+ qry_removed = rec->core.l_qseq;
+ j = 0;
+ if (hardclip > 0 && clipping == soft_clip) j++;
+ }
+
+ if (clipping == hard_clip && hardclip + qry_removed > 0) {
+ new_cigar[j] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP);
+ new_n_cigar++;
+ }
+ if (clipping == soft_clip) {
+ if (hardclip > 0) {
+ new_cigar[j] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP);
+ new_n_cigar++;
+ if (qry_removed > 0) --j;
+ }
+ if (qry_removed > 0) {
+ new_cigar[j] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP);
+ new_n_cigar++;
+ }
+ }
+
+ if (j > 0) {
+ new_cigar[--j] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i]));
+ new_n_cigar++;
+ }
+
+ // fill in the rest of the cigar
+ while (j > 0) {
+ new_cigar[--j] = orig_cigar[--i];
+ new_n_cigar++;
+ }
+
+ rec_out->core.n_cigar = new_n_cigar;
+
+ if (clipping == soft_clip)
+ qry_removed = 0; // Copy all the sequence and confidence values
+
+ new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2;
+ // Copy remaining SEQ
+ memcpy(bam_get_seq(rec_out), orig_seq, (rec->core.l_qseq - qry_removed + 1) / 2);
+
+ // Copy remaining QUAL
+ memcpy(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
+
+ // Set new l_qseq
+ rec_out->core.l_qseq -= qry_removed;
+
+ // Copy AUX
+ if (orig_l_aux)
+ memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+ // Set new l_data
+ rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+ return 0;
+}
+
+
+static hts_pos_t active_query_len(bam1_t *b) {
+ uint32_t *cigar = bam_get_cigar(b);
+ uint32_t cig_type, cig_op;
+ hts_pos_t len = 0;
+ int i;
+
+ for (i = 0; i < b->core.n_cigar; i++) {
+ cig_op = bam_cigar_op(cigar[i]);
+ cig_type = bam_cigar_type(cig_op);
+
+ if ((cig_type & 1) && (cig_op != BAM_CSOFT_CLIP)) {
+ len += bam_cigar_oplen(cigar[i]);
+ }
+ }
+
+ return len;
+}
+
+
+static inline void swap_bams(bam1_t **a, bam1_t **b) {
+ bam1_t *tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+
+// Format OA:Z:(RNAME,POS,strand,CIGAR,MAPQ,NM;
+static inline int tag_original_data(bam1_t *orig, kstring_t *oa_tag) {
+ char strand;
+ uint8_t *nm_tag, *old_oa_tag;
+ uint32_t *cigar;
+ int64_t nm = 0;
+ int i, res = 0;
+
+ ks_clear(oa_tag);
+
+ // if there is an existing OA tag the new one gets appended to it
+ if ((old_oa_tag = bam_aux_get(orig, "OA"))) {
+ res |= ksprintf(oa_tag, "%s", bam_aux2Z(old_oa_tag)) < 0;
+ }
+
+ if (orig->core.flag & BAM_FREVERSE)
+ strand = '-';
+ else
+ strand = '+';
+
+ if ((nm_tag = bam_aux_get(orig, "NM"))) {
+ nm = bam_aux2i(nm_tag);
+ }
+
+ res |= ksprintf(oa_tag, "%s,%"PRIhts_pos",%c,", bam_get_qname(orig), orig->core.pos + 1, strand) < 0;
+
+ for (i = 0, cigar = bam_get_cigar(orig); i < orig->core.n_cigar && res == 0; ++i) {
+ res |= kputw(bam_cigar_oplen(cigar[i]), oa_tag) < 0;
+ res |= kputc(bam_cigar_opchr(cigar[i]), oa_tag) < 0;
+ }
+
+ if (nm_tag) {
+ res |= ksprintf(oa_tag, ",%d,%"PRId64";", orig->core.qual, nm) < 0;
+ } else {
+ res |= ksprintf(oa_tag, "%d,;", orig->core.qual) < 0;
+ }
+
+ return res;
+}
+
+
+static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
+ clipping_type clipping, cl_param_t *param) {
+ int ret = 1, r, file_open = 0;
+
+ bam_hdr_t *header = NULL;
+ bam1_t *b = NULL, *b_tmp = NULL;
+ long f_count = 0, r_count = 0, n_count = 0, l_count = 0, l_exclude = 0, b_count = 0;
+ long filtered = 0, written = 0, failed = 0;
+ kstring_t str = KS_INITIALIZE;
+ kstring_t oat = KS_INITIALIZE;
+ bed_entry_list_t *sites;
+ FILE *stats_fp = samtools_stderr;
+ khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
+
+ if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash)) {
+ fprintf(samtools_stderr, "[ampliconclip] error: unable to load bed file.\n");
+ goto fail;
+ }
+
+ if ((header = sam_hdr_read(in)) == NULL) {
+ fprintf(samtools_stderr, "[ampliconclip] error: could not read header\n");
+ goto fail;
+ }
+
+ // changing pos can ruin coordinate sort order
+ if (sam_hdr_find_tag_hd(header, "SO", &str) == 0 && str.s && strcmp(str.s, "coordinate") == 0) {
+ const char *new_order = "unknown";
+
+ if (sam_hdr_update_hd(header, "SO", new_order) == -1) {
+ fprintf(samtools_stderr, "[ampliconclip] error: unable to change sort order to 'SO:%s'\n", new_order);
+ goto fail;
+ }
+ }
+
+ ks_free(&str);
+
+ if (param->add_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(),
+ param->arg_list ? "CL" : NULL,
+ param->arg_list ? param->arg_list : NULL,
+ NULL) != 0) {
+ fprintf(samtools_stderr, "[ampliconclip] warning: unable to add @PG line to header.\n");
+ }
+ if (sam_hdr_write(out, header) < 0) {
+ fprintf(samtools_stderr, "[ampliconclip] error: could not write header.\n");
+ goto fail;
+ }
+
+ if (reject) {
+ if (sam_hdr_write(reject, header) < 0) {
+ fprintf(samtools_stderr, "[ampliconclip] error: could not write header to rejects file.\n");
+ goto fail;
+ }
+ }
+
+ b = bam_init1();
+ b_tmp = bam_init1();
+ if (!b || !b_tmp) {
+ fprintf(samtools_stderr, "[ampliconclip] error: out of memory when trying to create record.\n");
+ goto fail;
+ }
+
+ int32_t last_tid = -1;
+ int ref_found = 0;
+
+ while ((r = sam_read1(in, header, b)) >= 0) {
+ hts_pos_t pos;
+ int is_rev;
+ int p_size;
+ int been_clipped = 0, filter = 0;
+ int exclude = (BAM_FUNMAP | BAM_FQCFAIL);
+ khiter_t itr;
+
+ l_count++;
+
+ if (b->core.tid != last_tid) {
+ const char *ref_name;
+
+ ref_found = 0;
+ last_tid = b->core.tid;
+
+ if ((ref_name = sam_hdr_tid2name(header, b->core.tid)) != NULL) {
+ itr = kh_get(bed_list_hash, bed_hash, ref_name);
+
+ if (itr != kh_end(bed_hash)) {
+ sites = &kh_val(bed_hash, itr);
+ ref_found = 1;
+ }
+ }
+ }
+
+ if (!(b->core.flag & exclude) && ref_found) {
+ if (param->oa_tag)
+ if (tag_original_data(b, &oat))
+ goto fail;
+
+ if (!param->both) {
+ if (bam_is_rev(b)) {
+ pos = bam_endpos(b);
+ is_rev = 1;
+ } else {
+ pos = b->core.pos;
+ is_rev = 0;
+ }
+
+ if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
+ if (is_rev) {
+ if (bam_trim_right(b, b_tmp, p_size, clipping) != 0)
+ goto fail;
+
+ swap_bams(&b, &b_tmp);
+ r_count++;
+ } else {
+ if (bam_trim_left(b, b_tmp, p_size, clipping) != 0)
+ goto fail;
+
+ swap_bams(&b, &b_tmp);
+ f_count++;
+ }
+
+ if (param->oa_tag) {
+ if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s))
+ goto fail;
+ }
+
+ if (param->del_tag) {
+ uint8_t *tag;
+
+ if ((tag = bam_aux_get(b, "NM")))
+ bam_aux_del(b, tag);
+
+ if ((tag = bam_aux_get(b, "MD")))
+ bam_aux_del(b, tag);
+ }
+
+ been_clipped = 1;
+ } else {
+ if (param->mark_fail) {
+ b->core.flag |= BAM_FQCFAIL;
+ }
+
+ n_count++;
+ }
+ } else {
+ int left = 0, right = 0;
+
+ // left first
+ pos = b->core.pos;
+ is_rev = 0;
+
+ if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
+ if (bam_trim_left(b, b_tmp, p_size, clipping) != 0)
+ goto fail;
+
+ swap_bams(&b, &b_tmp);
+ f_count++;
+ left = 1;
+ been_clipped = 1;
+ }
+
+ // the right
+ pos = bam_endpos(b);
+ is_rev = 1;
+
+ if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
+ if (bam_trim_right(b, b_tmp, p_size, clipping) != 0)
+ goto fail;
+
+ swap_bams(&b, &b_tmp);
+ r_count++;
+ right = 1;
+ been_clipped = 1;
+ }
+
+ if (left || right) {
+ uint8_t *tag;
+
+ if (param->oa_tag) {
+ if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s))
+ goto fail;
+ }
+
+ if (param->del_tag) {
+ if ((tag = bam_aux_get(b, "NM")))
+ bam_aux_del(b, tag);
+
+ if ((tag = bam_aux_get(b, "MD")))
+ bam_aux_del(b, tag);
+ }
+ }
+
+ if (left && right) {
+ b_count++;
+ } else if (!left && !right) {
+ if (param->mark_fail) {
+ b->core.flag |= BAM_FQCFAIL;
+ }
+
+ n_count++;
+ }
+ }
+
+ if (param->fail_len >= 0 || param->filter_len >= 0) {
+ hts_pos_t aql = active_query_len(b);
+
+ if (param->fail_len >= 0 && aql <= param->fail_len) {
+ b->core.flag |= BAM_FQCFAIL;
+ }
+
+ if (param->filter_len >= 0 && aql <= param->filter_len) {
+ filter = 1;
+ }
+ }
+
+ if (b->core.flag & BAM_FQCFAIL) {
+ failed++;
+ }
+
+ if (param->write_clipped && !been_clipped) {
+ filter = 1;
+ }
+
+ } else {
+ l_exclude++;
+
+ if (param->unmapped) {
+ filter = 1;
+ }
+ }
+
+ if (!filter) {
+ if (sam_write1(out, header, b) < 0) {
+ fprintf(samtools_stderr, "[ampliconclip] error: could not write line %ld.\n", l_count);
+ goto fail;
+ }
+
+ written++;
+ } else {
+ if (reject) {
+ if (sam_write1(reject, header, b) < 0) {
+ fprintf(samtools_stderr, "[ampliconclip] error: could not write to reject file %s\n",
+ param->rejects_file);
+ goto fail;
+ }
+ }
+
+ filtered++;
+ }
+ }
+
+ if (r < -1) {
+ fprintf(samtools_stderr, "[ampliconclip] error: failed to read input.\n");
+ goto fail;
+ }
+
+ if (param->stats_file) {
+ if ((stats_fp = fopen(param->stats_file, "w")) == NULL) {
+ fprintf(samtools_stderr, "[ampliconclip] warning: cannot write stats to %s.\n", param->stats_file);
+ } else {
+ file_open = 1;
+ }
+ }
+
+ fprintf(stats_fp, "COMMAND: %s\n"
+ "TOTAL READS: %ld\n"
+ "TOTAL CLIPPED: %ld\n"
+ "FORWARD CLIPPED: %ld\n"
+ "REVERSE CLIPPED: %ld\n"
+ "BOTH CLIPPED: %ld\n"
+ "NOT CLIPPED: %ld\n"
+ "EXCLUDED: %ld\n"
+ "FILTERED: %ld\n"
+ "FAILED: %ld\n"
+ "WRITTEN: %ld\n", param->arg_list, l_count, f_count + r_count,
+ f_count, r_count, b_count, n_count, l_exclude,
+ filtered, failed, written);
+
+ if (file_open) {
+ fclose(stats_fp);
+ }
+
+ ret = 0;
+
+fail:
+ destroy_bed_hash(bed_hash);
+ ks_free(&oat);
+ sam_hdr_destroy(header);
+ bam_destroy1(b);
+ bam_destroy1(b_tmp);
+ return ret;
+}
+
+
+static void usage(void) {
+ fprintf(samtools_stderr, "Usage: samtools ampliconclip -b BED file <input.bam> -o <output.bam>\n\n");
+ fprintf(samtools_stderr, "Option: \n");
+ fprintf(samtools_stderr, " -b FILE BED file of regions (eg amplicon primers) to be removed.\n");
+ fprintf(samtools_stderr, " -o FILE output file name (default samtools_stdout).\n");
+ fprintf(samtools_stderr, " -f FILE write stats to file name (default samtools_stderr)\n");
+ fprintf(samtools_stderr, " -u Output uncompressed data\n");
+ fprintf(samtools_stderr, " --soft-clip soft clip amplicon primers from reads (default)\n");
+ fprintf(samtools_stderr, " --hard-clip hard clip amplicon primers from reads.\n");
+ fprintf(samtools_stderr, " --both-ends clip on both 5' and 3' ends.\n");
+ fprintf(samtools_stderr, " --strand use strand data from BED file to match read direction.\n");
+ fprintf(samtools_stderr, " --clipped only output clipped reads.\n");
+ fprintf(samtools_stderr, " --fail mark unclipped, mapped reads as QCFAIL.\n");
+ fprintf(samtools_stderr, " --filter-len INT do not output reads INT size or shorter.\n");
+ fprintf(samtools_stderr, " --fail-len INT mark as QCFAIL reads INT size or shorter.\n");
+ fprintf(samtools_stderr, " --no-excluded do not write excluded reads (unmapped or QCFAIL).\n");
+ fprintf(samtools_stderr, " --rejects-file FILE file to write filtered reads.\n");
+ fprintf(samtools_stderr, " --original for clipped entries add an OA tag with original data.\n");
+ fprintf(samtools_stderr, " --keep-tag for clipped entries keep the old NM and MD tags.\n");
+ fprintf(samtools_stderr, " --tolerance match region within this number of bases, default 5.\n");
+ fprintf(samtools_stderr, " --no-PG do not add an @PG line.\n");
+ sam_global_opt_help(samtools_stderr, "-.O..@-.");
+ fprintf(samtools_stderr, "\nAbout: Soft clips read alignments where they match BED file defined regions.\n"
+ "Default clipping is only on the 5' end.\n\n");
+}
+
+
+int amplicon_clip_main(int argc, char **argv) {
+ int c, ret;
+ char wmode[4] = {'w', 'b', 0, 0};
+ char *bedfile = NULL, *fnout = "-";
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ htsThreadPool p = {NULL, 0};
+ samFile *in = NULL, *out = NULL, *reject = NULL;
+ clipping_type clipping = soft_clip;
+ cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL};
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
+ {"no-PG", no_argument, NULL, 1002},
+ {"soft-clip", no_argument, NULL, 1003},
+ {"hard-clip", no_argument, NULL, 1004},
+ {"strand", no_argument, NULL, 1005},
+ {"clipped", no_argument, NULL, 1006},
+ {"fail", no_argument, NULL, 1007},
+ {"both-ends", no_argument, NULL, 1008},
+ {"filter-len", required_argument, NULL, 1009},
+ {"fail-len", required_argument, NULL, 1010},
+ {"no-excluded", no_argument, NULL, 1011},
+ {"rejects-file", required_argument, NULL, 1012},
+ {"original", no_argument, NULL, 1013},
+ {"keep-tag", no_argument, NULL, 1014},
+ {"tolerance", required_argument, NULL, 1015},
+ {NULL, 0, NULL, 0}
+ };
+
+ while ((c = getopt_long(argc, argv, "b:@:o:O:f:u", lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'b': bedfile = optarg; break;
+ case 'o': fnout = optarg; break;
+ case 'f': param.stats_file = optarg; break;
+ case 'u': wmode[2] = '0'; break;
+ case 1002: param.add_pg = 0; break;
+ case 1003: clipping = soft_clip; break;
+ case 1004: clipping = hard_clip; break;
+ case 1005: param.use_strand = 1; break;
+ case 1006: param.write_clipped = 1; break;
+ case 1007: param.mark_fail = 1; break;
+ case 1008: param.both = 1; break;
+ case 1009: param.filter_len = atoi(optarg); break;
+ case 1010: param.fail_len = atoi(optarg); break;
+ case 1011: param.unmapped = 1; break;
+ case 1012: param.rejects_file = optarg; break;
+ case 1013: param.oa_tag = 1; break;
+ case 1014: param.del_tag = 0; break;
+ case 1015: param.tol = atoi(optarg); break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage(); samtools_exit(1);
+ }
+ }
+
+ if (!bedfile) {
+ usage();
+ return 1;
+ }
+
+ if (optind + 1 > argc) {
+ usage();
+ return 1;
+ }
+
+ if (param.tol < 0) {
+ fprintf(samtools_stderr, "[ampliconclip] warning: invalid tolerance of %d,"
+ " reseting tolerance to default of 5.\n", param.tol);
+ param.tol = 5;
+ }
+
+ if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) {
+ print_error_errno("ampliconclip", "cannot open input file");
+ return 1;
+ }
+
+ sam_open_mode(wmode+1, fnout, NULL);
+
+ if ((out = sam_open_format(fnout, wmode, &ga.out)) == NULL) {
+ print_error_errno("ampliconclip", "cannot open output file");
+ return 1;
+ }
+
+ if (param.rejects_file) {
+ sam_open_mode(wmode+1, param.rejects_file, NULL);
+
+ if ((reject = sam_open_format(param.rejects_file, wmode, &ga.out)) == NULL) {
+ print_error_errno("ampliconclip", "cannot open rejects file");
+ return 1;
+ }
+ }
+
+ if (ga.nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(samtools_stderr, "[ampliconclip] error: cannot create thread pool.\n");
+ return 1;
+ }
+ hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
+ hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+
+ if (reject) {
+ hts_set_opt(reject, HTS_OPT_THREAD_POOL, &p);
+ }
+ }
+
+ param.arg_list = stringify_argv(argc + 1, argv - 1);
+
+ ret = bam_clip(in, out, reject, bedfile, clipping, ¶m);
+
+ // cleanup
+ sam_close(in);
+
+ if (sam_close(out) < 0) {
+ fprintf(samtools_stderr, "[ampliconclip] error: error while closing output file %s.\n", argv[optind+1]);
+ ret = 1;
+ }
+
+ if (reject) {
+ if (sam_close(reject) < 0) {
+ fprintf(samtools_stderr, "[ampliconclip] error: error while closing reject file %s.\n", param.rejects_file);
+ ret = 1;
+ }
+ }
+
+ if (p.pool) hts_tpool_destroy(p.pool);
+
+ sam_global_args_free(&ga);
+ free(param.arg_list);
+
+ return ret;
+}
+
--- /dev/null
+/* bam_ampliconclip.h -- shared functions between amplicon clip/stats
+
+ Copyright (C) 2020-2021 Genome Research Ltd.
+
+ Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef BAM_AMPLICONCLIP_H
+#define BAM_AMPLICONCLIP_H
+
+#include "htslib/khash.h"
+
+typedef struct {
+ int64_t left;
+ int64_t right;
+ int rev;
+} bed_entry_t;
+
+typedef struct {
+ bed_entry_t *bp;
+ int64_t longest;
+ int length;
+ int size;
+} bed_entry_list_t;
+
+KHASH_MAP_INIT_STR(bed_list_hash, bed_entry_list_t);
+
+#define BED_LIST_INIT {NULL, 0, 0, 0, {0}}
+
+
+int load_bed_file_multi_ref(char *infile, int get_strand,
+ int sort_by_pos, khash_t(bed_list_hash) *bed_lists);
+
+void destroy_bed_hash(khash_t(bed_list_hash) *hash);
+
+
+#endif /* BAM_AMPLICONCLIP_H */
{
if (s) {
uint8_t *p, *aux;
- aux = bam1_aux(b);
+ aux = bam_get_aux(b);
p = s - 2;
__skip_tag(s);
memmove(aux, p, s - p);
- b->data_len -= bam_get_l_aux(b) - (s - p);
+ b->l_data -= bam_get_l_aux(b) - (s - p);
} else {
- b->data_len -= bam_get_l_aux(b);
+ b->l_data -= bam_get_l_aux(b);
}
return 0;
}
{
if (s) {
uint8_t *p, *aux;
- aux = bam1_aux(b);
+ aux = bam_get_aux(b);
p = s - 2;
__skip_tag(s);
memmove(aux, p, s - p);
- b->data_len -= bam_get_l_aux(b) - (s - p);
+ b->l_data -= bam_get_l_aux(b) - (s - p);
} else {
- b->data_len -= bam_get_l_aux(b);
+ b->l_data -= bam_get_l_aux(b);
}
return 0;
}
/* bam_cat.c -- efficiently concatenates bam files.
- Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019 Genome Research Ltd.
+ Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd.
Modified SAMtools work copyright (C) 2010 Illumina, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
// Copy contains and blocks within them
while ((c = cram_read_container(in_c))) {
- cram_block *blk;
-
- if (cram_container_is_empty(in_c)) {
- if (cram_write_container(out_c, c) != 0)
- return -1;
-
+ if (cram_container_is_empty(in_c)) {
+ cram_block *blk;
// Container compression header
if (!(blk = cram_read_block(in_c)))
return -1;
- if (cram_write_block(out_c, blk) != 0) {
- cram_free_block(blk);
- return -1;
- }
cram_free_block(blk);
cram_free_container(c);
-
continue;
}
cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
} else {
int32_t num_slices;
+ cram_block *blk;
// Not switching rg so do the usual read/write loop
if (cram_write_container(out_c, c) != 0)
char *outfn = 0;
char **infns = NULL; // files to concatenate
int infns_size = 0;
- int c, ret = 0, no_pg = 0;
+ int c, ret = 0, no_pg = 0, usage = 0;
samFile *in;
sam_global_args ga;
sam_global_args_init(&ga);
- while ((c = getopt_long(argc, argv, "h:o:b:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) {
switch (c) {
case 'h': {
samFile *fph = sam_open(optarg, "r");
break;
default:
if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage=1; break;
}
}
}
// Require at least one input file
- if (infns_size + nargv_fns == 0) {
+ if (infns_size + nargv_fns == 0 || usage) {
fprintf(stderr, "Usage: samtools cat [options] <in1.bam> [... <inN.bam>]\n");
fprintf(stderr, " samtools cat [options] <in1.cram> [... <inN.cram>]\n\n");
fprintf(stderr, "Concatenate BAM or CRAM files, first those in <bamlist.fofn>, then those\non the command line.\n\n");
/* bam_cat.c -- efficiently concatenates bam files.
- Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019 Genome Research Ltd.
+ Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd.
Modified SAMtools work copyright (C) 2010 Illumina, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
// Copy contains and blocks within them
while ((c = cram_read_container(in_c))) {
- cram_block *blk;
-
- if (cram_container_is_empty(in_c)) {
- if (cram_write_container(out_c, c) != 0)
- return -1;
-
+ if (cram_container_is_empty(in_c)) {
+ cram_block *blk;
// Container compression header
if (!(blk = cram_read_block(in_c)))
return -1;
- if (cram_write_block(out_c, blk) != 0) {
- cram_free_block(blk);
- return -1;
- }
cram_free_block(blk);
cram_free_container(c);
-
continue;
}
cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
} else {
int32_t num_slices;
+ cram_block *blk;
// Not switching rg so do the usual read/write loop
if (cram_write_container(out_c, c) != 0)
char *outfn = 0;
char **infns = NULL; // files to concatenate
int infns_size = 0;
- int c, ret = 0, no_pg = 0;
+ int c, ret = 0, no_pg = 0, usage = 0;
samFile *in;
sam_global_args ga;
sam_global_args_init(&ga);
- while ((c = getopt_long(argc, argv, "h:o:b:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) {
switch (c) {
case 'h': {
samFile *fph = sam_open(optarg, "r");
break;
default:
if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage=1; break;
}
}
}
// Require at least one input file
- if (infns_size + nargv_fns == 0) {
+ if (infns_size + nargv_fns == 0 || usage) {
fprintf(samtools_stderr, "Usage: samtools cat [options] <in1.bam> [... <inN.bam>]\n");
fprintf(samtools_stderr, " samtools cat [options] <in1.cram> [... <inN.cram>]\n\n");
fprintf(samtools_stderr, "Concatenate BAM or CRAM files, first those in <bamlist.fofn>, then those\non the command line.\n\n");
#include <config.h>
#include <ctype.h>
-#include "bam.h"
+#include <string.h>
+
+#include "htslib/sam.h"
/*!
@abstract Get the color encoding the previous and current base
cs = bam_aux2Z(c);
// adjust for strandedness and leading adaptor
- if(bam1_strand(b)) {
+ if(bam_is_rev(b)) {
i = strlen(cs) - 1 - i;
// adjust for leading hard clip
- uint32_t cigar = bam1_cigar(b)[0];
+ uint32_t cigar = bam_get_cigar(b)[0];
if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
i -= cigar >> BAM_CIGAR_SHIFT;
}
cq = bam_aux2Z(c);
// adjust for strandedness
- if(bam1_strand(b)) {
+ if(bam_is_rev(b)) {
i = strlen(cq) - 1 - i;
// adjust for leading hard clip
- uint32_t cigar = bam1_cigar(b)[0];
+ uint32_t cigar = bam_get_cigar(b)[0];
if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
i -= (cigar >> BAM_CIGAR_SHIFT);
}
cs = bam_aux2Z(c);
// adjust for strandedness and leading adaptor
- if(bam1_strand(b)) { //reverse strand
+ if(bam_is_rev(b)) { //reverse strand
cs_i = strlen(cs) - 1 - i;
// adjust for leading hard clip
- uint32_t cigar = bam1_cigar(b)[0];
+ uint32_t cigar = bam_get_cigar(b)[0];
if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
cs_i -= cigar >> BAM_CIGAR_SHIFT;
}
// get current color
cur_color = cs[cs_i];
// get previous base. Note: must rc adaptor
- prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)];
+ prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : seq_nt16_str[bam_seqi(bam_get_seq(b), i+1)];
// get current base
- cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+ cur_b = seq_nt16_str[bam_seqi(bam_get_seq(b), i)];
}
else {
cs_i=i+1;
// get current color
cur_color = cs[cs_i];
// get previous base
- prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)];
+ prev_b = (0 == i) ? cs[0] : seq_nt16_str[bam_seqi(bam_get_seq(b), i-1)];
// get current base
- cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+ cur_b = seq_nt16_str[bam_seqi(bam_get_seq(b), i)];
}
// corrected color
#include <config.h>
#include <ctype.h>
-#include "bam.h"
+#include <string.h>
+
+#include "htslib/sam.h"
/*!
@abstract Get the color encoding the previous and current base
cs = bam_aux2Z(c);
// adjust for strandedness and leading adaptor
- if(bam1_strand(b)) {
+ if(bam_is_rev(b)) {
i = strlen(cs) - 1 - i;
// adjust for leading hard clip
- uint32_t cigar = bam1_cigar(b)[0];
+ uint32_t cigar = bam_get_cigar(b)[0];
if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
i -= cigar >> BAM_CIGAR_SHIFT;
}
cq = bam_aux2Z(c);
// adjust for strandedness
- if(bam1_strand(b)) {
+ if(bam_is_rev(b)) {
i = strlen(cq) - 1 - i;
// adjust for leading hard clip
- uint32_t cigar = bam1_cigar(b)[0];
+ uint32_t cigar = bam_get_cigar(b)[0];
if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
i -= (cigar >> BAM_CIGAR_SHIFT);
}
cs = bam_aux2Z(c);
// adjust for strandedness and leading adaptor
- if(bam1_strand(b)) { //reverse strand
+ if(bam_is_rev(b)) { //reverse strand
cs_i = strlen(cs) - 1 - i;
// adjust for leading hard clip
- uint32_t cigar = bam1_cigar(b)[0];
+ uint32_t cigar = bam_get_cigar(b)[0];
if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
cs_i -= cigar >> BAM_CIGAR_SHIFT;
}
// get current color
cur_color = cs[cs_i];
// get previous base. Note: must rc adaptor
- prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)];
+ prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : seq_nt16_str[bam_seqi(bam_get_seq(b), i+1)];
// get current base
- cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+ cur_b = seq_nt16_str[bam_seqi(bam_get_seq(b), i)];
}
else {
cs_i=i+1;
// get current color
cur_color = cs[cs_i];
// get previous base
- prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)];
+ prev_b = (0 == i) ? cs[0] : seq_nt16_str[bam_seqi(bam_get_seq(b), i-1)];
// get current base
- cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+ cur_b = seq_nt16_str[bam_seqi(bam_get_seq(b), i)];
}
// corrected color
/* bam_fastq.c -- FASTA and FASTQ file generation
- Copyright (C) 2009-2017, 2019 Genome Research Ltd.
+ Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include "samtools.h"
#include "sam_opts.h"
-#define taglist_free(p)
-KLIST_INIT(ktaglist, char*, taglist_free)
-
#define DEFAULT_BARCODE_TAG "BC"
#define DEFAULT_QUALITY_TAG "QT"
#define INDEX_SEPARATOR "+"
int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
-static const char *copied_tags[] = { "RG", "BC", "QT", NULL };
-
static void bam2fq_usage(FILE *to, const char *command)
{
int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0;
fprintf(to,
"\n"
"Description:\n"
-"Converts a SAM, BAM or CRAM into either FASTQ or FASTA format depending on the command invoked.\n"
+"Converts a SAM, BAM or CRAM to %s format.\n"
"\n"
"Options:\n"
-" -0 FILE write reads designated READ_OTHER to FILE\n"
-" -1 FILE write reads designated READ1 to FILE\n"
-" -2 FILE write reads designated READ2 to FILE\n"
-" -o FILE write reads designated READ1 or READ2 to FILE\n"
-" note: if a singleton file is specified with -s, only\n"
-" paired reads will be written to the -1 and -2 files.\n"
-" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
-" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0
-" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
-" -n don't append /1 and /2 to the read name\n"
-" -N always append /1 and /2 to the read name\n");
+" -0 FILE write reads designated READ_OTHER to FILE\n"
+" -1 FILE write reads designated READ1 to FILE\n"
+" -2 FILE write reads designated READ2 to FILE\n"
+" -o FILE write reads designated READ1 or READ2 to FILE\n"
+" note: if a singleton file is specified with -s, only\n"
+" paired reads will be written to the -1 and -2 files.\n"
+" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
+" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0
+" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
+" -n don't append /1 and /2 to the read name\n"
+" -N always append /1 and /2 to the read name\n",
+ fq ? "FASTQ" : "FASTA");
if (fq) fprintf(to,
-" -O output quality in the OQ tag if present\n");
+" -O output quality in the OQ tag if present\n");
fprintf(to,
-" -s FILE write singleton reads designated READ1 or READ2 to FILE\n"
-" -t copy RG, BC and QT tags to the %s header line\n",
+" -s FILE write singleton reads designated READ1 or READ2 to FILE\n"
+" -t copy RG, BC and QT tags to the %s header line\n",
fq ? "FASTQ" : "FASTA");
fprintf(to,
-" -T TAGLIST copy arbitrary tags to the %s header line\n",
+" -T TAGLIST copy arbitrary tags to the %s header line\n",
fq ? "FASTQ" : "FASTA");
if (fq) fprintf(to,
-" -v INT default quality score if not given in file [1]\n"
-" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n"
-" -c compression level [0..9] to use when creating gz or bgzf fastq files [1]\n"
-" --i1 FILE write first index reads to FILE\n"
-" --i2 FILE write second index reads to FILE\n"
-" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n"
-" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n"
-" --index-format STR How to parse barcode and quality tags\n\n");
+" -v INT default quality score if not given in file [1]\n"
+" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n"
+" -c INT compression level [0..9] to use when writing bgzf files [1]\n"
+" --i1 FILE write first index reads to FILE\n"
+" --i2 FILE write second index reads to FILE\n"
+" --barcode-tag TAG\n"
+" Barcode tag [" DEFAULT_BARCODE_TAG "]\n"
+" --quality-tag TAG\n"
+" Quality tag [" DEFAULT_QUALITY_TAG "]\n"
+" --index-format STR\n"
+" How to parse barcode and quality tags\n\n");
sam_global_opt_help(to, "-.--.@-.");
fprintf(to,
"\n"
-"The files will be automatically compressed if the file names have a .gz or .bgzf extension.\n"
-"The input to this program must be collated by name. Run 'samtools collate' or 'samtools sort -n'.\n"
+"The files will be automatically compressed if the file names have a .gz\n"
+"or .bgzf extension. The input to this program must be collated by name.\n"
+"Run 'samtools collate' or 'samtools sort -n' to achieve this.\n"
"\n"
"Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n"
"Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n"
-"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n"
-"or both unset.\n"
+"Otherwise reads are designated READ_OTHER (both flags set or both flags unset).\n"
"Run 'samtools flags' for more information on flag codes and meanings.\n");
fprintf(to,
"\n"
-"The index-format string describes how to parse the barcode and quality tags, for example:\n"
-" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n"
-" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n"
-"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n"
-"'read until the separator or end of tag', for example:\n"
-" n*i* ignore the left part of the tag until the separator, then use the second part\n"
-" of the tag as index 1\n");
+"The index-format string describes how to parse the barcode and quality tags.\n"
+"It is made up of 'i' or 'n' followed by a length or '*'. For example:\n"
+" i14i8 The first 14 characters are index 1, the next 8 are index 2\n"
+" n8i14 Ignore the first 8 characters, and use the next 14 for index 1\n\n"
+"If the tag contains a separator, then the numeric part can be replaced with\n"
+"'*' to mean 'read until the separator or end of tag', for example:\n"
+" i*i* Break the tag at the separator into index 1 and index 2\n"
+" n*i* Ignore the left part of the tag until the separator,\n"
+" then use the second part of the tag as index 1\n");
fprintf(to,
"\n"
"Examples:\n"
-" To get just the paired reads in separate files, use:\n"
-" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n in.bam\n"
-"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n"
+"To get just the paired reads in separate files, use:\n"
+" samtools %s -1 pair1.%s -2 pair2.%s -0 /dev/null -s /dev/null -n in.bam\n"
+"\nTo get all non-supplementary/secondary reads in a single file, redirect\n"
+"the output:\n"
" samtools %s in.bam > all_reads.%s\n",
command, fq ? "fq" : "fa", fq ? "fq" : "fa",
command, fq ? "fq" : "fa");
typedef struct bam2fq_state {
samFile *fp;
- BGZF *fpse;
- BGZF *fpr[3];
- BGZF *fpi[2];
- BGZF *hstdout;
+ samFile *fpse;
+ samFile *fpr[3];
+ samFile *fpi[3];
+ samFile *hstdout;
sam_hdr_t *h;
bool has12, use_oq, copy_tags, illumina_tag;
int flag_on, flag_off, flag_alloff;
fastfile filetype;
int def_qual;
- klist_t(ktaglist) *taglist;
char *index_sequence;
char compression_level;
htsThreadPool p;
} bam2fq_state_t;
-/*
- * Get and decode the read from a BAM record.
- *
- * TODO: htslib really needs an interface for this. Consider this or perhaps
- * bam_get_seq_str (current vs original orientation) and bam_get_qual_str
- * functions as string formatted equivalents to bam_get_{seq,qual}?
- */
-
-/*
- * Reverse a string in place.
- * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux.
- * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik
- */
-static char *reverse(char *str)
-{
- int i = strlen(str)-1,j=0;
- char ch;
- while (i>j) {
- ch = str[i];
- str[i]= str[j];
- str[j] = ch;
- i--;
- j++;
- }
- return str;
-}
-
-/* return the read, reverse complemented if necessary */
-static char *get_read(const bam1_t *rec)
-{
- int len = rec->core.l_qseq + 1;
- char *read = calloc(1, len);
- char *seq = (char *)bam_get_seq(rec);
- int n;
-
- if (!read) return NULL;
-
- for (n=0; n < rec->core.l_qseq; n++) {
- if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]];
- else read[n] = seq_nt16_str[bam_seqi(seq,n)];
- }
- if (rec->core.flag & BAM_FREVERSE) reverse(read);
- return read;
-}
-
-/*
- * get and decode the quality from a BAM record
- */
-static int get_quality(const bam1_t *rec, char **qual_out)
-{
- char *quality = calloc(1, rec->core.l_qseq + 1);
- char *q = (char *)bam_get_qual(rec);
- int n;
-
- if (!quality) return -1;
-
- if (*q == '\xff') {
- free(quality);
- *qual_out = NULL;
- return 0;
- }
-
- for (n=0; n < rec->core.l_qseq; n++) {
- quality[n] = q[n]+33;
- }
- if (rec->core.flag & BAM_FREVERSE) reverse(quality);
- *qual_out = quality;
- return 0;
-}
-
-//
-// End of htslib complaints
-//
-
-
static readpart which_readpart(const bam1_t *b)
{
if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
}
}
-/*
- * parse the length part from the index-format string
- */
-static int getLength(char **s)
-{
- int n = 0;
- while (**s) {
- if (**s == '*') { n=-1; (*s)++; break; }
- if ( !isdigit(**s)) break;
- n = n*10 + ((**s)-'0');
- (*s)++;
- }
- return n;
-}
-
-static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf)
-{
- uint8_t *s = bam_aux_get(rec, tag);
- if (s) {
- char aux_type = *s;
- switch (aux_type) {
- case 'C':
- case 'S': aux_type = 'I'; break;
- case 'c':
- case 's': aux_type = 'i'; break;
- case 'd': aux_type = 'f'; break;
- }
-
- // Ensure space. Need 6 chars + length of tag. Max length of
- // i is 16, A is 21, B currently 26, Z is unknown, so
- // have to check that one later.
- if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false;
-
- kputc('\t', linebuf);
- kputsn(tag, 2, linebuf);
- kputc(':', linebuf);
- kputc(aux_type=='I'? 'i': aux_type, linebuf);
- kputc(':', linebuf);
- switch (aux_type) {
- case 'H':
- case 'Z':
- if (kputs(bam_aux2Z(s), linebuf) < 0) return false;
- break;
- case 'i': kputw(bam_aux2i(s), linebuf); break;
- case 'I': kputuw(bam_aux2i(s), linebuf); break;
- case 'A': kputc(bam_aux2A(s), linebuf); break;
- case 'f': kputd(bam_aux2f(s), linebuf); break;
- case 'B': kputs("*** Unhandled aux type ***", linebuf); return false;
- default: kputs("*** Unknown aux type ***", linebuf); return false;
- }
- }
- return true;
-}
-
-static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec)
-{
- if (!index_sequence) return 0;
-
- kstring_t new = {0,0,NULL};
- if (linebuf->s) {
- char *s = strchr(linebuf->s, '\n');
- if (s) {
- if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0)
- return -1;
- *s = 0;
- kputs(linebuf->s, &new);
- kputc(' ', &new);
- readpart readpart = which_readpart(rec);
- if (readpart == READ_1) kputc('1', &new);
- else if (readpart == READ_2) kputc('2', &new);
- else kputc('0', &new);
-
- kputc(':', &new);
- if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new);
- else kputc('N', &new);
-
- kputs(":0:", &new);
- kputs(index_sequence, &new);
- kputc('\n', &new);
- kputs(s+1, &new);
- free(ks_release(linebuf));
- linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m;
- }
- }
- return 0;
-}
-
-static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state)
-{
- int i;
-
- linebuf->l = 0;
- // Write read name
- if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false;
- if (kputs(bam_get_qname(rec), linebuf) < 0) return false;
- // Add the /1 /2 if requested
- if (state->has12) {
- readpart readpart = which_readpart(rec);
- if (readpart == READ_1) {
- if (kputs("/1", linebuf) < 0) return false;
- } else if (readpart == READ_2) {
- if (kputs("/2", linebuf) < 0) return false;
- }
- }
- if (state->copy_tags) {
- for (i = 0; copied_tags[i]; ++i) {
- if (!copy_tag(copied_tags[i], rec, linebuf)) {
- fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s);
- return false;
- }
- }
- }
-
- if (state->taglist->size) {
- kliter_t(ktaglist) *p;
- for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) {
- if (!copy_tag(kl_val(p), rec, linebuf)) {
- fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s);
- return false;
- }
- }
- }
-
- if (kputc('\n', linebuf) < 0) return false;
- if (kputs(seq, linebuf) < 0) return false;
- if (kputc('\n', linebuf) < 0) return false;
-
- if (state->filetype == FASTQ) {
- // Write quality
- if (kputs("+\n", linebuf) < 0) return false;
- if (qual && *qual) {
- if (kputs(qual, linebuf) < 0) return false;
- } else {
- int len = strlen(seq);
- if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false;
- for (i = 0; i < len; ++i) {
- kputc(33 + state->def_qual, linebuf);
- }
- }
- if (kputc('\n', linebuf) < 0) return false;
- }
- return true;
-}
-
-/*
- * Create FASTQ lines from the barcode tag using the index-format
- */
-static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts)
-{
- uint8_t *p;
- char *ifmt = opts->index_format;
- char *tag = NULL;
- char *qual = NULL;
- char *sub_tag = NULL;
- char *sub_qual = NULL;
- size_t tag_len;
- int file_number = 0;
- kstring_t linebuf = { 0, 0, NULL }; // Buffer
-
- if (!ifmt) return true;
-
- // read barcode tag
- p = bam_aux_get(rec,opts->barcode_tag);
- if (p) tag = bam_aux2Z(p);
-
- if (!tag) return true; // there is no tag
-
- tag_len = strlen(tag);
- sub_tag = calloc(1, tag_len + 1);
- if (!sub_tag) goto fail;
- sub_qual = calloc(1, tag_len + 1);
- if (!sub_qual) goto fail;
-
- // read quality tag
- p = bam_aux_get(rec, opts->quality_tag);
- if (p) qual = bam_aux2Z(p);
-
- // Parse the index-format string
- while (*ifmt) {
- if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly
- char action = *ifmt; // should be 'i' or 'n'
- ifmt++; // skip over action
- int index_len = getLength(&ifmt);
- int n = 0;
-
- if (index_len < 0) {
- // read until separator
- while (isalpha(*tag)) {
- sub_tag[n] = *tag++;
- if (qual) sub_qual[n] = *qual++;
- n++;
- }
- if (*tag) { // skip separator
- tag++;
- if (qual) qual++;
- }
- } else {
- // read index_len characters
- while (index_len-- && *tag) {
- sub_tag[n] = *tag++;
- if (qual) sub_qual[n] = *qual++;
- n++;
- }
- }
- sub_tag[n] = '\0';
- sub_qual[n] = '\0';
-
- if (action=='i' && *sub_tag) {
- if (state->index_sequence) {
- char *new_index_sequence = realloc(state->index_sequence, strlen(state->index_sequence) + strlen(sub_tag) + 2);
- if (!new_index_sequence) goto fail;
- state->index_sequence = new_index_sequence;
- strcat(state->index_sequence, INDEX_SEPARATOR);
- strcat(state->index_sequence, sub_tag);
- } else {
- state->index_sequence = strdup(sub_tag); // we're going to need this later...
- }
- if (!state->index_sequence) goto fail;
- if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail;
- if (state->illumina_tag) {
- if (insert_index_sequence_into_linebuf(sub_tag, &linebuf, rec) < 0) {
- goto fail;
- }
- }
- if (state->fpi[file_number]) {
- if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0)
- goto fail;
- }
- }
-
- }
-
- free(sub_qual); free(sub_tag);
- free(linebuf.s);
- return true;
-
- fail:
- perror(__func__);
- free(sub_qual); free(sub_tag);
- free(linebuf.s);
- return false;
-}
-
-// Transform a bam1_t record into a string with the FASTQ representation of it
-// @returns false for error, true for success
-static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
-{
- int32_t qlen = b->core.l_qseq;
- assert(qlen >= 0);
- const uint8_t *oq = NULL;
- char *qual = NULL;
-
- char *seq = get_read(b);
- if (!seq) return false;
-
- if (state->use_oq) oq = bam_aux_get(b, "OQ");
- if (oq && *oq=='Z') {
- qual = strdup(bam_aux2Z(oq));
- if (!qual) goto fail;
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- reverse(qual);
- }
- } else {
- if (get_quality(b, &qual) < 0) goto fail;
- }
-
- if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail;
-
- free(qual);
- free(seq);
- return true;
-
- fail:
- free(seq);
- free(qual);
- return false;
-}
-
static void free_opts(bam2fq_opts_t *opts)
{
- free(opts->barcode_tag);
- free(opts->quality_tag);
- free(opts->index_format);
- free(opts->extra_tags);
free(opts);
}
{"quality-tag", required_argument, NULL, 'q'},
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) {
+ while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:",
+ lopts, NULL)) > 0) {
switch (c) {
- case 'b': opts->barcode_tag = strdup(optarg); break;
- case 'q': opts->quality_tag = strdup(optarg); break;
+ case 'b': opts->barcode_tag = optarg; break;
+ case 'q': opts->quality_tag = optarg; break;
case 1 : opts->index_file[0] = optarg; break;
case 2 : opts->index_file[1] = optarg; break;
- case 3 : opts->index_format = strdup(optarg); break;
+ case 3 : opts->index_format = optarg; break;
case '0': opts->fnr[0] = optarg; break;
case '1': opts->fnr[1] = optarg; break;
case '2': opts->fnr[2] = optarg; break;
flag_off_set = 1;
opts->flag_off = 0;
}
- opts->flag_off |= strtol(optarg, 0, 0); break;
+ opts->flag_off |= strtol(optarg, 0, 0);
+ break;
case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
case 'n': opts->has12 = false; break;
case 'N': opts->has12always = true; break;
case 's': opts->fnse = optarg; break;
case 't': opts->copy_tags = true; break;
case 'i': opts->illumina_tag = true; break;
- case 'c': opts->compression_level = atoi(optarg); break;
- case 'T': opts->extra_tags = strdup(optarg); break;
+ case 'c':
+ opts->compression_level = atoi(optarg);
+ if (opts->compression_level < 0)
+ opts->compression_level = 0;
+ if (opts->compression_level > 9)
+ opts->compression_level = 9;
+ break;
+ case 'T': opts->extra_tags = optarg; break;
case 'v': opts->def_qual = atoi(optarg); break;
- case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); return false;
+
+ case '?':
+ bam2fq_usage(stderr, argv[0]);
+ free_opts(opts);
+ return false;
default:
if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
- bam2fq_usage(stderr, argv[0]); free_opts(opts); return false;
+ bam2fq_usage(stderr, argv[0]);
+ free_opts(opts);
+ return false;
}
break;
}
if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
if (opts->has12always) opts->has12 = true;
- if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG);
- if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG);
+ if (!opts->barcode_tag) opts->barcode_tag = DEFAULT_BARCODE_TAG;
+ if (!opts->quality_tag) opts->quality_tag = DEFAULT_QUALITY_TAG;
int nIndex = 0;
if (opts->index_format) {
}
const char* type_str = argv[0];
- if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) {
+ if (strcasecmp("fastq", type_str) == 0 ||
+ strcasecmp("bam2fq", type_str) == 0) {
opts->filetype = FASTQ;
} else if (strcasecmp("fasta", type_str) == 0) {
opts->filetype = FASTA;
return true;
}
-static BGZF *open_fqfile(char *filename, int c, htsThreadPool *tp)
-{
- char mode[4] = "w";
- size_t len = strlen(filename);
-
- mode[2] = 0; mode[3] = 0;
- if (len > 3 && strstr(filename + (len - 3),".gz")) {
- mode[1] = 'g'; mode[2] = c+'0';
- } else if ((len > 4 && strstr(filename + (len - 4),".bgz"))
- || (len > 5 && strstr(filename + (len - 5),".bgzf"))) {
- mode[1] = c+'0';
- } else {
- mode[1] = 'u';
+void set_sam_opts(samFile *fp, bam2fq_state_t *state,
+ const bam2fq_opts_t *opts) {
+ if (state->has12)
+ hts_set_opt(fp, FASTQ_OPT_RNUM, 1);
+
+ if (state->illumina_tag)
+ hts_set_opt(fp, FASTQ_OPT_CASAVA, 1);
+
+ hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag);
+
+ kstring_t tag_list = {0,0};
+ if (state->copy_tags)
+ kputs("RG,BC,QT", &tag_list);
+ if (opts->extra_tags) {
+ if (tag_list.l)
+ kputc(',', &tag_list);
+ kputs(opts->extra_tags, &tag_list);
}
+ if (tag_list.l)
+ hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s);
+ ks_free(&tag_list);
+}
- BGZF *fp = bgzf_open(filename,mode);
+// Open a file as normal or gzipped based on filename.
+// Note we always use bgzf and don't bother to attempt non-blocked
+// gzip streams. This is a departure from the old fastq code.
+static samFile *sam_open_z(char *fn, char *mode, bam2fq_state_t *state) {
+ char modez[6];
+ strcpy(modez, mode);
+
+ size_t l = strlen(fn);
+ if ((l > 3 && strcmp(fn+l-3, ".gz") == 0) ||
+ (l > 4 && strcmp(fn+l-4, ".bgz") == 0) ||
+ (l > 5 && strcmp(fn+l-5, ".bgzf") == 0)) {
+ char m[3] = {'z', state->compression_level+'0', '\0'};
+ strcat(modez, m);
+ }
+
+ samFile *fp = sam_open(fn, modez);
if (!fp)
- return fp;
- if (tp->pool && bgzf_thread_pool(fp, tp->pool, tp->qsize) < 0) {
- bgzf_close(fp);
return NULL;
- }
+
+ if (state->p.pool)
+ hts_set_thread_pool(fp, &state->p);
+
return fp;
}
static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
{
+ char *mode = opts->filetype == FASTA ? "wF" : "wf";
+
bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
+ if (!state)
+ return false;
state->flag_on = opts->flag_on;
state->flag_off = opts->flag_off;
state->flag_alloff = opts->flag_alloff;
state->hstdout = NULL;
state->compression_level = opts->compression_level;
- state->taglist = kl_init(ktaglist);
- if (opts->extra_tags) {
- char *save_p;
- char *s = strtok_r(opts->extra_tags, ",", &save_p);
- while (s) {
- if (strlen(s) != 2) {
- fprintf(stderr, "Parsing extra tags - '%s' is not two characters\n", s);
- free(state);
- return false;
- }
- char **et = kl_pushp(ktaglist, state->taglist);
- *et = s;
- s = strtok_r(NULL, ",", &save_p);
- }
- }
-
state->fp = sam_open(opts->fn_input, "r");
if (state->fp == NULL) {
print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input);
return false;
}
if (opts->fnse) {
- state->fpse = open_fqfile(opts->fnse, state->compression_level, &state->p);
- if (state->fpse == NULL) {
- print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse);
+ if (!(state->fpse = sam_open_z(opts->fnse, mode, state))) {
+ print_error_errno("bam2fq", "Cannot open singleton file \"%s\"", opts->fnse);
free(state);
return false;
}
+ set_sam_opts(state->fpse, state, opts);
}
if (opts->ga.reference) {
}
}
+ // single, read1, read2
int i, j;
for (i = 0; i < 3; ++i) {
if (opts->fnr[i]) {
if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0)
break;
if (j == i) {
- state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level, &state->p);
- if (state->fpr[i] == NULL) {
- print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"",
+ if (!(state->fpr[i] = sam_open_z(opts->fnr[i], mode, state))) {
+ print_error_errno("bam2fq", "Cannot open r%d file \"%s\"",
i, opts->fnr[i]);
free(state);
return false;
}
+ set_sam_opts(state->fpr[i], state, opts);
} else {
state->fpr[i] = state->fpr[j];
}
} else {
if (!state->hstdout) {
- state->hstdout = bgzf_dopen(fileno(stdout), "wu");
- if (!state->hstdout) {
+ if (!(state->hstdout = sam_open_z("-", mode, state))) {
print_error_errno("bam2fq", "Cannot open STDOUT");
free(state);
return false;
}
+ set_sam_opts(state->hstdout, state, opts);
}
state->fpr[i] = state->hstdout;
}
}
+
+ // index 1, index 2
for (i = 0; i < 2; i++) {
state->fpi[i] = NULL;
if (opts->index_file[i]) {
if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0)
break;
if (i == j) {
- state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level, &state->p);
- if (state->fpi[i] == NULL) {
- print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"",
+ if (!(state->fpi[i] = sam_open_z(opts->index_file[i], mode,
+ state))) {
+ print_error_errno("bam2fq", "Cannot open i%d file \"%s\"",
i+1, opts->index_file[i]);
free(state);
return false;
}
+ set_sam_opts(state->fpi[i], state, opts);
} else if (j < 0) {
state->fpi[i] = state->fpr[j+3];
} else {
bool valid = true;
sam_hdr_destroy(state->h);
check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status);
- if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; }
+ if (state->fpse && sam_close(state->fpse) < 0) {
+ print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse);
+ valid = false;
+ }
+
int i, j;
for (i = 0; i < 3; ++i) {
if (state->fpr[i] != state->hstdout) {
for (j = 0; j < i; j++)
if (state->fpr[i] == state->fpr[j])
break;
- if (j == i && bgzf_close(state->fpr[i])) {
+ if (j == i && sam_close(state->fpr[i])) {
print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]);
valid = false;
}
}
}
if (state->hstdout) {
- if (bgzf_close(state->hstdout)) {
+ if (sam_close(state->hstdout) < 0) {
print_error_errno("bam2fq", "Error closing STDOUT");
valid = false;
}
for (j -= 3; j >= 0 && j < i; j++)
if (state->fpi[i] == state->fpi[j])
break;
- if (j == i && state->fpi[i] && bgzf_close(state->fpi[i])) {
+ if (j == i && state->fpi[i] && sam_close(state->fpi[i]) < 0) {
print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
valid = false;
}
}
- kl_destroy(ktaglist,state->taglist);
free(state->index_sequence);
if (state->p.pool)
hts_tpool_destroy(state->p.pool);
}
+int write_index_rec(samFile *fp, bam1_t *b, bam2fq_state_t *state,
+ bam2fq_opts_t* opts, char *seq, int seq_len,
+ char *qual, int qual_len) {
+ if (!fp || !b || !seq_len)
+ return 0;
+
+ int ret = -1;
+ bam1_t *b2 = bam_init1(); // FIXME: reuse
+ if (!b2)
+ return -1;
+
+ size_t aux_len = b->data + b->l_data - bam_get_aux(b);
+ if (bam_set1(b2, b->core.l_qname, bam_get_qname(b),
+ (b->core.flag | BAM_FUNMAP) & ~BAM_FREVERSE,
+ -1, -1, 0, // refid, pos, mapq
+ 0, NULL, // cigar
+ -1, -1, 0, // rnext, pnext, tlen
+ seq_len, seq, qual,
+ aux_len) < 0)
+ goto err;
+
+ uint8_t *q = bam_get_qual(b2);
+ if (qual) {
+ int i;
+ for (i = 0; i < seq_len; i++)
+ q[i] -= '!';
+ } else {
+ memset(q, opts->def_qual, seq_len);
+ }
+
+ memcpy(bam_get_aux(b2), bam_get_aux(b), aux_len);
+ b2->l_data += aux_len;
+ if (sam_write1(fp, state->h, b2) < 0)
+ goto err;
+
+ ret = 0;
+ err:
+ if (b2)
+ bam_destroy1(b2);
+ return ret;
+}
+
+int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state,
+ bam2fq_opts_t* opts) {
+ bam1_t *b[2] = {b1, b2};
+
+ char *ifmt = opts->index_format;
+ if (!ifmt)
+ ifmt = "i*i*";
+
+ // Get seq / qual elements
+ char *bc = NULL, *qt = NULL;
+ if (b1)
+ bc = (char *)bam_aux_get(b1, opts->barcode_tag);
+ if (b2 && !bc)
+ bc = (char *)bam_aux_get(b2, opts->barcode_tag);
+ if (!bc)
+ return 0;
+ else
+ bc++; // skip Z
+
+ if (b1)
+ qt = (char *)bam_aux_get(b1, opts->quality_tag);
+ if (b2 && !qt)
+ qt = (char *)bam_aux_get(b2, opts->quality_tag);
+ if (qt && strlen(bc) != strlen(qt)-1)
+ qt = NULL;
+ else if (qt)
+ qt++;
+
+ int inum = 0;
+ while (inum < 2) {
+ char fc = *ifmt++;
+ if (!fc)
+ break; // ran out of index-format
+
+ long len, rem = 0;
+ if (isdigit(*ifmt)) {
+ rem = len = strtol(ifmt, &ifmt, 10);
+ } else {
+ ifmt++;
+ len = 0;
+ }
+
+ char *bc_end = bc, *qt_end = qt;
+ while (len ? *bc_end && rem-- : isalpha(*bc_end))
+ bc_end++, qt_end += qt != NULL;
+
+ switch (fc) {
+ case 'n':
+ // skip
+ bc = bc_end + (len==0);
+ if (qt)
+ qt = qt_end + (len==0);
+ break;
+
+ case 'i':
+ if (write_index_rec(state->fpi[inum], b[inum], state, opts,
+ bc, bc_end-bc, qt, qt_end-qt) < 0)
+ return -1;
+ bc = bc_end + (len==0);
+ if (qt)
+ qt = qt_end + (len==0);
+ inum++;
+ break;
+
+ default:
+ fprintf(stderr, "Unknown index-format code\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int flush_rec(bam2fq_state_t *state, bam2fq_opts_t* opts,
+ bam1_t *b[4], int score[3], int best[3],
+ int64_t *n_singletons) {
+ // Paired data, with 1 or 2 ends present.
+ if (score[1] > 0 && score[2] > 0) {
+ // If CASAVA tag is required and barcode is only on R1,
+ // copy it to R2
+ if (state->illumina_tag) {
+ char *tag;
+ if ((tag = (char *)bam_aux_get(b[best[1]],
+ opts->barcode_tag)))
+ if (bam_aux_update_str(b[best[2]],
+ opts->barcode_tag,
+ strlen(tag), tag+1) < 0)
+ goto err;
+ if ((tag = (char *)bam_aux_get(b[best[1]],
+ opts->quality_tag)))
+ if (bam_aux_update_str(b[best[2]],
+ opts->quality_tag,
+ strlen(tag), tag+1) < 0)
+ goto err;
+
+ }
+ if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0)
+ goto err;
+ if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0)
+ goto err;
+
+ if (output_index(b[best[1]], b[best[2]], state, opts) < 0)
+ goto err;
+ } else if (score[1] > 0 || score[2] > 0) {
+ if (state->fpse) {
+ // print whichever one exists to fpse
+ if (score[1] > 0) {
+ if (sam_write1(state->fpse, state->h, b[best[1]]) < 0)
+ goto err;
+ } else {
+ if (sam_write1(state->fpse, state->h, b[best[2]]) < 0)
+ goto err;
+ }
+ ++(*n_singletons);
+ } else {
+ if (score[1] > 0) {
+ if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0)
+ goto err;
+ } else {
+ if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0)
+ goto err;
+ }
+ }
+
+ if (output_index(score[1] > 0 ? b[best[1]] : NULL,
+ score[2] > 0 ? b[best[2]] : NULL,
+ state, opts) < 0)
+ goto err;
+ }
+
+ if (score[0]) { // single ended data (neither READ1 nor READ2)
+ if (sam_write1(state->fpr[0], state->h, b[best[0]]) < 0)
+ goto err;
+
+ if (output_index(b[best[0]], NULL, state, opts) < 0)
+ goto err;
+ }
+
+ return 0;
+
+ err:
+ return -1;
+}
+
static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
{
int n;
- bam1_t *records[3] = {NULL, NULL, NULL};
char *current_qname = NULL;
int64_t n_reads = 0, n_singletons = 0; // Statistics
- kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}};
int score[3];
int at_eof;
- bool valid = true;
- bam1_t* b = NULL;
+ bool valid = false;
+ int best[3] = {-1, -1, -1}; // map R0, R1, single to b[] indices;
+ // indexed by [readpart]
+ bam1_t *b[4]; // 3 readparts, plus current record
- while (true) {
- if (!b)
- b = bam_init1();
- if (b == NULL) {
+ for (n = 0; n < 4; n++) {
+ if (!(b[n] = bam_init1())) {
perror("[bam2fq_mainloop] Malloc error for bam record buffer.");
- valid = false;
- break;
+ return false;
}
- int res = sam_read1(state->fp, state->h, b);
+ }
+
+ n = 0;
+ while (true) {
+ int res = sam_read1(state->fp, state->h, b[n]);
if (res < -1) {
fprintf(stderr, "[bam2fq_mainloop] Failed to read bam record.\n");
- valid = false;
- break;
+ goto err;
}
at_eof = res < 0;
- if (!at_eof && filter_it_out(b, state))
+ if (!at_eof && filter_it_out(b[n], state))
continue;
- if (!at_eof) ++n_reads;
-
- if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) {
- if (current_qname) {
- if (state->illumina_tag) {
- for (n=0; valid && n<3; n++) {
- if (!records[n]) continue;
- if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false;
- }
- if (!valid) break;
- }
- free(state->index_sequence); state->index_sequence = NULL;
- if (score[1] > 0 && score[2] > 0) {
- // print linebuf[1] to fpr[1], linebuf[2] to fpr[2]
- if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
- if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
- } else if (score[1] > 0 || score[2] > 0) {
- if (state->fpse) {
- // print whichever one exists to fpse
- if (score[1] > 0) {
- if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
- } else {
- if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
- }
- ++n_singletons;
- } else {
- if (score[1] > 0) {
- if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
- } else {
- if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
- }
- }
- }
- if (score[0]) { // TODO: check this
- // print linebuf[0] to fpr[0]
- if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; }
- }
+ if (!at_eof) {
+ ++n_reads;
+
+ // Handle -O option: use OQ for qual
+ uint8_t *oq;
+ if (state->use_oq && (oq = bam_aux_get(b[n],"OQ")) && *oq == 'Z') {
+ int i, l = strlen((char *)++oq);
+ uint8_t *qual = bam_get_qual(b[n]);
+ for (i = 0; i < l && i < b[n]->core.l_qseq; i++)
+ qual[i] = oq[i] - '!';
}
+ }
+ if (at_eof
+ || !current_qname
+ || (strcmp(current_qname, bam_get_qname(b[n])) != 0)) {
+ // New name, so flush best examples of previous name.
+ if (current_qname)
+ if (flush_rec(state, opts, b, score, best, &n_singletons) < 0)
+ goto err;
- free(current_qname); current_qname = NULL;
+ current_qname = bam_get_qname(b[n]);
score[0] = score[1] = score[2] = 0;
- for (n=0; n < 3; n++) {
- bam_destroy1(records[n]); records[n]=NULL;
- }
if (at_eof) { break; }
-
- current_qname = strdup(bam_get_qname(b));
- if (!current_qname) { valid = false; break; }
}
// Prefer a copy of the read that has base qualities
- int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1;
- readpart rp = which_readpart(b);
- if (b_score > score[rp]) {
- if (!tags2fq(b, state, opts)) { valid = false; break; }
- if (records[rp]) bam_destroy1(records[rp]);
- records[rp] = b;
+ int b_score = bam_get_qual(b[n])[0] != 0xff? 2 : 1;
+ readpart rp = which_readpart(b[n]);
+ if (score[rp] < b_score) {
score[rp] = b_score;
- b = NULL;
- if(!bam1_to_fq(records[rp], &linebuf[rp], state)) {
- fprintf(stderr, "[%s] Error converting read to FASTA/Q\n", __func__);
- valid = false; break;
- }
+ // Record b[n] slot for best copy of readpair and find a new
+ // slot for next bam read
+ best[rp] = n;
+ int used_slot[4] = {0}, i;
+ for (i = 0; i < 3; i++)
+ if (best[i] >= 0)
+ used_slot[best[i]] = 1;
+ for (i = 0; i < 4 && used_slot[i]; i++)
+ ;
+ n = i;
}
}
+
+ valid = true;
+ err:
if (!valid)
- {
- perror("[bam2fq_mainloop] Error writing to FASTx files.");
- }
- bam_destroy1(b);
- for (n=0; n < 3; n++) {
- bam_destroy1(records[n]);
- }
- free(current_qname);
- free(linebuf[0].s);
- free(linebuf[1].s);
- free(linebuf[2].s);
- fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons);
- fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
+ print_error_errno("bam2fq", "Error writing to FASTx files.");
+
+ for (n = 0; n < 4; n++)
+ bam_destroy1(b[n]);
+
+ fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n",
+ __func__, n_singletons);
+ fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n",
+ __func__, n_reads);
return valid;
}
int main_bam2fq(int argc, char *argv[])
{
- int status = EXIT_SUCCESS;
+ int status = EXIT_FAILURE;
bam2fq_opts_t* opts = NULL;
bam2fq_state_t* state = NULL;
bool valid = parse_opts(argc, argv, &opts);
if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE;
- if (!init_state(opts, &state)) return EXIT_FAILURE;
+ if (!init_state(opts, &state)) goto err;
+
+ if (!bam2fq_mainloop(state,opts)) goto err;
- if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE;
+ if (!destroy_state(opts, state, &status)) goto err;
- if (!destroy_state(opts, state, &status)) return EXIT_FAILURE;
+ status = EXIT_SUCCESS;
+ err:
sam_global_args_free(&opts->ga);
free_opts(opts);
/* bam_fastq.c -- FASTA and FASTQ file generation
- Copyright (C) 2009-2017, 2019 Genome Research Ltd.
+ Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include "samtools.h"
#include "sam_opts.h"
-#define taglist_free(p)
-KLIST_INIT(ktaglist, char*, taglist_free)
-
#define DEFAULT_BARCODE_TAG "BC"
#define DEFAULT_QUALITY_TAG "QT"
#define INDEX_SEPARATOR "+"
int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
-static const char *copied_tags[] = { "RG", "BC", "QT", NULL };
-
static void bam2fq_usage(FILE *to, const char *command)
{
int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0;
fprintf(to,
"\n"
"Description:\n"
-"Converts a SAM, BAM or CRAM into either FASTQ or FASTA format depending on the command invoked.\n"
+"Converts a SAM, BAM or CRAM to %s format.\n"
"\n"
"Options:\n"
-" -0 FILE write reads designated READ_OTHER to FILE\n"
-" -1 FILE write reads designated READ1 to FILE\n"
-" -2 FILE write reads designated READ2 to FILE\n"
-" -o FILE write reads designated READ1 or READ2 to FILE\n"
-" note: if a singleton file is specified with -s, only\n"
-" paired reads will be written to the -1 and -2 files.\n"
-" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
-" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0
-" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
-" -n don't append /1 and /2 to the read name\n"
-" -N always append /1 and /2 to the read name\n");
+" -0 FILE write reads designated READ_OTHER to FILE\n"
+" -1 FILE write reads designated READ1 to FILE\n"
+" -2 FILE write reads designated READ2 to FILE\n"
+" -o FILE write reads designated READ1 or READ2 to FILE\n"
+" note: if a singleton file is specified with -s, only\n"
+" paired reads will be written to the -1 and -2 files.\n"
+" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
+" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0
+" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
+" -n don't append /1 and /2 to the read name\n"
+" -N always append /1 and /2 to the read name\n",
+ fq ? "FASTQ" : "FASTA");
if (fq) fprintf(to,
-" -O output quality in the OQ tag if present\n");
+" -O output quality in the OQ tag if present\n");
fprintf(to,
-" -s FILE write singleton reads designated READ1 or READ2 to FILE\n"
-" -t copy RG, BC and QT tags to the %s header line\n",
+" -s FILE write singleton reads designated READ1 or READ2 to FILE\n"
+" -t copy RG, BC and QT tags to the %s header line\n",
fq ? "FASTQ" : "FASTA");
fprintf(to,
-" -T TAGLIST copy arbitrary tags to the %s header line\n",
+" -T TAGLIST copy arbitrary tags to the %s header line\n",
fq ? "FASTQ" : "FASTA");
if (fq) fprintf(to,
-" -v INT default quality score if not given in file [1]\n"
-" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n"
-" -c compression level [0..9] to use when creating gz or bgzf fastq files [1]\n"
-" --i1 FILE write first index reads to FILE\n"
-" --i2 FILE write second index reads to FILE\n"
-" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n"
-" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n"
-" --index-format STR How to parse barcode and quality tags\n\n");
+" -v INT default quality score if not given in file [1]\n"
+" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n"
+" -c INT compression level [0..9] to use when writing bgzf files [1]\n"
+" --i1 FILE write first index reads to FILE\n"
+" --i2 FILE write second index reads to FILE\n"
+" --barcode-tag TAG\n"
+" Barcode tag [" DEFAULT_BARCODE_TAG "]\n"
+" --quality-tag TAG\n"
+" Quality tag [" DEFAULT_QUALITY_TAG "]\n"
+" --index-format STR\n"
+" How to parse barcode and quality tags\n\n");
sam_global_opt_help(to, "-.--.@-.");
fprintf(to,
"\n"
-"The files will be automatically compressed if the file names have a .gz or .bgzf extension.\n"
-"The input to this program must be collated by name. Run 'samtools collate' or 'samtools sort -n'.\n"
+"The files will be automatically compressed if the file names have a .gz\n"
+"or .bgzf extension. The input to this program must be collated by name.\n"
+"Run 'samtools collate' or 'samtools sort -n' to achieve this.\n"
"\n"
"Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n"
"Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n"
-"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n"
-"or both unset.\n"
+"Otherwise reads are designated READ_OTHER (both flags set or both flags unset).\n"
"Run 'samtools flags' for more information on flag codes and meanings.\n");
fprintf(to,
"\n"
-"The index-format string describes how to parse the barcode and quality tags, for example:\n"
-" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n"
-" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n"
-"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n"
-"'read until the separator or end of tag', for example:\n"
-" n*i* ignore the left part of the tag until the separator, then use the second part\n"
-" of the tag as index 1\n");
+"The index-format string describes how to parse the barcode and quality tags.\n"
+"It is made up of 'i' or 'n' followed by a length or '*'. For example:\n"
+" i14i8 The first 14 characters are index 1, the next 8 are index 2\n"
+" n8i14 Ignore the first 8 characters, and use the next 14 for index 1\n\n"
+"If the tag contains a separator, then the numeric part can be replaced with\n"
+"'*' to mean 'read until the separator or end of tag', for example:\n"
+" i*i* Break the tag at the separator into index 1 and index 2\n"
+" n*i* Ignore the left part of the tag until the separator,\n"
+" then use the second part of the tag as index 1\n");
fprintf(to,
"\n"
"Examples:\n"
-" To get just the paired reads in separate files, use:\n"
-" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n in.bam\n"
-"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n"
+"To get just the paired reads in separate files, use:\n"
+" samtools %s -1 pair1.%s -2 pair2.%s -0 /dev/null -s /dev/null -n in.bam\n"
+"\nTo get all non-supplementary/secondary reads in a single file, redirect\n"
+"the output:\n"
" samtools %s in.bam > all_reads.%s\n",
command, fq ? "fq" : "fa", fq ? "fq" : "fa",
command, fq ? "fq" : "fa");
typedef struct bam2fq_state {
samFile *fp;
- BGZF *fpse;
- BGZF *fpr[3];
- BGZF *fpi[2];
- BGZF *hsamtools_stdout;
+ samFile *fpse;
+ samFile *fpr[3];
+ samFile *fpi[3];
+ samFile *hsamtools_stdout;
sam_hdr_t *h;
bool has12, use_oq, copy_tags, illumina_tag;
int flag_on, flag_off, flag_alloff;
fastfile filetype;
int def_qual;
- klist_t(ktaglist) *taglist;
char *index_sequence;
char compression_level;
htsThreadPool p;
} bam2fq_state_t;
-/*
- * Get and decode the read from a BAM record.
- *
- * TODO: htslib really needs an interface for this. Consider this or perhaps
- * bam_get_seq_str (current vs original orientation) and bam_get_qual_str
- * functions as string formatted equivalents to bam_get_{seq,qual}?
- */
-
-/*
- * Reverse a string in place.
- * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux.
- * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik
- */
-static char *reverse(char *str)
-{
- int i = strlen(str)-1,j=0;
- char ch;
- while (i>j) {
- ch = str[i];
- str[i]= str[j];
- str[j] = ch;
- i--;
- j++;
- }
- return str;
-}
-
-/* return the read, reverse complemented if necessary */
-static char *get_read(const bam1_t *rec)
-{
- int len = rec->core.l_qseq + 1;
- char *read = calloc(1, len);
- char *seq = (char *)bam_get_seq(rec);
- int n;
-
- if (!read) return NULL;
-
- for (n=0; n < rec->core.l_qseq; n++) {
- if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]];
- else read[n] = seq_nt16_str[bam_seqi(seq,n)];
- }
- if (rec->core.flag & BAM_FREVERSE) reverse(read);
- return read;
-}
-
-/*
- * get and decode the quality from a BAM record
- */
-static int get_quality(const bam1_t *rec, char **qual_out)
-{
- char *quality = calloc(1, rec->core.l_qseq + 1);
- char *q = (char *)bam_get_qual(rec);
- int n;
-
- if (!quality) return -1;
-
- if (*q == '\xff') {
- free(quality);
- *qual_out = NULL;
- return 0;
- }
-
- for (n=0; n < rec->core.l_qseq; n++) {
- quality[n] = q[n]+33;
- }
- if (rec->core.flag & BAM_FREVERSE) reverse(quality);
- *qual_out = quality;
- return 0;
-}
-
-//
-// End of htslib complaints
-//
-
-
static readpart which_readpart(const bam1_t *b)
{
if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
}
}
-/*
- * parse the length part from the index-format string
- */
-static int getLength(char **s)
-{
- int n = 0;
- while (**s) {
- if (**s == '*') { n=-1; (*s)++; break; }
- if ( !isdigit(**s)) break;
- n = n*10 + ((**s)-'0');
- (*s)++;
- }
- return n;
-}
-
-static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf)
-{
- uint8_t *s = bam_aux_get(rec, tag);
- if (s) {
- char aux_type = *s;
- switch (aux_type) {
- case 'C':
- case 'S': aux_type = 'I'; break;
- case 'c':
- case 's': aux_type = 'i'; break;
- case 'd': aux_type = 'f'; break;
- }
-
- // Ensure space. Need 6 chars + length of tag. Max length of
- // i is 16, A is 21, B currently 26, Z is unknown, so
- // have to check that one later.
- if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false;
-
- kputc('\t', linebuf);
- kputsn(tag, 2, linebuf);
- kputc(':', linebuf);
- kputc(aux_type=='I'? 'i': aux_type, linebuf);
- kputc(':', linebuf);
- switch (aux_type) {
- case 'H':
- case 'Z':
- if (kputs(bam_aux2Z(s), linebuf) < 0) return false;
- break;
- case 'i': kputw(bam_aux2i(s), linebuf); break;
- case 'I': kputuw(bam_aux2i(s), linebuf); break;
- case 'A': kputc(bam_aux2A(s), linebuf); break;
- case 'f': kputd(bam_aux2f(s), linebuf); break;
- case 'B': kputs("*** Unhandled aux type ***", linebuf); return false;
- default: kputs("*** Unknown aux type ***", linebuf); return false;
- }
- }
- return true;
-}
-
-static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec)
-{
- if (!index_sequence) return 0;
-
- kstring_t new = {0,0,NULL};
- if (linebuf->s) {
- char *s = strchr(linebuf->s, '\n');
- if (s) {
- if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0)
- return -1;
- *s = 0;
- kputs(linebuf->s, &new);
- kputc(' ', &new);
- readpart readpart = which_readpart(rec);
- if (readpart == READ_1) kputc('1', &new);
- else if (readpart == READ_2) kputc('2', &new);
- else kputc('0', &new);
-
- kputc(':', &new);
- if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new);
- else kputc('N', &new);
-
- kputs(":0:", &new);
- kputs(index_sequence, &new);
- kputc('\n', &new);
- kputs(s+1, &new);
- free(ks_release(linebuf));
- linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m;
- }
- }
- return 0;
-}
-
-static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state)
-{
- int i;
-
- linebuf->l = 0;
- // Write read name
- if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false;
- if (kputs(bam_get_qname(rec), linebuf) < 0) return false;
- // Add the /1 /2 if requested
- if (state->has12) {
- readpart readpart = which_readpart(rec);
- if (readpart == READ_1) {
- if (kputs("/1", linebuf) < 0) return false;
- } else if (readpart == READ_2) {
- if (kputs("/2", linebuf) < 0) return false;
- }
- }
- if (state->copy_tags) {
- for (i = 0; copied_tags[i]; ++i) {
- if (!copy_tag(copied_tags[i], rec, linebuf)) {
- fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s);
- return false;
- }
- }
- }
-
- if (state->taglist->size) {
- kliter_t(ktaglist) *p;
- for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) {
- if (!copy_tag(kl_val(p), rec, linebuf)) {
- fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s);
- return false;
- }
- }
- }
-
- if (kputc('\n', linebuf) < 0) return false;
- if (kputs(seq, linebuf) < 0) return false;
- if (kputc('\n', linebuf) < 0) return false;
-
- if (state->filetype == FASTQ) {
- // Write quality
- if (kputs("+\n", linebuf) < 0) return false;
- if (qual && *qual) {
- if (kputs(qual, linebuf) < 0) return false;
- } else {
- int len = strlen(seq);
- if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false;
- for (i = 0; i < len; ++i) {
- kputc(33 + state->def_qual, linebuf);
- }
- }
- if (kputc('\n', linebuf) < 0) return false;
- }
- return true;
-}
-
-/*
- * Create FASTQ lines from the barcode tag using the index-format
- */
-static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts)
-{
- uint8_t *p;
- char *ifmt = opts->index_format;
- char *tag = NULL;
- char *qual = NULL;
- char *sub_tag = NULL;
- char *sub_qual = NULL;
- size_t tag_len;
- int file_number = 0;
- kstring_t linebuf = { 0, 0, NULL }; // Buffer
-
- if (!ifmt) return true;
-
- // read barcode tag
- p = bam_aux_get(rec,opts->barcode_tag);
- if (p) tag = bam_aux2Z(p);
-
- if (!tag) return true; // there is no tag
-
- tag_len = strlen(tag);
- sub_tag = calloc(1, tag_len + 1);
- if (!sub_tag) goto fail;
- sub_qual = calloc(1, tag_len + 1);
- if (!sub_qual) goto fail;
-
- // read quality tag
- p = bam_aux_get(rec, opts->quality_tag);
- if (p) qual = bam_aux2Z(p);
-
- // Parse the index-format string
- while (*ifmt) {
- if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly
- char action = *ifmt; // should be 'i' or 'n'
- ifmt++; // skip over action
- int index_len = getLength(&ifmt);
- int n = 0;
-
- if (index_len < 0) {
- // read until separator
- while (isalpha(*tag)) {
- sub_tag[n] = *tag++;
- if (qual) sub_qual[n] = *qual++;
- n++;
- }
- if (*tag) { // skip separator
- tag++;
- if (qual) qual++;
- }
- } else {
- // read index_len characters
- while (index_len-- && *tag) {
- sub_tag[n] = *tag++;
- if (qual) sub_qual[n] = *qual++;
- n++;
- }
- }
- sub_tag[n] = '\0';
- sub_qual[n] = '\0';
-
- if (action=='i' && *sub_tag) {
- if (state->index_sequence) {
- char *new_index_sequence = realloc(state->index_sequence, strlen(state->index_sequence) + strlen(sub_tag) + 2);
- if (!new_index_sequence) goto fail;
- state->index_sequence = new_index_sequence;
- strcat(state->index_sequence, INDEX_SEPARATOR);
- strcat(state->index_sequence, sub_tag);
- } else {
- state->index_sequence = strdup(sub_tag); // we're going to need this later...
- }
- if (!state->index_sequence) goto fail;
- if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail;
- if (state->illumina_tag) {
- if (insert_index_sequence_into_linebuf(sub_tag, &linebuf, rec) < 0) {
- goto fail;
- }
- }
- if (state->fpi[file_number]) {
- if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0)
- goto fail;
- }
- }
-
- }
-
- free(sub_qual); free(sub_tag);
- free(linebuf.s);
- return true;
-
- fail:
- perror(__func__);
- free(sub_qual); free(sub_tag);
- free(linebuf.s);
- return false;
-}
-
-// Transform a bam1_t record into a string with the FASTQ representation of it
-// @returns false for error, true for success
-static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
-{
- int32_t qlen = b->core.l_qseq;
- assert(qlen >= 0);
- const uint8_t *oq = NULL;
- char *qual = NULL;
-
- char *seq = get_read(b);
- if (!seq) return false;
-
- if (state->use_oq) oq = bam_aux_get(b, "OQ");
- if (oq && *oq=='Z') {
- qual = strdup(bam_aux2Z(oq));
- if (!qual) goto fail;
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- reverse(qual);
- }
- } else {
- if (get_quality(b, &qual) < 0) goto fail;
- }
-
- if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail;
-
- free(qual);
- free(seq);
- return true;
-
- fail:
- free(seq);
- free(qual);
- return false;
-}
-
static void free_opts(bam2fq_opts_t *opts)
{
- free(opts->barcode_tag);
- free(opts->quality_tag);
- free(opts->index_format);
- free(opts->extra_tags);
free(opts);
}
{"quality-tag", required_argument, NULL, 'q'},
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) {
+ while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:",
+ lopts, NULL)) > 0) {
switch (c) {
- case 'b': opts->barcode_tag = strdup(optarg); break;
- case 'q': opts->quality_tag = strdup(optarg); break;
+ case 'b': opts->barcode_tag = optarg; break;
+ case 'q': opts->quality_tag = optarg; break;
case 1 : opts->index_file[0] = optarg; break;
case 2 : opts->index_file[1] = optarg; break;
- case 3 : opts->index_format = strdup(optarg); break;
+ case 3 : opts->index_format = optarg; break;
case '0': opts->fnr[0] = optarg; break;
case '1': opts->fnr[1] = optarg; break;
case '2': opts->fnr[2] = optarg; break;
flag_off_set = 1;
opts->flag_off = 0;
}
- opts->flag_off |= strtol(optarg, 0, 0); break;
+ opts->flag_off |= strtol(optarg, 0, 0);
+ break;
case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
case 'n': opts->has12 = false; break;
case 'N': opts->has12always = true; break;
case 's': opts->fnse = optarg; break;
case 't': opts->copy_tags = true; break;
case 'i': opts->illumina_tag = true; break;
- case 'c': opts->compression_level = atoi(optarg); break;
- case 'T': opts->extra_tags = strdup(optarg); break;
+ case 'c':
+ opts->compression_level = atoi(optarg);
+ if (opts->compression_level < 0)
+ opts->compression_level = 0;
+ if (opts->compression_level > 9)
+ opts->compression_level = 9;
+ break;
+ case 'T': opts->extra_tags = optarg; break;
case 'v': opts->def_qual = atoi(optarg); break;
- case '?': bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false;
+
+ case '?':
+ bam2fq_usage(samtools_stderr, argv[0]);
+ free_opts(opts);
+ return false;
default:
if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
- bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false;
+ bam2fq_usage(samtools_stderr, argv[0]);
+ free_opts(opts);
+ return false;
}
break;
}
if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
if (opts->has12always) opts->has12 = true;
- if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG);
- if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG);
+ if (!opts->barcode_tag) opts->barcode_tag = DEFAULT_BARCODE_TAG;
+ if (!opts->quality_tag) opts->quality_tag = DEFAULT_QUALITY_TAG;
int nIndex = 0;
if (opts->index_format) {
}
const char* type_str = argv[0];
- if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) {
+ if (strcasecmp("fastq", type_str) == 0 ||
+ strcasecmp("bam2fq", type_str) == 0) {
opts->filetype = FASTQ;
} else if (strcasecmp("fasta", type_str) == 0) {
opts->filetype = FASTA;
return true;
}
-static BGZF *open_fqfile(char *filename, int c, htsThreadPool *tp)
-{
- char mode[4] = "w";
- size_t len = strlen(filename);
-
- mode[2] = 0; mode[3] = 0;
- if (len > 3 && strstr(filename + (len - 3),".gz")) {
- mode[1] = 'g'; mode[2] = c+'0';
- } else if ((len > 4 && strstr(filename + (len - 4),".bgz"))
- || (len > 5 && strstr(filename + (len - 5),".bgzf"))) {
- mode[1] = c+'0';
- } else {
- mode[1] = 'u';
+void set_sam_opts(samFile *fp, bam2fq_state_t *state,
+ const bam2fq_opts_t *opts) {
+ if (state->has12)
+ hts_set_opt(fp, FASTQ_OPT_RNUM, 1);
+
+ if (state->illumina_tag)
+ hts_set_opt(fp, FASTQ_OPT_CASAVA, 1);
+
+ hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag);
+
+ kstring_t tag_list = {0,0};
+ if (state->copy_tags)
+ kputs("RG,BC,QT", &tag_list);
+ if (opts->extra_tags) {
+ if (tag_list.l)
+ kputc(',', &tag_list);
+ kputs(opts->extra_tags, &tag_list);
}
+ if (tag_list.l)
+ hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s);
+ ks_free(&tag_list);
+}
- BGZF *fp = bgzf_open(filename,mode);
+// Open a file as normal or gzipped based on filename.
+// Note we always use bgzf and don't bother to attempt non-blocked
+// gzip streams. This is a departure from the old fastq code.
+static samFile *sam_open_z(char *fn, char *mode, bam2fq_state_t *state) {
+ char modez[6];
+ strcpy(modez, mode);
+
+ size_t l = strlen(fn);
+ if ((l > 3 && strcmp(fn+l-3, ".gz") == 0) ||
+ (l > 4 && strcmp(fn+l-4, ".bgz") == 0) ||
+ (l > 5 && strcmp(fn+l-5, ".bgzf") == 0)) {
+ char m[3] = {'z', state->compression_level+'0', '\0'};
+ strcat(modez, m);
+ }
+
+ samFile *fp = sam_open(fn, modez);
if (!fp)
- return fp;
- if (tp->pool && bgzf_thread_pool(fp, tp->pool, tp->qsize) < 0) {
- bgzf_close(fp);
return NULL;
- }
+
+ if (state->p.pool)
+ hts_set_thread_pool(fp, &state->p);
+
return fp;
}
static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
{
+ char *mode = opts->filetype == FASTA ? "wF" : "wf";
+
bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
+ if (!state)
+ return false;
state->flag_on = opts->flag_on;
state->flag_off = opts->flag_off;
state->flag_alloff = opts->flag_alloff;
state->hsamtools_stdout = NULL;
state->compression_level = opts->compression_level;
- state->taglist = kl_init(ktaglist);
- if (opts->extra_tags) {
- char *save_p;
- char *s = strtok_r(opts->extra_tags, ",", &save_p);
- while (s) {
- if (strlen(s) != 2) {
- fprintf(samtools_stderr, "Parsing extra tags - '%s' is not two characters\n", s);
- free(state);
- return false;
- }
- char **et = kl_pushp(ktaglist, state->taglist);
- *et = s;
- s = strtok_r(NULL, ",", &save_p);
- }
- }
-
state->fp = sam_open(opts->fn_input, "r");
if (state->fp == NULL) {
print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input);
return false;
}
if (opts->fnse) {
- state->fpse = open_fqfile(opts->fnse, state->compression_level, &state->p);
- if (state->fpse == NULL) {
- print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse);
+ if (!(state->fpse = sam_open_z(opts->fnse, mode, state))) {
+ print_error_errno("bam2fq", "Cannot open singleton file \"%s\"", opts->fnse);
free(state);
return false;
}
+ set_sam_opts(state->fpse, state, opts);
}
if (opts->ga.reference) {
}
}
+ // single, read1, read2
int i, j;
for (i = 0; i < 3; ++i) {
if (opts->fnr[i]) {
if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0)
break;
if (j == i) {
- state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level, &state->p);
- if (state->fpr[i] == NULL) {
- print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"",
+ if (!(state->fpr[i] = sam_open_z(opts->fnr[i], mode, state))) {
+ print_error_errno("bam2fq", "Cannot open r%d file \"%s\"",
i, opts->fnr[i]);
free(state);
return false;
}
+ set_sam_opts(state->fpr[i], state, opts);
} else {
state->fpr[i] = state->fpr[j];
}
} else {
if (!state->hsamtools_stdout) {
- state->hsamtools_stdout = bgzf_dopen(fileno(samtools_stdout), "wu");
- if (!state->hsamtools_stdout) {
+ if (!(state->hsamtools_stdout = sam_open_z("-", mode, state))) {
print_error_errno("bam2fq", "Cannot open STDOUT");
free(state);
return false;
}
+ set_sam_opts(state->hsamtools_stdout, state, opts);
}
state->fpr[i] = state->hsamtools_stdout;
}
}
+
+ // index 1, index 2
for (i = 0; i < 2; i++) {
state->fpi[i] = NULL;
if (opts->index_file[i]) {
if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0)
break;
if (i == j) {
- state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level, &state->p);
- if (state->fpi[i] == NULL) {
- print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"",
+ if (!(state->fpi[i] = sam_open_z(opts->index_file[i], mode,
+ state))) {
+ print_error_errno("bam2fq", "Cannot open i%d file \"%s\"",
i+1, opts->index_file[i]);
free(state);
return false;
}
+ set_sam_opts(state->fpi[i], state, opts);
} else if (j < 0) {
state->fpi[i] = state->fpr[j+3];
} else {
bool valid = true;
sam_hdr_destroy(state->h);
check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status);
- if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; }
+ if (state->fpse && sam_close(state->fpse) < 0) {
+ print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse);
+ valid = false;
+ }
+
int i, j;
for (i = 0; i < 3; ++i) {
if (state->fpr[i] != state->hsamtools_stdout) {
for (j = 0; j < i; j++)
if (state->fpr[i] == state->fpr[j])
break;
- if (j == i && bgzf_close(state->fpr[i])) {
+ if (j == i && sam_close(state->fpr[i])) {
print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]);
valid = false;
}
}
}
if (state->hsamtools_stdout) {
- if (bgzf_close(state->hsamtools_stdout)) {
+ if (sam_close(state->hsamtools_stdout) < 0) {
print_error_errno("bam2fq", "Error closing STDOUT");
valid = false;
}
for (j -= 3; j >= 0 && j < i; j++)
if (state->fpi[i] == state->fpi[j])
break;
- if (j == i && state->fpi[i] && bgzf_close(state->fpi[i])) {
+ if (j == i && state->fpi[i] && sam_close(state->fpi[i]) < 0) {
print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
valid = false;
}
}
- kl_destroy(ktaglist,state->taglist);
free(state->index_sequence);
if (state->p.pool)
hts_tpool_destroy(state->p.pool);
}
+int write_index_rec(samFile *fp, bam1_t *b, bam2fq_state_t *state,
+ bam2fq_opts_t* opts, char *seq, int seq_len,
+ char *qual, int qual_len) {
+ if (!fp || !b || !seq_len)
+ return 0;
+
+ int ret = -1;
+ bam1_t *b2 = bam_init1(); // FIXME: reuse
+ if (!b2)
+ return -1;
+
+ size_t aux_len = b->data + b->l_data - bam_get_aux(b);
+ if (bam_set1(b2, b->core.l_qname, bam_get_qname(b),
+ (b->core.flag | BAM_FUNMAP) & ~BAM_FREVERSE,
+ -1, -1, 0, // refid, pos, mapq
+ 0, NULL, // cigar
+ -1, -1, 0, // rnext, pnext, tlen
+ seq_len, seq, qual,
+ aux_len) < 0)
+ goto err;
+
+ uint8_t *q = bam_get_qual(b2);
+ if (qual) {
+ int i;
+ for (i = 0; i < seq_len; i++)
+ q[i] -= '!';
+ } else {
+ memset(q, opts->def_qual, seq_len);
+ }
+
+ memcpy(bam_get_aux(b2), bam_get_aux(b), aux_len);
+ b2->l_data += aux_len;
+ if (sam_write1(fp, state->h, b2) < 0)
+ goto err;
+
+ ret = 0;
+ err:
+ if (b2)
+ bam_destroy1(b2);
+ return ret;
+}
+
+int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state,
+ bam2fq_opts_t* opts) {
+ bam1_t *b[2] = {b1, b2};
+
+ char *ifmt = opts->index_format;
+ if (!ifmt)
+ ifmt = "i*i*";
+
+ // Get seq / qual elements
+ char *bc = NULL, *qt = NULL;
+ if (b1)
+ bc = (char *)bam_aux_get(b1, opts->barcode_tag);
+ if (b2 && !bc)
+ bc = (char *)bam_aux_get(b2, opts->barcode_tag);
+ if (!bc)
+ return 0;
+ else
+ bc++; // skip Z
+
+ if (b1)
+ qt = (char *)bam_aux_get(b1, opts->quality_tag);
+ if (b2 && !qt)
+ qt = (char *)bam_aux_get(b2, opts->quality_tag);
+ if (qt && strlen(bc) != strlen(qt)-1)
+ qt = NULL;
+ else if (qt)
+ qt++;
+
+ int inum = 0;
+ while (inum < 2) {
+ char fc = *ifmt++;
+ if (!fc)
+ break; // ran out of index-format
+
+ long len, rem = 0;
+ if (isdigit(*ifmt)) {
+ rem = len = strtol(ifmt, &ifmt, 10);
+ } else {
+ ifmt++;
+ len = 0;
+ }
+
+ char *bc_end = bc, *qt_end = qt;
+ while (len ? *bc_end && rem-- : isalpha(*bc_end))
+ bc_end++, qt_end += qt != NULL;
+
+ switch (fc) {
+ case 'n':
+ // skip
+ bc = bc_end + (len==0);
+ if (qt)
+ qt = qt_end + (len==0);
+ break;
+
+ case 'i':
+ if (write_index_rec(state->fpi[inum], b[inum], state, opts,
+ bc, bc_end-bc, qt, qt_end-qt) < 0)
+ return -1;
+ bc = bc_end + (len==0);
+ if (qt)
+ qt = qt_end + (len==0);
+ inum++;
+ break;
+
+ default:
+ fprintf(samtools_stderr, "Unknown index-format code\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int flush_rec(bam2fq_state_t *state, bam2fq_opts_t* opts,
+ bam1_t *b[4], int score[3], int best[3],
+ int64_t *n_singletons) {
+ // Paired data, with 1 or 2 ends present.
+ if (score[1] > 0 && score[2] > 0) {
+ // If CASAVA tag is required and barcode is only on R1,
+ // copy it to R2
+ if (state->illumina_tag) {
+ char *tag;
+ if ((tag = (char *)bam_aux_get(b[best[1]],
+ opts->barcode_tag)))
+ if (bam_aux_update_str(b[best[2]],
+ opts->barcode_tag,
+ strlen(tag), tag+1) < 0)
+ goto err;
+ if ((tag = (char *)bam_aux_get(b[best[1]],
+ opts->quality_tag)))
+ if (bam_aux_update_str(b[best[2]],
+ opts->quality_tag,
+ strlen(tag), tag+1) < 0)
+ goto err;
+
+ }
+ if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0)
+ goto err;
+ if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0)
+ goto err;
+
+ if (output_index(b[best[1]], b[best[2]], state, opts) < 0)
+ goto err;
+ } else if (score[1] > 0 || score[2] > 0) {
+ if (state->fpse) {
+ // print whichever one exists to fpse
+ if (score[1] > 0) {
+ if (sam_write1(state->fpse, state->h, b[best[1]]) < 0)
+ goto err;
+ } else {
+ if (sam_write1(state->fpse, state->h, b[best[2]]) < 0)
+ goto err;
+ }
+ ++(*n_singletons);
+ } else {
+ if (score[1] > 0) {
+ if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0)
+ goto err;
+ } else {
+ if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0)
+ goto err;
+ }
+ }
+
+ if (output_index(score[1] > 0 ? b[best[1]] : NULL,
+ score[2] > 0 ? b[best[2]] : NULL,
+ state, opts) < 0)
+ goto err;
+ }
+
+ if (score[0]) { // single ended data (neither READ1 nor READ2)
+ if (sam_write1(state->fpr[0], state->h, b[best[0]]) < 0)
+ goto err;
+
+ if (output_index(b[best[0]], NULL, state, opts) < 0)
+ goto err;
+ }
+
+ return 0;
+
+ err:
+ return -1;
+}
+
static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
{
int n;
- bam1_t *records[3] = {NULL, NULL, NULL};
char *current_qname = NULL;
int64_t n_reads = 0, n_singletons = 0; // Statistics
- kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}};
int score[3];
int at_eof;
- bool valid = true;
- bam1_t* b = NULL;
+ bool valid = false;
+ int best[3] = {-1, -1, -1}; // map R0, R1, single to b[] indices;
+ // indexed by [readpart]
+ bam1_t *b[4]; // 3 readparts, plus current record
- while (true) {
- if (!b)
- b = bam_init1();
- if (b == NULL) {
+ for (n = 0; n < 4; n++) {
+ if (!(b[n] = bam_init1())) {
perror("[bam2fq_mainloop] Malloc error for bam record buffer.");
- valid = false;
- break;
+ return false;
}
- int res = sam_read1(state->fp, state->h, b);
+ }
+
+ n = 0;
+ while (true) {
+ int res = sam_read1(state->fp, state->h, b[n]);
if (res < -1) {
fprintf(samtools_stderr, "[bam2fq_mainloop] Failed to read bam record.\n");
- valid = false;
- break;
+ goto err;
}
at_eof = res < 0;
- if (!at_eof && filter_it_out(b, state))
+ if (!at_eof && filter_it_out(b[n], state))
continue;
- if (!at_eof) ++n_reads;
-
- if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) {
- if (current_qname) {
- if (state->illumina_tag) {
- for (n=0; valid && n<3; n++) {
- if (!records[n]) continue;
- if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false;
- }
- if (!valid) break;
- }
- free(state->index_sequence); state->index_sequence = NULL;
- if (score[1] > 0 && score[2] > 0) {
- // print linebuf[1] to fpr[1], linebuf[2] to fpr[2]
- if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
- if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
- } else if (score[1] > 0 || score[2] > 0) {
- if (state->fpse) {
- // print whichever one exists to fpse
- if (score[1] > 0) {
- if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
- } else {
- if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
- }
- ++n_singletons;
- } else {
- if (score[1] > 0) {
- if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
- } else {
- if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
- }
- }
- }
- if (score[0]) { // TODO: check this
- // print linebuf[0] to fpr[0]
- if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; }
- }
+ if (!at_eof) {
+ ++n_reads;
+
+ // Handle -O option: use OQ for qual
+ uint8_t *oq;
+ if (state->use_oq && (oq = bam_aux_get(b[n],"OQ")) && *oq == 'Z') {
+ int i, l = strlen((char *)++oq);
+ uint8_t *qual = bam_get_qual(b[n]);
+ for (i = 0; i < l && i < b[n]->core.l_qseq; i++)
+ qual[i] = oq[i] - '!';
}
+ }
+ if (at_eof
+ || !current_qname
+ || (strcmp(current_qname, bam_get_qname(b[n])) != 0)) {
+ // New name, so flush best examples of previous name.
+ if (current_qname)
+ if (flush_rec(state, opts, b, score, best, &n_singletons) < 0)
+ goto err;
- free(current_qname); current_qname = NULL;
+ current_qname = bam_get_qname(b[n]);
score[0] = score[1] = score[2] = 0;
- for (n=0; n < 3; n++) {
- bam_destroy1(records[n]); records[n]=NULL;
- }
if (at_eof) { break; }
-
- current_qname = strdup(bam_get_qname(b));
- if (!current_qname) { valid = false; break; }
}
// Prefer a copy of the read that has base qualities
- int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1;
- readpart rp = which_readpart(b);
- if (b_score > score[rp]) {
- if (!tags2fq(b, state, opts)) { valid = false; break; }
- if (records[rp]) bam_destroy1(records[rp]);
- records[rp] = b;
+ int b_score = bam_get_qual(b[n])[0] != 0xff? 2 : 1;
+ readpart rp = which_readpart(b[n]);
+ if (score[rp] < b_score) {
score[rp] = b_score;
- b = NULL;
- if(!bam1_to_fq(records[rp], &linebuf[rp], state)) {
- fprintf(samtools_stderr, "[%s] Error converting read to FASTA/Q\n", __func__);
- valid = false; break;
- }
+ // Record b[n] slot for best copy of readpair and find a new
+ // slot for next bam read
+ best[rp] = n;
+ int used_slot[4] = {0}, i;
+ for (i = 0; i < 3; i++)
+ if (best[i] >= 0)
+ used_slot[best[i]] = 1;
+ for (i = 0; i < 4 && used_slot[i]; i++)
+ ;
+ n = i;
}
}
+
+ valid = true;
+ err:
if (!valid)
- {
- perror("[bam2fq_mainloop] Error writing to FASTx files.");
- }
- bam_destroy1(b);
- for (n=0; n < 3; n++) {
- bam_destroy1(records[n]);
- }
- free(current_qname);
- free(linebuf[0].s);
- free(linebuf[1].s);
- free(linebuf[2].s);
- fprintf(samtools_stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons);
- fprintf(samtools_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
+ print_error_errno("bam2fq", "Error writing to FASTx files.");
+
+ for (n = 0; n < 4; n++)
+ bam_destroy1(b[n]);
+
+ fprintf(samtools_stderr, "[M::%s] discarded %" PRId64 " singletons\n",
+ __func__, n_singletons);
+ fprintf(samtools_stderr, "[M::%s] processed %" PRId64 " reads\n",
+ __func__, n_reads);
return valid;
}
int main_bam2fq(int argc, char *argv[])
{
- int status = EXIT_SUCCESS;
+ int status = EXIT_FAILURE;
bam2fq_opts_t* opts = NULL;
bam2fq_state_t* state = NULL;
bool valid = parse_opts(argc, argv, &opts);
if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE;
- if (!init_state(opts, &state)) return EXIT_FAILURE;
+ if (!init_state(opts, &state)) goto err;
+
+ if (!bam2fq_mainloop(state,opts)) goto err;
- if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE;
+ if (!destroy_state(opts, state, &status)) goto err;
- if (!destroy_state(opts, state, &status)) return EXIT_FAILURE;
+ status = EXIT_SUCCESS;
+ err:
sam_global_args_free(&opts->ga);
free_opts(opts);
/* bam_flags.c -- flags subcommand.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2014, 2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <unistd.h>
#include <stdarg.h>
#include <htslib/sam.h>
+#include "samtools.h"
-static void usage(void)
+static void usage(FILE *fp)
{
- fprintf(stderr, "\n");
- fprintf(stderr, "About: Convert between textual and numeric flag representation\n");
- fprintf(stderr, "Usage: samtools flags INT|STR[,...]\n");
- fprintf(stderr, "\n");
- fprintf(stderr, "Flags:\n");
- fprintf(stderr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED);
- fprintf(stderr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR);
- fprintf(stderr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP);
- fprintf(stderr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP);
- fprintf(stderr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE);
- fprintf(stderr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE);
- fprintf(stderr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1);
- fprintf(stderr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2);
- fprintf(stderr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY);
- fprintf(stderr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL);
- fprintf(stderr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP);
- fprintf(stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY);
- fprintf(stderr, "\n");
+ static const struct { int bit; const char *desc; } *fl, flags[] = {
+ { BAM_FPAIRED, "paired-end / multiple-segment sequencing technology" },
+ { BAM_FPROPER_PAIR, "each segment properly aligned according to aligner" },
+ { BAM_FUNMAP, "segment unmapped" },
+ { BAM_FMUNMAP, "next segment in the template unmapped" },
+ { BAM_FREVERSE, "SEQ is reverse complemented" },
+ { BAM_FMREVERSE, "SEQ of next segment in template is rev.complemented" },
+ { BAM_FREAD1, "the first segment in the template" },
+ { BAM_FREAD2, "the last segment in the template" },
+ { BAM_FSECONDARY, "secondary alignment" },
+ { BAM_FQCFAIL, "not passing quality controls or other filters" },
+ { BAM_FDUP, "PCR or optical duplicate" },
+ { BAM_FSUPPLEMENTARY, "supplementary alignment" },
+ { 0, NULL }
+ };
+
+ fprintf(fp,
+"About: Convert between textual and numeric flag representation\n"
+"Usage: samtools flags FLAGS...\n"
+"\n"
+"Each FLAGS argument is either an INT (in decimal/hexadecimal/octal) representing\n"
+"a combination of the following numeric flag values, or a comma-separated string\n"
+"NAME,...,NAME representing a combination of the following flag names:\n"
+"\n");
+ for (fl = flags; fl->desc; fl++) {
+ char *name = bam_flag2str(fl->bit);
+ fprintf(fp, "%#6x %5d %-15s%s\n", fl->bit, fl->bit, name, fl->desc);
+ free(name);
+ }
}
int main_flags(int argc, char *argv[])
{
- if ( argc!=2 ) usage();
- else
+ if ( argc < 2 ) { usage(stdout); return 0; }
+
+ int i;
+ for (i = 1; i < argc; i++)
{
- int mask = bam_str2flag(argv[1]);
- if ( mask<0 ) { fprintf(stderr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; }
- printf("0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask));
+ int mask = bam_str2flag(argv[i]);
+ if ( mask<0 ) { print_error("flags", "Could not parse \"%s\"", argv[i]); usage(stderr); return 1; }
+ char *str = bam_flag2str(mask);
+ printf("0x%x\t%d\t%s\n", mask, mask, str);
+ free(str);
}
return 0;
}
/* bam_flags.c -- flags subcommand.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2014, 2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <unistd.h>
#include <stdarg.h>
#include <htslib/sam.h>
+#include "samtools.h"
-static void usage(void)
+static void usage(FILE *fp)
{
- fprintf(samtools_stderr, "\n");
- fprintf(samtools_stderr, "About: Convert between textual and numeric flag representation\n");
- fprintf(samtools_stderr, "Usage: samtools flags INT|STR[,...]\n");
- fprintf(samtools_stderr, "\n");
- fprintf(samtools_stderr, "Flags:\n");
- fprintf(samtools_stderr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED);
- fprintf(samtools_stderr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR);
- fprintf(samtools_stderr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP);
- fprintf(samtools_stderr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP);
- fprintf(samtools_stderr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE);
- fprintf(samtools_stderr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE);
- fprintf(samtools_stderr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1);
- fprintf(samtools_stderr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2);
- fprintf(samtools_stderr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY);
- fprintf(samtools_stderr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL);
- fprintf(samtools_stderr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP);
- fprintf(samtools_stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY);
- fprintf(samtools_stderr, "\n");
+ static const struct { int bit; const char *desc; } *fl, flags[] = {
+ { BAM_FPAIRED, "paired-end / multiple-segment sequencing technology" },
+ { BAM_FPROPER_PAIR, "each segment properly aligned according to aligner" },
+ { BAM_FUNMAP, "segment unmapped" },
+ { BAM_FMUNMAP, "next segment in the template unmapped" },
+ { BAM_FREVERSE, "SEQ is reverse complemented" },
+ { BAM_FMREVERSE, "SEQ of next segment in template is rev.complemented" },
+ { BAM_FREAD1, "the first segment in the template" },
+ { BAM_FREAD2, "the last segment in the template" },
+ { BAM_FSECONDARY, "secondary alignment" },
+ { BAM_FQCFAIL, "not passing quality controls or other filters" },
+ { BAM_FDUP, "PCR or optical duplicate" },
+ { BAM_FSUPPLEMENTARY, "supplementary alignment" },
+ { 0, NULL }
+ };
+
+ fprintf(fp,
+"About: Convert between textual and numeric flag representation\n"
+"Usage: samtools flags FLAGS...\n"
+"\n"
+"Each FLAGS argument is either an INT (in decimal/hexadecimal/octal) representing\n"
+"a combination of the following numeric flag values, or a comma-separated string\n"
+"NAME,...,NAME representing a combination of the following flag names:\n"
+"\n");
+ for (fl = flags; fl->desc; fl++) {
+ char *name = bam_flag2str(fl->bit);
+ fprintf(fp, "%#6x %5d %-15s%s\n", fl->bit, fl->bit, name, fl->desc);
+ free(name);
+ }
}
int main_flags(int argc, char *argv[])
{
- if ( argc!=2 ) usage();
- else
+ if ( argc < 2 ) { usage(samtools_stdout); return 0; }
+
+ int i;
+ for (i = 1; i < argc; i++)
{
- int mask = bam_str2flag(argv[1]);
- if ( mask<0 ) { fprintf(samtools_stderr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; }
- fprintf(samtools_stdout, "0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask));
+ int mask = bam_str2flag(argv[i]);
+ if ( mask<0 ) { print_error("flags", "Could not parse \"%s\"", argv[i]); usage(samtools_stderr); return 1; }
+ char *str = bam_flag2str(mask);
+ fprintf(samtools_stdout, "0x%x\t%d\t%s\n", mask, mask, str);
+ free(str);
}
return 0;
}
--- /dev/null
+/* bam_import -- Import of FASTQ files.
+ *
+ * samtools import -1 a_1.fq -2 a_2.fq --i1 a_i1.fq --i2 a_i2.fq
+ * samtools import a_1.fq a_2.fq
+ * samtools import a_interleaved.fq
+ *
+ * Copyright (C) 2020 Genome Research Ltd.
+ *
+ * Author: James Bonfield <jkb@sanger.ac.uk>
+ */
+
+/*
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notices and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+// TODO: Store other non-aux comments; in new sam tag?
+
+#include <config.h>
+#include <ctype.h>
+
+#include "htslib/sam.h"
+#include "htslib/thread_pool.h"
+
+#include "samtools.h"
+#include "sam_opts.h"
+
+static int usage(FILE *fp, int exit_status) {
+ fprintf(fp, "Usage: samtools import [options] [file.fastq ...]\n");
+ fprintf(fp, "\n");
+ fprintf(fp, "Options:\n");
+ fprintf(fp, " -s FILE Read paired-ended data from single FILE\n");
+ fprintf(fp, " -0 FILE Read single-ended data from FILE\n");
+ fprintf(fp, " -1 FILE Read-1 from FILE\n");
+ fprintf(fp, " -2 FILE Read-2 from FILE\n");
+ fprintf(fp, " --i1 FILE Index-1 from FILE\n");
+ fprintf(fp, " --i2 FILE Index-2 from FILE\n");
+ fprintf(fp, " -i Parse CASAVA identifier\n");
+ fprintf(fp, " --barcode-tag TAG\n");
+ fprintf(fp, " Tag to use with barcode sequences [BC]\n");
+ fprintf(fp, " --quality-tag TAG\n");
+ fprintf(fp, " Tag to use with barcode qualities [QT]\n");
+ fprintf(fp, " -r STRING Build up a complete @RG line\n");
+ fprintf(fp, " -R STRING Add a simple RG line of \"@RG\\tID:STRING\"\n");
+ fprintf(fp, " -T TAGLIST Parse tags in SAM format; list of '*' for all\n");
+ fprintf(fp, " -o FILE Output to FILE instead of stdout\n");
+ fprintf(fp, " -u Uncompressed output\n");
+ fprintf(fp, " --order TAG Store Nth record count in TAG\n");
+ fprintf(fp, "\n");
+ sam_global_opt_help(fp, "-.O.-@--");
+
+ fprintf(fp, "\nA single fastq file will be interpreted as -s, -0 or -1 depending on\n");
+ fprintf(fp, "file contents, and a pair of fastq files as \"-1 FILE1 -2 FILE2\".\n");
+
+ return exit_status;
+}
+
+// Order matters here as we want to read index elements before main
+// sequences so on reading the seqs we can emit a fully annotated record.
+enum fileno {
+ FQ_I1, FQ_I2, // index seqs for R1 and R2
+ FQ_R0, // single file and unpaired data (singled-ended tech).
+ FQ_R1, FQ_R2, // separate read1 and read2 files
+ FQ_SINGLE, // single file, but with read1 and/or read2 present.
+ FQ_END
+};
+
+typedef struct {
+ sam_global_args ga;
+ int no_pg;
+ char *fn[FQ_END], *fn_out;
+ int idx_both; // add index to READ2 too, not just READ1
+ int casava;
+ char *barcode_seq;
+ char *barcode_qual;
+ char *aux;
+ char *rg;
+ char *rg_line;
+ char *order;
+ int compress_level;
+ htsThreadPool p;
+} opts_t;
+
+// Append a sequence and quality string from a BAM record to a BC:Z and
+// QT:Z style aux tag string.
+static int append_index(kstring_t *s, kstring_t *q, bam1_t *b) {
+ char *sp, *qp;
+ if (ks_resize(s, s->l + b->core.l_qseq+1 +1) < 0)
+ return -1;
+ if (ks_resize(q, q->l + b->core.l_qseq+1 +1) < 0)
+ return -1;
+
+ sp = s->s + s->l - (s->l > 0);
+ qp = q->s + q->l - (q->l > 0);
+
+ if (s->l)
+ *sp++ = '-';
+
+ if (q->l)
+ *qp++ = ' ';
+
+ int i;
+ uint8_t *seq = bam_get_seq(b);
+ uint8_t *qual = bam_get_qual(b);
+ for (i = 0; i < b->core.l_qseq; i++) {
+ *sp++ = seq_nt16_str[bam_seqi(seq, i)];
+ *qp++ = qual[i] + '!';
+ }
+ *sp++ = 0;
+ *qp++ = 0;
+
+ s->l = sp - s->s;
+ q->l = qp - q->s;
+
+ return 0;
+}
+
+static int import_fastq(int argc, char **argv, opts_t *opts) {
+ int i, n, ret = 0;
+ samFile *fp_in[FQ_END] = {NULL};
+ bam1_t *b = bam_init1();
+ int ids[FQ_END];
+ samFile *fp_out = NULL;
+ sam_hdr_t *hdr_out = NULL;
+ kstring_t index_str = {0,0};
+ kstring_t read_str = {0,0};
+ char *rg = opts->rg;
+ kstring_t rg_line = {0,0};
+ uint64_t read_num = 0;
+ kstring_t idx_seq = {0};
+ kstring_t idx_qual = {0};
+
+ // Any additional arguments are assumed to be r1 r2, as a
+ // short cut. We support reading index tags out of those too (eg
+ // Illumina CASAVA format), but if we do that we lack the barcode
+ // quality string.
+ //
+ // We also consider a read name ending in /1 or /2 to be a single
+ // file containing interleaved fastq records for both ends.
+ // These will be labeled as fn[FQ_R1] but adjusted during reading.
+ if (argc == 1)
+ opts->fn[FQ_SINGLE] = argv[0];
+ else
+ for (i = 0; i < 4; i++)
+ if (argc > i)
+ opts->fn[FQ_R1+i] = argv[i];
+
+ // Open all files
+ for (i = n = 0; i < FQ_END; i++) {
+ if (!opts->fn[i])
+ continue;
+ fp_in[i] = sam_open_format(opts->fn[i], "r", &opts->ga.in);
+ if (!fp_in[i]) {
+ perror(opts->fn[i]);
+ ret = -1;
+ goto err;
+ }
+ if (opts->p.pool)
+ hts_set_thread_pool(fp_in[i], &opts->p);
+ ids[n++] = i;
+
+ if (opts->casava)
+ hts_set_opt(fp_in[i], FASTQ_OPT_CASAVA, 1);
+ if (opts->barcode_seq) // for auto-CASAVA parsing
+ hts_set_opt(fp_in[i], FASTQ_OPT_BARCODE, opts->barcode_seq);
+ if (opts->aux)
+ hts_set_opt(fp_in[i], FASTQ_OPT_AUX,
+ *opts->aux == '*' || *opts->aux == '\0'
+ ? NULL : opts->aux);
+
+ switch (i) {
+ case FQ_I1:
+ kputs("--i1 I1.fastq ", &read_str);
+ kputs("i*", &index_str);
+ break;
+ case FQ_I2:
+ kputs("--i2 I2.fastq ", &read_str);
+ kputs("i*", &index_str);
+ break;
+
+ case FQ_R0:
+ kputs("-0 unpaired.fastq ", &read_str);
+ break;
+
+ case FQ_R1:
+ kputs("-1 R1.fastq ", &read_str);
+ break;
+
+ case FQ_R2:
+ kputs("-2 R2.fastq ", &read_str);
+ break;
+
+ case FQ_SINGLE:
+ kputs("-N -o paired.fastq ", &read_str);
+ break;
+
+ default:
+ ks_clear(&read_str); // not reversible
+ kputs("", &read_str);
+ }
+ }
+ if (n == 0) {
+ bam_destroy1(b);
+ return usage(stdout, EXIT_SUCCESS);
+ }
+
+ char out_mode[10] = {'w', 0, 0};
+ if (opts->compress_level != -1)
+ out_mode[1] = '0' + opts->compress_level;
+ sam_open_mode(out_mode+strlen(out_mode), opts->fn_out, NULL);
+ fp_out = sam_open_format(opts->fn_out, out_mode, &opts->ga.out);
+ if (!fp_out) {
+ perror(opts->fn_out);
+ goto err;
+ }
+ if (opts->p.pool)
+ hts_set_thread_pool(fp_out, &opts->p);
+
+ // Create header
+ if (ks_len(&read_str)) {
+ char CO[2100];
+ if (ks_len(&index_str))
+ snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s "
+ "--index-format=\"%s\"\n",
+ ks_str(&read_str), ks_str(&index_str));
+ else
+ snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s\n",
+ ks_str(&read_str));
+
+ hdr_out = sam_hdr_parse(strlen(CO), CO);
+ } else {
+ hdr_out = sam_hdr_init();
+ }
+
+ // Read group
+ if (opts->rg_line) {
+ if (*opts->rg_line != '@')
+ ksprintf(&rg_line, "@RG\t%s", opts->rg_line);
+ else
+ kputs(opts->rg_line, &rg_line);
+ } else if (opts->rg) {
+ ksprintf(&rg_line, "@RG\tID:%s", opts->rg);
+ }
+
+ if (ks_len(&rg_line)) {
+ if (sam_hdr_add_lines(hdr_out, ks_str(&rg_line), 0) < 0)
+ goto err;
+ rg = strstr(ks_str(&rg_line), "\tID:");
+ if (!rg) {
+ fprintf(stderr, "\"-r RG-LINE\" option contained no ID field\n");
+ goto err;
+ }
+ rg += 4;
+
+ i = 0;
+ while (rg[i] != '\t' && rg[i] != '\0')
+ i++;
+ rg[i] = 0;
+ }
+
+ if ((ret = sam_hdr_write(fp_out, hdr_out)) < 0)
+ goto err;
+
+
+ // Interleave / combine from n files (ids[0..n-1]).
+ int res;
+ int eof = 0;
+ do {
+ idx_seq.l = idx_qual.l = 0;
+ for (i = 0; i < n; i++) {
+ if ((res = sam_read1(fp_in[ids[i]], NULL, b)) < 0) {
+ if (res == -1) {
+ eof++;
+ continue;
+ } else
+ break;
+ }
+
+ // index
+ if (ids[i] == FQ_I1 || ids[i] == FQ_I2) {
+ if (append_index(&idx_seq, &idx_qual, b) < 0) {
+ res = -1;
+ break;
+ }
+ continue;
+ }
+
+ // full read
+ if (idx_seq.l) {
+ if (opts->idx_both || ids[i] == FQ_SINGLE ||
+ ids[i] == FQ_R0 || ids[i] == FQ_R1) {
+ if (bam_aux_append(b, opts->barcode_seq, 'Z', idx_seq.l,
+ (uint8_t *)idx_seq.s) ||
+ bam_aux_append(b, opts->barcode_qual, 'Z', idx_qual.l,
+ (uint8_t *)idx_qual.s)) {
+ res = -1;
+ break;
+ }
+ }
+ }
+
+ switch(ids[i]) {
+ case FQ_R0:
+ // unpaired; no flags to declare
+ break;
+ case FQ_SINGLE:
+ // paired (but don't know if R1 or R2) or unpaired.
+ // We rely on the /1 and /2 read suffix parsing in htslib
+ // to distinguish the two cases, or CASAVA tags if
+ // explicitly enabled.
+ break;
+ case FQ_R1:
+ if ((b->core.flag & (BAM_FREAD1 | BAM_FREAD2)) == 0)
+ b->core.flag |= BAM_FREAD1;
+ b->core.flag |= BAM_FPAIRED;
+ if (i+1 < n && ids[i+1] == FQ_R2)
+ b->core.flag |= BAM_FMUNMAP;
+ break;
+ case FQ_R2:
+ b->core.flag |= BAM_FPAIRED | BAM_FREAD2;
+ if (i > 0 && ids[i-1] == FQ_R1)
+ b->core.flag |= BAM_FMUNMAP;
+ break;
+ }
+
+ if (rg) {
+ if (bam_aux_append(b, "RG", 'Z', strlen(rg)+1,
+ (uint8_t *)rg) < 0) {
+ ret = -1;
+ goto err;
+ }
+ }
+
+ if (opts->order) {
+ if (bam_aux_update_int(b, opts->order, read_num++) < 0) {
+ ret = -1;
+ goto err;
+ }
+ }
+
+ res = sam_write1(fp_out, hdr_out, b);
+ }
+ } while (res >= 0);
+
+ if (res != -1) {
+ print_error("import", "truncated file. Aborting");
+ ret = res;
+ goto err;
+ }
+
+ if (eof != n) {
+ print_error("import", "input files with differing number of records");
+ ret = -1;
+ goto err;
+ }
+
+ // Close and return
+ ret = 0;
+err:
+ bam_destroy1(b);
+ sam_hdr_destroy(hdr_out);
+ ks_free(&rg_line);
+ ks_free(&index_str);
+ ks_free(&read_str);
+ if (fp_out) {
+ if (sam_close(fp_out) < 0) {
+ perror(opts->fn_out);
+ ret |= -1;
+ }
+ }
+ for (i = 0; i < FQ_END; i++) {
+ if (fp_in[i] && sam_close(fp_in[i]) < 0) {
+ perror(opts->fn[i]);
+ ret |= -1;
+ }
+ }
+ ks_free(&idx_seq);
+ ks_free(&idx_qual);
+
+ return ret;
+}
+
+int main_import(int argc, char *argv[]) {
+ int c;
+ opts_t opts = {
+ .no_pg = 0,
+ .ga = SAM_GLOBAL_ARGS_INIT,
+ .fn = {NULL},
+ .fn_out = "-",
+ .casava = 0,
+ .barcode_seq = "BC",
+ .barcode_qual = "QT",
+ .aux = NULL,
+ .rg = NULL,
+ .rg_line = NULL,
+ .order = NULL,
+ .compress_level = -1,
+ };
+ kstring_t rg = {0};
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, '-', '@'),
+ {"no-PG", no_argument, NULL, 9},
+ {"i1", required_argument, NULL, 1},
+ {"i2", required_argument, NULL, 2},
+ {"r1", required_argument, NULL, '1'},
+ {"r2", required_argument, NULL, '2'},
+ {"rg", required_argument, NULL, 'R'},
+ {"rg-line", required_argument, NULL, 'r'},
+ {"order", required_argument, NULL, 3},
+ {"barcode-tag", required_argument, NULL, 4},
+ {"quality-tag", required_argument, NULL, 5},
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:", lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'b': opts.idx_both = 1; break;
+ case '0': opts.fn[FQ_R0] = optarg; break;
+ case '1': opts.fn[FQ_R1] = optarg; break;
+ case '2': opts.fn[FQ_R2] = optarg; break;
+ case 1: opts.fn[FQ_I1] = optarg; break;
+ case 2: opts.fn[FQ_I2] = optarg; break;
+ case 's': opts.fn[FQ_SINGLE] = optarg; break;
+ case 'o': opts.fn_out = optarg; break;
+ case 'i': opts.casava = 1; break;
+ case 4: opts.barcode_seq = optarg; break;
+ case 5: opts.barcode_qual = optarg; break;
+ case 'T': opts.aux = optarg; break;
+ case 'u': opts.compress_level = 0; break;
+ case 'R': opts.rg = optarg; break;
+ case 'r':
+ if (*optarg != '@' && ks_len(&rg) == 0)
+ kputs("@RG", &rg);
+ if (ks_len(&rg))
+ kputc_('\t', &rg);
+ kputs(optarg, &rg);
+ opts.rg_line = rg.s;
+ break;
+
+ case 9: opts.no_pg = 1; break;
+ case 3: opts.order = optarg; break;
+
+ case 'h': return usage(stdout, EXIT_SUCCESS);
+ case '?': return usage(stderr, EXIT_FAILURE);
+
+ default:
+ if (parse_sam_global_opt(c, optarg, lopts, &opts.ga) != 0)
+ return usage(stderr, EXIT_FAILURE);
+ break;
+ }
+ }
+
+ if (opts.ga.nthreads > 0) {
+ if (!(opts.p.pool = hts_tpool_init(opts.ga.nthreads))) {
+ fprintf(stderr, "Failed to create thread pool\n");
+ if (rg.s)
+ free(rg.s);
+ return -1;;
+ }
+ }
+
+ int ret = import_fastq(argc-optind, argv+optind, &opts) ? 1 : 0;
+
+ if (rg.s)
+ free(rg.s);
+
+ if (opts.p.pool)
+ hts_tpool_destroy(opts.p.pool);
+
+ return ret;
+}
--- /dev/null
+#include "samtools.pysam.h"
+
+/* bam_import -- Import of FASTQ files.
+ *
+ * samtools import -1 a_1.fq -2 a_2.fq --i1 a_i1.fq --i2 a_i2.fq
+ * samtools import a_1.fq a_2.fq
+ * samtools import a_interleaved.fq
+ *
+ * Copyright (C) 2020 Genome Research Ltd.
+ *
+ * Author: James Bonfield <jkb@sanger.ac.uk>
+ */
+
+/*
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notices and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+// TODO: Store other non-aux comments; in new sam tag?
+
+#include <config.h>
+#include <ctype.h>
+
+#include "htslib/sam.h"
+#include "htslib/thread_pool.h"
+
+#include "samtools.h"
+#include "sam_opts.h"
+
+static int usage(FILE *fp, int exit_status) {
+ fprintf(fp, "Usage: samtools import [options] [file.fastq ...]\n");
+ fprintf(fp, "\n");
+ fprintf(fp, "Options:\n");
+ fprintf(fp, " -s FILE Read paired-ended data from single FILE\n");
+ fprintf(fp, " -0 FILE Read single-ended data from FILE\n");
+ fprintf(fp, " -1 FILE Read-1 from FILE\n");
+ fprintf(fp, " -2 FILE Read-2 from FILE\n");
+ fprintf(fp, " --i1 FILE Index-1 from FILE\n");
+ fprintf(fp, " --i2 FILE Index-2 from FILE\n");
+ fprintf(fp, " -i Parse CASAVA identifier\n");
+ fprintf(fp, " --barcode-tag TAG\n");
+ fprintf(fp, " Tag to use with barcode sequences [BC]\n");
+ fprintf(fp, " --quality-tag TAG\n");
+ fprintf(fp, " Tag to use with barcode qualities [QT]\n");
+ fprintf(fp, " -r STRING Build up a complete @RG line\n");
+ fprintf(fp, " -R STRING Add a simple RG line of \"@RG\\tID:STRING\"\n");
+ fprintf(fp, " -T TAGLIST Parse tags in SAM format; list of '*' for all\n");
+ fprintf(fp, " -o FILE Output to FILE instead of samtools_stdout\n");
+ fprintf(fp, " -u Uncompressed output\n");
+ fprintf(fp, " --order TAG Store Nth record count in TAG\n");
+ fprintf(fp, "\n");
+ sam_global_opt_help(fp, "-.O.-@--");
+
+ fprintf(fp, "\nA single fastq file will be interpreted as -s, -0 or -1 depending on\n");
+ fprintf(fp, "file contents, and a pair of fastq files as \"-1 FILE1 -2 FILE2\".\n");
+
+ return exit_status;
+}
+
+// Order matters here as we want to read index elements before main
+// sequences so on reading the seqs we can emit a fully annotated record.
+enum fileno {
+ FQ_I1, FQ_I2, // index seqs for R1 and R2
+ FQ_R0, // single file and unpaired data (singled-ended tech).
+ FQ_R1, FQ_R2, // separate read1 and read2 files
+ FQ_SINGLE, // single file, but with read1 and/or read2 present.
+ FQ_END
+};
+
+typedef struct {
+ sam_global_args ga;
+ int no_pg;
+ char *fn[FQ_END], *fn_out;
+ int idx_both; // add index to READ2 too, not just READ1
+ int casava;
+ char *barcode_seq;
+ char *barcode_qual;
+ char *aux;
+ char *rg;
+ char *rg_line;
+ char *order;
+ int compress_level;
+ htsThreadPool p;
+} opts_t;
+
+// Append a sequence and quality string from a BAM record to a BC:Z and
+// QT:Z style aux tag string.
+static int append_index(kstring_t *s, kstring_t *q, bam1_t *b) {
+ char *sp, *qp;
+ if (ks_resize(s, s->l + b->core.l_qseq+1 +1) < 0)
+ return -1;
+ if (ks_resize(q, q->l + b->core.l_qseq+1 +1) < 0)
+ return -1;
+
+ sp = s->s + s->l - (s->l > 0);
+ qp = q->s + q->l - (q->l > 0);
+
+ if (s->l)
+ *sp++ = '-';
+
+ if (q->l)
+ *qp++ = ' ';
+
+ int i;
+ uint8_t *seq = bam_get_seq(b);
+ uint8_t *qual = bam_get_qual(b);
+ for (i = 0; i < b->core.l_qseq; i++) {
+ *sp++ = seq_nt16_str[bam_seqi(seq, i)];
+ *qp++ = qual[i] + '!';
+ }
+ *sp++ = 0;
+ *qp++ = 0;
+
+ s->l = sp - s->s;
+ q->l = qp - q->s;
+
+ return 0;
+}
+
+static int import_fastq(int argc, char **argv, opts_t *opts) {
+ int i, n, ret = 0;
+ samFile *fp_in[FQ_END] = {NULL};
+ bam1_t *b = bam_init1();
+ int ids[FQ_END];
+ samFile *fp_out = NULL;
+ sam_hdr_t *hdr_out = NULL;
+ kstring_t index_str = {0,0};
+ kstring_t read_str = {0,0};
+ char *rg = opts->rg;
+ kstring_t rg_line = {0,0};
+ uint64_t read_num = 0;
+ kstring_t idx_seq = {0};
+ kstring_t idx_qual = {0};
+
+ // Any additional arguments are assumed to be r1 r2, as a
+ // short cut. We support reading index tags out of those too (eg
+ // Illumina CASAVA format), but if we do that we lack the barcode
+ // quality string.
+ //
+ // We also consider a read name ending in /1 or /2 to be a single
+ // file containing interleaved fastq records for both ends.
+ // These will be labeled as fn[FQ_R1] but adjusted during reading.
+ if (argc == 1)
+ opts->fn[FQ_SINGLE] = argv[0];
+ else
+ for (i = 0; i < 4; i++)
+ if (argc > i)
+ opts->fn[FQ_R1+i] = argv[i];
+
+ // Open all files
+ for (i = n = 0; i < FQ_END; i++) {
+ if (!opts->fn[i])
+ continue;
+ fp_in[i] = sam_open_format(opts->fn[i], "r", &opts->ga.in);
+ if (!fp_in[i]) {
+ perror(opts->fn[i]);
+ ret = -1;
+ goto err;
+ }
+ if (opts->p.pool)
+ hts_set_thread_pool(fp_in[i], &opts->p);
+ ids[n++] = i;
+
+ if (opts->casava)
+ hts_set_opt(fp_in[i], FASTQ_OPT_CASAVA, 1);
+ if (opts->barcode_seq) // for auto-CASAVA parsing
+ hts_set_opt(fp_in[i], FASTQ_OPT_BARCODE, opts->barcode_seq);
+ if (opts->aux)
+ hts_set_opt(fp_in[i], FASTQ_OPT_AUX,
+ *opts->aux == '*' || *opts->aux == '\0'
+ ? NULL : opts->aux);
+
+ switch (i) {
+ case FQ_I1:
+ kputs("--i1 I1.fastq ", &read_str);
+ kputs("i*", &index_str);
+ break;
+ case FQ_I2:
+ kputs("--i2 I2.fastq ", &read_str);
+ kputs("i*", &index_str);
+ break;
+
+ case FQ_R0:
+ kputs("-0 unpaired.fastq ", &read_str);
+ break;
+
+ case FQ_R1:
+ kputs("-1 R1.fastq ", &read_str);
+ break;
+
+ case FQ_R2:
+ kputs("-2 R2.fastq ", &read_str);
+ break;
+
+ case FQ_SINGLE:
+ kputs("-N -o paired.fastq ", &read_str);
+ break;
+
+ default:
+ ks_clear(&read_str); // not reversible
+ kputs("", &read_str);
+ }
+ }
+ if (n == 0) {
+ bam_destroy1(b);
+ return usage(samtools_stdout, EXIT_SUCCESS);
+ }
+
+ char out_mode[10] = {'w', 0, 0};
+ if (opts->compress_level != -1)
+ out_mode[1] = '0' + opts->compress_level;
+ sam_open_mode(out_mode+strlen(out_mode), opts->fn_out, NULL);
+ fp_out = sam_open_format(opts->fn_out, out_mode, &opts->ga.out);
+ if (!fp_out) {
+ perror(opts->fn_out);
+ goto err;
+ }
+ if (opts->p.pool)
+ hts_set_thread_pool(fp_out, &opts->p);
+
+ // Create header
+ if (ks_len(&read_str)) {
+ char CO[2100];
+ if (ks_len(&index_str))
+ snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s "
+ "--index-format=\"%s\"\n",
+ ks_str(&read_str), ks_str(&index_str));
+ else
+ snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s\n",
+ ks_str(&read_str));
+
+ hdr_out = sam_hdr_parse(strlen(CO), CO);
+ } else {
+ hdr_out = sam_hdr_init();
+ }
+
+ // Read group
+ if (opts->rg_line) {
+ if (*opts->rg_line != '@')
+ ksprintf(&rg_line, "@RG\t%s", opts->rg_line);
+ else
+ kputs(opts->rg_line, &rg_line);
+ } else if (opts->rg) {
+ ksprintf(&rg_line, "@RG\tID:%s", opts->rg);
+ }
+
+ if (ks_len(&rg_line)) {
+ if (sam_hdr_add_lines(hdr_out, ks_str(&rg_line), 0) < 0)
+ goto err;
+ rg = strstr(ks_str(&rg_line), "\tID:");
+ if (!rg) {
+ fprintf(samtools_stderr, "\"-r RG-LINE\" option contained no ID field\n");
+ goto err;
+ }
+ rg += 4;
+
+ i = 0;
+ while (rg[i] != '\t' && rg[i] != '\0')
+ i++;
+ rg[i] = 0;
+ }
+
+ if ((ret = sam_hdr_write(fp_out, hdr_out)) < 0)
+ goto err;
+
+
+ // Interleave / combine from n files (ids[0..n-1]).
+ int res;
+ int eof = 0;
+ do {
+ idx_seq.l = idx_qual.l = 0;
+ for (i = 0; i < n; i++) {
+ if ((res = sam_read1(fp_in[ids[i]], NULL, b)) < 0) {
+ if (res == -1) {
+ eof++;
+ continue;
+ } else
+ break;
+ }
+
+ // index
+ if (ids[i] == FQ_I1 || ids[i] == FQ_I2) {
+ if (append_index(&idx_seq, &idx_qual, b) < 0) {
+ res = -1;
+ break;
+ }
+ continue;
+ }
+
+ // full read
+ if (idx_seq.l) {
+ if (opts->idx_both || ids[i] == FQ_SINGLE ||
+ ids[i] == FQ_R0 || ids[i] == FQ_R1) {
+ if (bam_aux_append(b, opts->barcode_seq, 'Z', idx_seq.l,
+ (uint8_t *)idx_seq.s) ||
+ bam_aux_append(b, opts->barcode_qual, 'Z', idx_qual.l,
+ (uint8_t *)idx_qual.s)) {
+ res = -1;
+ break;
+ }
+ }
+ }
+
+ switch(ids[i]) {
+ case FQ_R0:
+ // unpaired; no flags to declare
+ break;
+ case FQ_SINGLE:
+ // paired (but don't know if R1 or R2) or unpaired.
+ // We rely on the /1 and /2 read suffix parsing in htslib
+ // to distinguish the two cases, or CASAVA tags if
+ // explicitly enabled.
+ break;
+ case FQ_R1:
+ if ((b->core.flag & (BAM_FREAD1 | BAM_FREAD2)) == 0)
+ b->core.flag |= BAM_FREAD1;
+ b->core.flag |= BAM_FPAIRED;
+ if (i+1 < n && ids[i+1] == FQ_R2)
+ b->core.flag |= BAM_FMUNMAP;
+ break;
+ case FQ_R2:
+ b->core.flag |= BAM_FPAIRED | BAM_FREAD2;
+ if (i > 0 && ids[i-1] == FQ_R1)
+ b->core.flag |= BAM_FMUNMAP;
+ break;
+ }
+
+ if (rg) {
+ if (bam_aux_append(b, "RG", 'Z', strlen(rg)+1,
+ (uint8_t *)rg) < 0) {
+ ret = -1;
+ goto err;
+ }
+ }
+
+ if (opts->order) {
+ if (bam_aux_update_int(b, opts->order, read_num++) < 0) {
+ ret = -1;
+ goto err;
+ }
+ }
+
+ res = sam_write1(fp_out, hdr_out, b);
+ }
+ } while (res >= 0);
+
+ if (res != -1) {
+ print_error("import", "truncated file. Aborting");
+ ret = res;
+ goto err;
+ }
+
+ if (eof != n) {
+ print_error("import", "input files with differing number of records");
+ ret = -1;
+ goto err;
+ }
+
+ // Close and return
+ ret = 0;
+err:
+ bam_destroy1(b);
+ sam_hdr_destroy(hdr_out);
+ ks_free(&rg_line);
+ ks_free(&index_str);
+ ks_free(&read_str);
+ if (fp_out) {
+ if (sam_close(fp_out) < 0) {
+ perror(opts->fn_out);
+ ret |= -1;
+ }
+ }
+ for (i = 0; i < FQ_END; i++) {
+ if (fp_in[i] && sam_close(fp_in[i]) < 0) {
+ perror(opts->fn[i]);
+ ret |= -1;
+ }
+ }
+ ks_free(&idx_seq);
+ ks_free(&idx_qual);
+
+ return ret;
+}
+
+int main_import(int argc, char *argv[]) {
+ int c;
+ opts_t opts = {
+ .no_pg = 0,
+ .ga = SAM_GLOBAL_ARGS_INIT,
+ .fn = {NULL},
+ .fn_out = "-",
+ .casava = 0,
+ .barcode_seq = "BC",
+ .barcode_qual = "QT",
+ .aux = NULL,
+ .rg = NULL,
+ .rg_line = NULL,
+ .order = NULL,
+ .compress_level = -1,
+ };
+ kstring_t rg = {0};
+
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, '-', '@'),
+ {"no-PG", no_argument, NULL, 9},
+ {"i1", required_argument, NULL, 1},
+ {"i2", required_argument, NULL, 2},
+ {"r1", required_argument, NULL, '1'},
+ {"r2", required_argument, NULL, '2'},
+ {"rg", required_argument, NULL, 'R'},
+ {"rg-line", required_argument, NULL, 'r'},
+ {"order", required_argument, NULL, 3},
+ {"barcode-tag", required_argument, NULL, 4},
+ {"quality-tag", required_argument, NULL, 5},
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:", lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'b': opts.idx_both = 1; break;
+ case '0': opts.fn[FQ_R0] = optarg; break;
+ case '1': opts.fn[FQ_R1] = optarg; break;
+ case '2': opts.fn[FQ_R2] = optarg; break;
+ case 1: opts.fn[FQ_I1] = optarg; break;
+ case 2: opts.fn[FQ_I2] = optarg; break;
+ case 's': opts.fn[FQ_SINGLE] = optarg; break;
+ case 'o': opts.fn_out = optarg; break;
+ case 'i': opts.casava = 1; break;
+ case 4: opts.barcode_seq = optarg; break;
+ case 5: opts.barcode_qual = optarg; break;
+ case 'T': opts.aux = optarg; break;
+ case 'u': opts.compress_level = 0; break;
+ case 'R': opts.rg = optarg; break;
+ case 'r':
+ if (*optarg != '@' && ks_len(&rg) == 0)
+ kputs("@RG", &rg);
+ if (ks_len(&rg))
+ kputc_('\t', &rg);
+ kputs(optarg, &rg);
+ opts.rg_line = rg.s;
+ break;
+
+ case 9: opts.no_pg = 1; break;
+ case 3: opts.order = optarg; break;
+
+ case 'h': return usage(samtools_stdout, EXIT_SUCCESS);
+ case '?': return usage(samtools_stderr, EXIT_FAILURE);
+
+ default:
+ if (parse_sam_global_opt(c, optarg, lopts, &opts.ga) != 0)
+ return usage(samtools_stderr, EXIT_FAILURE);
+ break;
+ }
+ }
+
+ if (opts.ga.nthreads > 0) {
+ if (!(opts.p.pool = hts_tpool_init(opts.ga.nthreads))) {
+ fprintf(samtools_stderr, "Failed to create thread pool\n");
+ if (rg.s)
+ free(rg.s);
+ return -1;;
+ }
+ }
+
+ int ret = import_fastq(argc-optind, argv+optind, &opts) ? 1 : 0;
+
+ if (rg.s)
+ free(rg.s);
+
+ if (opts.p.pool)
+ hts_tpool_destroy(opts.p.pool);
+
+ return ret;
+}
{
fprintf(fp, "Usage: samtools idxstats [options] <in.bam>\n");
sam_global_opt_help(fp, "-.---@-.");
- exit(exit_status);
+ samtools_exit(exit_status);
}
int bam_idxstats(int argc, char *argv[])
/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
through fixmates with the mate scoring option on.
- Copyright (C) 2017-2019 Genome Research Ltd.
+ Copyright (C) 2017-2020 Genome Research Ltd.
Author: Andrew Whitwham <aw7@sanger.ac.uk>
int mode;
int write_index;
int include_fails;
+ int check_chain;
char *stats_file;
char *arg_list;
char *out_fn;
bam1_t *b;
struct read_queue_s *duplicate;
hts_pos_t pos;
+ int dup_checked;
} read_queue_t;
typedef struct {
char type;
} dup_map_t;
+typedef struct {
+ bam1_t *b;
+ int64_t score;
+ int64_t mate_score;
+ long x;
+ long y;
+ int opt;
+ int xpos;
+} check_t;
+typedef struct {
+ check_t *c;
+ size_t size;
+ size_t length;
+} check_list_t;
+
static khint32_t do_hash(unsigned char *key, khint32_t len);
static khint_t hash_key(key_data_t key) {
}
+/* Get the position of the coordinates from the read name. */
static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) {
int sep = 0;
int pos = 0;
return sep;
}
+
+static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *y_coord, long *warnings) {
+ int ret = 1;
+ int seps, xpos = 0, ypos = 0;
+ long x = 0, y = 0;
+ char *end;
+
+ seps = get_coordinate_positions(name, &xpos, &ypos);
+
+ /* The most current Illumina read format at time of writing is:
+ @machine:run:flowcell:lane:tile:x:y:UMI or
+ @machine:run:flowcell:lane:tile:x:y
+
+ Counting the separating colons gives us a quick format check.
+ Older name formats have fewer elements.
+ */
+
+ if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", name);
+ }
+
+ return ret;
+ }
+
+ x = strtol(name + xpos, &end, 10);
+
+ if ((name + xpos) == end) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name);
+ }
+
+ return ret;
+ }
+
+ y = strtol(name + ypos, &end, 10);
+
+ if ((name + ypos) == end) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name);
+ }
+
+ return ret;
+ }
+
+ *x_coord = x;
+ *y_coord = y;
+ *xpos_out = xpos;
+ ret = 0;
+
+ return ret;
+}
+
+
/* Using the coordinates from the Illumina read name, see whether the duplicated read is
close enough (set by max_dist) to the original to be counted as optical.*/
}
+/* Using the coordinates from the Illumina read name, see whether the duplicated read is
+ close enough (set by max_dist) to the original to be counted as optical.
+
+ This function needs the values from the first read to be already calculated. */
+
+static int optical_duplicate_partial(const char *name, const int oxpos, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) {
+ int ret = 0;
+ char *duplicate;
+ int dxpos = 0;
+ long dx, dy;
+
+ duplicate = bam_get_qname(dup);
+
+ if (get_coordinates(duplicate, &dxpos, &dx, &dy, warnings)) {
+ return ret;
+ }
+
+ if (strncmp(name, duplicate, oxpos - 1) == 0) {
+ // the initial parts match, look at the numbers
+ long xdiff, ydiff;
+
+ if (ox > dx) {
+ xdiff = ox - dx;
+ } else {
+ xdiff = dx - ox;
+ }
+
+ if (xdiff <= max_dist) {
+ // still might be optical
+
+ if (oy > dy) {
+ ydiff = oy - dy;
+ } else {
+ ydiff = dy - oy;
+ }
+
+ if (ydiff <= max_dist) ret = 1;
+ }
+ }
+
+ c->x = dx;
+ c->y = dy;
+ c->xpos = dxpos;
+
+ if (ret) {
+ c->opt = ret;
+ }
+
+ return ret;
+}
+
+
+/* Mark the read as a duplicate and update the duplicate hash (if needed) */
static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup,
long *optical, long *warn) {
char dup_type = 0;
dup->core.flag |= BAM_FDUP;
if (param->tag) {
- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(ori)) + 1, (uint8_t*)bam_get_qname(ori))) {
+ if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) {
fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n");
return -1;
}
if (param->opt_dist) { // mark optical duplicates
if (optical_duplicate(ori, dup, param->opt_dist, warn)) {
- bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"SQ");
+ bam_aux_update_str(dup, "dt", 3, "SQ");
dup_type = 'O';
(*optical)++;
} else {
// not an optical duplicate
- bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"LB");
+ bam_aux_update_str(dup, "dt", 3, "LB");
}
}
}
+/* If the duplicate type has changed to optical then retag and duplicate hash. */
static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) {
int ret = 0;
- uint8_t *data;
- // remove any existing dt tag
- if ((data = bam_aux_get(b, "dt")) != NULL) {
- bam_aux_del(b, data);
- }
-
- if (bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ")) {
- fprintf(stderr, "[markdup] error: unable to append 'dt' tag.\n");
+ if (bam_aux_update_str(b, "dt", 3, "SQ")) {
+ fprintf(stderr, "[markdup] error: unable to update 'dt' tag.\n");
ret = -1;
}
}
+/* Check all duplicates of the highest quality read (the "original") for consistancy. Also
+ pre-calculate any values for use in check_duplicate_chain later.
+ Returns 0 on success, >0 on coordinate reading error (program can continue) or
+ <0 on an error (program should not continue. */
+static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori,
+ check_list_t *list, long *warn, long *optical_single, long *optical_pair) {
-/*
- Where there is more than one duplicate go down the list and check for optical duplicates and change
- do tags (where used) to point to original (non-duplicate) read.
-*/
-static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori,
- long *warn, long *optical_single, long *optical_pair) {
int ret = 0;
- read_queue_t *current = ori->duplicate;
char *ori_name = bam_get_qname(ori->b);
- int have_original = !(ori->b->core.flag & BAM_FDUP);
- int ori_paired = (ori->b->core.flag & BAM_FPAIRED) && !(ori->b->core.flag & BAM_FMUNMAP);
+ read_queue_t *current = ori->duplicate;
+ int xpos;
+ long x, y;
+
+ if (param->opt_dist) {
+ if ((ret = get_coordinates(ori_name, &xpos, &x, &y, warn))) {
+ return ret;
+ }
+ }
+
+ list->length = 0;
while (current) {
- int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
+ check_t *c;
+
+ if (list->length >= list->size) {
+ check_t *tmp;
+
+ list->size *= 2;
+
+ if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) {
+ fprintf(stderr, "[markdup] error: Unable to expand opt check list.\n");
+ return -1;
+ }
+
+ list->c = tmp;
+ }
+
+ c = &list->c[list->length];
- if (param->tag && have_original) {
+ c->b = current->b;
+ c->x = -1;
+ c->y = -1;
+ c->opt = 0;
+ c->score = 0;
+ c->mate_score = 0;
+ current->dup_checked = 1;
+
+ if (param->tag) {
uint8_t *data;
// at this stage all duplicates should have a do tag
if (old_name) {
if (strcmp(old_name, ori_name) != 0) {
- bam_aux_del(current->b, data);
-
- if (bam_aux_append(current->b, "do", 'Z', strlen(ori_name) + 1, (uint8_t*)ori_name)) {
- fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n");
+ if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) {
+ fprintf(stderr, "[markdup] error: unable to update 'do' tag.\n");
ret = -1;
break;
}
}
if (param->opt_dist) {
- int is_cur_opt = 0, is_ori_opt = 0;
uint8_t *data;
char *dup_type;
+ int is_opt = 0;
+ int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
- if ((data = bam_aux_get(ori->b, "dt"))) {
+ if ((data = bam_aux_get(current->b, "dt"))) {
if ((dup_type = bam_aux2Z(data))) {
if (strcmp(dup_type, "SQ") == 0) {
- is_ori_opt = 1;
+ c->opt = 1;
}
}
}
- if ((data = bam_aux_get(current->b, "dt"))) {
- if ((dup_type = bam_aux2Z(data))) {
- if (strcmp(dup_type, "SQ") == 0) {
- is_cur_opt = 1;
- }
+ // need to run this to get the duplicates x and y scores
+ is_opt = optical_duplicate_partial(ori_name, xpos, x, y, current->b, c, param->opt_dist, warn);
+
+ if (!c->opt && is_opt) {
+ if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
+ ret = -1;
+ break;
}
+
+ c->opt = 1;
}
- if (!(is_ori_opt && is_cur_opt)) {
- // if both are already optical duplicates there is no need to check again, otherwise...
+ c->score = calc_score(current->b);
- if (optical_duplicate(ori->b, current->b, param->opt_dist, warn)) {
- // find out which one is the duplicate
- int is_cur_dup = 0;
+ if (current_paired) {
+ if ((c->mate_score = get_mate_score(current->b)) == -1) {
+ fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
+ ret = -1;
+ break;
+ }
+ }
+ }
- if (have_original) {
- // compared against an original, this is a dup.
- is_cur_dup = 1;
- } else if (ori_paired != current_paired) {
- if (!current_paired) {
- // current is single vs pair, this is a dup.
- is_cur_dup = 1;
- }
- } else {
- // do it by scores
- int64_t ori_score, curr_score;
+ current = current->duplicate;
+ list->length++;
+ }
- if ((ori->b->core.flag & BAM_FQCFAIL) != (current->b->core.flag & BAM_FQCFAIL)) {
- if (ori->b->core.flag & BAM_FQCFAIL) {
- ori_score = 0;
- curr_score = 1;
- } else {
- ori_score = 1;
- curr_score = 0;
- }
- } else {
- ori_score = calc_score(ori->b);
- curr_score = calc_score(current->b);
-
- if (current_paired) {
- // they are pairs so add mate scores.
- int64_t mate_tmp;
-
- if ((mate_tmp = get_mate_score(ori->b)) == -1) {
- fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
- ret = -1;
- break;
- } else {
- ori_score += mate_tmp;
- }
+ return ret;
+}
- if ((mate_tmp = get_mate_score(current->b)) == -1) {
- fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
- ret = -1;
- break;
- } else {
- curr_score += mate_tmp;
- }
- }
- }
- if (ori_score == curr_score) {
- if (strcmp(bam_get_qname(current->b), ori_name) < 0) {
- curr_score++;
- } else {
- curr_score--;
- }
- }
+static int xcoord_sort(const void *a, const void *b) {
+ check_t *ac = (check_t *) a;
+ check_t *bc = (check_t *) b;
- if (ori_score > curr_score) {
- is_cur_dup = 1;
- }
+ return (ac->x - bc->x);
+}
+
+
+/* Check all the duplicates against each other to see if they are optical duplicates. */
+static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list,
+ long *warn, long *optical_single, long *optical_pair) {
+ int ret = 0;
+ size_t curr = 0;
+
+ qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort);
+
+ while (curr < list->length - 1) {
+ check_t *current = &list->c[curr];
+ size_t count = curr;
+ char *cur_name = bam_get_qname(current->b);
+ int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
+
+ while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) {
+ // while close enough along the x coordinate
+ check_t *chk = &list->c[count];
+
+ if (current->opt && chk->opt)
+ continue;
+
+ // if both are already optical duplicates there is no need to check again, otherwise...
+
+ long ydiff;
+
+ if (current->y > chk->y) {
+ ydiff = current->y - chk->y;
+ } else {
+ ydiff = chk->y - current->y;
+ }
+
+ if (ydiff > param->opt_dist)
+ continue;
+
+ // the number are right, check the names
+ if (strncmp(cur_name, bam_get_qname(chk->b), current->xpos - 1) != 0)
+ continue;
+
+ // optical duplicates
+ int chk_dup = 0;
+ int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP);
+
+ if (current_paired != chk_paired) {
+ if (!chk_paired) {
+ // chk is single vs pair, this is a dup.
+ chk_dup = 1;
+ }
+ } else {
+ // do it by scores
+ int64_t cur_score, chk_score;
+
+ if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) {
+ if (current->b->core.flag & BAM_FQCFAIL) {
+ cur_score = 0;
+ chk_score = 1;
+ } else {
+ cur_score = 1;
+ chk_score = 0;
}
+ } else {
+ cur_score = current->score;
+ chk_score = chk->score;
- if (is_cur_dup) {
- // the current is the optical duplicate
- if (!is_cur_opt) { // only change if not already an optical duplicate
- if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
- ret = -1;
- break;
- }
- }
+ if (current_paired) {
+ // they are pairs so add mate scores.
+ chk_score += chk->mate_score;
+ cur_score += current->mate_score;
+ }
+ }
+
+ if (cur_score == chk_score) {
+ if (strcmp(bam_get_qname(chk->b), cur_name) < 0) {
+ chk_score++;
} else {
- if (!is_ori_opt) {
- if (optical_retag(param, dup_hash, ori->b, ori_paired, optical_single, optical_pair)) {
- ret = -1;
- break;
- }
- }
+ chk_score--;
}
}
+
+ if (cur_score > chk_score) {
+ chk_dup = 1;
+ }
+ }
+
+ if (chk_dup) {
+ // the duplicate is the optical duplicate
+ if (!chk->opt) { // only change if not already an optical duplicate
+ if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) {
+ ret = -1;
+ goto fail;
+ }
+
+ chk->opt = 1;
+ }
+ } else {
+ if (!current->opt) {
+ if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
+ ret = -1;
+ goto fail;
+ }
+
+ current->opt = 1;
+ }
}
}
- current = current->duplicate;
+ curr++;
+ }
+
+ fail:
+ return ret;
+}
+
+
+/* Where there is more than one duplicate go down the list and check for optical duplicates and change
+ do tags (where used) to point to original (non-duplicate) read. */
+static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list,
+ const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single,
+ long *optical_pair, const int check_range) {
+ int ret = 0;
+ kliter_t(read_queue) *rq;
+
+ rq = kl_begin(read_buffer);
+
+ while (rq != kl_end(read_buffer)) {
+ read_queue_t *in_read = &kl_val(rq);
+
+ if (check_range) {
+ /* Just check against the moving window of reads based on coordinates and max read length. */
+ if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) {
+ break;
+ }
+ } else {
+ // this is the last set of results and the end entry will be blank
+ if (!bam_get_qname(in_read->b)) {
+ break;
+ }
+ }
+
+ if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain
+
+ // check against the original for tagging and optical duplication
+ if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) {
+ if (ret < 0) { // real error
+ ret = -1;
+ break;
+ } else { // coordinate decoding error
+ ret = 0;
+ in_read->duplicate = NULL;
+ continue;
+ }
+ }
+
+ // check the rest of the duplicates against each other for optical duplication
+ if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) {
+ ret = -1;
+ break;
+ }
+
+ in_read->duplicate = NULL;
+ }
+
+ rq = kl_next(rq);
}
return ret;
}
+
/*
Function to use when estimating library size.
/* estimate the library size, based on the Picard code in DuplicationMetrics.java*/
-static unsigned long estimate_library_size(unsigned long read_pairs, unsigned long duplicate_pairs) {
+static unsigned long estimate_library_size(unsigned long paired_reads, unsigned long paired_duplicate_reads, unsigned long optical) {
unsigned long estimated_size = 0;
+ unsigned long non_optical_pairs = (paired_reads - optical) / 2;
+ unsigned long unique_pairs = (paired_reads - paired_duplicate_reads) / 2;
+ unsigned long duplicate_pairs = (paired_duplicate_reads - optical) / 2;
- read_pairs /= 2;
- duplicate_pairs /= 2;
-
- if ((read_pairs && duplicate_pairs) && (read_pairs > duplicate_pairs)) {
- unsigned long unique_pairs = read_pairs - duplicate_pairs;
+ if ((non_optical_pairs && duplicate_pairs && unique_pairs) && (non_optical_pairs > duplicate_pairs)) {
double m = 1;
double M = 100;
int i;
- if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) < 0) {
+ if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) < 0) {
fprintf(stderr, "[markdup] warning: unable to calculate estimated library size.\n");
return estimated_size;
}
- while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) > 0) {
+ while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) > 0) {
M *= 10;
}
for (i = 0; i < 40; i++) {
double r = (m + M) / 2;
- double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)read_pairs);
+ double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs);
if (u > 0) {
m = r;
fprintf(stderr, "[markdup] warning: unable to calculate estimated library size."
" Read pairs %ld should be greater than duplicate pairs %ld,"
" which should both be non zero.\n",
- read_pairs, duplicate_pairs);
+ non_optical_pairs, duplicate_pairs);
}
return estimated_size;
tmp_file_t temp;
char *idx_fn = NULL;
int exclude = 0;
+ check_list_t dup_list = {NULL, 0, 0};
if (!pair_hash || !single_hash || !read_buffer || !dup_hash) {
fprintf(stderr, "[markdup] out of memory\n");
goto fail;
}
+ if (param->check_chain && !(param->tag || param->opt_dist))
+ param->check_chain = 0;
+
+ if (param->check_chain) {
+ dup_list.size = 128;
+ dup_list.c = NULL;
+
+ if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) {
+ fprintf(stderr, "[markdup] error: unable to allocate memory for dup_list.\n");
+ goto fail;
+ }
+ }
+
reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0;
np_duplicate = np_opt_duplicate = 0;
while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) {
+ int dup_checked = 0;
// do some basic coordinate order checks
if (in_read->b->core.tid >= 0) { // -1 for unmapped reads
prev_tid = in_read->b->core.tid;
in_read->pair_key.single = 1;
in_read->single_key.single = 0;
+ in_read->duplicate = NULL;
+ in_read->dup_checked = 0;
reading++;
// read must not be secondary, supplementary, unmapped or (possibly) failed QC
if (!(in_read->b->core.flag & exclude)) {
examined++;
- in_read->duplicate = NULL;
+
// look at the pairs first
if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) {
// scores more than one read of the pair
bam1_t *dup = bp->p->b;
- in_read->duplicate = bp->p;
+ if (param->check_chain)
+ in_read->duplicate = bp->p;
+
bp->p = in_read;
if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
goto fail;
single_dup++;
-
- if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
- goto fail;
-
}
} else {
fprintf(stderr, "[markdup] error: single hashing failure.\n");
in_read->pair_key = pair_key;
} else if (ret == 0) {
int64_t old_score, new_score, tie_add = 0;
- bam1_t *dup;
- int check_chain = 0;
+ bam1_t *dup = NULL;
bp = &kh_val(pair_hash, k);
if (new_score + tie_add > old_score) { // swap reads
dup = bp->p->b;
- in_read->duplicate = bp->p;
+
+ if (param->check_chain) {
+
+ if (in_read->duplicate) {
+ read_queue_t *current = in_read->duplicate;
+
+ while (current->duplicate) {
+ current = current->duplicate;
+ }
+
+ current->duplicate = bp->p;
+ } else {
+ in_read->duplicate = bp->p;
+ }
+ }
+
bp->p = in_read;
} else {
- if (bp->p->duplicate) {
- in_read->duplicate = bp->p->duplicate;
- check_chain = 1;
+ if (param->check_chain) {
+ if (bp->p->duplicate) {
+ if (in_read->duplicate) {
+ read_queue_t *current = bp->p->duplicate;
+
+ while (current->duplicate) {
+ current = current->duplicate;
+ }
+
+ current->duplicate = in_read->duplicate;
+ }
+
+ in_read->duplicate = bp->p->duplicate;
+ }
+
+ bp->p->duplicate = in_read;
}
- bp->p->duplicate = in_read;
dup = in_read->b;
}
if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings))
goto fail;
- if (check_chain) {
- if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical))
- goto fail;
- }
-
- if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
- goto fail;
-
duplicate++;
} else {
fprintf(stderr, "[markdup] error: pair hashing failure.\n");
int ret;
key_data_t single_key;
in_hash_t *bp;
- int check_chain = 0;
make_single_key(&single_key, in_read->b);
if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) {
// if matched against one of a pair just mark as duplicate
- if (bp->p->duplicate) {
- in_read->duplicate = bp->p->duplicate;
- check_chain = 1;
- }
-
- bp->p->duplicate = in_read;
-
- if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings))
- goto fail;
+ if (param->check_chain) {
+ if (bp->p->duplicate) {
+ in_read->duplicate = bp->p->duplicate;
+ }
- if (check_chain) {
- // check the new duplicate entry in the chain
- if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical))
- goto fail;
+ bp->p->duplicate = in_read;
}
- // check against the new original
- if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
+ if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings))
goto fail;
} else {
int64_t old_score, new_score;
- bam1_t *dup;
+ bam1_t *dup = NULL;
old_score = calc_score(bp->p->b);
new_score = calc_score(in_read->b);
// to the single hash and mark the other as duplicate
if (new_score > old_score) { // swap reads
dup = bp->p->b;
- in_read->duplicate = bp->p;
+
+ if (param->check_chain)
+ in_read->duplicate = bp->p;
+
bp->p = in_read;
} else {
- if (bp->p->duplicate) {
- in_read->duplicate = bp->p->duplicate;
- check_chain = 1;
+ if (param->check_chain) {
+ if (bp->p->duplicate) {
+ in_read->duplicate = bp->p->duplicate;
+ }
+
+ bp->p->duplicate = in_read;
}
- bp->p->duplicate = in_read;
dup = in_read->b;
}
if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
goto fail;
-
-
- if (check_chain) {
- if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical))
- goto fail;
- }
-
- if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
- goto fail;
-
-
- }
+ }
single_dup++;
} else {
break;
}
+ if (!dup_checked && param->check_chain) {
+ // check for multiple optical duplicates of the same original read
+
+ if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) {
+ fprintf(stderr, "[markdup] error: duplicate checking failed.\n");
+ goto fail;
+ }
+
+ dup_checked = 1;
+ }
+
+
+ if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) {
+ break;
+ }
+
if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
if (param->supp) {
if (tmp_file_write(&temp, in_read->b)) {
goto fail;
}
+ // one last check
+ if (param->tag || param->opt_dist) {
+ if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) {
+ fprintf(stderr, "[markdup] error: duplicate checking failed.\n");
+ goto fail;
+ }
+ }
+
// write out the end of the list
rq = kl_begin(read_buffer);
while (rq != kl_end(read_buffer)) {
np_duplicate++;
if (param->tag && kh_val(dup_hash, k).name) {
- if (bam_aux_append(b, "do", 'Z', strlen(kh_val(dup_hash, k).name) + 1, (uint8_t*)kh_val(dup_hash, k).name)) {
+ if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) {
fprintf(stderr, "[markdup] error: unable to append supplementary 'do' tag.\n");
goto fail;
}
if (param->opt_dist) {
if (kh_val(dup_hash, k).type) {
- bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ");
+ bam_aux_update_str(b, "dt", 3, "SQ");
np_opt_duplicate++;
} else {
- bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"LB");
+ bam_aux_update_str(b, "dt", 3, "LB");
}
}
}
fp = stderr;
}
- els = estimate_library_size(pair, duplicate - optical);
+ els = estimate_library_size(pair, duplicate, optical);
fprintf(fp,
"COMMAND: %s\n"
}
}
+ if (param->check_chain && (param->tag || param->opt_dist))
+ free(dup_list.c);
+
kh_destroy(reads, pair_hash);
kh_destroy(reads, single_hash);
kl_destroy(read_queue, read_buffer);
}
kh_destroy(duplicates, dup_hash);
+ if (param->check_chain && (param->tag || param->opt_dist))
+ free(dup_list.c);
+
kh_destroy(reads, pair_hash);
kh_destroy(reads, single_hash);
sam_hdr_destroy(header);
fprintf(stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n"
" TYPE = t measure positions based on template start/end (default).\n"
" s measure positions based on sequence start.\n");
+ fprintf(stderr, " -n Reduce optical duplicate accuracy (faster results with many duplicates).\n");
+ fprintf(stderr, " -u Output uncompressed data\n");
fprintf(stderr, " --include-fails Include quality check failed reads.\n");
fprintf(stderr, " --no-PG Do not add a PG line\n");
+ fprintf(stderr, " --no-multi-dup Reduced duplicates of duplicates checking.\n");
fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag."
" Mainly for information and debugging.\n");
int bam_markdup(int argc, char **argv) {
int c, ret;
- char wmode[3] = {'w', 'b', 0};
+ char wmode[4] = {'w', 'b', 0, 0};
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
htsThreadPool p = {NULL, 0};
kstring_t tmpprefix = {0, 0, NULL};
struct stat st;
unsigned int t;
- md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL};
+ md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{"include-fails", no_argument, NULL, 1001},
{"no-PG", no_argument, NULL, 1002},
{"mode", required_argument, NULL, 'm'},
+ {"no-multi-dup", no_argument, NULL, 1003},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:ncm:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:cm:u", lopts, NULL)) >= 0) {
switch (c) {
case 'r': param.remove_dups = 1; break;
case 'l': param.max_length = atoi(optarg); break;
}
break;
+ case 'u': wmode[2] = '0'; break;
case 1001: param.include_fails = 1; break;
case 1002: param.no_pg = 1; break;
+ case 1003: param.check_chain = 0; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': return markdup_usage();
/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
through fixmates with the mate scoring option on.
- Copyright (C) 2017-2019 Genome Research Ltd.
+ Copyright (C) 2017-2020 Genome Research Ltd.
Author: Andrew Whitwham <aw7@sanger.ac.uk>
int mode;
int write_index;
int include_fails;
+ int check_chain;
char *stats_file;
char *arg_list;
char *out_fn;
bam1_t *b;
struct read_queue_s *duplicate;
hts_pos_t pos;
+ int dup_checked;
} read_queue_t;
typedef struct {
char type;
} dup_map_t;
+typedef struct {
+ bam1_t *b;
+ int64_t score;
+ int64_t mate_score;
+ long x;
+ long y;
+ int opt;
+ int xpos;
+} check_t;
+typedef struct {
+ check_t *c;
+ size_t size;
+ size_t length;
+} check_list_t;
+
static khint32_t do_hash(unsigned char *key, khint32_t len);
static khint_t hash_key(key_data_t key) {
}
+/* Get the position of the coordinates from the read name. */
static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) {
int sep = 0;
int pos = 0;
return sep;
}
+
+static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *y_coord, long *warnings) {
+ int ret = 1;
+ int seps, xpos = 0, ypos = 0;
+ long x = 0, y = 0;
+ char *end;
+
+ seps = get_coordinate_positions(name, &xpos, &ypos);
+
+ /* The most current Illumina read format at time of writing is:
+ @machine:run:flowcell:lane:tile:x:y:UMI or
+ @machine:run:flowcell:lane:tile:x:y
+
+ Counting the separating colons gives us a quick format check.
+ Older name formats have fewer elements.
+ */
+
+ if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", name);
+ }
+
+ return ret;
+ }
+
+ x = strtol(name + xpos, &end, 10);
+
+ if ((name + xpos) == end) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name);
+ }
+
+ return ret;
+ }
+
+ y = strtol(name + ypos, &end, 10);
+
+ if ((name + ypos) == end) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(samtools_stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name);
+ }
+
+ return ret;
+ }
+
+ *x_coord = x;
+ *y_coord = y;
+ *xpos_out = xpos;
+ ret = 0;
+
+ return ret;
+}
+
+
/* Using the coordinates from the Illumina read name, see whether the duplicated read is
close enough (set by max_dist) to the original to be counted as optical.*/
}
+/* Using the coordinates from the Illumina read name, see whether the duplicated read is
+ close enough (set by max_dist) to the original to be counted as optical.
+
+ This function needs the values from the first read to be already calculated. */
+
+static int optical_duplicate_partial(const char *name, const int oxpos, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) {
+ int ret = 0;
+ char *duplicate;
+ int dxpos = 0;
+ long dx, dy;
+
+ duplicate = bam_get_qname(dup);
+
+ if (get_coordinates(duplicate, &dxpos, &dx, &dy, warnings)) {
+ return ret;
+ }
+
+ if (strncmp(name, duplicate, oxpos - 1) == 0) {
+ // the initial parts match, look at the numbers
+ long xdiff, ydiff;
+
+ if (ox > dx) {
+ xdiff = ox - dx;
+ } else {
+ xdiff = dx - ox;
+ }
+
+ if (xdiff <= max_dist) {
+ // still might be optical
+
+ if (oy > dy) {
+ ydiff = oy - dy;
+ } else {
+ ydiff = dy - oy;
+ }
+
+ if (ydiff <= max_dist) ret = 1;
+ }
+ }
+
+ c->x = dx;
+ c->y = dy;
+ c->xpos = dxpos;
+
+ if (ret) {
+ c->opt = ret;
+ }
+
+ return ret;
+}
+
+
+/* Mark the read as a duplicate and update the duplicate hash (if needed) */
static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup,
long *optical, long *warn) {
char dup_type = 0;
dup->core.flag |= BAM_FDUP;
if (param->tag) {
- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(ori)) + 1, (uint8_t*)bam_get_qname(ori))) {
+ if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) {
fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n");
return -1;
}
if (param->opt_dist) { // mark optical duplicates
if (optical_duplicate(ori, dup, param->opt_dist, warn)) {
- bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"SQ");
+ bam_aux_update_str(dup, "dt", 3, "SQ");
dup_type = 'O';
(*optical)++;
} else {
// not an optical duplicate
- bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"LB");
+ bam_aux_update_str(dup, "dt", 3, "LB");
}
}
}
+/* If the duplicate type has changed to optical then retag and duplicate hash. */
static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) {
int ret = 0;
- uint8_t *data;
- // remove any existing dt tag
- if ((data = bam_aux_get(b, "dt")) != NULL) {
- bam_aux_del(b, data);
- }
-
- if (bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ")) {
- fprintf(samtools_stderr, "[markdup] error: unable to append 'dt' tag.\n");
+ if (bam_aux_update_str(b, "dt", 3, "SQ")) {
+ fprintf(samtools_stderr, "[markdup] error: unable to update 'dt' tag.\n");
ret = -1;
}
}
+/* Check all duplicates of the highest quality read (the "original") for consistancy. Also
+ pre-calculate any values for use in check_duplicate_chain later.
+ Returns 0 on success, >0 on coordinate reading error (program can continue) or
+ <0 on an error (program should not continue. */
+static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori,
+ check_list_t *list, long *warn, long *optical_single, long *optical_pair) {
-/*
- Where there is more than one duplicate go down the list and check for optical duplicates and change
- do tags (where used) to point to original (non-duplicate) read.
-*/
-static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori,
- long *warn, long *optical_single, long *optical_pair) {
int ret = 0;
- read_queue_t *current = ori->duplicate;
char *ori_name = bam_get_qname(ori->b);
- int have_original = !(ori->b->core.flag & BAM_FDUP);
- int ori_paired = (ori->b->core.flag & BAM_FPAIRED) && !(ori->b->core.flag & BAM_FMUNMAP);
+ read_queue_t *current = ori->duplicate;
+ int xpos;
+ long x, y;
+
+ if (param->opt_dist) {
+ if ((ret = get_coordinates(ori_name, &xpos, &x, &y, warn))) {
+ return ret;
+ }
+ }
+
+ list->length = 0;
while (current) {
- int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
+ check_t *c;
+
+ if (list->length >= list->size) {
+ check_t *tmp;
+
+ list->size *= 2;
+
+ if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) {
+ fprintf(samtools_stderr, "[markdup] error: Unable to expand opt check list.\n");
+ return -1;
+ }
+
+ list->c = tmp;
+ }
+
+ c = &list->c[list->length];
- if (param->tag && have_original) {
+ c->b = current->b;
+ c->x = -1;
+ c->y = -1;
+ c->opt = 0;
+ c->score = 0;
+ c->mate_score = 0;
+ current->dup_checked = 1;
+
+ if (param->tag) {
uint8_t *data;
// at this stage all duplicates should have a do tag
if (old_name) {
if (strcmp(old_name, ori_name) != 0) {
- bam_aux_del(current->b, data);
-
- if (bam_aux_append(current->b, "do", 'Z', strlen(ori_name) + 1, (uint8_t*)ori_name)) {
- fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n");
+ if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) {
+ fprintf(samtools_stderr, "[markdup] error: unable to update 'do' tag.\n");
ret = -1;
break;
}
}
if (param->opt_dist) {
- int is_cur_opt = 0, is_ori_opt = 0;
uint8_t *data;
char *dup_type;
+ int is_opt = 0;
+ int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
- if ((data = bam_aux_get(ori->b, "dt"))) {
+ if ((data = bam_aux_get(current->b, "dt"))) {
if ((dup_type = bam_aux2Z(data))) {
if (strcmp(dup_type, "SQ") == 0) {
- is_ori_opt = 1;
+ c->opt = 1;
}
}
}
- if ((data = bam_aux_get(current->b, "dt"))) {
- if ((dup_type = bam_aux2Z(data))) {
- if (strcmp(dup_type, "SQ") == 0) {
- is_cur_opt = 1;
- }
+ // need to run this to get the duplicates x and y scores
+ is_opt = optical_duplicate_partial(ori_name, xpos, x, y, current->b, c, param->opt_dist, warn);
+
+ if (!c->opt && is_opt) {
+ if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
+ ret = -1;
+ break;
}
+
+ c->opt = 1;
}
- if (!(is_ori_opt && is_cur_opt)) {
- // if both are already optical duplicates there is no need to check again, otherwise...
+ c->score = calc_score(current->b);
- if (optical_duplicate(ori->b, current->b, param->opt_dist, warn)) {
- // find out which one is the duplicate
- int is_cur_dup = 0;
+ if (current_paired) {
+ if ((c->mate_score = get_mate_score(current->b)) == -1) {
+ fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
+ ret = -1;
+ break;
+ }
+ }
+ }
- if (have_original) {
- // compared against an original, this is a dup.
- is_cur_dup = 1;
- } else if (ori_paired != current_paired) {
- if (!current_paired) {
- // current is single vs pair, this is a dup.
- is_cur_dup = 1;
- }
- } else {
- // do it by scores
- int64_t ori_score, curr_score;
+ current = current->duplicate;
+ list->length++;
+ }
- if ((ori->b->core.flag & BAM_FQCFAIL) != (current->b->core.flag & BAM_FQCFAIL)) {
- if (ori->b->core.flag & BAM_FQCFAIL) {
- ori_score = 0;
- curr_score = 1;
- } else {
- ori_score = 1;
- curr_score = 0;
- }
- } else {
- ori_score = calc_score(ori->b);
- curr_score = calc_score(current->b);
-
- if (current_paired) {
- // they are pairs so add mate scores.
- int64_t mate_tmp;
-
- if ((mate_tmp = get_mate_score(ori->b)) == -1) {
- fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
- ret = -1;
- break;
- } else {
- ori_score += mate_tmp;
- }
+ return ret;
+}
- if ((mate_tmp = get_mate_score(current->b)) == -1) {
- fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
- ret = -1;
- break;
- } else {
- curr_score += mate_tmp;
- }
- }
- }
- if (ori_score == curr_score) {
- if (strcmp(bam_get_qname(current->b), ori_name) < 0) {
- curr_score++;
- } else {
- curr_score--;
- }
- }
+static int xcoord_sort(const void *a, const void *b) {
+ check_t *ac = (check_t *) a;
+ check_t *bc = (check_t *) b;
- if (ori_score > curr_score) {
- is_cur_dup = 1;
- }
+ return (ac->x - bc->x);
+}
+
+
+/* Check all the duplicates against each other to see if they are optical duplicates. */
+static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list,
+ long *warn, long *optical_single, long *optical_pair) {
+ int ret = 0;
+ size_t curr = 0;
+
+ qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort);
+
+ while (curr < list->length - 1) {
+ check_t *current = &list->c[curr];
+ size_t count = curr;
+ char *cur_name = bam_get_qname(current->b);
+ int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
+
+ while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) {
+ // while close enough along the x coordinate
+ check_t *chk = &list->c[count];
+
+ if (current->opt && chk->opt)
+ continue;
+
+ // if both are already optical duplicates there is no need to check again, otherwise...
+
+ long ydiff;
+
+ if (current->y > chk->y) {
+ ydiff = current->y - chk->y;
+ } else {
+ ydiff = chk->y - current->y;
+ }
+
+ if (ydiff > param->opt_dist)
+ continue;
+
+ // the number are right, check the names
+ if (strncmp(cur_name, bam_get_qname(chk->b), current->xpos - 1) != 0)
+ continue;
+
+ // optical duplicates
+ int chk_dup = 0;
+ int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP);
+
+ if (current_paired != chk_paired) {
+ if (!chk_paired) {
+ // chk is single vs pair, this is a dup.
+ chk_dup = 1;
+ }
+ } else {
+ // do it by scores
+ int64_t cur_score, chk_score;
+
+ if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) {
+ if (current->b->core.flag & BAM_FQCFAIL) {
+ cur_score = 0;
+ chk_score = 1;
+ } else {
+ cur_score = 1;
+ chk_score = 0;
}
+ } else {
+ cur_score = current->score;
+ chk_score = chk->score;
- if (is_cur_dup) {
- // the current is the optical duplicate
- if (!is_cur_opt) { // only change if not already an optical duplicate
- if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
- ret = -1;
- break;
- }
- }
+ if (current_paired) {
+ // they are pairs so add mate scores.
+ chk_score += chk->mate_score;
+ cur_score += current->mate_score;
+ }
+ }
+
+ if (cur_score == chk_score) {
+ if (strcmp(bam_get_qname(chk->b), cur_name) < 0) {
+ chk_score++;
} else {
- if (!is_ori_opt) {
- if (optical_retag(param, dup_hash, ori->b, ori_paired, optical_single, optical_pair)) {
- ret = -1;
- break;
- }
- }
+ chk_score--;
}
}
+
+ if (cur_score > chk_score) {
+ chk_dup = 1;
+ }
+ }
+
+ if (chk_dup) {
+ // the duplicate is the optical duplicate
+ if (!chk->opt) { // only change if not already an optical duplicate
+ if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) {
+ ret = -1;
+ goto fail;
+ }
+
+ chk->opt = 1;
+ }
+ } else {
+ if (!current->opt) {
+ if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
+ ret = -1;
+ goto fail;
+ }
+
+ current->opt = 1;
+ }
}
}
- current = current->duplicate;
+ curr++;
+ }
+
+ fail:
+ return ret;
+}
+
+
+/* Where there is more than one duplicate go down the list and check for optical duplicates and change
+ do tags (where used) to point to original (non-duplicate) read. */
+static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list,
+ const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single,
+ long *optical_pair, const int check_range) {
+ int ret = 0;
+ kliter_t(read_queue) *rq;
+
+ rq = kl_begin(read_buffer);
+
+ while (rq != kl_end(read_buffer)) {
+ read_queue_t *in_read = &kl_val(rq);
+
+ if (check_range) {
+ /* Just check against the moving window of reads based on coordinates and max read length. */
+ if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) {
+ break;
+ }
+ } else {
+ // this is the last set of results and the end entry will be blank
+ if (!bam_get_qname(in_read->b)) {
+ break;
+ }
+ }
+
+ if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain
+
+ // check against the original for tagging and optical duplication
+ if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) {
+ if (ret < 0) { // real error
+ ret = -1;
+ break;
+ } else { // coordinate decoding error
+ ret = 0;
+ in_read->duplicate = NULL;
+ continue;
+ }
+ }
+
+ // check the rest of the duplicates against each other for optical duplication
+ if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) {
+ ret = -1;
+ break;
+ }
+
+ in_read->duplicate = NULL;
+ }
+
+ rq = kl_next(rq);
}
return ret;
}
+
/*
Function to use when estimating library size.
/* estimate the library size, based on the Picard code in DuplicationMetrics.java*/
-static unsigned long estimate_library_size(unsigned long read_pairs, unsigned long duplicate_pairs) {
+static unsigned long estimate_library_size(unsigned long paired_reads, unsigned long paired_duplicate_reads, unsigned long optical) {
unsigned long estimated_size = 0;
+ unsigned long non_optical_pairs = (paired_reads - optical) / 2;
+ unsigned long unique_pairs = (paired_reads - paired_duplicate_reads) / 2;
+ unsigned long duplicate_pairs = (paired_duplicate_reads - optical) / 2;
- read_pairs /= 2;
- duplicate_pairs /= 2;
-
- if ((read_pairs && duplicate_pairs) && (read_pairs > duplicate_pairs)) {
- unsigned long unique_pairs = read_pairs - duplicate_pairs;
+ if ((non_optical_pairs && duplicate_pairs && unique_pairs) && (non_optical_pairs > duplicate_pairs)) {
double m = 1;
double M = 100;
int i;
- if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) < 0) {
+ if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) < 0) {
fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size.\n");
return estimated_size;
}
- while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) > 0) {
+ while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) > 0) {
M *= 10;
}
for (i = 0; i < 40; i++) {
double r = (m + M) / 2;
- double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)read_pairs);
+ double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs);
if (u > 0) {
m = r;
fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size."
" Read pairs %ld should be greater than duplicate pairs %ld,"
" which should both be non zero.\n",
- read_pairs, duplicate_pairs);
+ non_optical_pairs, duplicate_pairs);
}
return estimated_size;
tmp_file_t temp;
char *idx_fn = NULL;
int exclude = 0;
+ check_list_t dup_list = {NULL, 0, 0};
if (!pair_hash || !single_hash || !read_buffer || !dup_hash) {
fprintf(samtools_stderr, "[markdup] out of memory\n");
goto fail;
}
+ if (param->check_chain && !(param->tag || param->opt_dist))
+ param->check_chain = 0;
+
+ if (param->check_chain) {
+ dup_list.size = 128;
+ dup_list.c = NULL;
+
+ if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) {
+ fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for dup_list.\n");
+ goto fail;
+ }
+ }
+
reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0;
np_duplicate = np_opt_duplicate = 0;
while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) {
+ int dup_checked = 0;
// do some basic coordinate order checks
if (in_read->b->core.tid >= 0) { // -1 for unmapped reads
prev_tid = in_read->b->core.tid;
in_read->pair_key.single = 1;
in_read->single_key.single = 0;
+ in_read->duplicate = NULL;
+ in_read->dup_checked = 0;
reading++;
// read must not be secondary, supplementary, unmapped or (possibly) failed QC
if (!(in_read->b->core.flag & exclude)) {
examined++;
- in_read->duplicate = NULL;
+
// look at the pairs first
if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) {
// scores more than one read of the pair
bam1_t *dup = bp->p->b;
- in_read->duplicate = bp->p;
+ if (param->check_chain)
+ in_read->duplicate = bp->p;
+
bp->p = in_read;
if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
goto fail;
single_dup++;
-
- if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
- goto fail;
-
}
} else {
fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n");
in_read->pair_key = pair_key;
} else if (ret == 0) {
int64_t old_score, new_score, tie_add = 0;
- bam1_t *dup;
- int check_chain = 0;
+ bam1_t *dup = NULL;
bp = &kh_val(pair_hash, k);
if (new_score + tie_add > old_score) { // swap reads
dup = bp->p->b;
- in_read->duplicate = bp->p;
+
+ if (param->check_chain) {
+
+ if (in_read->duplicate) {
+ read_queue_t *current = in_read->duplicate;
+
+ while (current->duplicate) {
+ current = current->duplicate;
+ }
+
+ current->duplicate = bp->p;
+ } else {
+ in_read->duplicate = bp->p;
+ }
+ }
+
bp->p = in_read;
} else {
- if (bp->p->duplicate) {
- in_read->duplicate = bp->p->duplicate;
- check_chain = 1;
+ if (param->check_chain) {
+ if (bp->p->duplicate) {
+ if (in_read->duplicate) {
+ read_queue_t *current = bp->p->duplicate;
+
+ while (current->duplicate) {
+ current = current->duplicate;
+ }
+
+ current->duplicate = in_read->duplicate;
+ }
+
+ in_read->duplicate = bp->p->duplicate;
+ }
+
+ bp->p->duplicate = in_read;
}
- bp->p->duplicate = in_read;
dup = in_read->b;
}
if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings))
goto fail;
- if (check_chain) {
- if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical))
- goto fail;
- }
-
- if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
- goto fail;
-
duplicate++;
} else {
fprintf(samtools_stderr, "[markdup] error: pair hashing failure.\n");
int ret;
key_data_t single_key;
in_hash_t *bp;
- int check_chain = 0;
make_single_key(&single_key, in_read->b);
if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) {
// if matched against one of a pair just mark as duplicate
- if (bp->p->duplicate) {
- in_read->duplicate = bp->p->duplicate;
- check_chain = 1;
- }
-
- bp->p->duplicate = in_read;
-
- if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings))
- goto fail;
+ if (param->check_chain) {
+ if (bp->p->duplicate) {
+ in_read->duplicate = bp->p->duplicate;
+ }
- if (check_chain) {
- // check the new duplicate entry in the chain
- if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical))
- goto fail;
+ bp->p->duplicate = in_read;
}
- // check against the new original
- if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
+ if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings))
goto fail;
} else {
int64_t old_score, new_score;
- bam1_t *dup;
+ bam1_t *dup = NULL;
old_score = calc_score(bp->p->b);
new_score = calc_score(in_read->b);
// to the single hash and mark the other as duplicate
if (new_score > old_score) { // swap reads
dup = bp->p->b;
- in_read->duplicate = bp->p;
+
+ if (param->check_chain)
+ in_read->duplicate = bp->p;
+
bp->p = in_read;
} else {
- if (bp->p->duplicate) {
- in_read->duplicate = bp->p->duplicate;
- check_chain = 1;
+ if (param->check_chain) {
+ if (bp->p->duplicate) {
+ in_read->duplicate = bp->p->duplicate;
+ }
+
+ bp->p->duplicate = in_read;
}
- bp->p->duplicate = in_read;
dup = in_read->b;
}
if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
goto fail;
-
-
- if (check_chain) {
- if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical))
- goto fail;
- }
-
- if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
- goto fail;
-
-
- }
+ }
single_dup++;
} else {
break;
}
+ if (!dup_checked && param->check_chain) {
+ // check for multiple optical duplicates of the same original read
+
+ if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) {
+ fprintf(samtools_stderr, "[markdup] error: duplicate checking failed.\n");
+ goto fail;
+ }
+
+ dup_checked = 1;
+ }
+
+
+ if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) {
+ break;
+ }
+
if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
if (param->supp) {
if (tmp_file_write(&temp, in_read->b)) {
goto fail;
}
+ // one last check
+ if (param->tag || param->opt_dist) {
+ if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) {
+ fprintf(samtools_stderr, "[markdup] error: duplicate checking failed.\n");
+ goto fail;
+ }
+ }
+
// write out the end of the list
rq = kl_begin(read_buffer);
while (rq != kl_end(read_buffer)) {
np_duplicate++;
if (param->tag && kh_val(dup_hash, k).name) {
- if (bam_aux_append(b, "do", 'Z', strlen(kh_val(dup_hash, k).name) + 1, (uint8_t*)kh_val(dup_hash, k).name)) {
+ if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) {
fprintf(samtools_stderr, "[markdup] error: unable to append supplementary 'do' tag.\n");
goto fail;
}
if (param->opt_dist) {
if (kh_val(dup_hash, k).type) {
- bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ");
+ bam_aux_update_str(b, "dt", 3, "SQ");
np_opt_duplicate++;
} else {
- bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"LB");
+ bam_aux_update_str(b, "dt", 3, "LB");
}
}
}
fp = samtools_stderr;
}
- els = estimate_library_size(pair, duplicate - optical);
+ els = estimate_library_size(pair, duplicate, optical);
fprintf(fp,
"COMMAND: %s\n"
}
}
+ if (param->check_chain && (param->tag || param->opt_dist))
+ free(dup_list.c);
+
kh_destroy(reads, pair_hash);
kh_destroy(reads, single_hash);
kl_destroy(read_queue, read_buffer);
}
kh_destroy(duplicates, dup_hash);
+ if (param->check_chain && (param->tag || param->opt_dist))
+ free(dup_list.c);
+
kh_destroy(reads, pair_hash);
kh_destroy(reads, single_hash);
sam_hdr_destroy(header);
fprintf(samtools_stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n"
" TYPE = t measure positions based on template start/end (default).\n"
" s measure positions based on sequence start.\n");
+ fprintf(samtools_stderr, " -n Reduce optical duplicate accuracy (faster results with many duplicates).\n");
+ fprintf(samtools_stderr, " -u Output uncompressed data\n");
fprintf(samtools_stderr, " --include-fails Include quality check failed reads.\n");
fprintf(samtools_stderr, " --no-PG Do not add a PG line\n");
+ fprintf(samtools_stderr, " --no-multi-dup Reduced duplicates of duplicates checking.\n");
fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag."
" Mainly for information and debugging.\n");
int bam_markdup(int argc, char **argv) {
int c, ret;
- char wmode[3] = {'w', 'b', 0};
+ char wmode[4] = {'w', 'b', 0, 0};
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
htsThreadPool p = {NULL, 0};
kstring_t tmpprefix = {0, 0, NULL};
struct stat st;
unsigned int t;
- md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL};
+ md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{"include-fails", no_argument, NULL, 1001},
{"no-PG", no_argument, NULL, 1002},
{"mode", required_argument, NULL, 'm'},
+ {"no-multi-dup", no_argument, NULL, 1003},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:ncm:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:cm:u", lopts, NULL)) >= 0) {
switch (c) {
case 'r': param.remove_dups = 1; break;
case 'l': param.max_length = atoi(optarg); break;
}
break;
+ case 'u': wmode[2] = '0'; break;
case 1001: param.include_fails = 1; break;
case 1002: param.no_pg = 1; break;
+ case 1003: param.check_chain = 0; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': return markdup_usage();
curr = 1 - curr;
pre_end = cur_end;
}
- if (result < -1) goto fail;
+ if (result < -1) goto read_fail;
if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired
bam1_t *pre = b[1-curr];
if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
ks_free(&str);
return 0;
+ read_fail:
+ print_error("fixmate", "Couldn't read from input file");
+ goto fail;
+
write_fail:
print_error_errno("fixmate", "Couldn't write to output file");
fail:
" -p Disable FR proper pair check\n"
" -c Add template cigar ct tag\n"
" -m Add mate score tag\n"
+" -u Uncompressed output\n"
" --no-PG do not add a PG line\n");
sam_global_opt_help(where, "-.O..@-.");
samFile *in = NULL, *out = NULL;
int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
- char wmode[3] = {'w', 'b', 0};
+ char wmode[4] = {'w', 'b', 0, 0};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{"no-PG", no_argument, NULL, 1},
// parse args
if (argc == 1) { usage(stdout); return 0; }
- while ((c = getopt_long(argc, argv, "rpcmO:@:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "rpcmO:@:u", lopts, NULL)) >= 0) {
switch (c) {
case 'r': remove_reads = 1; break;
case 'p': proper_pair_check = 0; break;
case 'c': add_ct = 1; break;
case 'm': mate_score = 1; break;
+ case 'u': wmode[2] = '0'; break;
case 1: no_pg = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
curr = 1 - curr;
pre_end = cur_end;
}
- if (result < -1) goto fail;
+ if (result < -1) goto read_fail;
if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired
bam1_t *pre = b[1-curr];
if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
ks_free(&str);
return 0;
+ read_fail:
+ print_error("fixmate", "Couldn't read from input file");
+ goto fail;
+
write_fail:
print_error_errno("fixmate", "Couldn't write to output file");
fail:
" -p Disable FR proper pair check\n"
" -c Add template cigar ct tag\n"
" -m Add mate score tag\n"
+" -u Uncompressed output\n"
" --no-PG do not add a PG line\n");
sam_global_opt_help(where, "-.O..@-.");
samFile *in = NULL, *out = NULL;
int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
- char wmode[3] = {'w', 'b', 0};
+ char wmode[4] = {'w', 'b', 0, 0};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{"no-PG", no_argument, NULL, 1},
// parse args
if (argc == 1) { usage(samtools_stdout); return 0; }
- while ((c = getopt_long(argc, argv, "rpcmO:@:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "rpcmO:@:u", lopts, NULL)) >= 0) {
switch (c) {
case 'r': remove_reads = 1; break;
case 'p': proper_pair_check = 0; break;
case 'c': add_ct = 1; break;
case 'm': mate_score = 1; break;
+ case 'u': wmode[2] = '0'; break;
case 1: no_pg = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
/* bam_md.c -- calmd subcommand.
- Copyright (C) 2009-2011, 2014-2015, 2019 Genome Research Ltd.
+ Copyright (C) 2009-2011, 2014-2015, 2019-2020 Genome Research Ltd.
Portions copyright (C) 2009-2011 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <string.h>
#include <ctype.h>
#include <limits.h>
+#include <errno.h>
#include "htslib/faidx.h"
#include "htslib/sam.h"
#include "htslib/kstring.h"
int bam_aux_drop_other(bam1_t *b, uint8_t *s);
-void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max_nm, int quiet_mode)
+static int bam_fillmd1_core(const char *ref_name, bam1_t *b, char *ref,
+ hts_pos_t ref_len, int flag, int max_nm,
+ int quiet_mode, uint32_t *skipped)
{
uint8_t *seq = bam_get_seq(b);
uint32_t *cigar = bam_get_cigar(b);
bam1_core_t *c = &b->core;
- int i, y, u = 0;
- hts_pos_t x;
- kstring_t *str;
+ int i, qpos, matched = 0;
+ hts_pos_t rpos;
+ kstring_t str = KS_INITIALIZE;
int32_t old_nm_i = -1, nm = 0;
+ uint32_t err = 0;
- str = (kstring_t*)calloc(1, sizeof(kstring_t));
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ if (c->l_qseq == 0) {
+ if (!quiet_mode) {
+ if (ref_name) {
+ fprintf(stderr, "[bam_fillmd1] no sequence in alignment "
+ "record for '%s' at %s:%"PRIhts_pos", skipped\n",
+ bam_get_qname(b), ref_name, c->pos + 1);
+ } else {
+ fprintf(stderr, "[bam_fillmd1] no sequence in alignment "
+ "record for '%s', skipped", bam_get_qname(b));
+ }
+ }
+ if (skipped) (*skipped)++;
+ return 0;
+ }
+
+ for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) {
+ int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int c1, c2, z = y + j;
- if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
- c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
+ for (j = 0; j < oplen; ++j) {
+ int c1, c2, z = qpos + j;
+ if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0')
+ break; // out of bounds
+ c1 = bam_seqi(seq, z);
+ c2 = seq_nt16_table[(uint8_t)ref[rpos+j]];
if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
- ++u;
+ ++matched;
} else {
- kputw(u, str); kputc(toupper(ref[x+j]), str);
- u = 0; ++nm;
+ err |= kputw(matched, &str) < 0;
+ err |= kputc(toupper(ref[rpos+j]), &str) < 0;
+ matched = 0; ++nm;
}
}
- if (j < l) break;
- x += l; y += l;
+ if (j < oplen) break;
+ rpos += oplen; qpos += oplen;
} else if (op == BAM_CDEL) {
- kputw(u, str); kputc('^', str);
- for (j = 0; j < l; ++j) {
- if (x+j >= ref_len || ref[x+j] == '\0') break;
- kputc(toupper(ref[x+j]), str);
+ err |= kputw(matched, &str) < 0;
+ err |= kputc('^', &str) < 0;
+ for (j = 0; j < oplen; ++j) {
+ if (rpos+j >= ref_len || ref[rpos+j] == '\0') break;
+ err |= kputc(toupper(ref[rpos+j]), &str) < 0;
}
- u = 0;
- x += j; nm += j;
- if (j < l) break;
+ matched = 0;
+ rpos += j; nm += j;
+ if (j < oplen) break;
} else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
- y += l;
- if (op == BAM_CINS) nm += l;
+ qpos += oplen;
+ if (op == BAM_CINS) nm += oplen;
} else if (op == BAM_CREF_SKIP) {
- x += l;
+ rpos += oplen;
}
}
- kputw(u, str);
+ err |= kputw(matched, &str) < 0;
+ if (err) {
+ print_error_errno("calmd", "Couldn't build new MD string");
+ goto fail;
+ }
// apply max_nm
if (max_nm > 0 && nm >= max_nm) {
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) {
+ int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int c1, c2, z = y + j;
- if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
- c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
+ for (j = 0; j < oplen; ++j) {
+ int c1, c2, z = qpos + j;
+ if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0')
+ break; // out of bounds
+ c1 = bam_seqi(seq, z);
+ c2 = seq_nt16_table[(uint8_t)ref[rpos+j]];
if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
seq[z/2] |= (z&1)? 0x0f : 0xf0;
bam_get_qual(b)[z] = 0;
}
}
- if (j < l) break;
- x += l; y += l;
- } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
- else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ if (j < oplen) break;
+ rpos += oplen; qpos += oplen;
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) rpos += oplen;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) qpos += oplen;
}
}
// update NM
if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) {
uint8_t *old_nm = bam_aux_get(b, "NM");
if (old_nm) old_nm_i = bam_aux2i(old_nm);
- if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ if (!old_nm) {
+ if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0)
+ goto aux_fail;
+ }
else if (nm != old_nm_i) {
if (!quiet_mode) {
fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
}
- bam_aux_del(b, old_nm);
- bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ if (bam_aux_del(b, old_nm) < 0) goto aux_fail;
+ if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0)
+ goto aux_fail;
}
}
// update MD
if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) {
uint8_t *old_md = bam_aux_get(b, "MD");
- if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
- else {
+ if (!old_md) {
+ if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0)
+ goto aux_fail;
+ } else {
int is_diff = 0;
- if (strlen((char*)old_md+1) == str->l) {
- for (i = 0; i < str->l; ++i)
- if (toupper(old_md[i+1]) != toupper(str->s[i]))
+ if (strlen((char*)old_md+1) == str.l) {
+ for (i = 0; i < str.l; ++i)
+ if (toupper(old_md[i+1]) != toupper(str.s[i]))
break;
- if (i < str->l) is_diff = 1;
+ if (i < str.l) is_diff = 1;
} else is_diff = 1;
if (is_diff) {
if (!quiet_mode) {
- fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
+ fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str.s);
}
- bam_aux_del(b, old_md);
- bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+ if (bam_aux_del(b, old_md) < 0) goto aux_fail;
+ if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0)
+ goto aux_fail;
}
}
}
if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
}
- free(str->s); free(str);
+ free(str.s);
+ return 0;
+
+ aux_fail:
+ if (errno == ENOMEM) {
+ print_error("calmd", "Couldn't add aux tag (too long)");
+ } else if (errno == EINVAL) {
+ print_error("calmd", "Corrupt aux data");
+ } else {
+ print_error_errno("calmd", "Couldn't add aux tag");
+ }
+ fail:
+ free(str.s);
+ return -1;
}
-void bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode)
+int bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode)
{
- bam_fillmd1_core(b, ref, INT_MAX, flag, 0, quiet_mode);
+ return bam_fillmd1_core(NULL, b, ref, INT_MAX, flag, 0, quiet_mode, NULL);
}
int calmd_usage() {
sam_hdr_t *header = NULL;
faidx_t *fai = NULL;
char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL;
+ const char *ref_name = NULL;
bam1_t *b = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ uint32_t skipped = 0;
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'),
if (b->core.tid >= 0) {
if (tid != b->core.tid) {
free(ref);
- ref = fai_fetch64(fai, sam_hdr_tid2name(header, b->core.tid), &len);
+ ref = NULL;
+ len = 0;
+ ref_name = sam_hdr_tid2name(header, b->core.tid);
+ if (ref_name) {
+ ref = fai_fetch64(fai, ref_name, &len);
+ }
tid = b->core.tid;
if (ref == 0) { // FIXME: Should this always be fatal?
fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
- sam_hdr_tid2name(header, tid));
+ ref_name ? ref_name : "(unknown)");
if (is_realn || capQ > 10) goto fail; // Would otherwise crash
}
}
- if (is_realn) sam_prob_realn(b, ref, len, baq_flag);
+ if (is_realn) {
+ if (sam_prob_realn(b, ref, len, baq_flag) < -3) {
+ print_error_errno("calmd", "BAQ alignment failed");
+ goto fail;
+ }
+ }
if (capQ > 10) {
int q = sam_cap_mapq(b, ref, len, capQ);
if (b->core.qual > q) b->core.qual = q;
}
- if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm, quiet_mode);
+ if (ref) {
+ if (bam_fillmd1_core(ref_name, b, ref, len, flt_flag, max_nm,
+ quiet_mode, &skipped) < 0)
+ goto fail;
+ }
}
if (sam_write1(fpout, header, b) < 0) {
print_error_errno("calmd", "failed to write to output file");
fprintf(stderr, "[bam_fillmd] Error reading input.\n");
goto fail;
}
+
+ if (skipped) {
+ fprintf(stderr, "[calmd] Warning: %"PRIu32" records skipped due "
+ "to no query sequence\n",
+ skipped);
+ }
+
bam_destroy1(b);
sam_hdr_destroy(header);
/* bam_md.c -- calmd subcommand.
- Copyright (C) 2009-2011, 2014-2015, 2019 Genome Research Ltd.
+ Copyright (C) 2009-2011, 2014-2015, 2019-2020 Genome Research Ltd.
Portions copyright (C) 2009-2011 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <string.h>
#include <ctype.h>
#include <limits.h>
+#include <errno.h>
#include "htslib/faidx.h"
#include "htslib/sam.h"
#include "htslib/kstring.h"
int bam_aux_drop_other(bam1_t *b, uint8_t *s);
-void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max_nm, int quiet_mode)
+static int bam_fillmd1_core(const char *ref_name, bam1_t *b, char *ref,
+ hts_pos_t ref_len, int flag, int max_nm,
+ int quiet_mode, uint32_t *skipped)
{
uint8_t *seq = bam_get_seq(b);
uint32_t *cigar = bam_get_cigar(b);
bam1_core_t *c = &b->core;
- int i, y, u = 0;
- hts_pos_t x;
- kstring_t *str;
+ int i, qpos, matched = 0;
+ hts_pos_t rpos;
+ kstring_t str = KS_INITIALIZE;
int32_t old_nm_i = -1, nm = 0;
+ uint32_t err = 0;
- str = (kstring_t*)calloc(1, sizeof(kstring_t));
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ if (c->l_qseq == 0) {
+ if (!quiet_mode) {
+ if (ref_name) {
+ fprintf(samtools_stderr, "[bam_fillmd1] no sequence in alignment "
+ "record for '%s' at %s:%"PRIhts_pos", skipped\n",
+ bam_get_qname(b), ref_name, c->pos + 1);
+ } else {
+ fprintf(samtools_stderr, "[bam_fillmd1] no sequence in alignment "
+ "record for '%s', skipped", bam_get_qname(b));
+ }
+ }
+ if (skipped) (*skipped)++;
+ return 0;
+ }
+
+ for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) {
+ int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int c1, c2, z = y + j;
- if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
- c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
+ for (j = 0; j < oplen; ++j) {
+ int c1, c2, z = qpos + j;
+ if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0')
+ break; // out of bounds
+ c1 = bam_seqi(seq, z);
+ c2 = seq_nt16_table[(uint8_t)ref[rpos+j]];
if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
- ++u;
+ ++matched;
} else {
- kputw(u, str); kputc(toupper(ref[x+j]), str);
- u = 0; ++nm;
+ err |= kputw(matched, &str) < 0;
+ err |= kputc(toupper(ref[rpos+j]), &str) < 0;
+ matched = 0; ++nm;
}
}
- if (j < l) break;
- x += l; y += l;
+ if (j < oplen) break;
+ rpos += oplen; qpos += oplen;
} else if (op == BAM_CDEL) {
- kputw(u, str); kputc('^', str);
- for (j = 0; j < l; ++j) {
- if (x+j >= ref_len || ref[x+j] == '\0') break;
- kputc(toupper(ref[x+j]), str);
+ err |= kputw(matched, &str) < 0;
+ err |= kputc('^', &str) < 0;
+ for (j = 0; j < oplen; ++j) {
+ if (rpos+j >= ref_len || ref[rpos+j] == '\0') break;
+ err |= kputc(toupper(ref[rpos+j]), &str) < 0;
}
- u = 0;
- x += j; nm += j;
- if (j < l) break;
+ matched = 0;
+ rpos += j; nm += j;
+ if (j < oplen) break;
} else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
- y += l;
- if (op == BAM_CINS) nm += l;
+ qpos += oplen;
+ if (op == BAM_CINS) nm += oplen;
} else if (op == BAM_CREF_SKIP) {
- x += l;
+ rpos += oplen;
}
}
- kputw(u, str);
+ err |= kputw(matched, &str) < 0;
+ if (err) {
+ print_error_errno("calmd", "Couldn't build new MD string");
+ goto fail;
+ }
// apply max_nm
if (max_nm > 0 && nm >= max_nm) {
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) {
+ int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int c1, c2, z = y + j;
- if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
- c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
+ for (j = 0; j < oplen; ++j) {
+ int c1, c2, z = qpos + j;
+ if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0')
+ break; // out of bounds
+ c1 = bam_seqi(seq, z);
+ c2 = seq_nt16_table[(uint8_t)ref[rpos+j]];
if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
seq[z/2] |= (z&1)? 0x0f : 0xf0;
bam_get_qual(b)[z] = 0;
}
}
- if (j < l) break;
- x += l; y += l;
- } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
- else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ if (j < oplen) break;
+ rpos += oplen; qpos += oplen;
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) rpos += oplen;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) qpos += oplen;
}
}
// update NM
if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) {
uint8_t *old_nm = bam_aux_get(b, "NM");
if (old_nm) old_nm_i = bam_aux2i(old_nm);
- if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ if (!old_nm) {
+ if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0)
+ goto aux_fail;
+ }
else if (nm != old_nm_i) {
if (!quiet_mode) {
fprintf(samtools_stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
}
- bam_aux_del(b, old_nm);
- bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+ if (bam_aux_del(b, old_nm) < 0) goto aux_fail;
+ if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0)
+ goto aux_fail;
}
}
// update MD
if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) {
uint8_t *old_md = bam_aux_get(b, "MD");
- if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
- else {
+ if (!old_md) {
+ if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0)
+ goto aux_fail;
+ } else {
int is_diff = 0;
- if (strlen((char*)old_md+1) == str->l) {
- for (i = 0; i < str->l; ++i)
- if (toupper(old_md[i+1]) != toupper(str->s[i]))
+ if (strlen((char*)old_md+1) == str.l) {
+ for (i = 0; i < str.l; ++i)
+ if (toupper(old_md[i+1]) != toupper(str.s[i]))
break;
- if (i < str->l) is_diff = 1;
+ if (i < str.l) is_diff = 1;
} else is_diff = 1;
if (is_diff) {
if (!quiet_mode) {
- fprintf(samtools_stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
+ fprintf(samtools_stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str.s);
}
- bam_aux_del(b, old_md);
- bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+ if (bam_aux_del(b, old_md) < 0) goto aux_fail;
+ if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0)
+ goto aux_fail;
}
}
}
if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
}
- free(str->s); free(str);
+ free(str.s);
+ return 0;
+
+ aux_fail:
+ if (errno == ENOMEM) {
+ print_error("calmd", "Couldn't add aux tag (too long)");
+ } else if (errno == EINVAL) {
+ print_error("calmd", "Corrupt aux data");
+ } else {
+ print_error_errno("calmd", "Couldn't add aux tag");
+ }
+ fail:
+ free(str.s);
+ return -1;
}
-void bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode)
+int bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode)
{
- bam_fillmd1_core(b, ref, INT_MAX, flag, 0, quiet_mode);
+ return bam_fillmd1_core(NULL, b, ref, INT_MAX, flag, 0, quiet_mode, NULL);
}
int calmd_usage() {
sam_hdr_t *header = NULL;
faidx_t *fai = NULL;
char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL;
+ const char *ref_name = NULL;
bam1_t *b = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ uint32_t skipped = 0;
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'),
if (b->core.tid >= 0) {
if (tid != b->core.tid) {
free(ref);
- ref = fai_fetch64(fai, sam_hdr_tid2name(header, b->core.tid), &len);
+ ref = NULL;
+ len = 0;
+ ref_name = sam_hdr_tid2name(header, b->core.tid);
+ if (ref_name) {
+ ref = fai_fetch64(fai, ref_name, &len);
+ }
tid = b->core.tid;
if (ref == 0) { // FIXME: Should this always be fatal?
fprintf(samtools_stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
- sam_hdr_tid2name(header, tid));
+ ref_name ? ref_name : "(unknown)");
if (is_realn || capQ > 10) goto fail; // Would otherwise crash
}
}
- if (is_realn) sam_prob_realn(b, ref, len, baq_flag);
+ if (is_realn) {
+ if (sam_prob_realn(b, ref, len, baq_flag) < -3) {
+ print_error_errno("calmd", "BAQ alignment failed");
+ goto fail;
+ }
+ }
if (capQ > 10) {
int q = sam_cap_mapq(b, ref, len, capQ);
if (b->core.qual > q) b->core.qual = q;
}
- if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm, quiet_mode);
+ if (ref) {
+ if (bam_fillmd1_core(ref_name, b, ref, len, flt_flag, max_nm,
+ quiet_mode, &skipped) < 0)
+ goto fail;
+ }
}
if (sam_write1(fpout, header, b) < 0) {
print_error_errno("calmd", "failed to write to output file");
fprintf(samtools_stderr, "[bam_fillmd] Error reading input.\n");
goto fail;
}
+
+ if (skipped) {
+ fprintf(samtools_stderr, "[calmd] Warning: %"PRIu32" records skipped due "
+ "to no query sequence\n",
+ skipped);
+ }
+
bam_destroy1(b);
sam_hdr_destroy(header);
/* bam_plcmd.c -- mpileup subcommand.
- Copyright (C) 2008-2015, 2019 Genome Research Ltd.
+ Copyright (C) 2008-2015, 2019-2021 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
int del_len = -p->indel;
if (p->indel > 0) {
int len = bam_plp_insertion(p, ks, &del_len);
- if (len < 0)
+ if (len < 0) {
+ print_error("mpileup", "bam_plp_insertion() failed");
return -1;
+ }
putc('+', fp); printw(len, fp);
if (bam_is_rev(p->b)) {
char pad = rev_del ? '#' : '*';
#define MPLP_REDO_BAQ (1<<6)
#define MPLP_ILLUMINA13 (1<<7)
#define MPLP_IGNORE_RG (1<<8)
-#define MPLP_PRINT_QPOS (1<<9)
-#define MPLP_PER_SAMPLE (1<<11)
-#define MPLP_SMART_OVERLAPS (1<<12)
+#define MPLP_PER_SAMPLE (1<<9)
+#define MPLP_SMART_OVERLAPS (1<<10)
+#define MPLP_PRINT_MAPQ_CHAR (1<<11)
+#define MPLP_PRINT_QPOS (1<<12)
#define MPLP_PRINT_QNAME (1<<13)
#define MPLP_PRINT_FLAG (1<<14)
#define MPLP_PRINT_RNAME (1<<15)
fprintf(fp, "%s\t%"PRIhts_pos"\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N');
for (i = 0; i < n; ++i) {
fputs("\t0\t*\t*", fp);
- if (conf->flag & MPLP_PRINT_QPOS)
- fputs("\t*", fp);
- int flag_value = MPLP_PRINT_QNAME;
+ int flag_value = MPLP_PRINT_MAPQ_CHAR;
while(flag_value < MPLP_PRINT_QUAL + 1) {
if (conf->flag & flag_value)
fputs("\t*", fp);
fprintf(pileup_fp, "\t%d\t", cnt);
if (n_plp[i] == 0) {
fputs("*\t*", pileup_fp);
- if (conf->flag & MPLP_PRINT_QPOS)
- fputs("\t*", pileup_fp);
- int flag_value = MPLP_PRINT_QNAME;
+ int flag_value = MPLP_PRINT_MAPQ_CHAR;
while(flag_value < MPLP_PRINT_QUAL + 1) {
if (conf->flag & flag_value)
fputs("\t*", pileup_fp);
}
if (!n) putc('*', pileup_fp);
- /* Print mpileup positions */
- if (conf->flag & MPLP_PRINT_QPOS) {
- n = 0;
- putc('\t', pileup_fp);
- for (j = 0; j < n_plp[i]; ++j) {
- const bam_pileup1_t *p = plp[i] + j;
- int c = p->qpos < p->b->core.l_qseq
- ? bam_get_qual(p->b)[p->qpos]
- : 0;
- if ( c < conf->min_baseQ ) continue;
- if (n > 0) putc(',', pileup_fp);
- n++;
- fprintf(pileup_fp, "%d", p->qpos + 1);
- }
- if (!n) putc('*', pileup_fp);
- }
-
/* Print selected columns */
- int flag_value = MPLP_PRINT_QNAME;
+ int flag_value = MPLP_PRINT_MAPQ_CHAR;
while(flag_value < MPLP_PRINT_QUAL + 1) {
if (conf->flag & flag_value) {
n = 0;
? bam_get_qual(p->b)[p->qpos]
: 0;
if ( c < conf->min_baseQ ) continue;
- if (n > 0 && flag_value != MPLP_PRINT_MAPQ) putc(',', pileup_fp);
+ if (n > 0 && flag_value != MPLP_PRINT_MAPQ_CHAR) putc(',', pileup_fp);
n++;
switch (flag_value) {
+ case MPLP_PRINT_MAPQ_CHAR:
+ c = p->b->core.qual + 33;
+ if (c > 126) c = 126;
+ putc(c, pileup_fp);
+ break;
+ case MPLP_PRINT_QPOS:
+ fprintf(pileup_fp, "%d", p->qpos + 1);
+ break;
case MPLP_PRINT_QNAME:
fputs(bam_get_qname(p->b), pileup_fp);
break;
fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1);
break;
case MPLP_PRINT_MAPQ:
- c = p->b->core.qual + 33;
- if (c > 126) c = 126;
- putc(c, pileup_fp);
+ fprintf(pileup_fp, "%d", p->b->core.qual);
break;
case MPLP_PRINT_RNEXT:
if (p->b->core.mtid >= 0)
}
}
+ if (ret < 0) {
+ print_error("mpileup", "error reading from input file");
+ ret = EXIT_FAILURE;
+ goto fail;
+ }
+
if (conf->all && !(conf->flag & MPLP_BCF)) {
// Handle terminating region
if (last_tid < 0 && conf->reg && conf->all > 1) {
fprintf(fp,
" -r, --region REG region in which pileup is generated\n"
" -R, --ignore-RG ignore RG tags (one BAM = one sample)\n"
-" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+" --rf, --incl-flags STR|INT required flags: include reads with any of the mask bits set [%s]\n", tmp_require);
fprintf(fp,
-" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
+" --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n"
" [%s]\n", tmp_filter);
fprintf(fp,
" -x, --ignore-overlaps disable read-pair overlap detection\n"
case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
case '6': mplp.flag |= MPLP_ILLUMINA13; break;
case 'R': mplp.flag |= MPLP_IGNORE_RG; break;
- case 's': mplp.flag |= MPLP_PRINT_MAPQ; break;
+ case 's': mplp.flag |= MPLP_PRINT_MAPQ_CHAR; break;
case 'O': mplp.flag |= MPLP_PRINT_QPOS; break;
case 'C': mplp.capQ_thres = atoi(optarg); break;
case 'q': mplp.min_mq = atoi(optarg); break;
/* bam_plcmd.c -- mpileup subcommand.
- Copyright (C) 2008-2015, 2019 Genome Research Ltd.
+ Copyright (C) 2008-2015, 2019-2021 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
int del_len = -p->indel;
if (p->indel > 0) {
int len = bam_plp_insertion(p, ks, &del_len);
- if (len < 0)
+ if (len < 0) {
+ print_error("mpileup", "bam_plp_insertion() failed");
return -1;
+ }
putc('+', fp); printw(len, fp);
if (bam_is_rev(p->b)) {
char pad = rev_del ? '#' : '*';
#define MPLP_REDO_BAQ (1<<6)
#define MPLP_ILLUMINA13 (1<<7)
#define MPLP_IGNORE_RG (1<<8)
-#define MPLP_PRINT_QPOS (1<<9)
-#define MPLP_PER_SAMPLE (1<<11)
-#define MPLP_SMART_OVERLAPS (1<<12)
+#define MPLP_PER_SAMPLE (1<<9)
+#define MPLP_SMART_OVERLAPS (1<<10)
+#define MPLP_PRINT_MAPQ_CHAR (1<<11)
+#define MPLP_PRINT_QPOS (1<<12)
#define MPLP_PRINT_QNAME (1<<13)
#define MPLP_PRINT_FLAG (1<<14)
#define MPLP_PRINT_RNAME (1<<15)
fprintf(fp, "%s\t%"PRIhts_pos"\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N');
for (i = 0; i < n; ++i) {
fputs("\t0\t*\t*", fp);
- if (conf->flag & MPLP_PRINT_QPOS)
- fputs("\t*", fp);
- int flag_value = MPLP_PRINT_QNAME;
+ int flag_value = MPLP_PRINT_MAPQ_CHAR;
while(flag_value < MPLP_PRINT_QUAL + 1) {
if (conf->flag & flag_value)
fputs("\t*", fp);
if (id < 0 || id >= m->n) {
assert(q); // otherwise a bug
fprintf(samtools_stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]);
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
if (m->n_plp[id] == m->m_plp[id]) {
m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
if (n == 0) {
fprintf(samtools_stderr,"[%s] no input file/data given\n", __func__);
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
// read the header of each file in the list and initialize data
if ( !data[i]->fp )
{
fprintf(samtools_stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) {
fprintf(samtools_stderr, "[%s] failed to process %s: %s\n",
__func__, conf->fai_fname, strerror(errno));
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
data[i]->conf = conf;
data[i]->ref = &mp_ref;
h_tmp = sam_hdr_read(data[i]->fp);
if ( !h_tmp ) {
fprintf(samtools_stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : sam_hdr_str(h_tmp));
if (conf->flag & MPLP_BCF) {
if (idx == NULL) {
fprintf(samtools_stderr, "[%s] fail to load index for %s\n", __func__, fn[i]);
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) {
fprintf(samtools_stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]);
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end, tid0 = data[i]->iter->tid;
hts_idx_destroy(idx);
bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode);
if (bcf_fp == NULL) {
fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
// BCF header creation
if (bcf_hdr_write(bcf_fp, bcf_hdr) != 0) {
print_error_errno("mpileup", "Failed to write VCF/BCF header to \"%s\"",
conf->output_fname? conf->output_fname : "standard output");
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
// End of BCF header creation
if (pileup_fp == NULL) {
fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno));
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
}
if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) {
print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"",
conf->output_fname?conf->output_fname:"standard output");
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
// call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0)
if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) {
print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"",
conf->output_fname?conf->output_fname:"standard output");
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
}
}
fprintf(pileup_fp, "\t%d\t", cnt);
if (n_plp[i] == 0) {
fputs("*\t*", pileup_fp);
- if (conf->flag & MPLP_PRINT_QPOS)
- fputs("\t*", pileup_fp);
- int flag_value = MPLP_PRINT_QNAME;
+ int flag_value = MPLP_PRINT_MAPQ_CHAR;
while(flag_value < MPLP_PRINT_QUAL + 1) {
if (conf->flag & flag_value)
fputs("\t*", pileup_fp);
}
if (!n) putc('*', pileup_fp);
- /* Print mpileup positions */
- if (conf->flag & MPLP_PRINT_QPOS) {
- n = 0;
- putc('\t', pileup_fp);
- for (j = 0; j < n_plp[i]; ++j) {
- const bam_pileup1_t *p = plp[i] + j;
- int c = p->qpos < p->b->core.l_qseq
- ? bam_get_qual(p->b)[p->qpos]
- : 0;
- if ( c < conf->min_baseQ ) continue;
- if (n > 0) putc(',', pileup_fp);
- n++;
- fprintf(pileup_fp, "%d", p->qpos + 1);
- }
- if (!n) putc('*', pileup_fp);
- }
-
/* Print selected columns */
- int flag_value = MPLP_PRINT_QNAME;
+ int flag_value = MPLP_PRINT_MAPQ_CHAR;
while(flag_value < MPLP_PRINT_QUAL + 1) {
if (conf->flag & flag_value) {
n = 0;
? bam_get_qual(p->b)[p->qpos]
: 0;
if ( c < conf->min_baseQ ) continue;
- if (n > 0 && flag_value != MPLP_PRINT_MAPQ) putc(',', pileup_fp);
+ if (n > 0 && flag_value != MPLP_PRINT_MAPQ_CHAR) putc(',', pileup_fp);
n++;
switch (flag_value) {
+ case MPLP_PRINT_MAPQ_CHAR:
+ c = p->b->core.qual + 33;
+ if (c > 126) c = 126;
+ putc(c, pileup_fp);
+ break;
+ case MPLP_PRINT_QPOS:
+ fprintf(pileup_fp, "%d", p->qpos + 1);
+ break;
case MPLP_PRINT_QNAME:
fputs(bam_get_qname(p->b), pileup_fp);
break;
fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1);
break;
case MPLP_PRINT_MAPQ:
- c = p->b->core.qual + 33;
- if (c > 126) c = 126;
- putc(c, pileup_fp);
+ fprintf(pileup_fp, "%d", p->b->core.qual);
break;
case MPLP_PRINT_RNEXT:
if (p->b->core.mtid >= 0)
}
}
+ if (ret < 0) {
+ print_error("mpileup", "error reading from input file");
+ ret = EXIT_FAILURE;
+ goto fail;
+ }
+
if (conf->all && !(conf->flag & MPLP_BCF)) {
// Handle terminating region
if (last_tid < 0 && conf->reg && conf->all > 1) {
else
{
fprintf(samtools_stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
- exit(EXIT_FAILURE);
+ samtools_exit(EXIT_FAILURE);
}
free(tags[i]);
}
fprintf(fp,
" -r, --region REG region in which pileup is generated\n"
" -R, --ignore-RG ignore RG tags (one BAM = one sample)\n"
-" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+" --rf, --incl-flags STR|INT required flags: include reads with any of the mask bits set [%s]\n", tmp_require);
fprintf(fp,
-" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
+" --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n"
" [%s]\n", tmp_filter);
fprintf(fp,
" -x, --ignore-overlaps disable read-pair overlap detection\n"
case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
case '6': mplp.flag |= MPLP_ILLUMINA13; break;
case 'R': mplp.flag |= MPLP_IGNORE_RG; break;
- case 's': mplp.flag |= MPLP_PRINT_MAPQ; break;
+ case 's': mplp.flag |= MPLP_PRINT_MAPQ_CHAR; break;
case 'O': mplp.flag |= MPLP_PRINT_QPOS; break;
case 'C': mplp.capQ_thres = atoi(optarg); break;
case 'q': mplp.min_mq = atoi(optarg); break;
" -i, --in-place Modify the CRAM file directly, if possible.\n"
" (Defaults to outputting to samtools_stdout.)\n"
" -c, --command CMD Pass the header in SAM format to external program CMD.\n");
- exit(ret);
+ samtools_exit(ret);
}
static sam_hdr_t* external_reheader(samFile* in, const char* external) {
return h;
}
-int main_reheader(int argc, char *argv[])
+int samtools_main_reheader(int argc, char *argv[])
{
int inplace = 0, r, no_pg = 0, c, skip_header = 0;
sam_hdr_t *h;
p->discarded = 0;
p->endpos = endpos; p->score = score;
if (p->b == 0) p->b = bam_init1();
- if (!p->b) { perror(NULL); exit(EXIT_FAILURE); }
- if (bam_copy1(p->b, b) == NULL) { perror(NULL); exit(EXIT_FAILURE); }
+ if (!p->b) { perror(NULL); samtools_exit(EXIT_FAILURE); }
+ if (bam_copy1(p->b, b) == NULL) { perror(NULL); samtools_exit(EXIT_FAILURE); }
return p;
}
} else { // replace
p->score = score; p->endpos = endpos;
if (bam_copy1(p->b, b) == NULL) {
- perror(NULL); exit(EXIT_FAILURE);
+ perror(NULL); samtools_exit(EXIT_FAILURE);
}
}
} // otherwise, discard the alignment
/* bam_sort.c -- sorting and merging.
- Copyright (C) 2008-2019 Genome Research Ltd.
+ Copyright (C) 2008-2021 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <stdio.h>
#include <string.h>
#include <time.h>
+#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <getopt.h>
#include <assert.h>
#include <pthread.h>
+#include <inttypes.h>
#include "htslib/ksort.h"
#include "htslib/hts_os.h"
#include "htslib/khash.h"
#include "htslib/hts_endian.h"
#include "sam_opts.h"
#include "samtools.h"
+#include "bedidx.h"
// Struct which contains the a record, and the pointer to the sort tag (if any) or
static int g_is_by_qname = 0;
static int g_is_by_tag = 0;
+static int g_is_by_minhash = 0;
static char g_sort_tag[2] = {0,0};
static int strnum_cmp(const char *_a, const char *_b)
} heap1_t;
static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b);
+static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b);
// Function to compare reads in the heap and determine which one is < the other
+// Note, unlike the bam1_cmp_by_X functions which return <0, 0, >0 this
+// is strictly 0 or 1 only.
static inline int heap_lt(const heap1_t a, const heap1_t b)
{
if (!a.entry.bam_record)
int t;
t = bam1_cmp_by_tag(a.entry, b.entry);
if (t != 0) return t > 0;
+ } else if (g_is_by_minhash) {
+ int t = bam1_cmp_by_minhash(a.entry, b.entry);
+ if (t != 0) return t > 0;
} else if (g_is_by_qname) {
int t, fa, fb;
t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record));
id_len = id_end - idp;
if (id_len < transformed_id.l) {
- if (ks_resize(&new_hdr_line, new_hdr_line.l + transformed_id.l - id_len))
+ if (ks_resize(&new_hdr_line, new_hdr_line.l
+ + transformed_id.l - id_len + 1/*nul*/))
goto fail;
}
if (id_len != transformed_id.l) {
// Get translated header lines and fill in map for @PG records
pg_list = trans_rg_pg(false, translate, merge_pg, merged_hdr->pg_ids,
tbl->pg_trans, NULL);
+ if (!pg_list) goto fail;
// Fix-up PG: tags in the new @RG records and add to output
if (finish_rg_pg(true, rg_list, tbl->pg_trans, &merged_hdr->out_rg))
#define MERGE_COMBINE_PG 32 // Combine PG tags frather than redefining them
#define MERGE_FIRST_CO 64 // Use only first file's @CO headers (sort cmd only)
+
+static hts_reglist_t *duplicate_reglist(const hts_reglist_t *rl, int rn) {
+ if (!rl)
+ return NULL;
+
+ hts_reglist_t *new_rl = calloc(rn, sizeof(hts_reglist_t));
+ if (!new_rl)
+ return NULL;
+
+ int i;
+ for (i=0; i < rn; i++) {
+ new_rl[i].tid = rl[i].tid;
+ new_rl[i].count = rl[i].count;
+ new_rl[i].min_beg = rl[i].min_beg;
+ new_rl[i].max_end = rl[i].max_end;
+
+ new_rl[i].reg = rl[i].reg;
+ new_rl[i].intervals = malloc(new_rl[i].count * sizeof(hts_pair_pos_t));
+ if (!new_rl[i].intervals) {
+ hts_reglist_free(new_rl, i);
+ return NULL;
+ }
+ memcpy(new_rl[i].intervals, rl[i].intervals, new_rl[i].count * sizeof(hts_pair_pos_t));
+ }
+
+ return new_rl;
+}
+
/*
* How merging is handled
*
- * If a hheader is defined use we will use that as our output header
+ * If a header is defined use we will use that as our output header
* otherwise we use the first header from the first input file.
*
* Now go through each file and create a translation table for that file for:
*/
int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode,
const char *headers, int n, char * const *fn, char * const *fn_idx,
- int flag, const char *reg, int n_threads, const char *cmd,
- const htsFormat *in_fmt, const htsFormat *out_fmt, int write_index,
- char *arg_list, int no_pg)
+ const char *fn_bed, int flag, const char *reg, int n_threads,
+ const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt,
+ int write_index, char *arg_list, int no_pg)
{
samFile *fpout, **fp = NULL;
heap1_t *heap = NULL;
trans_tbl_t *translation_tbl = NULL;
int *rtrans = NULL;
char *out_idx_fn = NULL;
+ void *hreg = NULL;
+ hts_reglist_t *lreg = NULL;
merged_header_t *merged_hdr = init_merged_header();
if (!merged_hdr) return -1;
}
if (hin) {
- // Popluate merged_hdr from the pre-prepared header
+ // Populate merged_hdr from the pre-prepared header
trans_tbl_t dummy;
int res;
res = trans_tbl_init(merged_hdr, hin, &dummy, flag & MERGE_COMBINE_RG,
RG[i]))
return -1; // FIXME: memory leak
- // TODO sam_itr_next() doesn't yet work for SAM files,
- // so for those keep the headers around for use with sam_read1()
- if (hts_get_format(fp[i])->format == sam) hdr[i] = hin;
- else { sam_hdr_destroy(hin); hdr[i] = NULL; }
+ hdr[i] = hin;
if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
if (!hout) return -1; // FIXME: memory leak
// If we're only merging a specified region move our iters to start at that point
- if (reg) {
- int tid;
- hts_pos_t beg, end;
+ int tid, nreg;
+ hts_pos_t beg, end;
+ if (fn_bed) {
+ hreg = bed_read(fn_bed);
+ if (!hreg) {
+ fprintf(stderr, "[%s] Could not read BED file: \"%s\"\n", __func__, fn_bed);
+ goto fail;
+ }
+ bed_unify(hreg);
+ lreg = bed_reglist(hreg, ALL, &nreg);
+ if (!lreg || !nreg) {
+ fprintf(stderr, "[%s] Null or empty region list\n", __func__);
+ goto fail;
+ }
+ } else if (reg) {
rtrans = rtrans_build(n, sam_hdr_nref(hout), translation_tbl);
if (!rtrans) goto mem_fail;
fprintf(stderr, "[%s] Badly formatted region or unknown reference name: \"%s\"\n", __func__, reg);
goto fail;
}
+
+ }
+
+ if (reg || fn_bed) {
+ hts_idx_t *reg_idx = NULL;
for (i = 0; i < n; ++i) {
- hts_idx_t *idx = NULL;
- // If index filename has not been specfied, look in BAM folder
+
+ // If index filename has not been specified, look in the BAM folder
if (fn_idx != NULL) {
- idx = sam_index_load2(fp[i], fn[i], fn_idx[i]);
+ reg_idx = sam_index_load2(fp[i], fn[i], fn_idx[i]);
} else {
- idx = sam_index_load(fp[i], fn[i]);
+ reg_idx = sam_index_load(fp[i], fn[i]);
}
- // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
- int mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid];
- if (idx == NULL) {
- fprintf(stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n",
+ if (reg_idx == NULL) {
+ fprintf(stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n",
__func__, fn[i]);
+ free(rtrans);
+ rtrans = NULL;
goto fail;
}
- if (mapped_tid != INT32_MIN) {
- iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end);
+
+ int mapped_tid = INT32_MIN;
+ if (fn_bed) {
+ hts_reglist_t *rl = duplicate_reglist(lreg, nreg);
+ iter[i] = sam_itr_regions(reg_idx, hdr[i], rl, nreg);
} else {
- iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0);
+ // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
+ mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid];
+ if (mapped_tid != INT32_MIN) {
+ iter[i] = sam_itr_queryi(reg_idx, mapped_tid, beg, end);
+ } else {
+ iter[i] = sam_itr_queryi(reg_idx, HTS_IDX_NONE, 0, 0);
+ }
}
- hts_idx_destroy(idx);
+
if (iter[i] == NULL) {
- if (mapped_tid != INT32_MIN) {
- fprintf(stderr,
- "[%s] failed to get iterator over "
- "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n",
- __func__, fn[i], mapped_tid, beg, end);
+ if (fn_bed) {
+ fprintf(stderr, "[%s] failed to get multi-region iterator "
+ "{%s, %s}\n", __func__, fn[i], fn_bed);
} else {
- fprintf(stderr,
- "[%s] failed to get iterator over "
- "{%s, HTS_IDX_NONE, 0, 0}\n",
- __func__, fn[i]);
+ if (mapped_tid != INT32_MIN) {
+ fprintf(stderr,
+ "[%s] failed to get iterator over "
+ "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n",
+ __func__, fn[i], mapped_tid, beg, end);
+ } else {
+ fprintf(stderr,
+ "[%s] failed to get iterator over "
+ "{%s, HTS_IDX_NONE, 0, 0}\n",
+ __func__, fn[i]);
+ }
}
+ hts_idx_destroy(reg_idx);
+ free(rtrans);
+ rtrans = NULL;
goto fail;
}
+
+ hts_idx_destroy(reg_idx);
}
+
free(rtrans);
rtrans = NULL;
- } else {
- for (i = 0; i < n; ++i) {
- if (hdr[i] == NULL) {
- iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
- if (iter[i] == NULL) {
- fprintf(stderr, "[%s] failed to get iterator\n", __func__);
- goto fail;
- }
- }
- else iter[i] = NULL;
- }
}
// Load the first read from each file into the heap
sam_hdr_destroy(hin);
sam_hdr_destroy(hout);
free_merged_header(merged_hdr);
+ hts_reglist_free(lreg, nreg);
+ bed_destroy(hreg);
free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
if (sam_close(fpout) < 0) {
print_error(cmd, "error closing output file");
free(RG);
free(translation_tbl);
free(hdr);
+ hts_reglist_free(lreg, nreg);
+ bed_destroy(hreg);
free(iter);
free(heap);
free(fp);
strcpy(mode, "wb");
if (flag & MERGE_UNCOMP) strcat(mode, "0");
else if (flag & MERGE_LEVEL1) strcat(mode, "1");
- return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1);
+ return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1);
}
static void merge_usage(FILE *to)
{
fprintf(to,
-"Usage: samtools merge [-nurlf] [-h inh.sam] [-b <bamlist.fofn>] <out.bam> <in1.bam> [<in2.bam> ... <inN.bam>]\n"
+"Usage: samtools merge [options] -o <out.bam> [options] <in1.bam> ... <inN.bam>\n"
+" or: samtools merge [options] <out.bam> <in1.bam> ... <inN.bam>\n"
"\n"
"Options:\n"
" -n Input files are sorted by read name\n"
" -r Attach RG tag (inferred from file names)\n"
" -u Uncompressed BAM output\n"
" -f Overwrite the output BAM if exist\n"
+" -o FILE Specify output file via option instead of <out.bam> argument\n"
" -1 Compress level 1\n"
" -l INT Compression level, from 0 to 9 [-1]\n"
" -R STR Merge file in the specified region STR [all]\n"
" -s VALUE Override random seed\n"
" -b FILE List of input BAM filenames, one per line [null]\n"
" -X Use customized index files\n"
+" -L FILE Specify a BED file for multiple region filtering [null]\n"
" --no-PG do not add a PG line\n");
sam_global_opt_help(to, "-.O..@..");
}
{
int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0;
char *fn_headers = NULL, *reg = NULL, mode[12];
- char *sort_tag = NULL, *arg_list = NULL;
+ char *sort_tag = NULL, *fnout = NULL, *arg_list = NULL;
long random_seed = (long)time(NULL);
char** fn = NULL;
- char** fn_idx = NULL;
+ char** fn_idx = NULL, *fn_bed = NULL;
int fn_size = 0, no_pg = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
return 0;
}
- while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:X", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "h:nru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': flag |= MERGE_RG; break;
case 'f': flag |= MERGE_FORCE; break;
case 'h': fn_headers = optarg; break;
case 'n': is_by_qname = 1; break;
+ case 'o': fnout = optarg; break;
case 't': sort_tag = optarg; break;
case '1': flag |= MERGE_LEVEL1; level = 1; break;
case 'u': flag |= MERGE_UNCOMP; level = 0; break;
case 'p': flag |= MERGE_COMBINE_PG; break;
case 's': random_seed = atol(optarg); break;
case 'X': has_index_file = 1; break; // -X flag for index filename
+ case 'L': fn_bed = optarg; break;
case 'b': {
// load the list of files to read
if (has_index_file) {
case '?': merge_usage(stderr); return 1;
}
}
- if ( argc - optind < 1 ) {
+
+ if (fnout == NULL && argc - optind >= 1) {
+ fnout = argv[optind];
+ optind++;
+ }
+ if (fnout == NULL) {
print_error("merge", "You must at least specify the output file");
merge_usage(stderr);
return 1;
return 1;
}
- srand48(random_seed);
- if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) {
- FILE *fp = fopen(argv[optind], "rb");
- if (fp != NULL) {
- fclose(fp);
- fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]);
- return 1;
+ hts_srand48(random_seed);
+ if (!(flag & MERGE_FORCE) && strcmp(fnout, "-") != 0) {
+ struct stat sbuf;
+ if (stat(fnout, &sbuf) == 0 && S_ISREG(sbuf.st_mode)) {
+ fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, fnout);
+ ret = 1;
+ goto end;
}
}
int nargcfiles = 0;
if (has_index_file) { // Calculate # of input BAM files
- if ((argc - optind - 1) % 2 != 0) {
+ if ((argc - optind) % 2 != 0) {
fprintf(stderr, "Odd number of filenames detected! Each BAM file should have an index file\n");
- return 1;
+ ret = 1;
+ goto end;
}
- nargcfiles = (argc - optind - 1) / 2;
+ nargcfiles = (argc - optind) / 2;
} else {
- nargcfiles = argc - optind - 1;
+ nargcfiles = argc - optind;
}
if (nargcfiles > 0) {
// Add argc files to end of array
fn = realloc(fn, (fn_size+nargcfiles) * sizeof(char*));
if (fn == NULL) { ret = 1; goto end; }
- memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*));
+ memcpy(fn+fn_size, argv + optind, nargcfiles * sizeof(char*));
if(has_index_file) {
fn_idx = realloc(fn_idx, nargcfiles * sizeof(char*));
if (fn_idx == NULL) { ret = 1; goto end; }
- memcpy(fn_idx+fn_size, argv + nargcfiles + (optind+1), nargcfiles * sizeof(char*));
+ memcpy(fn_idx+fn_size, argv + nargcfiles + optind, nargcfiles * sizeof(char*));
}
}
if (fn_size+nargcfiles < 1) {
print_error("merge", "You must specify at least one (and usually two or more) input files");
merge_usage(stderr);
- free(fn_idx);
- return 1;
+ ret = 1;
+ goto end;
+ }
+
+ if (reg && fn_bed) {
+ print_error("merge", "You must specify either a BED file or a region");
+ ret = 1;
+ goto end;
}
strcpy(mode, "wb");
- sam_open_mode(mode+1, argv[optind], NULL);
+ sam_open_mode(mode+1, fnout, NULL);
if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
- if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers,
- fn_size+nargcfiles, fn, fn_idx, flag, reg, ga.nthreads,
+ if (bam_merge_core2(is_by_qname, sort_tag, fnout, mode, fn_headers,
+ fn_size+nargcfiles, fn, fn_idx, fn_bed, flag, reg, ga.nthreads,
"merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0)
ret = 1;
ks_heapmake(heap, heap_size, heap);
while (heap->pos != HEAP_EMPTY) {
bam1_t *b = heap->entry.bam_record;
+ if (g_is_by_minhash && b->core.tid == -1) {
+ // Remove the cached minhash value
+ b->core.pos = -1;
+ b->core.mpos = -1;
+ b->core.isize = 0;
+ }
if (sam_write1(fpout, hout, b) < 0) {
print_error_errno(cmd, "failed writing to \"%s\"", out);
goto fail;
}
}
+// Sort by minimiser (stored in bam1_tag.u.pos).
+// If equal, sort by position.
+//
+// The 64-bit sort key is split over the bam pos and isize fields.
+// This permits it to survive writing to temporary file and coming back.
+static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b)
+{
+ const bam1_t *A = a.bam_record;
+ const bam1_t *B = b.bam_record;
+
+ if (!A) return 1;
+ if (!B) return 0;
+
+ if (A->core.tid != -1 || B->core.tid != -1)
+ return bam1_cmp_core(a,b);
+
+ const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos;
+ const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos;
+
+ if (m_a < m_b) // by hash
+ return -1;
+ else if (m_a > m_b)
+ return 1;
+ else if (A->core.isize < B->core.isize) // by hash location in seq
+ return -1;
+ else if (A->core.isize > B->core.isize)
+ return 1;
+ else
+ return bam1_cmp_core(a,b);
+}
+
// Function to compare reads and determine which one is < the other
// Handle sort-by-pos, sort-by-name, or sort-by-tag
static inline int bam1_lt(const bam1_tag a, const bam1_tag b)
{
if (g_is_by_tag) {
return bam1_cmp_by_tag(a, b) < 0;
+ } else if (g_is_by_minhash) {
+ return bam1_cmp_by_minhash(a, b) < 0;
} else {
return bam1_cmp_core(a,b) < 0;
}
// -1 for failure
static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf,
const sam_hdr_t *h, int n_threads, const htsFormat *fmt,
- char *arg_list, int no_pg, int write_index)
+ int clear_minhash, char *arg_list, int no_pg, int write_index)
{
size_t i;
samFile* fp;
fp = sam_open_format(fn, mode, fmt);
if (fp == NULL) return -1;
- if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools",
- "VN", samtools_version(),
+ if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", "VN", samtools_version(),
arg_list ? "CL": NULL,
arg_list ? arg_list : NULL,
NULL)) {
goto fail;
}
- if (sam_hdr_write(fp, (sam_hdr_t *)h) != 0) goto fail;
+ if (sam_hdr_write(fp, h) != 0) goto fail;
- if (write_index) {
+ if (write_index)
if (!(out_idx_fn = auto_index(fp, fn, (sam_hdr_t *)h))) goto fail;
- }
if (n_threads > 1) hts_set_threads(fp, n_threads);
for (i = 0; i < l; ++i) {
- if (sam_write1(fp, (sam_hdr_t *)h, buf[i].bam_record) < 0) goto fail;
+ bam1_t *b = buf[i].bam_record;
+ if (clear_minhash && b->core.tid == -1) {
+ // Remove the cached minhash value
+ b->core.pos = -1;
+ b->core.mpos = -1;
+ b->core.isize = 0;
+ }
+ if (sam_write1(fp, h, b) < 0) goto fail;
}
if (write_index) {
return ret;
}
+/*
+ * Computes the minhash of a sequence using both forward and reverse strands.
+ *
+ * This is used as a sort key for unmapped data, to collate like sequences
+ * together and to improve compression ratio.
+ *
+ * The minhash is returned and *pos filled out with location of this hash
+ * key in the sequence if pos != NULL.
+ */
+static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) {
+ uint64_t hashf = 0, minhashf = UINT64_MAX;
+ uint64_t hashr = 0, minhashr = UINT64_MAX;
+ int minhashpf = 0, minhashpr = 0, i;
+ uint64_t mask = (1L<<(2*kmer))-1;
+ unsigned char *seq = bam_get_seq(b);
+ int len = b->core.l_qseq;
+
+ // Lookup tables for bam_seqi to 0123 fwd/rev hashes
+ // =ACM GRSV TWYH KDBN
+#define X 0
+ unsigned char L[16] = {
+ X,0,1,X, 2,X,X,X, 3,X,X,X, X,X,X,X,
+ };
+ uint64_t R[16] = {
+ X,3,2,X, 1,X,X,X, 0,X,X,X, X,X,X,X,
+ };
+ for (i = 0; i < 16; i++)
+ R[i] <<= 2*(kmer-1);
+
+ // Punt homopolymers somewhere central in the hash space
+#define XOR (0xdead7878beef7878 & mask)
+
+ // Initialise hash keys
+ for (i = 0; i < kmer-1 && i < len; i++) {
+ int base = bam_seqi(seq, i);
+ hashf = (hashf<<2) | L[base];
+ hashr = (hashr>>2) | R[base];
+ }
+
+ // Loop to find minimum
+ for (; i < len; i++) {
+ int base = bam_seqi(seq, i);
+
+ hashf = ((hashf<<2) | L[base]) & mask;
+ hashr = (hashr>>2) | R[base];
+
+ if (minhashf > (hashf^XOR))
+ minhashf = (hashf^XOR), minhashpf = i;
+ if (minhashr > (hashr^XOR))
+ minhashr = (hashr^XOR), minhashpr = len-i+kmer-2;
+
+ }
+
+ if (minhashf <= minhashr) {
+ if (rev) *rev = 0;
+ if (pos) *pos = minhashpf;
+ return minhashf;
+ } else {
+ if (rev) *rev = 1;
+ if (pos) *pos = minhashpr;
+ return minhashr;
+ }
+}
+
+//--- Start of candidates to punt to htslib
+/*!
+ * @abstract
+ * Extracts the sequence (in current alignment orientation) from
+ * a bam record and places it in buf, which is nul terminated.
+ *
+ * @param b The bam structure
+ * @param buf A buffer at least b->core.l_qseq+1 bytes long
+ */
+static void bam_to_seq(bam1_t *b, char *buf) {
+ int i;
+ uint8_t *seq = bam_get_seq(b);
+ for (i = 0; i < b->core.l_qseq; i++)
+ buf[i] = seq_nt16_str[bam_seqi(seq, i)];
+ buf[i] = 0;
+}
+
+/*!
+ * @abstract
+ * Writes a new sequence, of length b->core.l_qseq, to a BAM record.
+ *
+ * If a sequence of a new length is required the caller must first make
+ * room for it by updating the bam1_t struct.
+ *
+ * @param b The bam structure
+ * @param buf A buffer at least b->core.l_qseq bytes long
+ */
+static void seq_to_bam(bam1_t *b, char *buf) {
+ int i;
+ uint8_t *seq = bam_get_seq(b);
+ for (i = 0; i < b->core.l_qseq; i++)
+ bam_set_seqi(seq, i, seq_nt16_table[(unsigned char)buf[i]]);
+}
+
+/*!
+ * @abstract Reverse complements a BAM record.
+ *
+ * It's possible to do this inline, but complex due to the 4-bit sequence
+ * encoding. For now I take the dumb approach.
+ *
+ * @param b Pointer to a BAM alignment
+ *
+ * @return 0 on success, -1 on failure (ENOMEM)
+ */
+static int reverse_complement(bam1_t *b) {
+ static char comp[256] = {
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//00
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//10
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//20
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//30
+
+ // * * * * E F * * I J * L * * O
+ '@','T','V','G', 'H','E','F','C', 'D','I','H','M', 'L','K','N','O',//40
+ //P Q * * * * * * X Y Z [ \ ] ^ _
+ 'P','Q','Y','S', 'A','A','B','W', 'X','Y','Z','[','\\','[','^','_',//50
+ //` * * * * E F * * I J * L * * O
+ '`','t','v','g', 'h','e','f','c', 'd','i','j','m', 'l','k','n','o',//60
+ //P Q * * * * * * X Y Z { | } ~ DEL
+ 'p','q','y','s', 'a','a','b','w', 'x','y','z','{', '|','}','~',127,//70
+
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//80
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//90
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//A0
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//B0
+
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//C0
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//D0
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//E0
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//F0
+ };
+ char seq_[10000], *seq = seq_;
+ uint8_t *qual = bam_get_qual(b);
+ int i, j;
+
+ if (b->core.l_qseq >= 10000)
+ if (!(seq = malloc(b->core.l_qseq+1)))
+ return -1;
+
+ bam_to_seq(b, seq);
+
+ for (i = 0, j = b->core.l_qseq-1; i < j; i++, j--) {
+ unsigned char tmp = seq[i];
+ seq[i] = comp[(unsigned char)seq[j]];
+ seq[j] = comp[tmp];
+ tmp = qual[i];
+ qual[i] = qual[j];
+ qual[j] = tmp;
+ }
+ if (i ==j)
+ seq[i] = comp[(unsigned char)seq[i]];
+
+ seq_to_bam(b, seq);
+
+ if (seq != seq_)
+ free(seq);
+
+ b->core.flag ^= 0x10;
+
+ return 0;
+}
+//--- End of candidates to punt to htslib
+
static void *worker(void *data)
{
worker_t *w = (worker_t*)data;
char *name;
w->error = 0;
- if (!g_is_by_qname && !g_is_by_tag) {
+ if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) {
if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) {
w->error = errno;
return NULL;
}
} else {
+ if (g_is_by_minhash) {
+ int i;
+ for (i = 0; i < w->buf_len; i++) {
+ bam1_t *b = w->buf[i].bam_record;
+ if (b->core.tid != -1)
+ continue;
+
+ int pos = 0, rev = 0;
+ uint64_t mh = minhash(b, g_is_by_minhash, &pos, &rev);
+ if (rev)
+ reverse_complement(b);
+
+ // Store 64-bit hash in unmapped pos and mpos fields.
+ // The position of hash is in isize, which we use for
+ // resolving ties when sorting by hash key.
+ // These are unused for completely unmapped data and
+ // will be reset during final output.
+ b->core.pos = mh>>31;
+ b->core.mpos = mh&0x7fffffff;
+ b->core.isize = 65535-pos >=0 ? 65535-pos : 0;
+ }
+ }
ks_mergesort(sort, w->buf_len, w->buf, 0);
}
return 0;
}
- if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, NULL, 1, 0) < 0)
+ if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, 0, NULL, 1, 0) < 0)
w->error = errno;
} else {
- if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, NULL, 1, 0) < 0)
+ if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) < 0)
w->error = errno;
}
return n_files + n_threads;
}
+
/*!
@abstract Sort an unsorted BAM file based on the chromosome order
and the leftmost position of an alignment
*/
int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix,
const char *fnout, const char *modeout,
- size_t _max_mem, int n_threads,
+ size_t _max_mem, int by_minimiser, int n_threads,
const htsFormat *in_fmt, const htsFormat *out_fmt,
char *arg_list, int no_pg, int write_index)
{
if (n_threads < 2) n_threads = 1;
g_is_by_qname = is_by_qname;
+ g_is_by_minhash = by_minimiser;
if (sort_by_tag) {
g_is_by_tag = 1;
g_sort_tag[0] = sort_by_tag[0];
else
new_so = "coordinate";
- if ((-1 == sam_hdr_update_hd(header, "SO", new_so))
- && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL))
- ) {
- print_error("sort", "failed to change sort order header to '%s'\n", new_so);
- goto err;
+ if (by_minimiser) {
+ const char *new_ss = "coordinate:minhash";
+ if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "SS", new_ss))
+ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION,
+ "SO", new_so, "SS", new_ss, NULL))
+ ) {
+ print_error("sort", "failed to change sort order header to 'SO:%s SS:%s'\n",
+ new_so, new_ss);
+ goto err;
+ }
+ } else {
+ if ((-1 == sam_hdr_update_hd(header, "SO", new_so))
+ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL))
+ ) {
+ print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so);
+ goto err;
+ }
}
if (-1 == sam_hdr_remove_tag_hd(header, "GO")) {
// write the final output
if (n_files == 0 && num_in_mem < 2) { // a single block
- if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, arg_list, no_pg, write_index) != 0) {
+ if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt,
+ g_is_by_minhash, arg_list, no_pg, write_index) != 0) {
print_error_errno("sort", "failed to create \"%s\"", fnout);
goto err;
}
char *fnout = calloc(strlen(prefix) + 4 + 1, 1);
if (!fnout) return -1;
sprintf(fnout, "%s.bam", prefix);
- ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
+ ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, 0, NULL, NULL, NULL, 1, 0);
free(fnout);
return ret;
}
"Usage: samtools sort [options...] [in.bam]\n"
"Options:\n"
" -l INT Set compression level, from 0 (uncompressed) to 9 (best)\n"
+" -u Output uncompressed data (equivalent to -l 0)\n"
" -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
-" -n Sort by read name\n"
+" -M Use minimiser for clustering unaligned/unplaced reads\n"
+" -K INT Kmer size to use for minimiser [20]\n"
+" -n Sort by read name (not compatible with samtools index command)\n"
" -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
" -o FILE Write final output to FILE rather than standard output\n"
" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"
" --no-PG do not add a PG line\n");
- sam_global_opt_help(fp, "-.O..@-.");
+ sam_global_opt_help(fp, "-.O..@..");
}
static void complain_about_memory_setting(size_t max_mem) {
{
size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20;
int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0;
+ int by_minimiser = 0, minimiser_kmer = 20;
char* sort_tag = NULL, *arg_list = NULL;
char *fnout = "-", modeout[12];
kstring_t tmpprefix = { 0, 0, NULL };
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) {
switch (c) {
case 'o': fnout = optarg; o_seen = 1; break;
case 'n': is_by_qname = 1; break;
}
case 'T': kputs(optarg, &tmpprefix); break;
case 'l': level = atoi(optarg); break;
- case 1: no_pg = 1; break;
+ case 'u': level = 0; break;
+ case 1: no_pg = 1; break;
+ case 'M': by_minimiser = 1; break;
+ case 'K':
+ minimiser_kmer = atoi(optarg);
+ if (minimiser_kmer < 1)
+ minimiser_kmer = 1;
+ else if (minimiser_kmer > 31)
+ minimiser_kmer = 31;
+ break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
}
ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-",
- tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
+ tmpprefix.s, fnout, modeout, max_mem,
+ by_minimiser * minimiser_kmer, ga.nthreads,
&ga.in, &ga.out, arg_list, no_pg, ga.write_index);
if (ret >= 0)
ret = EXIT_SUCCESS;
/* bam_sort.c -- sorting and merging.
- Copyright (C) 2008-2019 Genome Research Ltd.
+ Copyright (C) 2008-2021 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <stdio.h>
#include <string.h>
#include <time.h>
+#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <getopt.h>
#include <assert.h>
#include <pthread.h>
+#include <inttypes.h>
#include "htslib/ksort.h"
#include "htslib/hts_os.h"
#include "htslib/khash.h"
#include "htslib/hts_endian.h"
#include "sam_opts.h"
#include "samtools.h"
+#include "bedidx.h"
// Struct which contains the a record, and the pointer to the sort tag (if any) or
static int g_is_by_qname = 0;
static int g_is_by_tag = 0;
+static int g_is_by_minhash = 0;
static char g_sort_tag[2] = {0,0};
static int strnum_cmp(const char *_a, const char *_b)
} heap1_t;
static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b);
+static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b);
// Function to compare reads in the heap and determine which one is < the other
+// Note, unlike the bam1_cmp_by_X functions which return <0, 0, >0 this
+// is strictly 0 or 1 only.
static inline int heap_lt(const heap1_t a, const heap1_t b)
{
if (!a.entry.bam_record)
int t;
t = bam1_cmp_by_tag(a.entry, b.entry);
if (t != 0) return t > 0;
+ } else if (g_is_by_minhash) {
+ int t = bam1_cmp_by_minhash(a.entry, b.entry);
+ if (t != 0) return t > 0;
} else if (g_is_by_qname) {
int t, fa, fb;
t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record));
id_len = id_end - idp;
if (id_len < transformed_id.l) {
- if (ks_resize(&new_hdr_line, new_hdr_line.l + transformed_id.l - id_len))
+ if (ks_resize(&new_hdr_line, new_hdr_line.l
+ + transformed_id.l - id_len + 1/*nul*/))
goto fail;
}
if (id_len != transformed_id.l) {
// Get translated header lines and fill in map for @PG records
pg_list = trans_rg_pg(false, translate, merge_pg, merged_hdr->pg_ids,
tbl->pg_trans, NULL);
+ if (!pg_list) goto fail;
// Fix-up PG: tags in the new @RG records and add to output
if (finish_rg_pg(true, rg_list, tbl->pg_trans, &merged_hdr->out_rg))
#define MERGE_COMBINE_PG 32 // Combine PG tags frather than redefining them
#define MERGE_FIRST_CO 64 // Use only first file's @CO headers (sort cmd only)
+
+static hts_reglist_t *duplicate_reglist(const hts_reglist_t *rl, int rn) {
+ if (!rl)
+ return NULL;
+
+ hts_reglist_t *new_rl = calloc(rn, sizeof(hts_reglist_t));
+ if (!new_rl)
+ return NULL;
+
+ int i;
+ for (i=0; i < rn; i++) {
+ new_rl[i].tid = rl[i].tid;
+ new_rl[i].count = rl[i].count;
+ new_rl[i].min_beg = rl[i].min_beg;
+ new_rl[i].max_end = rl[i].max_end;
+
+ new_rl[i].reg = rl[i].reg;
+ new_rl[i].intervals = malloc(new_rl[i].count * sizeof(hts_pair_pos_t));
+ if (!new_rl[i].intervals) {
+ hts_reglist_free(new_rl, i);
+ return NULL;
+ }
+ memcpy(new_rl[i].intervals, rl[i].intervals, new_rl[i].count * sizeof(hts_pair_pos_t));
+ }
+
+ return new_rl;
+}
+
/*
* How merging is handled
*
- * If a hheader is defined use we will use that as our output header
+ * If a header is defined use we will use that as our output header
* otherwise we use the first header from the first input file.
*
* Now go through each file and create a translation table for that file for:
*/
int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode,
const char *headers, int n, char * const *fn, char * const *fn_idx,
- int flag, const char *reg, int n_threads, const char *cmd,
- const htsFormat *in_fmt, const htsFormat *out_fmt, int write_index,
- char *arg_list, int no_pg)
+ const char *fn_bed, int flag, const char *reg, int n_threads,
+ const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt,
+ int write_index, char *arg_list, int no_pg)
{
samFile *fpout, **fp = NULL;
heap1_t *heap = NULL;
trans_tbl_t *translation_tbl = NULL;
int *rtrans = NULL;
char *out_idx_fn = NULL;
+ void *hreg = NULL;
+ hts_reglist_t *lreg = NULL;
merged_header_t *merged_hdr = init_merged_header();
if (!merged_hdr) return -1;
}
if (hin) {
- // Popluate merged_hdr from the pre-prepared header
+ // Populate merged_hdr from the pre-prepared header
trans_tbl_t dummy;
int res;
res = trans_tbl_init(merged_hdr, hin, &dummy, flag & MERGE_COMBINE_RG,
RG[i]))
return -1; // FIXME: memory leak
- // TODO sam_itr_next() doesn't yet work for SAM files,
- // so for those keep the headers around for use with sam_read1()
- if (hts_get_format(fp[i])->format == sam) hdr[i] = hin;
- else { sam_hdr_destroy(hin); hdr[i] = NULL; }
+ hdr[i] = hin;
if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
fprintf(samtools_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
if (!hout) return -1; // FIXME: memory leak
// If we're only merging a specified region move our iters to start at that point
- if (reg) {
- int tid;
- hts_pos_t beg, end;
+ int tid, nreg;
+ hts_pos_t beg, end;
+ if (fn_bed) {
+ hreg = bed_read(fn_bed);
+ if (!hreg) {
+ fprintf(samtools_stderr, "[%s] Could not read BED file: \"%s\"\n", __func__, fn_bed);
+ goto fail;
+ }
+ bed_unify(hreg);
+ lreg = bed_reglist(hreg, ALL, &nreg);
+ if (!lreg || !nreg) {
+ fprintf(samtools_stderr, "[%s] Null or empty region list\n", __func__);
+ goto fail;
+ }
+ } else if (reg) {
rtrans = rtrans_build(n, sam_hdr_nref(hout), translation_tbl);
if (!rtrans) goto mem_fail;
fprintf(samtools_stderr, "[%s] Badly formatted region or unknown reference name: \"%s\"\n", __func__, reg);
goto fail;
}
+
+ }
+
+ if (reg || fn_bed) {
+ hts_idx_t *reg_idx = NULL;
for (i = 0; i < n; ++i) {
- hts_idx_t *idx = NULL;
- // If index filename has not been specfied, look in BAM folder
+
+ // If index filename has not been specified, look in the BAM folder
if (fn_idx != NULL) {
- idx = sam_index_load2(fp[i], fn[i], fn_idx[i]);
+ reg_idx = sam_index_load2(fp[i], fn[i], fn_idx[i]);
} else {
- idx = sam_index_load(fp[i], fn[i]);
+ reg_idx = sam_index_load(fp[i], fn[i]);
}
- // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
- int mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid];
- if (idx == NULL) {
- fprintf(samtools_stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n",
+ if (reg_idx == NULL) {
+ fprintf(samtools_stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n",
__func__, fn[i]);
+ free(rtrans);
+ rtrans = NULL;
goto fail;
}
- if (mapped_tid != INT32_MIN) {
- iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end);
+
+ int mapped_tid = INT32_MIN;
+ if (fn_bed) {
+ hts_reglist_t *rl = duplicate_reglist(lreg, nreg);
+ iter[i] = sam_itr_regions(reg_idx, hdr[i], rl, nreg);
} else {
- iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0);
+ // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
+ mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid];
+ if (mapped_tid != INT32_MIN) {
+ iter[i] = sam_itr_queryi(reg_idx, mapped_tid, beg, end);
+ } else {
+ iter[i] = sam_itr_queryi(reg_idx, HTS_IDX_NONE, 0, 0);
+ }
}
- hts_idx_destroy(idx);
+
if (iter[i] == NULL) {
- if (mapped_tid != INT32_MIN) {
- fprintf(samtools_stderr,
- "[%s] failed to get iterator over "
- "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n",
- __func__, fn[i], mapped_tid, beg, end);
+ if (fn_bed) {
+ fprintf(samtools_stderr, "[%s] failed to get multi-region iterator "
+ "{%s, %s}\n", __func__, fn[i], fn_bed);
} else {
- fprintf(samtools_stderr,
- "[%s] failed to get iterator over "
- "{%s, HTS_IDX_NONE, 0, 0}\n",
- __func__, fn[i]);
+ if (mapped_tid != INT32_MIN) {
+ fprintf(samtools_stderr,
+ "[%s] failed to get iterator over "
+ "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n",
+ __func__, fn[i], mapped_tid, beg, end);
+ } else {
+ fprintf(samtools_stderr,
+ "[%s] failed to get iterator over "
+ "{%s, HTS_IDX_NONE, 0, 0}\n",
+ __func__, fn[i]);
+ }
}
+ hts_idx_destroy(reg_idx);
+ free(rtrans);
+ rtrans = NULL;
goto fail;
}
+
+ hts_idx_destroy(reg_idx);
}
+
free(rtrans);
rtrans = NULL;
- } else {
- for (i = 0; i < n; ++i) {
- if (hdr[i] == NULL) {
- iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
- if (iter[i] == NULL) {
- fprintf(samtools_stderr, "[%s] failed to get iterator\n", __func__);
- goto fail;
- }
- }
- else iter[i] = NULL;
- }
}
// Load the first read from each file into the heap
sam_hdr_destroy(hin);
sam_hdr_destroy(hout);
free_merged_header(merged_hdr);
+ hts_reglist_free(lreg, nreg);
+ bed_destroy(hreg);
free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
if (sam_close(fpout) < 0) {
print_error(cmd, "error closing output file");
free(RG);
free(translation_tbl);
free(hdr);
+ hts_reglist_free(lreg, nreg);
+ bed_destroy(hreg);
free(iter);
free(heap);
free(fp);
strcpy(mode, "wb");
if (flag & MERGE_UNCOMP) strcat(mode, "0");
else if (flag & MERGE_LEVEL1) strcat(mode, "1");
- return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1);
+ return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1);
}
static void merge_usage(FILE *to)
{
fprintf(to,
-"Usage: samtools merge [-nurlf] [-h inh.sam] [-b <bamlist.fofn>] <out.bam> <in1.bam> [<in2.bam> ... <inN.bam>]\n"
+"Usage: samtools merge [options] -o <out.bam> [options] <in1.bam> ... <inN.bam>\n"
+" or: samtools merge [options] <out.bam> <in1.bam> ... <inN.bam>\n"
"\n"
"Options:\n"
" -n Input files are sorted by read name\n"
" -r Attach RG tag (inferred from file names)\n"
" -u Uncompressed BAM output\n"
" -f Overwrite the output BAM if exist\n"
+" -o FILE Specify output file via option instead of <out.bam> argument\n"
" -1 Compress level 1\n"
" -l INT Compression level, from 0 to 9 [-1]\n"
" -R STR Merge file in the specified region STR [all]\n"
" -s VALUE Override random seed\n"
" -b FILE List of input BAM filenames, one per line [null]\n"
" -X Use customized index files\n"
+" -L FILE Specify a BED file for multiple region filtering [null]\n"
" --no-PG do not add a PG line\n");
sam_global_opt_help(to, "-.O..@..");
}
{
int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0;
char *fn_headers = NULL, *reg = NULL, mode[12];
- char *sort_tag = NULL, *arg_list = NULL;
+ char *sort_tag = NULL, *fnout = NULL, *arg_list = NULL;
long random_seed = (long)time(NULL);
char** fn = NULL;
- char** fn_idx = NULL;
+ char** fn_idx = NULL, *fn_bed = NULL;
int fn_size = 0, no_pg = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
return 0;
}
- while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:X", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "h:nru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': flag |= MERGE_RG; break;
case 'f': flag |= MERGE_FORCE; break;
case 'h': fn_headers = optarg; break;
case 'n': is_by_qname = 1; break;
+ case 'o': fnout = optarg; break;
case 't': sort_tag = optarg; break;
case '1': flag |= MERGE_LEVEL1; level = 1; break;
case 'u': flag |= MERGE_UNCOMP; level = 0; break;
case 'p': flag |= MERGE_COMBINE_PG; break;
case 's': random_seed = atol(optarg); break;
case 'X': has_index_file = 1; break; // -X flag for index filename
+ case 'L': fn_bed = optarg; break;
case 'b': {
// load the list of files to read
if (has_index_file) {
case '?': merge_usage(samtools_stderr); return 1;
}
}
- if ( argc - optind < 1 ) {
+
+ if (fnout == NULL && argc - optind >= 1) {
+ fnout = argv[optind];
+ optind++;
+ }
+ if (fnout == NULL) {
print_error("merge", "You must at least specify the output file");
merge_usage(samtools_stderr);
return 1;
return 1;
}
- srand48(random_seed);
- if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) {
- FILE *fp = fopen(argv[optind], "rb");
- if (fp != NULL) {
- fclose(fp);
- fprintf(samtools_stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]);
- return 1;
+ hts_srand48(random_seed);
+ if (!(flag & MERGE_FORCE) && strcmp(fnout, "-") != 0) {
+ struct stat sbuf;
+ if (stat(fnout, &sbuf) == 0 && S_ISREG(sbuf.st_mode)) {
+ fprintf(samtools_stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, fnout);
+ ret = 1;
+ goto end;
}
}
int nargcfiles = 0;
if (has_index_file) { // Calculate # of input BAM files
- if ((argc - optind - 1) % 2 != 0) {
+ if ((argc - optind) % 2 != 0) {
fprintf(samtools_stderr, "Odd number of filenames detected! Each BAM file should have an index file\n");
- return 1;
+ ret = 1;
+ goto end;
}
- nargcfiles = (argc - optind - 1) / 2;
+ nargcfiles = (argc - optind) / 2;
} else {
- nargcfiles = argc - optind - 1;
+ nargcfiles = argc - optind;
}
if (nargcfiles > 0) {
// Add argc files to end of array
fn = realloc(fn, (fn_size+nargcfiles) * sizeof(char*));
if (fn == NULL) { ret = 1; goto end; }
- memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*));
+ memcpy(fn+fn_size, argv + optind, nargcfiles * sizeof(char*));
if(has_index_file) {
fn_idx = realloc(fn_idx, nargcfiles * sizeof(char*));
if (fn_idx == NULL) { ret = 1; goto end; }
- memcpy(fn_idx+fn_size, argv + nargcfiles + (optind+1), nargcfiles * sizeof(char*));
+ memcpy(fn_idx+fn_size, argv + nargcfiles + optind, nargcfiles * sizeof(char*));
}
}
if (fn_size+nargcfiles < 1) {
print_error("merge", "You must specify at least one (and usually two or more) input files");
merge_usage(samtools_stderr);
- free(fn_idx);
- return 1;
+ ret = 1;
+ goto end;
+ }
+
+ if (reg && fn_bed) {
+ print_error("merge", "You must specify either a BED file or a region");
+ ret = 1;
+ goto end;
}
strcpy(mode, "wb");
- sam_open_mode(mode+1, argv[optind], NULL);
+ sam_open_mode(mode+1, fnout, NULL);
if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
- if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers,
- fn_size+nargcfiles, fn, fn_idx, flag, reg, ga.nthreads,
+ if (bam_merge_core2(is_by_qname, sort_tag, fnout, mode, fn_headers,
+ fn_size+nargcfiles, fn, fn_idx, fn_bed, flag, reg, ga.nthreads,
"merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0)
ret = 1;
ks_heapmake(heap, heap_size, heap);
while (heap->pos != HEAP_EMPTY) {
bam1_t *b = heap->entry.bam_record;
+ if (g_is_by_minhash && b->core.tid == -1) {
+ // Remove the cached minhash value
+ b->core.pos = -1;
+ b->core.mpos = -1;
+ b->core.isize = 0;
+ }
if (sam_write1(fpout, hout, b) < 0) {
print_error_errno(cmd, "failed writing to \"%s\"", out);
goto fail;
}
}
+// Sort by minimiser (stored in bam1_tag.u.pos).
+// If equal, sort by position.
+//
+// The 64-bit sort key is split over the bam pos and isize fields.
+// This permits it to survive writing to temporary file and coming back.
+static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b)
+{
+ const bam1_t *A = a.bam_record;
+ const bam1_t *B = b.bam_record;
+
+ if (!A) return 1;
+ if (!B) return 0;
+
+ if (A->core.tid != -1 || B->core.tid != -1)
+ return bam1_cmp_core(a,b);
+
+ const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos;
+ const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos;
+
+ if (m_a < m_b) // by hash
+ return -1;
+ else if (m_a > m_b)
+ return 1;
+ else if (A->core.isize < B->core.isize) // by hash location in seq
+ return -1;
+ else if (A->core.isize > B->core.isize)
+ return 1;
+ else
+ return bam1_cmp_core(a,b);
+}
+
// Function to compare reads and determine which one is < the other
// Handle sort-by-pos, sort-by-name, or sort-by-tag
static inline int bam1_lt(const bam1_tag a, const bam1_tag b)
{
if (g_is_by_tag) {
return bam1_cmp_by_tag(a, b) < 0;
+ } else if (g_is_by_minhash) {
+ return bam1_cmp_by_minhash(a, b) < 0;
} else {
return bam1_cmp_core(a,b) < 0;
}
// -1 for failure
static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf,
const sam_hdr_t *h, int n_threads, const htsFormat *fmt,
- char *arg_list, int no_pg, int write_index)
+ int clear_minhash, char *arg_list, int no_pg, int write_index)
{
size_t i;
samFile* fp;
fp = sam_open_format(fn, mode, fmt);
if (fp == NULL) return -1;
- if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools",
- "VN", samtools_version(),
+ if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", "VN", samtools_version(),
arg_list ? "CL": NULL,
arg_list ? arg_list : NULL,
NULL)) {
goto fail;
}
- if (sam_hdr_write(fp, (sam_hdr_t *)h) != 0) goto fail;
+ if (sam_hdr_write(fp, h) != 0) goto fail;
- if (write_index) {
+ if (write_index)
if (!(out_idx_fn = auto_index(fp, fn, (sam_hdr_t *)h))) goto fail;
- }
if (n_threads > 1) hts_set_threads(fp, n_threads);
for (i = 0; i < l; ++i) {
- if (sam_write1(fp, (sam_hdr_t *)h, buf[i].bam_record) < 0) goto fail;
+ bam1_t *b = buf[i].bam_record;
+ if (clear_minhash && b->core.tid == -1) {
+ // Remove the cached minhash value
+ b->core.pos = -1;
+ b->core.mpos = -1;
+ b->core.isize = 0;
+ }
+ if (sam_write1(fp, h, b) < 0) goto fail;
}
if (write_index) {
return ret;
}
+/*
+ * Computes the minhash of a sequence using both forward and reverse strands.
+ *
+ * This is used as a sort key for unmapped data, to collate like sequences
+ * together and to improve compression ratio.
+ *
+ * The minhash is returned and *pos filled out with location of this hash
+ * key in the sequence if pos != NULL.
+ */
+static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) {
+ uint64_t hashf = 0, minhashf = UINT64_MAX;
+ uint64_t hashr = 0, minhashr = UINT64_MAX;
+ int minhashpf = 0, minhashpr = 0, i;
+ uint64_t mask = (1L<<(2*kmer))-1;
+ unsigned char *seq = bam_get_seq(b);
+ int len = b->core.l_qseq;
+
+ // Lookup tables for bam_seqi to 0123 fwd/rev hashes
+ // =ACM GRSV TWYH KDBN
+#define X 0
+ unsigned char L[16] = {
+ X,0,1,X, 2,X,X,X, 3,X,X,X, X,X,X,X,
+ };
+ uint64_t R[16] = {
+ X,3,2,X, 1,X,X,X, 0,X,X,X, X,X,X,X,
+ };
+ for (i = 0; i < 16; i++)
+ R[i] <<= 2*(kmer-1);
+
+ // Punt homopolymers somewhere central in the hash space
+#define XOR (0xdead7878beef7878 & mask)
+
+ // Initialise hash keys
+ for (i = 0; i < kmer-1 && i < len; i++) {
+ int base = bam_seqi(seq, i);
+ hashf = (hashf<<2) | L[base];
+ hashr = (hashr>>2) | R[base];
+ }
+
+ // Loop to find minimum
+ for (; i < len; i++) {
+ int base = bam_seqi(seq, i);
+
+ hashf = ((hashf<<2) | L[base]) & mask;
+ hashr = (hashr>>2) | R[base];
+
+ if (minhashf > (hashf^XOR))
+ minhashf = (hashf^XOR), minhashpf = i;
+ if (minhashr > (hashr^XOR))
+ minhashr = (hashr^XOR), minhashpr = len-i+kmer-2;
+
+ }
+
+ if (minhashf <= minhashr) {
+ if (rev) *rev = 0;
+ if (pos) *pos = minhashpf;
+ return minhashf;
+ } else {
+ if (rev) *rev = 1;
+ if (pos) *pos = minhashpr;
+ return minhashr;
+ }
+}
+
+//--- Start of candidates to punt to htslib
+/*!
+ * @abstract
+ * Extracts the sequence (in current alignment orientation) from
+ * a bam record and places it in buf, which is nul terminated.
+ *
+ * @param b The bam structure
+ * @param buf A buffer at least b->core.l_qseq+1 bytes long
+ */
+static void bam_to_seq(bam1_t *b, char *buf) {
+ int i;
+ uint8_t *seq = bam_get_seq(b);
+ for (i = 0; i < b->core.l_qseq; i++)
+ buf[i] = seq_nt16_str[bam_seqi(seq, i)];
+ buf[i] = 0;
+}
+
+/*!
+ * @abstract
+ * Writes a new sequence, of length b->core.l_qseq, to a BAM record.
+ *
+ * If a sequence of a new length is required the caller must first make
+ * room for it by updating the bam1_t struct.
+ *
+ * @param b The bam structure
+ * @param buf A buffer at least b->core.l_qseq bytes long
+ */
+static void seq_to_bam(bam1_t *b, char *buf) {
+ int i;
+ uint8_t *seq = bam_get_seq(b);
+ for (i = 0; i < b->core.l_qseq; i++)
+ bam_set_seqi(seq, i, seq_nt16_table[(unsigned char)buf[i]]);
+}
+
+/*!
+ * @abstract Reverse complements a BAM record.
+ *
+ * It's possible to do this inline, but complex due to the 4-bit sequence
+ * encoding. For now I take the dumb approach.
+ *
+ * @param b Pointer to a BAM alignment
+ *
+ * @return 0 on success, -1 on failure (ENOMEM)
+ */
+static int reverse_complement(bam1_t *b) {
+ static char comp[256] = {
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//00
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//10
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//20
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//30
+
+ // * * * * E F * * I J * L * * O
+ '@','T','V','G', 'H','E','F','C', 'D','I','H','M', 'L','K','N','O',//40
+ //P Q * * * * * * X Y Z [ \ ] ^ _
+ 'P','Q','Y','S', 'A','A','B','W', 'X','Y','Z','[','\\','[','^','_',//50
+ //` * * * * E F * * I J * L * * O
+ '`','t','v','g', 'h','e','f','c', 'd','i','j','m', 'l','k','n','o',//60
+ //P Q * * * * * * X Y Z { | } ~ DEL
+ 'p','q','y','s', 'a','a','b','w', 'x','y','z','{', '|','}','~',127,//70
+
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//80
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//90
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//A0
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//B0
+
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//C0
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//D0
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//E0
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//F0
+ };
+ char seq_[10000], *seq = seq_;
+ uint8_t *qual = bam_get_qual(b);
+ int i, j;
+
+ if (b->core.l_qseq >= 10000)
+ if (!(seq = malloc(b->core.l_qseq+1)))
+ return -1;
+
+ bam_to_seq(b, seq);
+
+ for (i = 0, j = b->core.l_qseq-1; i < j; i++, j--) {
+ unsigned char tmp = seq[i];
+ seq[i] = comp[(unsigned char)seq[j]];
+ seq[j] = comp[tmp];
+ tmp = qual[i];
+ qual[i] = qual[j];
+ qual[j] = tmp;
+ }
+ if (i ==j)
+ seq[i] = comp[(unsigned char)seq[i]];
+
+ seq_to_bam(b, seq);
+
+ if (seq != seq_)
+ free(seq);
+
+ b->core.flag ^= 0x10;
+
+ return 0;
+}
+//--- End of candidates to punt to htslib
+
static void *worker(void *data)
{
worker_t *w = (worker_t*)data;
char *name;
w->error = 0;
- if (!g_is_by_qname && !g_is_by_tag) {
+ if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) {
if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) {
w->error = errno;
return NULL;
}
} else {
+ if (g_is_by_minhash) {
+ int i;
+ for (i = 0; i < w->buf_len; i++) {
+ bam1_t *b = w->buf[i].bam_record;
+ if (b->core.tid != -1)
+ continue;
+
+ int pos = 0, rev = 0;
+ uint64_t mh = minhash(b, g_is_by_minhash, &pos, &rev);
+ if (rev)
+ reverse_complement(b);
+
+ // Store 64-bit hash in unmapped pos and mpos fields.
+ // The position of hash is in isize, which we use for
+ // resolving ties when sorting by hash key.
+ // These are unused for completely unmapped data and
+ // will be reset during final output.
+ b->core.pos = mh>>31;
+ b->core.mpos = mh&0x7fffffff;
+ b->core.isize = 65535-pos >=0 ? 65535-pos : 0;
+ }
+ }
ks_mergesort(sort, w->buf_len, w->buf, 0);
}
return 0;
}
- if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, NULL, 1, 0) < 0)
+ if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, 0, NULL, 1, 0) < 0)
w->error = errno;
} else {
- if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, NULL, 1, 0) < 0)
+ if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) < 0)
w->error = errno;
}
return n_files + n_threads;
}
+
/*!
@abstract Sort an unsorted BAM file based on the chromosome order
and the leftmost position of an alignment
*/
int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix,
const char *fnout, const char *modeout,
- size_t _max_mem, int n_threads,
+ size_t _max_mem, int by_minimiser, int n_threads,
const htsFormat *in_fmt, const htsFormat *out_fmt,
char *arg_list, int no_pg, int write_index)
{
if (n_threads < 2) n_threads = 1;
g_is_by_qname = is_by_qname;
+ g_is_by_minhash = by_minimiser;
if (sort_by_tag) {
g_is_by_tag = 1;
g_sort_tag[0] = sort_by_tag[0];
else
new_so = "coordinate";
- if ((-1 == sam_hdr_update_hd(header, "SO", new_so))
- && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL))
- ) {
- print_error("sort", "failed to change sort order header to '%s'\n", new_so);
- goto err;
+ if (by_minimiser) {
+ const char *new_ss = "coordinate:minhash";
+ if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "SS", new_ss))
+ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION,
+ "SO", new_so, "SS", new_ss, NULL))
+ ) {
+ print_error("sort", "failed to change sort order header to 'SO:%s SS:%s'\n",
+ new_so, new_ss);
+ goto err;
+ }
+ } else {
+ if ((-1 == sam_hdr_update_hd(header, "SO", new_so))
+ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL))
+ ) {
+ print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so);
+ goto err;
+ }
}
if (-1 == sam_hdr_remove_tag_hd(header, "GO")) {
// write the final output
if (n_files == 0 && num_in_mem < 2) { // a single block
- if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, arg_list, no_pg, write_index) != 0) {
+ if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt,
+ g_is_by_minhash, arg_list, no_pg, write_index) != 0) {
print_error_errno("sort", "failed to create \"%s\"", fnout);
goto err;
}
char *fnout = calloc(strlen(prefix) + 4 + 1, 1);
if (!fnout) return -1;
sprintf(fnout, "%s.bam", prefix);
- ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
+ ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, 0, NULL, NULL, NULL, 1, 0);
free(fnout);
return ret;
}
"Usage: samtools sort [options...] [in.bam]\n"
"Options:\n"
" -l INT Set compression level, from 0 (uncompressed) to 9 (best)\n"
+" -u Output uncompressed data (equivalent to -l 0)\n"
" -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
-" -n Sort by read name\n"
+" -M Use minimiser for clustering unaligned/unplaced reads\n"
+" -K INT Kmer size to use for minimiser [20]\n"
+" -n Sort by read name (not compatible with samtools index command)\n"
" -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
" -o FILE Write final output to FILE rather than standard output\n"
" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"
" --no-PG do not add a PG line\n");
- sam_global_opt_help(fp, "-.O..@-.");
+ sam_global_opt_help(fp, "-.O..@..");
}
static void complain_about_memory_setting(size_t max_mem) {
{
size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20;
int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0;
+ int by_minimiser = 0, minimiser_kmer = 20;
char* sort_tag = NULL, *arg_list = NULL;
char *fnout = "-", modeout[12];
kstring_t tmpprefix = { 0, 0, NULL };
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) {
switch (c) {
case 'o': fnout = optarg; o_seen = 1; break;
case 'n': is_by_qname = 1; break;
}
case 'T': kputs(optarg, &tmpprefix); break;
case 'l': level = atoi(optarg); break;
- case 1: no_pg = 1; break;
+ case 'u': level = 0; break;
+ case 1: no_pg = 1; break;
+ case 'M': by_minimiser = 1; break;
+ case 'K':
+ minimiser_kmer = atoi(optarg);
+ if (minimiser_kmer < 1)
+ minimiser_kmer = 1;
+ else if (minimiser_kmer > 31)
+ minimiser_kmer = 31;
+ break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
}
ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-",
- tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
+ tmpprefix.s, fnout, modeout, max_mem,
+ by_minimiser * minimiser_kmer, ga.nthreads,
&ga.in, &ga.out, arg_list, no_pg, ga.write_index);
if (ret >= 0)
ret = EXIT_SUCCESS;
/* bam_stat.c -- flagstat subcommand.
- Copyright (C) 2009, 2011, 2013-2015, 2019 Genome Research Ltd.
+ Copyright (C) 2009, 2011, 2013-2015, 2019, 2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
long long n_dup[2];
long long n_diffchr[2], n_diffhigh[2];
long long n_secondary[2], n_supp[2];
+ long long n_primary[2], n_pmapped[2], n_pdup[2];
} bam_flagstat_t;
-#define flagstat_loop(s, c) do { \
- int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \
- ++(s)->n_reads[w]; \
- if ((c)->flag & BAM_FSECONDARY ) { \
- ++(s)->n_secondary[w]; \
- } else if ((c)->flag & BAM_FSUPPLEMENTARY ) { \
- ++(s)->n_supp[w]; \
- } else if ((c)->flag & BAM_FPAIRED) { \
- ++(s)->n_pair_all[w]; \
- if (((c)->flag & BAM_FPROPER_PAIR) && !((c)->flag & BAM_FUNMAP) ) ++(s)->n_pair_good[w]; \
- if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \
- if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \
- if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \
- if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \
- ++(s)->n_pair_map[w]; \
- if ((c)->mtid != (c)->tid) { \
- ++(s)->n_diffchr[w]; \
- if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \
- } \
- } \
- } \
- if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \
- if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \
- } while (0)
+inline static void flagstat_loop(bam_flagstat_t *s, bam1_core_t *c)
+{
+ int w = (c->flag & BAM_FQCFAIL)? 1 : 0;
+ ++s->n_reads[w];
+ if (c->flag & BAM_FSECONDARY ) {
+ ++s->n_secondary[w];
+ } else if (c->flag & BAM_FSUPPLEMENTARY ) {
+ ++s->n_supp[w];
+ } else {
+ ++s->n_primary[w];
+
+ if (c->flag & BAM_FPAIRED) {
+ ++s->n_pair_all[w];
+ if ((c->flag & BAM_FPROPER_PAIR) && !(c->flag & BAM_FUNMAP) ) ++s->n_pair_good[w];
+ if (c->flag & BAM_FREAD1) ++s->n_read1[w];
+ if (c->flag & BAM_FREAD2) ++s->n_read2[w];
+ if ((c->flag & BAM_FMUNMAP) && !(c->flag & BAM_FUNMAP)) ++s->n_sgltn[w];
+ if (!(c->flag & BAM_FUNMAP) && !(c->flag & BAM_FMUNMAP)) {
+ ++s->n_pair_map[w];
+ if (c->mtid != c->tid) {
+ ++s->n_diffchr[w];
+ if (c->qual >= 5) ++s->n_diffhigh[w];
+ }
+ }
+ }
+
+ if (!(c->flag & BAM_FUNMAP)) ++s->n_pmapped[w];
+ if (c->flag & BAM_FDUP) ++s->n_pdup[w];
+ }
+ if (!(c->flag & BAM_FUNMAP)) ++s->n_mapped[w];
+ if (c->flag & BAM_FDUP) ++s->n_dup[w];
+}
bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h)
{
while ((ret = sam_read1(fp, h, b)) >= 0)
flagstat_loop(s, c);
bam_destroy1(b);
- if (ret != -1)
- fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
+ if (ret != -1) {
+ free(s);
+ return NULL;
+ }
return s;
}
{
char b0[16], b1[16];
printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
+ printf("%lld + %lld primary\n", s->n_primary[0], s->n_primary[1]);
printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]);
printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]);
printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
+ printf("%lld + %lld primary duplicates\n", s->n_pdup[0], s->n_pdup[1]);
printf("%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1]));
+ printf("%lld + %lld primary mapped (%s : %s)\n", s->n_pmapped[0], s->n_pmapped[1], percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1]));
printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]);
printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]);
}
static void out_fmt_json(bam_flagstat_t *s) {
- char b0[16], b1[16];
+ char b0[16], b1[16], p0[16], p1[16], pp0[16], pp1[16], s0[16], s1[16];
printf("{\n \"QC-passed reads\": { \n"
" \"total\": %lld, \n"
+ " \"primary\": %lld, \n"
" \"secondary\": %lld, \n"
" \"supplementary\": %lld, \n"
" \"duplicates\": %lld, \n"
+ " \"primary duplicates\": %lld, \n"
" \"mapped\": %lld, \n"
" \"mapped %%\": %s, \n"
+ " \"primary mapped\": %lld, \n"
+ " \"primary mapped %%\": %s, \n"
" \"paired in sequencing\": %lld, \n"
" \"read1\": %lld, \n"
" \"read2\": %lld, \n"
" },"
"\n \"QC-failed reads\": { \n"
" \"total\": %lld, \n"
+ " \"primary\": %lld, \n"
" \"secondary\": %lld, \n"
" \"supplementary\": %lld, \n"
" \"duplicates\": %lld, \n"
+ " \"primary duplicates\": %lld, \n"
" \"mapped\": %lld, \n"
" \"mapped %%\": %s, \n"
+ " \"primary mapped\": %lld, \n"
+ " \"primary mapped %%\": %s, \n"
" \"paired in sequencing\": %lld, \n"
" \"read1\": %lld, \n"
" \"read2\": %lld, \n"
" }\n"
"}\n",
s->n_reads[0],
+ s->n_primary[0],
s->n_secondary[0],
s->n_supp[0],
s->n_dup[0],
+ s->n_pdup[0],
s->n_mapped[0],
percent_json(b0, s->n_mapped[0], s->n_reads[0]),
+ s->n_pmapped[0],
+ percent_json(p0, s->n_pmapped[0], s->n_primary[0]),
s->n_pair_all[0],
s->n_read1[0],
s->n_read2[0],
s->n_pair_good[0],
- percent_json(b0, s->n_pair_good[0], s->n_pair_all[0]),
+ percent_json(pp0, s->n_pair_good[0], s->n_pair_all[0]),
s->n_pair_map[0],
s->n_sgltn[0],
- percent_json(b0, s->n_sgltn[0], s->n_pair_all[0]),
+ percent_json(s0, s->n_sgltn[0], s->n_pair_all[0]),
s->n_diffchr[0],
s->n_diffhigh[0],
s->n_reads[1],
+ s->n_primary[1],
s->n_secondary[1],
s->n_supp[1],
s->n_dup[1],
+ s->n_pdup[1],
s->n_mapped[1],
percent_json(b1, s->n_mapped[1], s->n_reads[1]),
+ s->n_pmapped[1],
+ percent_json(p1, s->n_pmapped[1], s->n_primary[1]),
s->n_pair_all[1],
s->n_read1[1],
s->n_read2[1],
s->n_pair_good[1],
- percent_json(b1, s->n_pair_good[1], s->n_pair_all[1]),
+ percent_json(pp1, s->n_pair_good[1], s->n_pair_all[1]),
s->n_pair_map[1],
s->n_sgltn[1],
- percent_json(b1, s->n_sgltn[1], s->n_pair_all[1]),
+ percent_json(s1, s->n_sgltn[1], s->n_pair_all[1]),
s->n_diffchr[1],
s->n_diffhigh[1]
);
static void out_fmt_tsv(bam_flagstat_t *s) {
char b0[16], b1[16];
printf("%lld\t%lld\ttotal (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
+ printf("%lld\t%lld\tprimary\n", s->n_primary[0], s->n_primary[1]);
printf("%lld\t%lld\tsecondary\n", s->n_secondary[0], s->n_secondary[1]);
printf("%lld\t%lld\tsupplementary\n", s->n_supp[0], s->n_supp[1]);
printf("%lld\t%lld\tduplicates\n", s->n_dup[0], s->n_dup[1]);
+ printf("%lld\t%lld\tprimary duplicates\n", s->n_pdup[0], s->n_pdup[1]);
printf("%lld\t%lld\tmapped\n", s->n_mapped[0], s->n_mapped[1]);
printf("%s\t%s\tmapped %%\n", percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1]));
+ printf("%lld\t%lld\tprimary mapped\n", s->n_pmapped[0], s->n_pmapped[1]);
+ printf("%s\t%s\tprimary mapped %%\n", percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1]));
printf("%lld\t%lld\tpaired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
printf("%lld\t%lld\tread1\n", s->n_read1[0], s->n_read1[1]);
printf("%lld\t%lld\tread2\n", s->n_read2[0], s->n_read2[1]);
sam_hdr_t *header;
bam_flagstat_t *s;
const char *out_fmt = "default";
- int c;
+ int c, status = EXIT_SUCCESS;
enum {
INPUT_FMT_OPTION = CHAR_MAX+1,
}
s = bam_flagstat_core(fp, header);
- output_fmt(s, out_fmt);
- free(s);
+ if (s) {
+ output_fmt(s, out_fmt);
+ free(s);
+ }
+ else {
+ print_error("flagstat", "error reading from \"%s\"", argv[optind]);
+ status = EXIT_FAILURE;
+ }
+
sam_hdr_destroy(header);
sam_close(fp);
sam_global_args_free(&ga);
- return 0;
+ return status;
}
/* bam_stat.c -- flagstat subcommand.
- Copyright (C) 2009, 2011, 2013-2015, 2019 Genome Research Ltd.
+ Copyright (C) 2009, 2011, 2013-2015, 2019, 2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
long long n_dup[2];
long long n_diffchr[2], n_diffhigh[2];
long long n_secondary[2], n_supp[2];
+ long long n_primary[2], n_pmapped[2], n_pdup[2];
} bam_flagstat_t;
-#define flagstat_loop(s, c) do { \
- int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \
- ++(s)->n_reads[w]; \
- if ((c)->flag & BAM_FSECONDARY ) { \
- ++(s)->n_secondary[w]; \
- } else if ((c)->flag & BAM_FSUPPLEMENTARY ) { \
- ++(s)->n_supp[w]; \
- } else if ((c)->flag & BAM_FPAIRED) { \
- ++(s)->n_pair_all[w]; \
- if (((c)->flag & BAM_FPROPER_PAIR) && !((c)->flag & BAM_FUNMAP) ) ++(s)->n_pair_good[w]; \
- if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \
- if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \
- if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \
- if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \
- ++(s)->n_pair_map[w]; \
- if ((c)->mtid != (c)->tid) { \
- ++(s)->n_diffchr[w]; \
- if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \
- } \
- } \
- } \
- if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \
- if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \
- } while (0)
+inline static void flagstat_loop(bam_flagstat_t *s, bam1_core_t *c)
+{
+ int w = (c->flag & BAM_FQCFAIL)? 1 : 0;
+ ++s->n_reads[w];
+ if (c->flag & BAM_FSECONDARY ) {
+ ++s->n_secondary[w];
+ } else if (c->flag & BAM_FSUPPLEMENTARY ) {
+ ++s->n_supp[w];
+ } else {
+ ++s->n_primary[w];
+
+ if (c->flag & BAM_FPAIRED) {
+ ++s->n_pair_all[w];
+ if ((c->flag & BAM_FPROPER_PAIR) && !(c->flag & BAM_FUNMAP) ) ++s->n_pair_good[w];
+ if (c->flag & BAM_FREAD1) ++s->n_read1[w];
+ if (c->flag & BAM_FREAD2) ++s->n_read2[w];
+ if ((c->flag & BAM_FMUNMAP) && !(c->flag & BAM_FUNMAP)) ++s->n_sgltn[w];
+ if (!(c->flag & BAM_FUNMAP) && !(c->flag & BAM_FMUNMAP)) {
+ ++s->n_pair_map[w];
+ if (c->mtid != c->tid) {
+ ++s->n_diffchr[w];
+ if (c->qual >= 5) ++s->n_diffhigh[w];
+ }
+ }
+ }
+
+ if (!(c->flag & BAM_FUNMAP)) ++s->n_pmapped[w];
+ if (c->flag & BAM_FDUP) ++s->n_pdup[w];
+ }
+ if (!(c->flag & BAM_FUNMAP)) ++s->n_mapped[w];
+ if (c->flag & BAM_FDUP) ++s->n_dup[w];
+}
bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h)
{
while ((ret = sam_read1(fp, h, b)) >= 0)
flagstat_loop(s, c);
bam_destroy1(b);
- if (ret != -1)
- fprintf(samtools_stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
+ if (ret != -1) {
+ free(s);
+ return NULL;
+ }
return s;
}
fprintf(fp, " -O, --");
fprintf(fp, "output-fmt FORMAT[,OPT[=VAL]]...\n"
" Specify output format (json, tsv)\n");
- exit(exit_status);
+ samtools_exit(exit_status);
}
static void out_fmt_default(bam_flagstat_t *s)
{
char b0[16], b1[16];
fprintf(samtools_stdout, "%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
+ fprintf(samtools_stdout, "%lld + %lld primary\n", s->n_primary[0], s->n_primary[1]);
fprintf(samtools_stdout, "%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]);
fprintf(samtools_stdout, "%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]);
fprintf(samtools_stdout, "%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
+ fprintf(samtools_stdout, "%lld + %lld primary duplicates\n", s->n_pdup[0], s->n_pdup[1]);
fprintf(samtools_stdout, "%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1]));
+ fprintf(samtools_stdout, "%lld + %lld primary mapped (%s : %s)\n", s->n_pmapped[0], s->n_pmapped[1], percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1]));
fprintf(samtools_stdout, "%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
fprintf(samtools_stdout, "%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]);
fprintf(samtools_stdout, "%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]);
}
static void out_fmt_json(bam_flagstat_t *s) {
- char b0[16], b1[16];
+ char b0[16], b1[16], p0[16], p1[16], pp0[16], pp1[16], s0[16], s1[16];
fprintf(samtools_stdout, "{\n \"QC-passed reads\": { \n"
" \"total\": %lld, \n"
+ " \"primary\": %lld, \n"
" \"secondary\": %lld, \n"
" \"supplementary\": %lld, \n"
" \"duplicates\": %lld, \n"
+ " \"primary duplicates\": %lld, \n"
" \"mapped\": %lld, \n"
" \"mapped %%\": %s, \n"
+ " \"primary mapped\": %lld, \n"
+ " \"primary mapped %%\": %s, \n"
" \"paired in sequencing\": %lld, \n"
" \"read1\": %lld, \n"
" \"read2\": %lld, \n"
" },"
"\n \"QC-failed reads\": { \n"
" \"total\": %lld, \n"
+ " \"primary\": %lld, \n"
" \"secondary\": %lld, \n"
" \"supplementary\": %lld, \n"
" \"duplicates\": %lld, \n"
+ " \"primary duplicates\": %lld, \n"
" \"mapped\": %lld, \n"
" \"mapped %%\": %s, \n"
+ " \"primary mapped\": %lld, \n"
+ " \"primary mapped %%\": %s, \n"
" \"paired in sequencing\": %lld, \n"
" \"read1\": %lld, \n"
" \"read2\": %lld, \n"
" }\n"
"}\n",
s->n_reads[0],
+ s->n_primary[0],
s->n_secondary[0],
s->n_supp[0],
s->n_dup[0],
+ s->n_pdup[0],
s->n_mapped[0],
percent_json(b0, s->n_mapped[0], s->n_reads[0]),
+ s->n_pmapped[0],
+ percent_json(p0, s->n_pmapped[0], s->n_primary[0]),
s->n_pair_all[0],
s->n_read1[0],
s->n_read2[0],
s->n_pair_good[0],
- percent_json(b0, s->n_pair_good[0], s->n_pair_all[0]),
+ percent_json(pp0, s->n_pair_good[0], s->n_pair_all[0]),
s->n_pair_map[0],
s->n_sgltn[0],
- percent_json(b0, s->n_sgltn[0], s->n_pair_all[0]),
+ percent_json(s0, s->n_sgltn[0], s->n_pair_all[0]),
s->n_diffchr[0],
s->n_diffhigh[0],
s->n_reads[1],
+ s->n_primary[1],
s->n_secondary[1],
s->n_supp[1],
s->n_dup[1],
+ s->n_pdup[1],
s->n_mapped[1],
percent_json(b1, s->n_mapped[1], s->n_reads[1]),
+ s->n_pmapped[1],
+ percent_json(p1, s->n_pmapped[1], s->n_primary[1]),
s->n_pair_all[1],
s->n_read1[1],
s->n_read2[1],
s->n_pair_good[1],
- percent_json(b1, s->n_pair_good[1], s->n_pair_all[1]),
+ percent_json(pp1, s->n_pair_good[1], s->n_pair_all[1]),
s->n_pair_map[1],
s->n_sgltn[1],
- percent_json(b1, s->n_sgltn[1], s->n_pair_all[1]),
+ percent_json(s1, s->n_sgltn[1], s->n_pair_all[1]),
s->n_diffchr[1],
s->n_diffhigh[1]
);
static void out_fmt_tsv(bam_flagstat_t *s) {
char b0[16], b1[16];
fprintf(samtools_stdout, "%lld\t%lld\ttotal (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
+ fprintf(samtools_stdout, "%lld\t%lld\tprimary\n", s->n_primary[0], s->n_primary[1]);
fprintf(samtools_stdout, "%lld\t%lld\tsecondary\n", s->n_secondary[0], s->n_secondary[1]);
fprintf(samtools_stdout, "%lld\t%lld\tsupplementary\n", s->n_supp[0], s->n_supp[1]);
fprintf(samtools_stdout, "%lld\t%lld\tduplicates\n", s->n_dup[0], s->n_dup[1]);
+ fprintf(samtools_stdout, "%lld\t%lld\tprimary duplicates\n", s->n_pdup[0], s->n_pdup[1]);
fprintf(samtools_stdout, "%lld\t%lld\tmapped\n", s->n_mapped[0], s->n_mapped[1]);
fprintf(samtools_stdout, "%s\t%s\tmapped %%\n", percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1]));
+ fprintf(samtools_stdout, "%lld\t%lld\tprimary mapped\n", s->n_pmapped[0], s->n_pmapped[1]);
+ fprintf(samtools_stdout, "%s\t%s\tprimary mapped %%\n", percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1]));
fprintf(samtools_stdout, "%lld\t%lld\tpaired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
fprintf(samtools_stdout, "%lld\t%lld\tread1\n", s->n_read1[0], s->n_read1[1]);
fprintf(samtools_stdout, "%lld\t%lld\tread2\n", s->n_read2[0], s->n_read2[1]);
sam_hdr_t *header;
bam_flagstat_t *s;
const char *out_fmt = "default";
- int c;
+ int c, status = EXIT_SUCCESS;
enum {
INPUT_FMT_OPTION = CHAR_MAX+1,
}
s = bam_flagstat_core(fp, header);
- output_fmt(s, out_fmt);
- free(s);
+ if (s) {
+ output_fmt(s, out_fmt);
+ free(s);
+ }
+ else {
+ print_error("flagstat", "error reading from \"%s\"", argv[optind]);
+ status = EXIT_FAILURE;
+ }
+
sam_hdr_destroy(header);
sam_close(fp);
sam_global_args_free(&ga);
- return 0;
+ return status;
}
/* bamtk.c -- main samtools command front-end.
- Copyright (C) 2008-2019 Genome Research Ltd.
+ Copyright (C) 2008-2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#include <string.h>
#include "htslib/hts.h"
+#include "htslib/hfile.h"
#include "samtools.h"
#include "version.h"
int bam_idxstats(int argc, char *argv[]);
int bam_markdup(int argc, char *argv[]);
int main_samview(int argc, char *argv[]);
-int main_import(int argc, char *argv[]);
int main_reheader(int argc, char *argv[]);
int main_cut_target(int argc, char *argv[]);
int main_phase(int argc, char *argv[]);
int faidx_main(int argc, char *argv[]);
int dict_main(int argc, char *argv[]);
int fqidx_main(int argc, char *argv[]);
+int amplicon_clip_main(int argc, char *argv[]);
+int main_ampliconstats(int argc, char *argv[]);
+int main_import(int argc, char *argv[]);
const char *samtools_version()
{
return SAMTOOLS_VERSION;
}
+// These come out of the config.h file built by autoconf or Makefile
+const char *samtools_feature_string(void) {
+ const char *fmt =
+
+#ifdef PACKAGE_URL
+ "build=configure "
+#else
+ "build=Makefile "
+#endif
+
+#ifdef HAVE_CURSES
+ "curses=yes "
+#else
+ "curses=no "
+#endif
+ ;
+
+ return fmt;
+}
+
+static void long_version(void) {
+ printf("samtools %s\n"
+ "Using htslib %s\n"
+ "Copyright (C) 2021 Genome Research Ltd.\n",
+ samtools_version(), hts_version());
+
+ printf("\nSamtools compilation details:\n");
+ printf(" Features: %s\n", samtools_feature_string());
+ printf(" CC: %s\n", SAMTOOLS_CC);
+ printf(" CPPFLAGS: %s\n", SAMTOOLS_CPPFLAGS);
+ printf(" CFLAGS: %s\n", SAMTOOLS_CFLAGS);
+ printf(" LDFLAGS: %s\n", SAMTOOLS_LDFLAGS);
+ printf(" HTSDIR: %s\n", SAMTOOLS_HTSDIR);
+ printf(" LIBS: %s\n", SAMTOOLS_LIBS);
+ printf(" CURSES_LIB: %s\n", SAMTOOLS_CURSES_LIB);
+
+ printf("\nHTSlib compilation details:\n");
+ printf(" Features: %s\n", hts_feature_string());
+ printf(" CC: %s\n", hts_test_feature(HTS_FEATURE_CC));
+ printf(" CPPFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CPPFLAGS));
+ printf(" CFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CFLAGS));
+ printf(" LDFLAGS: %s\n", hts_test_feature(HTS_FEATURE_LDFLAGS));
+
+ // Plugins and schemes
+ printf("\nHTSlib URL scheme handlers present:\n");
+ const char *plugins[100];
+ int np = 100, i, j;
+
+ if (hfile_list_plugins(plugins, &np) < 0)
+ return;
+
+ for (i = 0; i < np; i++) {
+ const char *sc_list[100];
+ int nschemes = 100;
+ if (hfile_list_schemes(plugins[i], sc_list, &nschemes) < 0)
+ return;
+
+ printf(" %s:\t", plugins[i]);
+ for (j = 0; j < nschemes; j++)
+ printf(" %s%c", sc_list[j], ",\n"[j+1==nschemes]);
+ }
+}
+
static void usage(FILE *fp)
{
/* Please improve the grouping */
" targetcut cut fosmid regions (for fosmid pool only)\n"
" addreplacerg adds or replaces RG tags\n"
" markdup mark duplicates\n"
+" ampliconclip clip oligos from the end of reads\n"
"\n"
" -- File operations\n"
" collate shuffle and group alignments by name\n"
" quickcheck quickly check if SAM/BAM/CRAM file appears intact\n"
" fastq converts a BAM to a FASTQ\n"
" fasta converts a BAM to a FASTA\n"
+" import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n"
"\n"
" -- Statistics\n"
" bedcov read depth per BED region\n"
" idxstats BAM index stats\n"
" phase phase heterozygotes\n"
" stats generate stats (former bamcheck)\n"
+" ampliconstats generate amplicon specific stats\n"
"\n"
" -- Viewing\n"
" flags explain BAM flags\n"
" tview text alignment viewer\n"
" view SAM<->BAM<->CRAM conversion\n"
" depad convert padded BAM to unpadded BAM\n"
+"\n"
+" -- Misc\n"
+" help [cmd] display this help message or help for [cmd]\n"
+" version detailed version information\n"
"\n");
-#ifdef _WIN32
- fprintf(fp,
-"Note: The Windows version of SAMtools is mainly designed for read-only\n"
-" operations, such as viewing the alignments and generating the pileup.\n"
-" Binary files generated by the Windows version may be buggy.\n\n");
-#endif
}
// This is a tricky one, but on Windows the filename wildcard expansion is done by
else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1);
else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1);
else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1);
+ else if (strcmp(argv[1], "ampliconclip") == 0) ret = amplicon_clip_main(argc-1, argv+1);
else if (strcmp(argv[1], "flagstat") == 0 ||
strcmp(argv[1], "flagstats") == 0) ret = bam_flagstat(argc-1, argv+1);
else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1);
return 1;
}
else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1);
- else if (strcmp(argv[1], "--version") == 0) {
- printf(
-"samtools %s\n"
-"Using htslib %s\n"
-"Copyright (C) 2019 Genome Research Ltd.\n",
- samtools_version(), hts_version());
+ else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1);
+ else if (strcmp(argv[1], "version") == 0 || \
+ strcmp(argv[1], "--version") == 0) {
+ long_version();
}
else if (strcmp(argv[1], "--version-only") == 0) {
printf("%s+htslib-%s\n", samtools_version(), hts_version());
/* bamtk.c -- main samtools command front-end.
- Copyright (C) 2008-2019 Genome Research Ltd.
+ Copyright (C) 2008-2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#include <string.h>
#include "htslib/hts.h"
+#include "htslib/hfile.h"
#include "samtools.h"
#include "version.h"
+#include "samtools_config_vars.h"
int bam_taf2baf(int argc, char *argv[]);
int bam_mpileup(int argc, char *argv[]);
int bam_idxstats(int argc, char *argv[]);
int bam_markdup(int argc, char *argv[]);
int main_samview(int argc, char *argv[]);
-int main_import(int argc, char *argv[]);
-int main_reheader(int argc, char *argv[]);
+int samtools_main_reheader(int argc, char *argv[]);
int main_cut_target(int argc, char *argv[]);
int main_phase(int argc, char *argv[]);
int main_cat(int argc, char *argv[]);
int faidx_main(int argc, char *argv[]);
int dict_main(int argc, char *argv[]);
int fqidx_main(int argc, char *argv[]);
+int amplicon_clip_main(int argc, char *argv[]);
+int main_ampliconstats(int argc, char *argv[]);
+int main_import(int argc, char *argv[]);
const char *samtools_version()
{
return SAMTOOLS_VERSION;
}
+// These come out of the config.h file built by autoconf or Makefile
+const char *samtools_feature_string(void) {
+ const char *fmt =
+
+#ifdef PACKAGE_URL
+ "build=configure "
+#else
+ "build=Makefile "
+#endif
+
+#ifdef HAVE_CURSES
+ "curses=yes "
+#else
+ "curses=no "
+#endif
+ ;
+
+ return fmt;
+}
+
+static void long_version(void) {
+ fprintf(samtools_stdout, "samtools %s\n"
+ "Using htslib %s\n"
+ "Copyright (C) 2021 Genome Research Ltd.\n",
+ samtools_version(), hts_version());
+
+ fprintf(samtools_stdout, "\nSamtools compilation details:\n");
+ fprintf(samtools_stdout, " Features: %s\n", samtools_feature_string());
+ fprintf(samtools_stdout, " CC: %s\n", SAMTOOLS_CC);
+ fprintf(samtools_stdout, " CPPFLAGS: %s\n", SAMTOOLS_CPPFLAGS);
+ fprintf(samtools_stdout, " CFLAGS: %s\n", SAMTOOLS_CFLAGS);
+ fprintf(samtools_stdout, " LDFLAGS: %s\n", SAMTOOLS_LDFLAGS);
+ fprintf(samtools_stdout, " HTSDIR: %s\n", SAMTOOLS_HTSDIR);
+ fprintf(samtools_stdout, " LIBS: %s\n", SAMTOOLS_LIBS);
+ fprintf(samtools_stdout, " CURSES_LIB: %s\n", SAMTOOLS_CURSES_LIB);
+
+ fprintf(samtools_stdout, "\nHTSlib compilation details:\n");
+ fprintf(samtools_stdout, " Features: %s\n", hts_feature_string());
+ fprintf(samtools_stdout, " CC: %s\n", hts_test_feature(HTS_FEATURE_CC));
+ fprintf(samtools_stdout, " CPPFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CPPFLAGS));
+ fprintf(samtools_stdout, " CFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CFLAGS));
+ fprintf(samtools_stdout, " LDFLAGS: %s\n", hts_test_feature(HTS_FEATURE_LDFLAGS));
+
+ // Plugins and schemes
+ fprintf(samtools_stdout, "\nHTSlib URL scheme handlers present:\n");
+ const char *plugins[100];
+ int np = 100, i, j;
+
+ if (hfile_list_plugins(plugins, &np) < 0)
+ return;
+
+ for (i = 0; i < np; i++) {
+ const char *sc_list[100];
+ int nschemes = 100;
+ if (hfile_list_schemes(plugins[i], sc_list, &nschemes) < 0)
+ return;
+
+ fprintf(samtools_stdout, " %s:\t", plugins[i]);
+ for (j = 0; j < nschemes; j++)
+ fprintf(samtools_stdout, " %s%c", sc_list[j], ",\n"[j+1==nschemes]);
+ }
+}
+
static void usage(FILE *fp)
{
/* Please improve the grouping */
" targetcut cut fosmid regions (for fosmid pool only)\n"
" addreplacerg adds or replaces RG tags\n"
" markdup mark duplicates\n"
+" ampliconclip clip oligos from the end of reads\n"
"\n"
" -- File operations\n"
" collate shuffle and group alignments by name\n"
" quickcheck quickly check if SAM/BAM/CRAM file appears intact\n"
" fastq converts a BAM to a FASTQ\n"
" fasta converts a BAM to a FASTA\n"
+" import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n"
"\n"
" -- Statistics\n"
" bedcov read depth per BED region\n"
" idxstats BAM index stats\n"
" phase phase heterozygotes\n"
" stats generate stats (former bamcheck)\n"
+" ampliconstats generate amplicon specific stats\n"
"\n"
" -- Viewing\n"
" flags explain BAM flags\n"
" tview text alignment viewer\n"
" view SAM<->BAM<->CRAM conversion\n"
" depad convert padded BAM to unpadded BAM\n"
+"\n"
+" -- Misc\n"
+" help [cmd] display this help message or help for [cmd]\n"
+" version detailed version information\n"
"\n");
-#ifdef _WIN32
- fprintf(fp,
-"Note: The Windows version of SAMtools is mainly designed for read-only\n"
-" operations, such as viewing the alignments and generating the pileup.\n"
-" Binary files generated by the Windows version may be buggy.\n\n");
-#endif
}
// This is a tricky one, but on Windows the filename wildcard expansion is done by
else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1);
else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1);
else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1);
+ else if (strcmp(argv[1], "ampliconclip") == 0) ret = amplicon_clip_main(argc-1, argv+1);
else if (strcmp(argv[1], "flagstat") == 0 ||
strcmp(argv[1], "flagstats") == 0) ret = bam_flagstat(argc-1, argv+1);
else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1);
else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1);
- else if (strcmp(argv[1], "reheader") == 0) ret = main_reheader(argc-1, argv+1);
+ else if (strcmp(argv[1], "reheader") == 0) ret = samtools_main_reheader(argc-1, argv+1);
else if (strcmp(argv[1], "cat") == 0) ret = main_cat(argc-1, argv+1);
else if (strcmp(argv[1], "targetcut") == 0) ret = main_cut_target(argc-1, argv+1);
else if (strcmp(argv[1], "phase") == 0) ret = main_phase(argc-1, argv+1);
return 1;
}
//else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1);
- else if (strcmp(argv[1], "--version") == 0) {
- fprintf(samtools_stdout,
-"samtools %s\n"
-"Using htslib %s\n"
-"Copyright (C) 2019 Genome Research Ltd.\n",
- samtools_version(), hts_version());
+ else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1);
+ else if (strcmp(argv[1], "version") == 0 || \
+ strcmp(argv[1], "--version") == 0) {
+ long_version();
}
else if (strcmp(argv[1], "--version-only") == 0) {
fprintf(samtools_stdout, "%s+htslib-%s\n", samtools_version(), hts_version());
/* bedcov.c -- bedcov subcommand.
Copyright (C) 2012 Broad Institute.
- Copyright (C) 2013-2014, 2018, 2019 Genome Research Ltd.
+ Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#include "htslib/kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
+#define DEFAULT_DEPTH 64000
+
typedef struct {
htsFile *fp;
sam_hdr_t *header;
hts_itr_t *iter;
int min_mapQ;
+ uint32_t flags; // read filtering flags
} aux_t;
static int read_bam(void *data, bam1_t *b)
{
ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b);
if ( ret<0 ) break;
- if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
+ if ( b->core.flag & aux->flags ) continue;
if ( (int)b->core.qual < aux->min_mapQ ) continue;
break;
}
kstream_t *ks;
hts_idx_t **idx;
aux_t **aux;
- int *n_plp, dret, i, j, m, n, c, min_mapQ = 0, skip_DN = 0;
- int64_t *cnt;
+ int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0;
+ int64_t *cnt, *pcov = NULL;;
const bam_pileup1_t **plp;
int usage = 0, has_index_file = 0;
+ uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
+ int tflags = 0, min_depth = -1;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "Q:Xj", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) {
switch (c) {
case 'Q': min_mapQ = atoi(optarg); break;
case 'X': has_index_file = 1; break;
+ case 'g':
+ tflags = bam_str2flag(optarg);
+ if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
+ print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
+ return 1;
+ }
+ flags &= ~tflags;
+ break;
+ case 'G':
+ tflags = bam_str2flag(optarg);
+ if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
+ print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
+ return 1;
+ }
+ flags |= tflags;
+ break;
case 'j': skip_DN = 1; break;
+ case 'd': min_depth = atoi(optarg); break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': usage = 1; break;
fprintf(stderr, "Options:\n");
fprintf(stderr, " -Q <int> mapping quality threshold [0]\n");
fprintf(stderr, " -X use customized index files\n");
+ fprintf(stderr, " -g <flags> remove the specified flags from the set used to filter out reads\n");
+ fprintf(stderr, " -G <flags> add the specified flags to the set used to filter out reads\n"
+ " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704");
fprintf(stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n");
+ fprintf(stderr, " -d <int> depth threshold. Number of reference bases with coverage above and"
+ " including this value will be displayed in a separate column\n");
sam_global_opt_help(stderr, "-.--.--.");
return 1;
}
argv[i+optind+1]);
return 2;
}
+ aux[i]->flags = flags;
}
- cnt = calloc(n, 8);
+ cnt = calloc(n, sizeof(*cnt));
+ if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov));
+ if (!cnt || (min_depth >= 0 && !pcov)) return 2;
fp = gzopen(argv[optind], "rb");
if (fp == NULL) {
plp = calloc(n, sizeof(bam_pileup1_t*));
while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
char *p, *q;
- int tid, beg, end, pos;
+ int tid, pos, num = 0;
+ int64_t beg = 0, end = 0;
bam_mplp_t mplp;
if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */
be followed by a tab in that case). */
if (strncmp(str.s, "track ", 6) == 0) continue;
if (strncmp(str.s, "browser ", 8) == 0) continue;
- for (p = q = str.s; *p && *p != '\t'; ++p);
- if (*p != '\t') goto bed_error;
- *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t';
+ for (p = q = str.s; *p && !isspace(*p); ++p);
+ if (*p == 0) goto bed_error;
+ char c = *p;
+ *p = 0; tid = bam_name2id(aux[0]->header, q); *p = c;
if (tid < 0) goto bed_error;
- for (q = p = p + 1; isdigit(*p); ++p);
- if (*p != '\t') goto bed_error;
- *p = 0; beg = atoi(q); *p = '\t';
- for (q = p = p + 1; isdigit(*p); ++p);
- if (*p == '\t' || *p == 0) {
- int c = *p;
- *p = 0; end = atoi(q); *p = c;
- } else goto bed_error;
+ num = sscanf(p + 1, "%"SCNd64" %"SCNd64, &beg, &end);
+ if (num < 2 || end < beg) goto bed_error;
for (i = 0; i < n; ++i) {
if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
}
+
mplp = bam_mplp_init(n, read_bam, (void**)aux);
- bam_mplp_set_maxcnt(mplp, 64000);
- memset(cnt, 0, 8 * n);
- while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
+ if (min_depth > DEFAULT_DEPTH)
+ bam_mplp_set_maxcnt(mplp, min_depth);
+ else
+ bam_mplp_set_maxcnt(mplp, DEFAULT_DEPTH);
+
+ memset(cnt, 0, sizeof(*cnt) * n);
+ if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n);
+
+ while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0)
if (pos >= beg && pos < end) {
- for (i = 0, m = 0; i < n; ++i) {
- if (skip_DN)
+ for (i = 0; i < n; ++i) {
+ m = 0;
+ if (skip_DN || min_depth >= 0) {
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *pi = plp[i] + j;
if (pi->is_del || pi->is_refskip) ++m;
}
- cnt[i] += n_plp[i] - m;
+ }
+ int pd = n_plp[i] - m;
+ cnt[i] += pd;
+ if (min_depth >= 0 && pd >= min_depth) pcov[i]++;
}
}
+
+ if (ret < 0) {
+ print_error("bedcov", "error reading from input file");
+ status = 2;
+ bam_mplp_destroy(mplp);
+ break;
+ }
+
for (i = 0; i < n; ++i) {
kputc('\t', &str);
kputl(cnt[i], &str);
}
+ if (min_depth >= 0) {
+ for (i = 0; i < n; ++i) {
+ kputc('\t', &str);
+ kputl(pcov[i], &str);
+ }
+ }
puts(str.s);
bam_mplp_destroy(mplp);
continue;
bed_error:
fprintf(stderr, "Errors in BED line '%s'\n", str.s);
+ status = 2;
}
free(n_plp); free(plp);
ks_destroy(ks);
gzclose(fp);
free(cnt);
+ free(pcov);
for (i = 0; i < n; ++i) {
if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
hts_idx_destroy(idx[i]);
free(aux); free(idx);
free(str.s);
sam_global_args_free(&ga);
- return 0;
+ return status;
}
/* bedcov.c -- bedcov subcommand.
Copyright (C) 2012 Broad Institute.
- Copyright (C) 2013-2014, 2018, 2019 Genome Research Ltd.
+ Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#include "htslib/kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
+#define DEFAULT_DEPTH 64000
+
typedef struct {
htsFile *fp;
sam_hdr_t *header;
hts_itr_t *iter;
int min_mapQ;
+ uint32_t flags; // read filtering flags
} aux_t;
static int read_bam(void *data, bam1_t *b)
{
ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b);
if ( ret<0 ) break;
- if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
+ if ( b->core.flag & aux->flags ) continue;
if ( (int)b->core.qual < aux->min_mapQ ) continue;
break;
}
kstream_t *ks;
hts_idx_t **idx;
aux_t **aux;
- int *n_plp, dret, i, j, m, n, c, min_mapQ = 0, skip_DN = 0;
- int64_t *cnt;
+ int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0;
+ int64_t *cnt, *pcov = NULL;;
const bam_pileup1_t **plp;
int usage = 0, has_index_file = 0;
+ uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
+ int tflags = 0, min_depth = -1;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "Q:Xj", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) {
switch (c) {
case 'Q': min_mapQ = atoi(optarg); break;
case 'X': has_index_file = 1; break;
+ case 'g':
+ tflags = bam_str2flag(optarg);
+ if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
+ print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
+ return 1;
+ }
+ flags &= ~tflags;
+ break;
+ case 'G':
+ tflags = bam_str2flag(optarg);
+ if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
+ print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
+ return 1;
+ }
+ flags |= tflags;
+ break;
case 'j': skip_DN = 1; break;
+ case 'd': min_depth = atoi(optarg); break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': usage = 1; break;
fprintf(samtools_stderr, "Options:\n");
fprintf(samtools_stderr, " -Q <int> mapping quality threshold [0]\n");
fprintf(samtools_stderr, " -X use customized index files\n");
+ fprintf(samtools_stderr, " -g <flags> remove the specified flags from the set used to filter out reads\n");
+ fprintf(samtools_stderr, " -G <flags> add the specified flags to the set used to filter out reads\n"
+ " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704");
fprintf(samtools_stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n");
+ fprintf(samtools_stderr, " -d <int> depth threshold. Number of reference bases with coverage above and"
+ " including this value will be displayed in a separate column\n");
sam_global_opt_help(samtools_stderr, "-.--.--.");
return 1;
}
argv[i+optind+1]);
return 2;
}
+ aux[i]->flags = flags;
}
- cnt = calloc(n, 8);
+ cnt = calloc(n, sizeof(*cnt));
+ if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov));
+ if (!cnt || (min_depth >= 0 && !pcov)) return 2;
fp = gzopen(argv[optind], "rb");
if (fp == NULL) {
plp = calloc(n, sizeof(bam_pileup1_t*));
while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
char *p, *q;
- int tid, beg, end, pos;
+ int tid, pos, num = 0;
+ int64_t beg = 0, end = 0;
bam_mplp_t mplp;
if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */
be followed by a tab in that case). */
if (strncmp(str.s, "track ", 6) == 0) continue;
if (strncmp(str.s, "browser ", 8) == 0) continue;
- for (p = q = str.s; *p && *p != '\t'; ++p);
- if (*p != '\t') goto bed_error;
- *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t';
+ for (p = q = str.s; *p && !isspace(*p); ++p);
+ if (*p == 0) goto bed_error;
+ char c = *p;
+ *p = 0; tid = bam_name2id(aux[0]->header, q); *p = c;
if (tid < 0) goto bed_error;
- for (q = p = p + 1; isdigit(*p); ++p);
- if (*p != '\t') goto bed_error;
- *p = 0; beg = atoi(q); *p = '\t';
- for (q = p = p + 1; isdigit(*p); ++p);
- if (*p == '\t' || *p == 0) {
- int c = *p;
- *p = 0; end = atoi(q); *p = c;
- } else goto bed_error;
+ num = sscanf(p + 1, "%"SCNd64" %"SCNd64, &beg, &end);
+ if (num < 2 || end < beg) goto bed_error;
for (i = 0; i < n; ++i) {
if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
}
+
mplp = bam_mplp_init(n, read_bam, (void**)aux);
- bam_mplp_set_maxcnt(mplp, 64000);
- memset(cnt, 0, 8 * n);
- while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
+ if (min_depth > DEFAULT_DEPTH)
+ bam_mplp_set_maxcnt(mplp, min_depth);
+ else
+ bam_mplp_set_maxcnt(mplp, DEFAULT_DEPTH);
+
+ memset(cnt, 0, sizeof(*cnt) * n);
+ if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n);
+
+ while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0)
if (pos >= beg && pos < end) {
- for (i = 0, m = 0; i < n; ++i) {
- if (skip_DN)
+ for (i = 0; i < n; ++i) {
+ m = 0;
+ if (skip_DN || min_depth >= 0) {
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *pi = plp[i] + j;
if (pi->is_del || pi->is_refskip) ++m;
}
- cnt[i] += n_plp[i] - m;
+ }
+ int pd = n_plp[i] - m;
+ cnt[i] += pd;
+ if (min_depth >= 0 && pd >= min_depth) pcov[i]++;
}
}
+
+ if (ret < 0) {
+ print_error("bedcov", "error reading from input file");
+ status = 2;
+ bam_mplp_destroy(mplp);
+ break;
+ }
+
for (i = 0; i < n; ++i) {
kputc('\t', &str);
kputl(cnt[i], &str);
}
+ if (min_depth >= 0) {
+ for (i = 0; i < n; ++i) {
+ kputc('\t', &str);
+ kputl(pcov[i], &str);
+ }
+ }
samtools_puts(str.s);
bam_mplp_destroy(mplp);
continue;
bed_error:
fprintf(samtools_stderr, "Errors in BED line '%s'\n", str.s);
+ status = 2;
}
free(n_plp); free(plp);
ks_destroy(ks);
gzclose(fp);
free(cnt);
+ free(pcov);
for (i = 0; i < n; ++i) {
if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
hts_idx_destroy(idx[i]);
free(aux); free(idx);
free(str.s);
sam_global_args_free(&ga);
- return 0;
+ return status;
}
return kh_key(h, i);
}
+/**
+ * Create a region list from a the region hash table
+ * @param reg_hash The region hash table
+ * @param filter 0 - allow all regions, 1 - allow only selected regions
+ * @param n_reg Pointer to the returned region number
+ * @return The regions list as a hts_reglist_t
+ */
+
hts_reglist_t *bed_reglist(void *reg_hash, int filter, int *n_reg) {
reghash_t *h;
return kh_key(h, i);
}
+/**
+ * Create a region list from a the region hash table
+ * @param reg_hash The region hash table
+ * @param filter 0 - allow all regions, 1 - allow only selected regions
+ * @param n_reg Pointer to the returned region number
+ * @return The regions list as a hts_reglist_t
+ */
+
hts_reglist_t *bed_reglist(void *reg_hash, int filter, int *n_reg) {
reghash_t *h;
/* coverage.c -- samtools coverage subcommand
Copyright (C) 2018,2019 Florian Breitwieser
- Portions copyright (C) 2019 Genome Research Ltd.
+ Portions copyright (C) 2019-2021 Genome Research Ltd.
Author: Florian P Breitwieser <florian.bw@gmail.com>
DEALINGS IN THE SOFTWARE. */
/* This program calculates coverage from multiple BAMs
- * simutaneously, to achieve random access and to use the BED interface.
+ * simultaneously, to achieve random access and to use the BED interface.
* To compile this program separately, you may:
*
* gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz
const char *VERSION = "0.1";
-typedef struct { // auxiliary data structure to hold a BAM file
- samFile *fp; // file handle
- sam_hdr_t *hdr; // file header
- hts_itr_t *iter; // iterator to a region - NULL for us by default
- int min_mapQ; // mapQ filter
- int min_len; // length filter
- unsigned int n_reads; // records the number of reads seen in file
- unsigned int n_selected_reads; // records the number of reads passing filter
- unsigned long summed_mapQ; // summed mapQ of all reads passing filter
- int fail_flags;
- int required_flags;
-} bam_aux_t;
-
typedef struct { // auxiliary data structure to hold stats on coverage
unsigned long long n_covered_bases;
unsigned long long summed_coverage;
unsigned long long summed_mapQ;
unsigned int n_reads;
unsigned int n_selected_reads;
- int32_t tid; // chromosome ID, defined by header
+ bool covered;
hts_pos_t beg;
hts_pos_t end;
int64_t bin_width;
} stats_aux_t;
+typedef struct { // auxiliary data structure to hold a BAM file
+ samFile *fp; // file handle
+ sam_hdr_t *hdr; // file header
+ hts_itr_t *iter; // iterator to a region - NULL for us by default
+ int min_mapQ; // mapQ filter
+ int min_len; // length filter
+ int fail_flags;
+ int required_flags;
+ stats_aux_t *stats;
+} bam_aux_t;
+
#if __STDC_VERSION__ >= 199901L
#define VERTICAL_LINE "\u2502" // BOX DRAWINGS LIGHT VERTICAL
// LOWER ONE EIGHTH BLOCK … FULL BLOCK
static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"};
// In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those
-static const char *const BLOCK_CHARS2[2] = {"\u2584", "\u2588"};
+static const char *const BLOCK_CHARS2[2] = {".", ":"};
#else
"\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84",
"\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" };
-static const char *const BLOCK_CHARS2[2] = {"\xE2\x96\x84", "\xE2\x96\x88"};
+static const char *const BLOCK_CHARS2[2] = {".", ":"};
#endif
"Input options:\n"
" -b, --bam-list FILE list of input BAM filenames, one per line\n"
" -l, --min-read-len INT ignore reads shorter than INT bp [0]\n"
- " -q, --min-MQ INT base quality threshold [0]\n"
- " -Q, --min-BQ INT mapping quality threshold [0]\n"
+ " -q, --min-MQ INT mapping quality threshold [0]\n"
+ " -Q, --min-BQ INT base quality threshold [0]\n"
" --rf <int|str> required flags: skip reads with mask bits unset []\n"
" --ff <int|str> filter flags: skip reads with mask bits set \n"
" [UNMAP,SECONDARY,QCFAIL,DUP]\n"
+ " -d, --depth INT maximum allowed coverage depth [1000000].\n"
+ " If 0, depth is set to the maximum integer value,\n"
+ " effectively removing any depth limit.\n"
"Output options:\n"
" -m, --histogram show histogram instead of tabular output\n"
" -A, --ascii show only ASCII characters in histogram\n"
return buf;
}
-static void set_read_counts(bam_aux_t **data, stats_aux_t *stats, int n_bam_files) {
- int i;
- stats->n_reads = 0;
- stats->n_selected_reads = 0;
- stats->summed_mapQ = 0;
- for (i = 0; i < n_bam_files && data[i]; ++i) {
- stats->n_reads += data[i]->n_reads;
- stats->n_selected_reads += data[i]->n_selected_reads;
- stats->summed_mapQ += data[i]->summed_mapQ;
- data[i]->n_reads = 0;
- data[i]->n_selected_reads = 0;
- data[i]->summed_mapQ = 0;
- }
-}
-
// read one alignment from one BAM file
static int read_bam(void *data, bam1_t *b) {
bam_aux_t *aux = (bam_aux_t*)data; // data in fact is a pointer to an auxiliary structure
+ int nref = sam_hdr_nref(aux->hdr);
int ret;
while (1) {
if((ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b)) < 0) break;
- ++aux->n_reads;
+ if (b->core.tid >= 0 && b->core.tid < nref)
+ aux->stats[b->core.tid].n_reads++;
if ( aux->fail_flags && (b->core.flag & aux->fail_flags) ) continue;
if ( aux->required_flags && !(b->core.flag & aux->required_flags) ) continue;
if ( b->core.qual < aux->min_mapQ ) continue;
if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue;
- ++aux->n_selected_reads;
- aux->summed_mapQ += b->core.qual;
+ if (b->core.tid >= 0 && b->core.tid < nref) {
+ aux->stats[b->core.tid].n_selected_reads++;
+ aux->stats[b->core.tid].summed_mapQ += b->core.qual;
+ }
break;
}
return ret;
}
-void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats) {
- fputs(sam_hdr_tid2name(h, stats->tid), file_out);
- double region_len = (double) stats->end - stats->beg;
+void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid) {
+ fputs(sam_hdr_tid2name(h, tid), file_out);
+ double region_len = (double) stats[tid].end - stats[tid].beg;
fprintf(file_out, "\t%"PRId64"\t%"PRId64"\t%u\t%llu\t%g\t%g\t%.3g\t%.3g\n",
- stats->beg+1,
- stats->end,
- stats->n_selected_reads,
- stats->n_covered_bases,
- 100.0 * stats->n_covered_bases / region_len,
- stats->summed_coverage / region_len,
- stats->summed_coverage > 0? stats->summed_baseQ/(double) stats->summed_coverage : 0,
- stats->n_selected_reads > 0? stats->summed_mapQ/(double) stats->n_selected_reads : 0
+ stats[tid].beg+1,
+ stats[tid].end,
+ stats[tid].n_selected_reads,
+ stats[tid].n_covered_bases,
+ 100.0 * stats[tid].n_covered_bases / region_len,
+ stats[tid].summed_coverage / region_len,
+ stats[tid].summed_coverage > 0? stats[tid].summed_baseQ/(double) stats[tid].summed_coverage : 0,
+ stats[tid].n_selected_reads > 0? stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads : 0
);
}
-void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, const uint32_t *hist,
+void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist,
const int hist_size, const bool full_utf) {
int i, col;
bool show_percentiles = false;
const int n_rows = 10;
const char * const * BLOCK_CHARS = full_utf? BLOCK_CHARS8 : BLOCK_CHARS2;
const int blockchar_len = full_utf? 8 : 2;
- /*
- if (stats->beg == 0) {
- stats->end = h->target_len[stats->tid];
- }
- */
- double region_len = stats->end - stats->beg;
+ double region_len = stats[tid].end - stats[tid].beg;
// Calculate histogram that contains percent covered
double hist_data[hist_size];
double max_val = 0.0;
for (i = 0; i < hist_size; ++i) {
- hist_data[i] = 100 * hist[i] / (double) stats->bin_width;
+ hist_data[i] = 100 * hist[i] / (double) stats[tid].bin_width;
if (hist_data[i] > max_val) max_val = hist_data[i];
}
char buf[30];
- fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, stats->tid), readable_bps(sam_hdr_tid2len(h, stats->tid), buf));
+ fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, tid), readable_bps(sam_hdr_tid2len(h, tid), buf));
double row_bin_size = max_val / (double) n_rows;
for (i = n_rows-1; i >= 0; --i) {
} else {
fprintf(file_out, ">%7.2f%% ", current_bin);
}
- fprintf(file_out, VERTICAL_LINE);
+ fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
for (col = 0; col < hist_size; ++col) {
// get the difference in eights, or halfs when full UTF8 is not supported
int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1;
fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]);
}
}
- fprintf(file_out, VERTICAL_LINE);
+ fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
fputc(' ', file_out);
switch (i) {
- case 9: fprintf(file_out, "Number of reads: %i", stats->n_selected_reads); break;
- case 8: if (stats->n_reads - stats->n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats->n_reads - stats->n_selected_reads); break;
- case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats->n_covered_bases, buf)); break;
+ case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break;
+ case 8: if (stats[tid].n_reads - stats[tid].n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats[tid].n_reads - stats[tid].n_selected_reads); break;
+ case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats[tid].n_covered_bases, buf)); break;
case 6: fprintf(file_out, "Percent covered: %.4g%%",
- 100.0 * stats->n_covered_bases / region_len); break;
+ 100.0 * stats[tid].n_covered_bases / region_len); break;
case 5: fprintf(file_out, "Mean coverage: %.3gx",
- stats->summed_coverage / region_len); break;
+ stats[tid].summed_coverage / region_len); break;
case 4: fprintf(file_out, "Mean baseQ: %.3g",
- stats->summed_baseQ/(double) stats->summed_coverage); break;
+ stats[tid].summed_baseQ/(double) stats[tid].summed_coverage); break;
case 3: fprintf(file_out, "Mean mapQ: %.3g",
- stats->summed_mapQ/(double) stats->n_selected_reads); break;
+ stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads); break;
case 1: fprintf(file_out, "Histo bin width: %sbp",
- readable_bps(stats->bin_width, buf)); break;
+ readable_bps(stats[tid].bin_width, buf)); break;
case 0: fprintf(file_out, "Histo max bin: %.5g%%", max_val); break;
};
fputc('\n', file_out);
// print x axis. Could be made pretty for widths that are not divisible
// by 10 by variable spacing of the labels, instead of placing a label every 10 characters
char buf2[50];
- fprintf(file_out, " %s", center_text(readable_bps(stats->beg + 1, buf), buf2, 10));
+ fprintf(file_out, " %s", center_text(readable_bps(stats[tid].beg + 1, buf), buf2, 10));
int rest;
for (rest = 10; rest < 10*(hist_size/10); rest += 10) {
- fprintf(file_out, "%s", center_text(readable_bps(stats->beg + stats->bin_width*rest, buf), buf2, 10));
+ fprintf(file_out, "%s", center_text(readable_bps(stats[tid].beg + stats[tid].bin_width*rest, buf), buf2, 10));
}
int last_padding = hist_size%10;
- fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats->end, buf), buf2, 10));
+ fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats[tid].end, buf), buf2, 10));
fprintf(file_out, "\n");
}
int main_coverage(int argc, char *argv[]) {
int status = EXIT_SUCCESS;
- int ret, tid, pos, i, j;
+ int ret, tid = -1, old_tid = -1, pos, i, j;
- int max_depth = 0;
+ int max_depth = 1000000;
int opt_min_baseQ = 0;
int opt_min_mapQ = 0;
int opt_min_len = 0;
bool opt_print_header = true;
bool opt_print_tabular = true;
bool opt_print_histogram = false;
- bool *covered_tids = NULL;
bool opt_full_utf = true;
FILE *file_out = stdout;
{"incl-flags", required_argument, NULL, 1}, // require flag
{"excl-flags", required_argument, NULL, 2}, // filter flag
{"bam-list", required_argument, NULL, 'b'},
- {"min-read-len", required_argument, NULL, 'L'},
+ {"min-read-len", required_argument, NULL, 'l'},
{"min-MQ", required_argument, NULL, 'q'},
{"min-mq", required_argument, NULL, 'q'},
{"min-BQ", required_argument, NULL, 'Q'},
{"n-bins", required_argument, NULL, 'w'},
{"region", required_argument, NULL, 'r'},
{"help", no_argument, NULL, 'h'},
+ {"depth", required_argument, NULL, 'd'},
{ NULL, 0, NULL, 0 }
};
// parse the command line
int c;
opterr = 0;
- while ((c = getopt_long(argc, argv, "Ao:L:q:Q:hHw:r:b:m", lopts, NULL)) != -1) {
+ while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) {
switch (c) {
case 1:
if ((required_flags = bam_str2flag(optarg)) < 0) {
fprintf(stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE;
}; break;
case 'o': opt_output_file = optarg; opt_full_width = false; break;
- case 'L': opt_min_len = atoi(optarg); break;
- case 'q': opt_min_baseQ = atoi(optarg); break;
- case 'Q': opt_min_mapQ = atoi(optarg); break;
+ case 'l': opt_min_len = atoi(optarg); break;
+ case 'q': opt_min_mapQ = atoi(optarg); break;
+ case 'Q': opt_min_baseQ = atoi(optarg); break;
+ case 'd': max_depth = atoi(optarg); break; // maximum coverage depth
case 'w': opt_n_bins = atoi(optarg); opt_full_width = false;
opt_print_histogram = true; opt_print_tabular = false;
break;
if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi)) {
columns = csbi.srWindow.Right - csbi.srWindow.Left + 1;
}
-#else
+#elif defined TIOCGWINSZ
struct winsize w;
if (ioctl(2, TIOCGWINSZ, &w) == 0)
columns = w.ws_col;
data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file
if (!data) {
- print_error("coverage", "Failed to allocate memory");
+ print_error_errno("coverage", "Failed to allocate memory");
status = EXIT_FAILURE;
goto coverage_end;
}
int rf;
data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t));
if (!data[i]) {
- print_error("coverage", "Failed to allocate memory");
+ print_error_errno("coverage", "Failed to allocate memory");
status = EXIT_FAILURE;
goto coverage_end;
}
// Set CRAM options on file handle - returns 0 on success
if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
- print_error_errno("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
+ print_error("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
status = EXIT_FAILURE;
goto coverage_end;
}
if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
- print_error_errno("coverage", "Failed to set CRAM_OPT_DECODE_MD value");
+ print_error("coverage", "Failed to set CRAM_OPT_DECODE_MD value");
status = EXIT_FAILURE;
goto coverage_end;
}
data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator
hts_idx_destroy(idx); // the index is not needed any more; free the memory
if (data[i]->iter == NULL) {
- print_error_errno("coverage", "Failed to parse region \"%s\"", opt_reg);
+ print_error("coverage", "Failed to parse region \"%s\". Check the region format or region name presence in the file \"%s\"", opt_reg, argv[optind+i]);
status = EXIT_FAILURE;
goto coverage_end;
}
h = data[0]->hdr; // easy access to the header of the 1st BAM
int n_targets = sam_hdr_nref(h);
- covered_tids = calloc(n_targets, sizeof(bool));
- stats = calloc(1, sizeof(stats_aux_t));
- if (!covered_tids || !stats) {
- print_error("coverage", "Failed to allocate memory");
+ stats = calloc(n_targets, sizeof(stats_aux_t));
+ if (!stats) {
+ print_error_errno("coverage", "Failed to allocate memory");
status = EXIT_FAILURE;
goto coverage_end;
}
int64_t n_bins = opt_n_bins;
if (opt_reg) {
- stats->tid = data[0]->iter->tid;
- stats->beg = data[0]->iter->beg; // and to the parsed region coordinates
- stats->end = data[0]->iter->end;
- if (stats->end == HTS_POS_MAX) {
- stats->end = sam_hdr_tid2len(h, stats->tid);
+ stats_aux_t *s = stats + data[0]->iter->tid;
+ s->beg = data[0]->iter->beg; // and to the parsed region coordinates
+ s->end = data[0]->iter->end;
+ if (s->end == HTS_POS_MAX) {
+ s->end = sam_hdr_tid2len(h, data[0]->iter->tid);
}
- if (opt_n_bins > stats->end - stats->beg) {
- n_bins = stats->end - stats->beg;
+ if (opt_n_bins > s->end - s->beg) {
+ n_bins = s->end - s->beg;
}
- stats->bin_width = (stats->end-stats->beg) / n_bins;
- } else {
- stats->tid = -1;
+ s->bin_width = (s->end-s->beg) / (n_bins > 0 ? n_bins : 1);
}
+ for (i=0; i<n_bam_files; i++)
+ data[i]->stats = stats;
+
int64_t current_bin = 0;
// the core multi-pileup loop
n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM
plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
if (!hist || !n_plp || !plp) {
- print_error("coverage", "Failed to allocate memory");
+ print_error_errno("coverage", "Failed to allocate memory");
status = EXIT_FAILURE;
goto coverage_end;
}
while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
- if (tid != stats->tid) { // Next target sequence
- if (stats->tid >= 0) { // It's not the first sequence, print results
- set_read_counts(data, stats, n_bam_files);
+ if (tid != old_tid) { // Next target sequence
+ if (old_tid >= 0) {
if (opt_print_histogram) {
- print_hist(file_out, h, stats, hist, n_bins, opt_full_utf);
+ print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf);
fputc('\n', file_out);
} else if (opt_print_tabular) {
- print_tabular_line(file_out, h, stats);
+ print_tabular_line(file_out, h, stats, old_tid);
}
- // reset data
- memset(stats, 0, sizeof(stats_aux_t));
if (opt_print_histogram)
memset(hist, 0, n_bins*sizeof(uint32_t));
}
- stats->tid = tid;
- covered_tids[tid] = true;
+ stats[tid].covered = true;
if (!opt_reg)
- stats->end = sam_hdr_tid2len(h, tid);
+ stats[tid].end = sam_hdr_tid2len(h, tid);
if (opt_print_histogram) {
- n_bins = opt_n_bins > stats->end-stats->beg? stats->end-stats->beg : opt_n_bins;
- stats->bin_width = (stats->end-stats->beg) / n_bins;
+ n_bins = opt_n_bins > stats[tid].end-stats[tid].beg? stats[tid].end-stats[tid].beg : opt_n_bins;
+ stats[tid].bin_width = (stats[tid].end-stats[tid].beg) / n_bins;
}
+
+ old_tid = tid;
}
- if (pos < stats->beg || pos >= stats->end) continue; // out of range; skip
+ if (pos < stats[tid].beg || pos >= stats[tid].end) continue; // out of range; skip
if (tid >= n_targets) continue; // diff number of @SQ lines per file?
if (opt_print_histogram) {
- current_bin = (pos - stats->beg) / stats->bin_width;
+ current_bin = (pos - stats[tid].beg) / stats[tid].bin_width;
}
bool count_base = false;
else if (p->qpos < p->b->core.l_qseq &&
bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality
else
- stats->summed_baseQ += bam_get_qual(p->b)[p->qpos];
+ stats[tid].summed_baseQ += bam_get_qual(p->b)[p->qpos];
}
if (depth_at_pos > 0) {
count_base = true;
- stats->summed_coverage += depth_at_pos;
+ stats[tid].summed_coverage += depth_at_pos;
}
// hist[current_bin] += depth_at_pos; // Add counts to the histogram here to have one based on coverage
//fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
}
if (count_base) {
- ++(stats->n_covered_bases);
+ stats[tid].n_covered_bases++;
if (opt_print_histogram && current_bin < n_bins)
++(hist[current_bin]); // Histogram based on breadth of coverage
}
}
- if (stats->tid != -1) {
- set_read_counts(data, stats, n_bam_files);
+ if (tid == -1 && opt_reg && *opt_reg != '*')
+ // Region specified but no data covering it.
+ tid = data[0]->iter->tid;
+
+ if (tid < n_targets && tid >=0) {
if (opt_print_histogram) {
- print_hist(file_out, h, stats, hist, n_bins, opt_full_utf);
+ print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf);
} else if (opt_print_tabular) {
- print_tabular_line(file_out, h, stats);
+ print_tabular_line(file_out, h, stats, tid);
}
}
if (!opt_reg && opt_print_tabular) {
- memset(stats, 0, sizeof(stats_aux_t));
for (i = 0; i < n_targets; ++i) {
- if (!covered_tids[i]) {
- stats->tid = i;
- stats->end = sam_hdr_tid2len(h, i);
- print_tabular_line(file_out, h, stats);
+ if (!stats[i].covered) {
+ stats[i].end = sam_hdr_tid2len(h, i);
+ print_tabular_line(file_out, h, stats, i);
}
}
}
coverage_end:
if (n_plp) free(n_plp);
if (plp) free(plp);
- bam_mplp_destroy(mplp);
+ if (mplp) bam_mplp_destroy(mplp);
- if (covered_tids) free(covered_tids);
if (hist) free(hist);
if (stats) free(stats);
-
// Close files and free data structures
if (!(file_out == stdout || fclose(file_out) == 0)) {
if (status == EXIT_SUCCESS) {
/* coverage.c -- samtools coverage subcommand
Copyright (C) 2018,2019 Florian Breitwieser
- Portions copyright (C) 2019 Genome Research Ltd.
+ Portions copyright (C) 2019-2021 Genome Research Ltd.
Author: Florian P Breitwieser <florian.bw@gmail.com>
DEALINGS IN THE SOFTWARE. */
/* This program calculates coverage from multiple BAMs
- * simutaneously, to achieve random access and to use the BED interface.
+ * simultaneously, to achieve random access and to use the BED interface.
* To compile this program separately, you may:
*
* gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz
const char *VERSION = "0.1";
-typedef struct { // auxiliary data structure to hold a BAM file
- samFile *fp; // file handle
- sam_hdr_t *hdr; // file header
- hts_itr_t *iter; // iterator to a region - NULL for us by default
- int min_mapQ; // mapQ filter
- int min_len; // length filter
- unsigned int n_reads; // records the number of reads seen in file
- unsigned int n_selected_reads; // records the number of reads passing filter
- unsigned long summed_mapQ; // summed mapQ of all reads passing filter
- int fail_flags;
- int required_flags;
-} bam_aux_t;
-
typedef struct { // auxiliary data structure to hold stats on coverage
unsigned long long n_covered_bases;
unsigned long long summed_coverage;
unsigned long long summed_mapQ;
unsigned int n_reads;
unsigned int n_selected_reads;
- int32_t tid; // chromosome ID, defined by header
+ bool covered;
hts_pos_t beg;
hts_pos_t end;
int64_t bin_width;
} stats_aux_t;
+typedef struct { // auxiliary data structure to hold a BAM file
+ samFile *fp; // file handle
+ sam_hdr_t *hdr; // file header
+ hts_itr_t *iter; // iterator to a region - NULL for us by default
+ int min_mapQ; // mapQ filter
+ int min_len; // length filter
+ int fail_flags;
+ int required_flags;
+ stats_aux_t *stats;
+} bam_aux_t;
+
#if __STDC_VERSION__ >= 199901L
#define VERTICAL_LINE "\u2502" // BOX DRAWINGS LIGHT VERTICAL
// LOWER ONE EIGHTH BLOCK … FULL BLOCK
static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"};
// In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those
-static const char *const BLOCK_CHARS2[2] = {"\u2584", "\u2588"};
+static const char *const BLOCK_CHARS2[2] = {".", ":"};
#else
"\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84",
"\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" };
-static const char *const BLOCK_CHARS2[2] = {"\xE2\x96\x84", "\xE2\x96\x88"};
+static const char *const BLOCK_CHARS2[2] = {".", ":"};
#endif
"Input options:\n"
" -b, --bam-list FILE list of input BAM filenames, one per line\n"
" -l, --min-read-len INT ignore reads shorter than INT bp [0]\n"
- " -q, --min-MQ INT base quality threshold [0]\n"
- " -Q, --min-BQ INT mapping quality threshold [0]\n"
+ " -q, --min-MQ INT mapping quality threshold [0]\n"
+ " -Q, --min-BQ INT base quality threshold [0]\n"
" --rf <int|str> required flags: skip reads with mask bits unset []\n"
" --ff <int|str> filter flags: skip reads with mask bits set \n"
" [UNMAP,SECONDARY,QCFAIL,DUP]\n"
+ " -d, --depth INT maximum allowed coverage depth [1000000].\n"
+ " If 0, depth is set to the maximum integer value,\n"
+ " effectively removing any depth limit.\n"
"Output options:\n"
" -m, --histogram show histogram instead of tabular output\n"
" -A, --ascii show only ASCII characters in histogram\n"
return buf;
}
-static void set_read_counts(bam_aux_t **data, stats_aux_t *stats, int n_bam_files) {
- int i;
- stats->n_reads = 0;
- stats->n_selected_reads = 0;
- stats->summed_mapQ = 0;
- for (i = 0; i < n_bam_files && data[i]; ++i) {
- stats->n_reads += data[i]->n_reads;
- stats->n_selected_reads += data[i]->n_selected_reads;
- stats->summed_mapQ += data[i]->summed_mapQ;
- data[i]->n_reads = 0;
- data[i]->n_selected_reads = 0;
- data[i]->summed_mapQ = 0;
- }
-}
-
// read one alignment from one BAM file
static int read_bam(void *data, bam1_t *b) {
bam_aux_t *aux = (bam_aux_t*)data; // data in fact is a pointer to an auxiliary structure
+ int nref = sam_hdr_nref(aux->hdr);
int ret;
while (1) {
if((ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b)) < 0) break;
- ++aux->n_reads;
+ if (b->core.tid >= 0 && b->core.tid < nref)
+ aux->stats[b->core.tid].n_reads++;
if ( aux->fail_flags && (b->core.flag & aux->fail_flags) ) continue;
if ( aux->required_flags && !(b->core.flag & aux->required_flags) ) continue;
if ( b->core.qual < aux->min_mapQ ) continue;
if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue;
- ++aux->n_selected_reads;
- aux->summed_mapQ += b->core.qual;
+ if (b->core.tid >= 0 && b->core.tid < nref) {
+ aux->stats[b->core.tid].n_selected_reads++;
+ aux->stats[b->core.tid].summed_mapQ += b->core.qual;
+ }
break;
}
return ret;
}
-void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats) {
- fputs(sam_hdr_tid2name(h, stats->tid), file_out);
- double region_len = (double) stats->end - stats->beg;
+void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid) {
+ fputs(sam_hdr_tid2name(h, tid), file_out);
+ double region_len = (double) stats[tid].end - stats[tid].beg;
fprintf(file_out, "\t%"PRId64"\t%"PRId64"\t%u\t%llu\t%g\t%g\t%.3g\t%.3g\n",
- stats->beg+1,
- stats->end,
- stats->n_selected_reads,
- stats->n_covered_bases,
- 100.0 * stats->n_covered_bases / region_len,
- stats->summed_coverage / region_len,
- stats->summed_coverage > 0? stats->summed_baseQ/(double) stats->summed_coverage : 0,
- stats->n_selected_reads > 0? stats->summed_mapQ/(double) stats->n_selected_reads : 0
+ stats[tid].beg+1,
+ stats[tid].end,
+ stats[tid].n_selected_reads,
+ stats[tid].n_covered_bases,
+ 100.0 * stats[tid].n_covered_bases / region_len,
+ stats[tid].summed_coverage / region_len,
+ stats[tid].summed_coverage > 0? stats[tid].summed_baseQ/(double) stats[tid].summed_coverage : 0,
+ stats[tid].n_selected_reads > 0? stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads : 0
);
}
-void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, const uint32_t *hist,
+void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist,
const int hist_size, const bool full_utf) {
int i, col;
bool show_percentiles = false;
const int n_rows = 10;
const char * const * BLOCK_CHARS = full_utf? BLOCK_CHARS8 : BLOCK_CHARS2;
const int blockchar_len = full_utf? 8 : 2;
- /*
- if (stats->beg == 0) {
- stats->end = h->target_len[stats->tid];
- }
- */
- double region_len = stats->end - stats->beg;
+ double region_len = stats[tid].end - stats[tid].beg;
// Calculate histogram that contains percent covered
double hist_data[hist_size];
double max_val = 0.0;
for (i = 0; i < hist_size; ++i) {
- hist_data[i] = 100 * hist[i] / (double) stats->bin_width;
+ hist_data[i] = 100 * hist[i] / (double) stats[tid].bin_width;
if (hist_data[i] > max_val) max_val = hist_data[i];
}
char buf[30];
- fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, stats->tid), readable_bps(sam_hdr_tid2len(h, stats->tid), buf));
+ fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, tid), readable_bps(sam_hdr_tid2len(h, tid), buf));
double row_bin_size = max_val / (double) n_rows;
for (i = n_rows-1; i >= 0; --i) {
} else {
fprintf(file_out, ">%7.2f%% ", current_bin);
}
- fprintf(file_out, VERTICAL_LINE);
+ fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
for (col = 0; col < hist_size; ++col) {
// get the difference in eights, or halfs when full UTF8 is not supported
int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1;
fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]);
}
}
- fprintf(file_out, VERTICAL_LINE);
+ fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
fputc(' ', file_out);
switch (i) {
- case 9: fprintf(file_out, "Number of reads: %i", stats->n_selected_reads); break;
- case 8: if (stats->n_reads - stats->n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats->n_reads - stats->n_selected_reads); break;
- case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats->n_covered_bases, buf)); break;
+ case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break;
+ case 8: if (stats[tid].n_reads - stats[tid].n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats[tid].n_reads - stats[tid].n_selected_reads); break;
+ case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats[tid].n_covered_bases, buf)); break;
case 6: fprintf(file_out, "Percent covered: %.4g%%",
- 100.0 * stats->n_covered_bases / region_len); break;
+ 100.0 * stats[tid].n_covered_bases / region_len); break;
case 5: fprintf(file_out, "Mean coverage: %.3gx",
- stats->summed_coverage / region_len); break;
+ stats[tid].summed_coverage / region_len); break;
case 4: fprintf(file_out, "Mean baseQ: %.3g",
- stats->summed_baseQ/(double) stats->summed_coverage); break;
+ stats[tid].summed_baseQ/(double) stats[tid].summed_coverage); break;
case 3: fprintf(file_out, "Mean mapQ: %.3g",
- stats->summed_mapQ/(double) stats->n_selected_reads); break;
+ stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads); break;
case 1: fprintf(file_out, "Histo bin width: %sbp",
- readable_bps(stats->bin_width, buf)); break;
+ readable_bps(stats[tid].bin_width, buf)); break;
case 0: fprintf(file_out, "Histo max bin: %.5g%%", max_val); break;
};
fputc('\n', file_out);
// print x axis. Could be made pretty for widths that are not divisible
// by 10 by variable spacing of the labels, instead of placing a label every 10 characters
char buf2[50];
- fprintf(file_out, " %s", center_text(readable_bps(stats->beg + 1, buf), buf2, 10));
+ fprintf(file_out, " %s", center_text(readable_bps(stats[tid].beg + 1, buf), buf2, 10));
int rest;
for (rest = 10; rest < 10*(hist_size/10); rest += 10) {
- fprintf(file_out, "%s", center_text(readable_bps(stats->beg + stats->bin_width*rest, buf), buf2, 10));
+ fprintf(file_out, "%s", center_text(readable_bps(stats[tid].beg + stats[tid].bin_width*rest, buf), buf2, 10));
}
int last_padding = hist_size%10;
- fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats->end, buf), buf2, 10));
+ fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats[tid].end, buf), buf2, 10));
fprintf(file_out, "\n");
}
int main_coverage(int argc, char *argv[]) {
int status = EXIT_SUCCESS;
- int ret, tid, pos, i, j;
+ int ret, tid = -1, old_tid = -1, pos, i, j;
- int max_depth = 0;
+ int max_depth = 1000000;
int opt_min_baseQ = 0;
int opt_min_mapQ = 0;
int opt_min_len = 0;
bool opt_print_header = true;
bool opt_print_tabular = true;
bool opt_print_histogram = false;
- bool *covered_tids = NULL;
bool opt_full_utf = true;
FILE *file_out = samtools_stdout;
{"incl-flags", required_argument, NULL, 1}, // require flag
{"excl-flags", required_argument, NULL, 2}, // filter flag
{"bam-list", required_argument, NULL, 'b'},
- {"min-read-len", required_argument, NULL, 'L'},
+ {"min-read-len", required_argument, NULL, 'l'},
{"min-MQ", required_argument, NULL, 'q'},
{"min-mq", required_argument, NULL, 'q'},
{"min-BQ", required_argument, NULL, 'Q'},
{"n-bins", required_argument, NULL, 'w'},
{"region", required_argument, NULL, 'r'},
{"help", no_argument, NULL, 'h'},
+ {"depth", required_argument, NULL, 'd'},
{ NULL, 0, NULL, 0 }
};
// parse the command line
int c;
opterr = 0;
- while ((c = getopt_long(argc, argv, "Ao:L:q:Q:hHw:r:b:m", lopts, NULL)) != -1) {
+ while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) {
switch (c) {
case 1:
if ((required_flags = bam_str2flag(optarg)) < 0) {
fprintf(samtools_stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE;
}; break;
case 'o': opt_output_file = optarg; opt_full_width = false; break;
- case 'L': opt_min_len = atoi(optarg); break;
- case 'q': opt_min_baseQ = atoi(optarg); break;
- case 'Q': opt_min_mapQ = atoi(optarg); break;
+ case 'l': opt_min_len = atoi(optarg); break;
+ case 'q': opt_min_mapQ = atoi(optarg); break;
+ case 'Q': opt_min_baseQ = atoi(optarg); break;
+ case 'd': max_depth = atoi(optarg); break; // maximum coverage depth
case 'w': opt_n_bins = atoi(optarg); opt_full_width = false;
opt_print_histogram = true; opt_print_tabular = false;
break;
if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi)) {
columns = csbi.srWindow.Right - csbi.srWindow.Left + 1;
}
-#else
+#elif defined TIOCGWINSZ
struct winsize w;
if (ioctl(2, TIOCGWINSZ, &w) == 0)
columns = w.ws_col;
data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file
if (!data) {
- print_error("coverage", "Failed to allocate memory");
+ print_error_errno("coverage", "Failed to allocate memory");
status = EXIT_FAILURE;
goto coverage_end;
}
int rf;
data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t));
if (!data[i]) {
- print_error("coverage", "Failed to allocate memory");
+ print_error_errno("coverage", "Failed to allocate memory");
status = EXIT_FAILURE;
goto coverage_end;
}
// Set CRAM options on file handle - returns 0 on success
if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
- print_error_errno("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
+ print_error("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
status = EXIT_FAILURE;
goto coverage_end;
}
if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
- print_error_errno("coverage", "Failed to set CRAM_OPT_DECODE_MD value");
+ print_error("coverage", "Failed to set CRAM_OPT_DECODE_MD value");
status = EXIT_FAILURE;
goto coverage_end;
}
data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator
hts_idx_destroy(idx); // the index is not needed any more; free the memory
if (data[i]->iter == NULL) {
- print_error_errno("coverage", "Failed to parse region \"%s\"", opt_reg);
+ print_error("coverage", "Failed to parse region \"%s\". Check the region format or region name presence in the file \"%s\"", opt_reg, argv[optind+i]);
status = EXIT_FAILURE;
goto coverage_end;
}
h = data[0]->hdr; // easy access to the header of the 1st BAM
int n_targets = sam_hdr_nref(h);
- covered_tids = calloc(n_targets, sizeof(bool));
- stats = calloc(1, sizeof(stats_aux_t));
- if (!covered_tids || !stats) {
- print_error("coverage", "Failed to allocate memory");
+ stats = calloc(n_targets, sizeof(stats_aux_t));
+ if (!stats) {
+ print_error_errno("coverage", "Failed to allocate memory");
status = EXIT_FAILURE;
goto coverage_end;
}
int64_t n_bins = opt_n_bins;
if (opt_reg) {
- stats->tid = data[0]->iter->tid;
- stats->beg = data[0]->iter->beg; // and to the parsed region coordinates
- stats->end = data[0]->iter->end;
- if (stats->end == HTS_POS_MAX) {
- stats->end = sam_hdr_tid2len(h, stats->tid);
+ stats_aux_t *s = stats + data[0]->iter->tid;
+ s->beg = data[0]->iter->beg; // and to the parsed region coordinates
+ s->end = data[0]->iter->end;
+ if (s->end == HTS_POS_MAX) {
+ s->end = sam_hdr_tid2len(h, data[0]->iter->tid);
}
- if (opt_n_bins > stats->end - stats->beg) {
- n_bins = stats->end - stats->beg;
+ if (opt_n_bins > s->end - s->beg) {
+ n_bins = s->end - s->beg;
}
- stats->bin_width = (stats->end-stats->beg) / n_bins;
- } else {
- stats->tid = -1;
+ s->bin_width = (s->end-s->beg) / (n_bins > 0 ? n_bins : 1);
}
+ for (i=0; i<n_bam_files; i++)
+ data[i]->stats = stats;
+
int64_t current_bin = 0;
// the core multi-pileup loop
n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM
plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
if (!hist || !n_plp || !plp) {
- print_error("coverage", "Failed to allocate memory");
+ print_error_errno("coverage", "Failed to allocate memory");
status = EXIT_FAILURE;
goto coverage_end;
}
while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
- if (tid != stats->tid) { // Next target sequence
- if (stats->tid >= 0) { // It's not the first sequence, print results
- set_read_counts(data, stats, n_bam_files);
+ if (tid != old_tid) { // Next target sequence
+ if (old_tid >= 0) {
if (opt_print_histogram) {
- print_hist(file_out, h, stats, hist, n_bins, opt_full_utf);
+ print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf);
fputc('\n', file_out);
} else if (opt_print_tabular) {
- print_tabular_line(file_out, h, stats);
+ print_tabular_line(file_out, h, stats, old_tid);
}
- // reset data
- memset(stats, 0, sizeof(stats_aux_t));
if (opt_print_histogram)
memset(hist, 0, n_bins*sizeof(uint32_t));
}
- stats->tid = tid;
- covered_tids[tid] = true;
+ stats[tid].covered = true;
if (!opt_reg)
- stats->end = sam_hdr_tid2len(h, tid);
+ stats[tid].end = sam_hdr_tid2len(h, tid);
if (opt_print_histogram) {
- n_bins = opt_n_bins > stats->end-stats->beg? stats->end-stats->beg : opt_n_bins;
- stats->bin_width = (stats->end-stats->beg) / n_bins;
+ n_bins = opt_n_bins > stats[tid].end-stats[tid].beg? stats[tid].end-stats[tid].beg : opt_n_bins;
+ stats[tid].bin_width = (stats[tid].end-stats[tid].beg) / n_bins;
}
+
+ old_tid = tid;
}
- if (pos < stats->beg || pos >= stats->end) continue; // out of range; skip
+ if (pos < stats[tid].beg || pos >= stats[tid].end) continue; // out of range; skip
if (tid >= n_targets) continue; // diff number of @SQ lines per file?
if (opt_print_histogram) {
- current_bin = (pos - stats->beg) / stats->bin_width;
+ current_bin = (pos - stats[tid].beg) / stats[tid].bin_width;
}
bool count_base = false;
else if (p->qpos < p->b->core.l_qseq &&
bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality
else
- stats->summed_baseQ += bam_get_qual(p->b)[p->qpos];
+ stats[tid].summed_baseQ += bam_get_qual(p->b)[p->qpos];
}
if (depth_at_pos > 0) {
count_base = true;
- stats->summed_coverage += depth_at_pos;
+ stats[tid].summed_coverage += depth_at_pos;
}
// hist[current_bin] += depth_at_pos; // Add counts to the histogram here to have one based on coverage
//fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
}
if (count_base) {
- ++(stats->n_covered_bases);
+ stats[tid].n_covered_bases++;
if (opt_print_histogram && current_bin < n_bins)
++(hist[current_bin]); // Histogram based on breadth of coverage
}
}
- if (stats->tid != -1) {
- set_read_counts(data, stats, n_bam_files);
+ if (tid == -1 && opt_reg && *opt_reg != '*')
+ // Region specified but no data covering it.
+ tid = data[0]->iter->tid;
+
+ if (tid < n_targets && tid >=0) {
if (opt_print_histogram) {
- print_hist(file_out, h, stats, hist, n_bins, opt_full_utf);
+ print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf);
} else if (opt_print_tabular) {
- print_tabular_line(file_out, h, stats);
+ print_tabular_line(file_out, h, stats, tid);
}
}
if (!opt_reg && opt_print_tabular) {
- memset(stats, 0, sizeof(stats_aux_t));
for (i = 0; i < n_targets; ++i) {
- if (!covered_tids[i]) {
- stats->tid = i;
- stats->end = sam_hdr_tid2len(h, i);
- print_tabular_line(file_out, h, stats);
+ if (!stats[i].covered) {
+ stats[i].end = sam_hdr_tid2len(h, i);
+ print_tabular_line(file_out, h, stats, i);
}
}
}
coverage_end:
if (n_plp) free(n_plp);
if (plp) free(plp);
- bam_mplp_destroy(mplp);
+ if (mplp) bam_mplp_destroy(mplp);
- if (covered_tids) free(covered_tids);
if (hist) free(hist);
if (stats) free(stats);
-
// Close files and free data structures
if (!(file_out == samtools_stdout || fclose(file_out) == 0)) {
if (status == EXIT_SUCCESS) {
if (n > g->max_bases) { // enlarge g->bases
g->max_bases = n;
kroundup32(g->max_bases);
- g->bases = realloc(g->bases, g->max_bases * 2);
+ g->bases = realloc(g->bases, (size_t) g->max_bases * 2);
}
for (i = k = 0; i < n; ++i) {
const bam_pileup1_t *p = plp + i;
int main_cut_target(int argc, char *argv[])
{
- int c, tid, pos, n, lasttid = -1, usage = 0;
+ int c, tid, pos, n, lasttid = -1, usage = 0, status = EXIT_SUCCESS;
hts_pos_t l, max_l;
const bam_pileup1_t *p;
bam_plp_t plp;
cns[pos] = gencns(&g, n, p);
}
process_cns(g.h, lasttid, l, cns);
+
+ if (n < 0) {
+ print_error("targetcut", "error reading from \"%s\"", argv[optind]);
+ status = EXIT_FAILURE;
+ }
+
free(cns);
sam_hdr_destroy(g.h);
bam_plp_destroy(plp);
errmod_destroy(g.em);
free(g.bases);
sam_global_args_free(&ga);
- return 0;
+ return status;
}
if (n > g->max_bases) { // enlarge g->bases
g->max_bases = n;
kroundup32(g->max_bases);
- g->bases = realloc(g->bases, g->max_bases * 2);
+ g->bases = realloc(g->bases, (size_t) g->max_bases * 2);
}
for (i = k = 0; i < n; ++i) {
const bam_pileup1_t *p = plp + i;
int main_cut_target(int argc, char *argv[])
{
- int c, tid, pos, n, lasttid = -1, usage = 0;
+ int c, tid, pos, n, lasttid = -1, usage = 0, status = EXIT_SUCCESS;
hts_pos_t l, max_l;
const bam_pileup1_t *p;
bam_plp_t plp;
cns[pos] = gencns(&g, n, p);
}
process_cns(g.h, lasttid, l, cns);
+
+ if (n < 0) {
+ print_error("targetcut", "error reading from \"%s\"", argv[optind]);
+ status = EXIT_FAILURE;
+ }
+
free(cns);
sam_hdr_destroy(g.h);
bam_plp_destroy(plp);
errmod_destroy(g.em);
free(g.bases);
sam_global_args_free(&ga);
- return 0;
+ return status;
}
/* dict.c -- create a sequence dictionary file.
- Copyright (C) 2015 Genome Research Ltd.
+ Copyright (C) 2015, 2020 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
#include <config.h>
#include <stdio.h>
+#include <string.h>
#include <unistd.h>
#include <zlib.h>
#include <getopt.h>
{
char *output_fname, *fname;
char *assembly, *species, *uri;
- int header;
+ int alias, header;
}
args_t;
hts_md5_final(digest, md5);
hts_md5_hex(hex, digest);
fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex);
+ if (args->alias) {
+ const char *name = seq->name.s;
+ if (strncmp(name, "chr", 3) == 0) {
+ name += 3;
+ fprintf(out, "\tAN:%s", name);
+ }
+ else
+ fprintf(out, "\tAN:chr%s", name);
+
+ if (strcmp(name, "M") == 0)
+ fprintf(out, ",chrMT,MT");
+ else if (strcmp(name, "MT") == 0)
+ fprintf(out, ",chrM,M");
+ }
if (args->uri)
fprintf(out, "\tUR:%s", args->uri);
else if (strcmp(fn, "-") != 0) {
fprintf(stderr, "About: Create a sequence dictionary file from a fasta file\n");
fprintf(stderr, "Usage: samtools dict [options] <file.fa|file.fa.gz>\n\n");
fprintf(stderr, "Options: -a, --assembly STR assembly\n");
+ fprintf(stderr, " -A, --alias, --alternative-name\n");
+ fprintf(stderr, " add AN tag by adding/removing 'chr'\n");
fprintf(stderr, " -H, --no-header do not print @HD line\n");
- fprintf(stderr, " -o, --output STR file to write out dict file [stdout]\n");
+ fprintf(stderr, " -o, --output FILE file to write out dict file [stdout]\n");
fprintf(stderr, " -s, --species STR species\n");
fprintf(stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n");
fprintf(stderr, "\n");
{
{"help", no_argument, NULL, 'h'},
{"no-header", no_argument, NULL, 'H'},
+ {"alias", no_argument, NULL, 'A'},
+ {"alternative-name", no_argument, NULL, 'A'},
{"assembly", required_argument, NULL, 'a'},
{"species", required_argument, NULL, 's'},
{"uri", required_argument, NULL, 'u'},
{NULL, 0, NULL, 0}
};
int c;
- while ( (c=getopt_long(argc,argv,"?hHa:s:u:o:",loptions,NULL))>0 )
+ while ( (c=getopt_long(argc,argv,"?AhHa:s:u:o:",loptions,NULL))>0 )
{
switch (c)
{
+ case 'A': args->alias = 1; break;
case 'a': args->assembly = optarg; break;
case 's': args->species = optarg; break;
case 'u': args->uri = optarg; break;
/* dict.c -- create a sequence dictionary file.
- Copyright (C) 2015 Genome Research Ltd.
+ Copyright (C) 2015, 2020 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
#include <config.h>
#include <stdio.h>
+#include <string.h>
#include <unistd.h>
#include <zlib.h>
#include <getopt.h>
{
char *output_fname, *fname;
char *assembly, *species, *uri;
- int header;
+ int alias, header;
}
args_t;
fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
if (fp == 0) {
fprintf(samtools_stderr, "dict: %s: No such file or directory\n", fn);
- exit(1);
+ samtools_exit(1);
}
FILE *out = samtools_stdout;
if (args->output_fname) {
out = fopen(args->output_fname, "w");
if (out == NULL) {
fprintf(samtools_stderr, "dict: %s: Cannot open file for writing\n", args->output_fname);
- exit(1);
+ samtools_exit(1);
}
}
if (!(md5 = hts_md5_init()))
- exit(1);
+ samtools_exit(1);
seq = kseq_init(fp);
if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n");
hts_md5_final(digest, md5);
hts_md5_hex(hex, digest);
fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex);
+ if (args->alias) {
+ const char *name = seq->name.s;
+ if (strncmp(name, "chr", 3) == 0) {
+ name += 3;
+ fprintf(out, "\tAN:%s", name);
+ }
+ else
+ fprintf(out, "\tAN:chr%s", name);
+
+ if (strcmp(name, "M") == 0)
+ fprintf(out, ",chrMT,MT");
+ else if (strcmp(name, "MT") == 0)
+ fprintf(out, ",chrM,M");
+ }
if (args->uri)
fprintf(out, "\tUR:%s", args->uri);
else if (strcmp(fn, "-") != 0) {
fprintf(samtools_stderr, "About: Create a sequence dictionary file from a fasta file\n");
fprintf(samtools_stderr, "Usage: samtools dict [options] <file.fa|file.fa.gz>\n\n");
fprintf(samtools_stderr, "Options: -a, --assembly STR assembly\n");
+ fprintf(samtools_stderr, " -A, --alias, --alternative-name\n");
+ fprintf(samtools_stderr, " add AN tag by adding/removing 'chr'\n");
fprintf(samtools_stderr, " -H, --no-header do not print @HD line\n");
- fprintf(samtools_stderr, " -o, --output STR file to write out dict file [samtools_stdout]\n");
+ fprintf(samtools_stderr, " -o, --output FILE file to write out dict file [samtools_stdout]\n");
fprintf(samtools_stderr, " -s, --species STR species\n");
fprintf(samtools_stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n");
fprintf(samtools_stderr, "\n");
{
{"help", no_argument, NULL, 'h'},
{"no-header", no_argument, NULL, 'H'},
+ {"alias", no_argument, NULL, 'A'},
+ {"alternative-name", no_argument, NULL, 'A'},
{"assembly", required_argument, NULL, 'a'},
{"species", required_argument, NULL, 's'},
{"uri", required_argument, NULL, 'u'},
{NULL, 0, NULL, 0}
};
int c;
- while ( (c=getopt_long(argc,argv,"?hHa:s:u:o:",loptions,NULL))>0 )
+ while ( (c=getopt_long(argc,argv,"?AhHa:s:u:o:",loptions,NULL))>0 )
{
switch (c)
{
+ case 'A': args->alias = 1; break;
case 'a': args->assembly = optarg; break;
case 's': args->species = optarg; break;
case 'u': args->uri = optarg; break;
/* faidx.c -- faidx subcommand.
- Copyright (C) 2008, 2009, 2013, 2016, 2018-2019 Genome Research Ltd.
+ Copyright (C) 2008, 2009, 2013, 2016, 2018-2020 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
static int usage(FILE *fp, enum fai_format_options format, int exit_status)
{
- char *tool, *file_type;
+ char *tool, *file_type, *index_name;
if (format == FAI_FASTA) {
tool = "faidx <file.fa|file.fa.gz>";
file_type = "FASTA";
+ index_name = "file.fa";
} else {
tool = "fqidx <file.fq|file.fq.gz>";
file_type = "FASTQ";
+ index_name = "file.fq";
}
fprintf(fp, "Usage: samtools %s [<reg> [...]]\n", tool);
" TYPE = rc for /rc on negative strand (default)\n"
" no for no strand indicator\n"
" sign for (+) / (-)\n"
- " custom,<pos>,<neg> for custom indicator\n",
- file_type, file_type);
+ " custom,<pos>,<neg> for custom indicator\n"
+ " --fai-idx FILE name of the index file (default %s.fai).\n"
+ " --gzi-idx FILE name of compressed file index (default %s.gz.gzi).\n",
+ file_type, file_type, index_name, index_name);
if (format == FAI_FASTA) {
char *pos_strand_name = ""; // Extension to add to name for +ve strand
char *neg_strand_name = "/rc"; // Extension to add to name for -ve strand
char *strand_names = NULL; // Used for custom strand annotation
+ char *fai_name = NULL; // specified index name
+ char *gzi_name = NULL; // specified compressed index name
FILE* file_out = stdout;/* output stream */
static const struct option lopts[] = {
{ "fastq", no_argument, NULL, 'f' },
{ "reverse-complement", no_argument, NULL, 'i' },
{ "mark-strand", required_argument, NULL, 1000 },
+ { "fai-idx", required_argument, NULL, 1001 },
+ { "gzi-idx", required_argument, NULL, 1002 },
{ NULL, 0, NULL, 0 }
};
return usage(stderr, format, EXIT_FAILURE);
}
break;
+ case 1001: fai_name = optarg; break;
+ case 1002: gzi_name = optarg; break;
default: break;
}
}
if ( argc==optind )
return usage(stdout, format, EXIT_SUCCESS);
- if ( optind+1 == argc && !region_file)
- {
- if (fai_build(argv[optind]) != 0) {
- fprintf(stderr, "[faidx] Could not build fai index %s.fai\n", argv[optind]);
+ if (optind+1 == argc && !region_file) {
+ if (output_file && !fai_name)
+ fai_name = output_file;
+
+ if (fai_build3(argv[optind], fai_name, gzi_name) != 0) {
+ if (fai_name)
+ fprintf(stderr, "[faidx] Could not build fai index %s", fai_name);
+ else
+ fprintf(stderr, "[faidx] Could not build fai index %s.fai", argv[optind]);
+
+ if (gzi_name)
+ fprintf(stderr, " or compressed index %s\n", gzi_name);
+ else
+ fprintf(stderr, "\n");
+
return EXIT_FAILURE;
}
+
return 0;
}
- faidx_t *fai = fai_load_format(argv[optind], format);
+ faidx_t *fai = fai_load3_format(argv[optind], fai_name, gzi_name, FAI_CREATE, format);
+
+ if (!fai) {
+ if (fai_name)
+ fprintf(stderr, "[faidx] Could not load fai index %s", fai_name);
+ else
+ fprintf(stderr, "[faidx] Could not build fai index %s.fai", argv[optind]);
+
+ if (gzi_name)
+ fprintf(stderr, " or compressed index %s\n", gzi_name);
+ else
+ fprintf(stderr, "\n");
- if ( !fai ) {
- fprintf(stderr, "[faidx] Could not load fai index of %s\n", argv[optind]);
return EXIT_FAILURE;
}
/* faidx.c -- faidx subcommand.
- Copyright (C) 2008, 2009, 2013, 2016, 2018-2019 Genome Research Ltd.
+ Copyright (C) 2008, 2009, 2013, 2016, 2018-2020 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
static int usage(FILE *fp, enum fai_format_options format, int exit_status)
{
- char *tool, *file_type;
+ char *tool, *file_type, *index_name;
if (format == FAI_FASTA) {
tool = "faidx <file.fa|file.fa.gz>";
file_type = "FASTA";
+ index_name = "file.fa";
} else {
tool = "fqidx <file.fq|file.fq.gz>";
file_type = "FASTQ";
+ index_name = "file.fq";
}
fprintf(fp, "Usage: samtools %s [<reg> [...]]\n", tool);
" TYPE = rc for /rc on negative strand (default)\n"
" no for no strand indicator\n"
" sign for (+) / (-)\n"
- " custom,<pos>,<neg> for custom indicator\n",
- file_type, file_type);
+ " custom,<pos>,<neg> for custom indicator\n"
+ " --fai-idx FILE name of the index file (default %s.fai).\n"
+ " --gzi-idx FILE name of compressed file index (default %s.gz.gzi).\n",
+ file_type, file_type, index_name, index_name);
if (format == FAI_FASTA) {
char *pos_strand_name = ""; // Extension to add to name for +ve strand
char *neg_strand_name = "/rc"; // Extension to add to name for -ve strand
char *strand_names = NULL; // Used for custom strand annotation
+ char *fai_name = NULL; // specified index name
+ char *gzi_name = NULL; // specified compressed index name
FILE* file_out = samtools_stdout;/* output stream */
static const struct option lopts[] = {
{ "fastq", no_argument, NULL, 'f' },
{ "reverse-complement", no_argument, NULL, 'i' },
{ "mark-strand", required_argument, NULL, 1000 },
+ { "fai-idx", required_argument, NULL, 1001 },
+ { "gzi-idx", required_argument, NULL, 1002 },
{ NULL, 0, NULL, 0 }
};
return usage(samtools_stderr, format, EXIT_FAILURE);
}
break;
+ case 1001: fai_name = optarg; break;
+ case 1002: gzi_name = optarg; break;
default: break;
}
}
if ( argc==optind )
return usage(samtools_stdout, format, EXIT_SUCCESS);
- if ( optind+1 == argc && !region_file)
- {
- if (fai_build(argv[optind]) != 0) {
- fprintf(samtools_stderr, "[faidx] Could not build fai index %s.fai\n", argv[optind]);
+ if (optind+1 == argc && !region_file) {
+ if (output_file && !fai_name)
+ fai_name = output_file;
+
+ if (fai_build3(argv[optind], fai_name, gzi_name) != 0) {
+ if (fai_name)
+ fprintf(samtools_stderr, "[faidx] Could not build fai index %s", fai_name);
+ else
+ fprintf(samtools_stderr, "[faidx] Could not build fai index %s.fai", argv[optind]);
+
+ if (gzi_name)
+ fprintf(samtools_stderr, " or compressed index %s\n", gzi_name);
+ else
+ fprintf(samtools_stderr, "\n");
+
return EXIT_FAILURE;
}
+
return 0;
}
- faidx_t *fai = fai_load_format(argv[optind], format);
+ faidx_t *fai = fai_load3_format(argv[optind], fai_name, gzi_name, FAI_CREATE, format);
+
+ if (!fai) {
+ if (fai_name)
+ fprintf(samtools_stderr, "[faidx] Could not load fai index %s", fai_name);
+ else
+ fprintf(samtools_stderr, "[faidx] Could not build fai index %s.fai", argv[optind]);
+
+ if (gzi_name)
+ fprintf(samtools_stderr, " or compressed index %s\n", gzi_name);
+ else
+ fprintf(samtools_stderr, "\n");
- if ( !fai ) {
- fprintf(samtools_stderr, "[faidx] Could not load fai index of %s\n", argv[optind]);
return EXIT_FAILURE;
}
+++ /dev/null
-[Files in this distribution outwith the cram/ subdirectory are distributed
-according to the terms of the following MIT/Expat license.]
-
-The MIT/Expat License
-
-Copyright (C) 2012-2019 Genome Research Ltd.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
-
-[Files within the cram/ subdirectory in this distribution are distributed
-according to the terms of the following Modified 3-Clause BSD license.]
-
-The Modified-BSD License
-
-Copyright (C) 2012-2019 Genome Research Ltd.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
-3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute
- nor the names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR ITS CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-[The use of a range of years within a copyright notice in this distribution
-should be interpreted as being equivalent to a list of years including the
-first and last year specified and all consecutive years between them.
-
-For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009,
-2011-2012" should be interpreted as being identical to a notice that reads
-"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice
-that reads "Copyright (C) 2005-2012" should be interpreted as being identical
-to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010,
-2011, 2012".]
+++ /dev/null
-HTSlib is an implementation of a unified C library for accessing common file
-formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing
-data. It is the core library used by samtools and bcftools.
-
-See INSTALL for building and installation instructions.
/* padding.c -- depad subcommand.
Copyright (C) 2011, 2012 Broad Institute.
- Copyright (C) 2014-2016, 2019 Genome Research Ltd.
+ Copyright (C) 2014-2016, 2019-2020 Genome Research Ltd.
Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#define bam_reg2bin(b,e) hts_reg2bin((b),(e), 14, 5)
-// The one and only function needed from sam.c.
-// Explicitly here to avoid including bam.h translation layer.
-extern char *samfaipath(const char *fn_ref);
-
-static void replace_cigar(bam1_t *b, int n, uint32_t *cigar)
+static int replace_cigar(bam1_t *b, uint32_t n, uint32_t *cigar)
{
+ int diff = 0;
if (n != b->core.n_cigar) {
int o = b->core.l_qname + b->core.n_cigar * 4;
- if (b->l_data + (n - b->core.n_cigar) * 4 > b->m_data) {
- b->m_data = b->l_data + (n - b->core.n_cigar) * 4;
- kroundup32(b->m_data);
- b->data = (uint8_t*)realloc(b->data, b->m_data);
+ if (n > b->core.n_cigar) {
+ diff = (n - b->core.n_cigar) * 4;
+ if ((INT_MAX - b->l_data)/4 < (n - b->core.n_cigar)) {
+ fprintf(stderr, "[depad] ERROR: BAM record too big\n");
+ return -1;
+ }
+ if (b->l_data + diff > b->m_data) {
+ b->m_data = b->l_data + diff;
+ kroundup32(b->m_data);
+ uint8_t *tmp = (uint8_t*)realloc(b->data, b->m_data);
+ if (!tmp) {
+ fprintf(stderr, "[depad] ERROR: Memory allocation failure.\n");
+ return -1;
+ }
+ b->data = tmp;
+ }
+ } else {
+ diff = -(int)((b->core.n_cigar - n) * 4);
}
memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->l_data - o);
- memcpy(b->data + b->core.l_qname, cigar, n * 4);
- b->l_data += (n - b->core.n_cigar) * 4;
b->core.n_cigar = n;
- } else memcpy(b->data + b->core.l_qname, cigar, n * 4);
+ }
+
+ memcpy(b->data + b->core.l_qname, cigar, n * 4);
+ b->l_data += diff;
+
+ return 0;
}
#define write_cigar(_c, _n, _m, _v) do { \
kstring_t r, q;
int r_tid = -1;
uint32_t *cigar2 = 0;
- int ret = 0, n2 = 0, m2 = 0, *posmap = 0;
+ int ret = 0, *posmap = 0;
+ uint32_t n2 = 0, m2 = 0;
b = bam_init1();
if (!b) {
}
}
write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH));
- replace_cigar(b, n2, cigar2);
+ if (replace_cigar(b, n2, cigar2) < 0)
+ return -1;
posmap = update_posmap(posmap, r);
} else if (b->core.n_cigar > 0) {
int i, k, op;
for (i = k = 0; i < n2; ++i)
if (cigar2[i]) cigar2[k++] = cigar2[i];
n2 = k;
- replace_cigar(b, n2, cigar2);
+ if (replace_cigar(b, n2, cigar2) < 0)
+ return -1;
}
/* Even unmapped reads can have a POS value, e.g. if their mate was mapped */
if (b->core.pos != -1) b->core.pos = posmap[b->core.pos];
sam_hdr_t *h = 0, *h_fix = 0;
faidx_t *fai = 0;
int c, compress_level = -1, is_long_help = 0, no_pg = 0;
- char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0, *fn_out_idx = NULL;
+ char in_mode[5], out_mode[6], *fn_out = 0, *fn_fai = 0, *fn_out_idx = NULL;
int ret=0;
char *arg_list = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
// Load FASTA reference (also needed for SAM -> BAM if missing header)
if (ga.reference) {
- fn_list = samfaipath(ga.reference);
- fai = fai_load(ga.reference);
+ fn_fai = fai_path(ga.reference);
+ fai = fai_load3(ga.reference, fn_fai, NULL, FAI_CREATE);
}
// open file handlers
if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) {
ret = 1;
goto depad_end;
}
- if (fn_list && hts_set_fai_filename(in, fn_list) != 0) {
- fprintf(stderr, "[depad] failed to load reference file \"%s\".\n", fn_list);
+ if (fn_fai && hts_set_fai_filename(in, fn_fai) != 0) {
+ fprintf(stderr, "[depad] failed to load reference file \"%s\".\n", fn_fai);
ret = 1;
goto depad_end;
}
fprintf(stderr, "[depad] error on closing output file.\n");
ret = 1;
}
- free(fn_list); free(fn_out);
+ free(fn_fai); free(fn_out);
if (fn_out_idx)
free(fn_out_idx);
sam_global_args_free(&ga);
/* padding.c -- depad subcommand.
Copyright (C) 2011, 2012 Broad Institute.
- Copyright (C) 2014-2016, 2019 Genome Research Ltd.
+ Copyright (C) 2014-2016, 2019-2020 Genome Research Ltd.
Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#define bam_reg2bin(b,e) hts_reg2bin((b),(e), 14, 5)
-// The one and only function needed from sam.c.
-// Explicitly here to avoid including bam.h translation layer.
-extern char *samfaipath(const char *fn_ref);
-
-static void replace_cigar(bam1_t *b, int n, uint32_t *cigar)
+static int replace_cigar(bam1_t *b, uint32_t n, uint32_t *cigar)
{
+ int diff = 0;
if (n != b->core.n_cigar) {
int o = b->core.l_qname + b->core.n_cigar * 4;
- if (b->l_data + (n - b->core.n_cigar) * 4 > b->m_data) {
- b->m_data = b->l_data + (n - b->core.n_cigar) * 4;
- kroundup32(b->m_data);
- b->data = (uint8_t*)realloc(b->data, b->m_data);
+ if (n > b->core.n_cigar) {
+ diff = (n - b->core.n_cigar) * 4;
+ if ((INT_MAX - b->l_data)/4 < (n - b->core.n_cigar)) {
+ fprintf(samtools_stderr, "[depad] ERROR: BAM record too big\n");
+ return -1;
+ }
+ if (b->l_data + diff > b->m_data) {
+ b->m_data = b->l_data + diff;
+ kroundup32(b->m_data);
+ uint8_t *tmp = (uint8_t*)realloc(b->data, b->m_data);
+ if (!tmp) {
+ fprintf(samtools_stderr, "[depad] ERROR: Memory allocation failure.\n");
+ return -1;
+ }
+ b->data = tmp;
+ }
+ } else {
+ diff = -(int)((b->core.n_cigar - n) * 4);
}
memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->l_data - o);
- memcpy(b->data + b->core.l_qname, cigar, n * 4);
- b->l_data += (n - b->core.n_cigar) * 4;
b->core.n_cigar = n;
- } else memcpy(b->data + b->core.l_qname, cigar, n * 4);
+ }
+
+ memcpy(b->data + b->core.l_qname, cigar, n * 4);
+ b->l_data += diff;
+
+ return 0;
}
#define write_cigar(_c, _n, _m, _v) do { \
kstring_t r, q;
int r_tid = -1;
uint32_t *cigar2 = 0;
- int ret = 0, n2 = 0, m2 = 0, *posmap = 0;
+ int ret = 0, *posmap = 0;
+ uint32_t n2 = 0, m2 = 0;
b = bam_init1();
if (!b) {
}
}
write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH));
- replace_cigar(b, n2, cigar2);
+ if (replace_cigar(b, n2, cigar2) < 0)
+ return -1;
posmap = update_posmap(posmap, r);
} else if (b->core.n_cigar > 0) {
int i, k, op;
for (i = k = 0; i < n2; ++i)
if (cigar2[i]) cigar2[k++] = cigar2[i];
n2 = k;
- replace_cigar(b, n2, cigar2);
+ if (replace_cigar(b, n2, cigar2) < 0)
+ return -1;
}
/* Even unmapped reads can have a POS value, e.g. if their mate was mapped */
if (b->core.pos != -1) b->core.pos = posmap[b->core.pos];
sam_hdr_t *h = 0, *h_fix = 0;
faidx_t *fai = 0;
int c, compress_level = -1, is_long_help = 0, no_pg = 0;
- char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0, *fn_out_idx = NULL;
+ char in_mode[5], out_mode[6], *fn_out = 0, *fn_fai = 0, *fn_out_idx = NULL;
int ret=0;
char *arg_list = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
// Load FASTA reference (also needed for SAM -> BAM if missing header)
if (ga.reference) {
- fn_list = samfaipath(ga.reference);
- fai = fai_load(ga.reference);
+ fn_fai = fai_path(ga.reference);
+ fai = fai_load3(ga.reference, fn_fai, NULL, FAI_CREATE);
}
// open file handlers
if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) {
ret = 1;
goto depad_end;
}
- if (fn_list && hts_set_fai_filename(in, fn_list) != 0) {
- fprintf(samtools_stderr, "[depad] failed to load reference file \"%s\".\n", fn_list);
+ if (fn_fai && hts_set_fai_filename(in, fn_fai) != 0) {
+ fprintf(samtools_stderr, "[depad] failed to load reference file \"%s\".\n", fn_fai);
ret = 1;
goto depad_end;
}
fprintf(samtools_stderr, "[depad] error on closing output file.\n");
ret = 1;
}
- free(fn_list); free(fn_out);
+ free(fn_fai); free(fn_out);
if (fn_out_idx)
free(fn_out_idx);
sam_global_args_free(&ga);
int main_phase(int argc, char *argv[])
{
int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0;
+ int status = EXIT_SUCCESS;
const bam_pileup1_t *plp;
bam_plp_t iter;
nseq_t *seqs;
return 1;
}
}
+
+ if (n < 0) {
+ print_error("phase", "error reading from '%s'", argv[optind]);
+ status = EXIT_FAILURE;
+ }
+
sam_hdr_destroy(g.fp_hdr);
bam_plp_destroy(iter);
sam_close(g.fp);
}
free(g.arg_list);
sam_global_args_free(&ga);
- return 0;
+ return status;
}
int main_phase(int argc, char *argv[])
{
int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0;
+ int status = EXIT_SUCCESS;
const bam_pileup1_t *plp;
bam_plp_t iter;
nseq_t *seqs;
return 1;
}
}
+
+ if (n < 0) {
+ print_error("phase", "error reading from '%s'", argv[optind]);
+ status = EXIT_FAILURE;
+ }
+
sam_hdr_destroy(g.fp_hdr);
bam_plp_destroy(iter);
sam_close(g.fp);
}
free(g.arg_list);
sam_global_args_free(&ga);
- return 0;
+ return status;
}
/* sam_view.c -- SAM<->BAM<->CRAM conversion.
- Copyright (C) 2009-2019 Genome Research Ltd.
+ Copyright (C) 2009-2021 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include "htslib/faidx.h"
#include "htslib/khash.h"
#include "htslib/thread_pool.h"
+#include "htslib/hts_expr.h"
#include "samtools.h"
#include "sam_opts.h"
#include "bedidx.h"
-KHASH_SET_INIT_STR(rg)
-KHASH_SET_INIT_STR(tv)
+KHASH_SET_INIT_STR(str)
-typedef khash_t(rg) *rghash_t;
-typedef khash_t(tv) *tvhash_t;
+typedef khash_t(str) *strhash_t;
// This structure contains the settings for a samview run
typedef struct samview_settings {
- rghash_t rghash;
- tvhash_t tvhash;
+ strhash_t rghash;
+ strhash_t rnhash;
+ strhash_t tvhash;
int min_mapQ;
int flag_on;
int flag_off;
char** remove_aux;
int multi_region;
char* tag;
+ hts_filter_t *filter;
+ int remove_flag;
+ int add_flag;
} samview_settings_t;
// TODO Add declarations of these to a viable htslib or samtools header
extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b);
extern int bam_remove_B(bam1_t *b);
-extern char *samfaipath(const char *fn_ref);
// Returns 0 to indicate read should be output 1 otherwise
static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings)
if (settings->rghash) {
uint8_t *s = bam_aux_get(b, "RG");
if (s) {
- khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1));
+ khint_t k = kh_get(str, settings->rghash, (char*)(s + 1));
if (k == kh_end(settings->rghash)) return 1;
}
}
- if (settings->tvhash && settings->tag) {
+ if (settings->tag) {
uint8_t *s = bam_aux_get(b, settings->tag);
if (s) {
- khint_t k = kh_get(tv, settings->tvhash, (char*)(s + 1));
- if (k == kh_end(settings->tvhash)) return 1;
+ if (settings->tvhash) {
+ char t[32], *val;
+ if (*s == 'i' || *s == 'I' || *s == 's' || *s == 'S' || *s == 'c' || *s == 'C') {
+ int ret = snprintf(t, 32, "%"PRId64, bam_aux2i(s));
+ if (ret > 0) val = t;
+ else return 1;
+ } else if (*s == 'A') {
+ t[0] = *(s+1);
+ t[1] = 0;
+ val = t;
+ } else {
+ val = (char *)(s+1);
+ }
+ khint_t k = kh_get(str, settings->tvhash, val);
+ if (k == kh_end(settings->tvhash)) return 1;
+ }
} else {
return 1;
}
}
+ if (settings->rnhash) {
+ const char* rn = bam_get_qname(b);
+ if (!rn || kh_get(str, settings->rnhash, rn) == kh_end(settings->rnhash)) {
+ return 1;
+ }
+ }
if (settings->library) {
const char *p = bam_get_library((sam_hdr_t*)h, b);
if (!p || strcmp(p, settings->library) != 0) return 1;
}
}
}
+
+ if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1)
+ return 1;
+
return 0;
}
static int usage(FILE *fp, int exit_status, int is_long_help);
+static int populate_lookup_from_file(const char *subcmd, strhash_t lookup, char *fn)
+{
+ FILE *fp;
+ char buf[1024];
+ int ret = 0;
+ fp = fopen(fn, "r");
+ if (fp == NULL) {
+ print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
+ return -1;
+ }
+
+ while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) {
+ char *d = strdup(buf);
+ if (d != NULL) {
+ kh_put(str, lookup, d, &ret);
+ if (ret == 0) free(d); /* Duplicate */
+ } else {
+ ret = -1;
+ }
+ }
+ if (ferror(fp)) ret = -1;
+ if (ret == -1) {
+ print_error_errno(subcmd, "failed to read \"%s\"", fn);
+ }
+ fclose(fp);
+ return (ret != -1) ? 0 : -1;
+}
+
static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name)
{
char *d = strdup(name);
if (d == NULL) goto err;
if (settings->rghash == NULL) {
- settings->rghash = kh_init(rg);
+ settings->rghash = kh_init(str);
if (settings->rghash == NULL) goto err;
}
- kh_put(rg, settings->rghash, d, &ret);
+ kh_put(str, settings->rghash, d, &ret);
if (ret == -1) goto err;
if (ret == 0) free(d); /* Duplicate */
return 0;
return -1;
}
-static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
+static int add_read_names_file(const char *subcmd, samview_settings_t *settings, char *fn)
{
- FILE *fp;
- char buf[1024];
- int ret = 0;
- if (settings->rghash == NULL) {
- settings->rghash = kh_init(rg);
- if (settings->rghash == NULL) {
+ if (settings->rnhash == NULL) {
+ settings->rnhash = kh_init(str);
+ if (settings->rnhash == NULL) {
perror(NULL);
return -1;
}
}
+ return populate_lookup_from_file(subcmd, settings->rnhash, fn);
+}
- fp = fopen(fn, "r");
- if (fp == NULL) {
- print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
- return -1;
- }
-
- while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) {
- char *d = strdup(buf);
- if (d != NULL) {
- kh_put(rg, settings->rghash, d, &ret);
- if (ret == 0) free(d); /* Duplicate */
- } else {
- ret = -1;
+static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
+{
+ if (settings->rghash == NULL) {
+ settings->rghash = kh_init(str);
+ if (settings->rghash == NULL) {
+ perror(NULL);
+ return -1;
}
}
- if (ferror(fp)) ret = -1;
- if (ret == -1) {
- print_error_errno(subcmd, "failed to read \"%s\"", fn);
- }
- fclose(fp);
- return (ret != -1) ? 0 : -1;
+ return populate_lookup_from_file(subcmd, settings->rghash, fn);
}
static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name)
if (d == NULL) goto err;
if (settings->tvhash == NULL) {
- settings->tvhash = kh_init(tv);
+ settings->tvhash = kh_init(str);
if (settings->tvhash == NULL) goto err;
}
- kh_put(tv, settings->tvhash, d, &ret);
+ kh_put(str, settings->tvhash, d, &ret);
if (ret == -1) goto err;
if (ret == 0) free(d); /* Duplicate */
return 0;
static int add_tag_values_file(const char *subcmd, samview_settings_t *settings, char *fn)
{
- FILE *fp;
- char buf[1024];
- int ret = 0;
if (settings->tvhash == NULL) {
- settings->tvhash = kh_init(tv);
+ settings->tvhash = kh_init(str);
if (settings->tvhash == NULL) {
perror(NULL);
return -1;
}
}
-
- fp = fopen(fn, "r");
- if (fp == NULL) {
- print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
- return -1;
- }
-
- while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) {
- char *d = strdup(buf);
- if (d != NULL) {
- kh_put(tv, settings->tvhash, d, &ret);
- if (ret == 0) free(d); /* Duplicate */
- } else {
- ret = -1;
- }
- }
- if (ferror(fp)) ret = -1;
- if (ret == -1) {
- print_error_errno(subcmd, "failed to read \"%s\"", fn);
- }
- fclose(fp);
- return (ret != -1) ? 0 : -1;
+ return populate_lookup_from_file(subcmd, settings->tvhash, fn);
}
static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b, const char *fname, int *retp)
return r;
}
+static inline void change_flag(bam1_t *b, samview_settings_t *settings)
+{
+ if (settings->add_flag)
+ b->core.flag |= settings->add_flag;
+
+ if (settings->remove_flag)
+ b->core.flag &= ~settings->remove_flag;
+}
+
+// Make mnemonic distinct values for longoption-only options
+#define LONGOPT(c) ((c) + 128)
+
int main_samview(int argc, char *argv[])
{
int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0;
samFile *in = 0, *out = 0, *un_out=0;
FILE *fp_out = NULL;
sam_hdr_t *header = NULL;
- char out_mode[5], out_un_mode[5], *out_format = "";
- char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0;
+ char out_mode[6] = {0}, out_un_mode[6] = {0}, *out_format = "";
+ char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_fai = 0, *q, *fn_un_out = 0;
char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
htsThreadPool p = {NULL, 0};
.library = NULL,
.bed = NULL,
.multi_region = 0,
- .tag = NULL
+ .tag = NULL,
+ .filter = NULL,
+ .remove_flag = 0,
+ .add_flag = 0
};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'),
- {"no-PG", no_argument, NULL, 1},
+ {"add-flags", required_argument, NULL, LONGOPT('a')},
+ {"bam", no_argument, NULL, 'b'},
+ {"count", no_argument, NULL, 'c'},
+ {"cram", no_argument, NULL, 'C'},
+ {"customised-index", no_argument, NULL, 'X'},
+ {"customized-index", no_argument, NULL, 'X'},
+ {"excl-flags", required_argument, NULL, 'F'},
+ {"exclude-flags", required_argument, NULL, 'F'},
+ {"expr", required_argument, NULL, 'e'},
+ {"expression", required_argument, NULL, 'e'},
+ {"fai-reference", required_argument, NULL, 't'},
+ {"fast", no_argument, NULL, '1'},
+ {"header-only", no_argument, NULL, 'H'},
+ {"help", no_argument, NULL, LONGOPT('?')},
+ {"library", required_argument, NULL, 'l'},
+ {"min-mapq", required_argument, NULL, 'q'},
+ {"min-MQ", required_argument, NULL, 'q'},
+ {"min-mq", required_argument, NULL, 'q'},
+ {"min-qlen", required_argument, NULL, 'm'},
+ {"no-header", no_argument, NULL, LONGOPT('H')},
+ {"no-PG", no_argument, NULL, LONGOPT('P')},
+ {"output", required_argument, NULL, 'o'},
+ {"output-unselected", required_argument, NULL, 'U'},
+ {"QNAME-file", required_argument, NULL, 'N'},
+ {"qname-file", required_argument, NULL, 'N'},
+ {"read-group", required_argument, NULL, 'r'},
+ {"read-group-file", required_argument, NULL, 'R'},
+ {"readgroup", required_argument, NULL, 'r'},
+ {"readgroup-file", required_argument, NULL, 'R'},
+ {"region-file", required_argument, NULL, LONGOPT('L')},
+ {"regions-file", required_argument, NULL, LONGOPT('L')},
+ {"remove-B", no_argument, NULL, 'B'},
+ {"remove-flags", required_argument, NULL, LONGOPT('r')},
+ {"remove-tag", required_argument, NULL, 'x'},
+ {"require-flags", required_argument, NULL, 'f'},
+ {"subsample", required_argument, NULL, LONGOPT('s')},
+ {"subsample-seed", required_argument, NULL, LONGOPT('S')},
+ {"tag", required_argument, NULL, 'd'},
+ {"tag-file", required_argument, NULL, 'D'},
+ {"target-file", required_argument, NULL, 'L'},
+ {"targets-file", required_argument, NULL, 'L'},
+ {"uncompressed", no_argument, NULL, 'u'},
+ {"unoutput", required_argument, NULL, 'U'},
+ {"use-index", no_argument, NULL, 'M'},
+ {"with-header", no_argument, NULL, 'h'},
{ NULL, 0, NULL, 0 }
};
opterr = 0;
while ((c = getopt_long(argc, argv,
- "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:d:D:L:s:@:m:x:U:MX",
+ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:",
lopts, NULL)) >= 0) {
switch (c) {
case 's':
- if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) {
- // Convert likely user input 0,1,2,... to pseudo-random
- // values with more entropy and more bits set
- srand(settings.subsam_seed);
- settings.subsam_seed = rand();
- }
+ settings.subsam_seed = strtol(optarg, &q, 10);
if (q && *q == '.') {
settings.subsam_frac = strtod(q, &q);
if (*q) ret = 1;
goto view_end;
}
break;
+ case LONGOPT('s'):
+ settings.subsam_frac = strtod(optarg, &q);
+ if (*q || settings.subsam_frac < 0.0 || settings.subsam_frac > 1.0) {
+ print_error("view", "Incorrect sampling argument \"%s\"", optarg);
+ goto view_end;
+ }
+ break;
+ case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break;
case 'm': settings.min_qlen = atoi(optarg); break;
case 'c': is_count = 1; break;
case 'S': break;
case 'b': out_format = "b"; break;
case 'C': out_format = "c"; break;
- case 't': fn_list = strdup(optarg); break;
+ case 't': fn_fai = strdup(optarg); break;
case 'h': is_header = 1; break;
case 'H': is_header_only = 1; break;
+ case LONGOPT('H'): is_header = is_header_only = 0; break;
case 'o': fn_out = strdup(optarg); break;
case 'U': fn_un_out = strdup(optarg); break;
case 'X': has_index_file = 1; break;
- case 'f': settings.flag_on |= strtol(optarg, 0, 0); break;
- case 'F': settings.flag_off |= strtol(optarg, 0, 0); break;
- case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break;
+ case 'f': settings.flag_on |= bam_str2flag(optarg); break;
+ case 'F': settings.flag_off |= bam_str2flag(optarg); break;
+ case 'G': settings.flag_alloff |= bam_str2flag(optarg); break;
case 'q': settings.min_mapQ = atoi(optarg); break;
case 'u': compress_level = 0; break;
case '1': compress_level = 1; break;
case 'l': settings.library = strdup(optarg); break;
+ case LONGOPT('L'):
+ settings.multi_region = 1;
+ // fall through
case 'L':
if ((settings.bed = bed_read(optarg)) == NULL) {
print_error_errno("view", "Could not read file \"%s\"", optarg);
goto view_end;
}
break;
+ case 'N':
+ if (add_read_names_file("view", &settings, optarg) != 0) {
+ ret = 1;
+ goto view_end;
+ }
+ break;
case 'd':
- if (strlen(optarg) < 4 || optarg[2] != ':') {
+ if (strlen(optarg) < 2 || (strlen(optarg) > 2 && optarg[2] != ':')) {
print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg);
ret = 1;
goto view_end;
memcpy(settings.tag, optarg, 2);
}
- if (add_tag_value_single("view", &settings, optarg+3) != 0) {
+ if (strlen(optarg) > 3 && add_tag_value_single("view", &settings, optarg+3) != 0) {
+ print_error("view", "Could not add tag:value \"%s\"", optarg);
ret = 1;
goto view_end;
}
case 'D':
// Allow ";" as delimiter besides ":" to support MinGW CLI POSIX
// path translation as described at:
- // http://www.mingw.org/wiki/Posix_path_conversion
+ // http://www.mingw.org/wiki/Posix_path_conversion
if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) {
print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg);
ret = 1;
//case 'x': out_format = "x"; break;
//case 'X': out_format = "X"; break;
*/
+ case LONGOPT('?'):
+ return usage(stdout, EXIT_SUCCESS, 1);
case '?':
if (optopt == '?') { // '-?' appeared on command line
return usage(stdout, EXIT_SUCCESS, 1);
case 'x':
{
if (strlen(optarg) != 2) {
- fprintf(stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n");
+ print_error("main_samview", "Error parsing -x auxiliary tags should be exactly two characters long.");
return usage(stderr, EXIT_FAILURE, 0);
}
settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len));
}
break;
case 'M': settings.multi_region = 1; break;
- case 1: no_pg = 1; break;
+ case LONGOPT('P'): no_pg = 1; break;
+ case 'e':
+ if (!(settings.filter = hts_filter_init(optarg))) {
+ print_error("main_samview", "Couldn't initialise filter");
+ return 1;
+ }
+ break;
+ case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break;
+ case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break;
default:
if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0)
return usage(stderr, EXIT_FAILURE, 0);
break;
}
}
+ if (fn_fai == 0 && ga.reference) fn_fai = fai_path(ga.reference);
if (compress_level >= 0 && !*out_format) out_format = "b";
if (is_header_only) is_header = 1;
// File format auto-detection first
// Overridden by manual -b, -C
if (*out_format)
out_mode[1] = out_un_mode[1] = *out_format;
- out_mode[2] = out_un_mode[2] = '\0';
- // out_(un_)mode now 1 or 2 bytes long, followed by nul.
+ // out_(un_)mode now 1, 2 or 3 bytes long, followed by nul.
if (compress_level >= 0) {
char tmp[2];
tmp[0] = compress_level + '0'; tmp[1] = '\0';
print_error("view", "No input provided or missing option argument.");
return usage(stderr, EXIT_FAILURE, 0); // potential memory leak...
}
+ if (settings.subsam_seed != 0) {
+ // Convert likely user input 1,2,... to pseudo-random
+ // values with more entropy and more bits set
+ srand(settings.subsam_seed);
+ settings.subsam_seed = rand();
+ }
fn_in = (optind < argc)? argv[optind] : "-";
- // generate the fn_list if necessary
- if (fn_list == 0 && ga.reference) fn_list = samfaipath(ga.reference);
- // open file handlers
if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) {
print_error_errno("view", "failed to open \"%s\" for reading", fn_in);
ret = 1;
goto view_end;
}
- if (fn_list) {
- if (hts_set_fai_filename(in, fn_list) != 0) {
- fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ if (fn_fai) {
+ if (hts_set_fai_filename(in, fn_fai) != 0) {
+ fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai);
ret = 1;
goto view_end;
}
ret = 1;
goto view_end;
}
- if (fn_list) {
- if (hts_set_fai_filename(out, fn_list) != 0) {
- fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ if (fn_fai) {
+ if (hts_set_fai_filename(out, fn_fai) != 0) {
+ fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai);
ret = 1;
goto view_end;
}
ret = 1;
goto view_end;
}
- if (fn_list) {
- if (hts_set_fai_filename(un_out, fn_list) != 0) {
- fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ if (fn_fai) {
+ if (hts_set_fai_filename(un_out, fn_fai) != 0) {
+ fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai);
ret = 1;
goto view_end;
}
// fetch alignments
while ((result = sam_itr_multi_next(in, iter, b)) >= 0) {
if (!process_aln(header, b, &settings)) {
- if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; }
+ if (!is_count) {
+ change_flag(b, &settings);
+ if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
+ }
count++;
} else {
if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
if ((has_index_file && optind >= argc - 2) || (!has_index_file && optind >= argc - 1)) { // convert/print the entire file
bam1_t *b = bam_init1();
int r;
+ errno = 0;
while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in'
if (!process_aln(header, b, &settings)) {
- if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; }
+ if (!is_count) {
+ change_flag(b, &settings);
+ if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
+ }
count++;
} else {
if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
}
}
if (r < -1) {
- fprintf(stderr, "[main_samview] truncated file.\n");
+ print_error_errno("view", "error reading file \"%s\"", fn_in);
ret = 1;
}
bam_destroy1(b);
// fetch alignments
while ((result = sam_itr_next(in, iter, b)) >= 0) {
if (!process_aln(header, b, &settings)) {
- if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; }
+ if (!is_count) {
+ change_flag(b, &settings);
+ if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
+ }
count++;
} else {
if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret);
if (fp_out) fclose(fp_out);
- free(fn_list); free(fn_out); free(settings.library); free(fn_un_out);
+ free(fn_fai); free(fn_out); free(settings.library); free(fn_un_out);
sam_global_args_free(&ga);
if ( header ) sam_hdr_destroy(header);
if (settings.bed) bed_destroy(settings.bed);
khint_t k;
for (k = 0; k < kh_end(settings.rghash); ++k)
if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k));
- kh_destroy(rg, settings.rghash);
+ kh_destroy(str, settings.rghash);
+ }
+ if (settings.rnhash) {
+ khint_t k;
+ for (k = 0; k < kh_end(settings.rnhash); ++k)
+ if (kh_exist(settings.rnhash, k)) free((char*)kh_key(settings.rnhash, k));
+ kh_destroy(str, settings.rnhash);
}
if (settings.tvhash) {
khint_t k;
for (k = 0; k < kh_end(settings.tvhash); ++k)
if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k));
- kh_destroy(tv, settings.tvhash);
+ kh_destroy(str, settings.tvhash);
}
if (settings.remove_aux_len) {
free(settings.remove_aux);
if (settings.tag) {
free(settings.tag);
}
+ if (settings.filter)
+ hts_filter_free(settings.filter);
if (p.pool)
hts_tpool_destroy(p.pool);
"\n"
"Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n"
"\n"
-"Options:\n"
-// output options
-" -b output BAM\n"
-" -C output CRAM (requires -T)\n"
-" -1 use fast BAM compression (implies -b)\n"
-" -u uncompressed BAM output (implies -b)\n"
-" -h include header in SAM output\n"
-" -H print SAM header only (no alignments)\n"
-" -c print only the count of matching records\n"
-" -o FILE output file name [stdout]\n"
-" -U FILE output reads not selected by filters to FILE [null]\n"
-// extra input
-" -t FILE FILE listing reference names and lengths (see long help) [null]\n"
-" -X include customized index file\n"
-// read filters
-" -L FILE only include reads overlapping this BED FILE [null]\n"
-" -r STR only include reads in read group STR [null]\n"
-" -R FILE only include reads with read group listed in FILE [null]\n"
-" -d STR:STR\n"
-" only include reads with tag STR and associated value STR [null]\n"
-" -D STR:FILE\n"
-" only include reads with tag STR and associated values listed in\n"
-" FILE [null]\n"
-" -q INT only include reads with mapping quality >= INT [0]\n"
-" -l STR only include reads in library STR [null]\n"
-" -m INT only include reads with number of CIGAR operations consuming\n"
-" query sequence >= INT [0]\n"
-" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
-" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0
-" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
-" -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n"
-" fraction of templates/read pairs to keep; INT part sets seed)\n"
-" -M use the multi-region iterator (increases the speed, removes\n"
-" duplicates and outputs the reads as they are ordered in the file)\n"
-// read processing
-" -x STR read tag to strip (repeatable) [null]\n"
-" -B collapse the backward CIGAR operation\n"
-// general options
-" -? print long help, including note about region specification\n"
-" -S ignored (input format is auto-detected)\n"
-" --no-PG do not add a PG line\n");
+"Output options:\n"
+" -b, --bam Output BAM\n"
+" -C, --cram Output CRAM (requires -T)\n"
+" -1, --fast Use fast BAM compression (implies --bam)\n"
+" -u, --uncompressed Uncompressed BAM output (implies --bam)\n"
+" -h, --with-header Include header in SAM output\n"
+" -H, --header-only Print SAM header only (no alignments)\n"
+" --no-header Print SAM alignment records only [default]\n"
+" -c, --count Print only the count of matching records\n"
+" -o, --output FILE Write output to FILE [standard output]\n"
+" -U, --unoutput FILE, --output-unselected FILE\n"
+" Output reads not selected by filters to FILE\n"
+"Input options:\n"
+" -t, --fai-reference FILE FILE listing reference names and lengths\n"
+" -M, --use-index Use index and multi-region iterator for regions\n"
+" --region[s]-file FILE Use index to include only reads overlapping FILE\n"
+" -X, --customized-index Expect extra index file argument after <in.bam>\n"
+"\n"
+"Filtering options (Only include in output reads that...):\n"
+" -L, --target[s]-file FILE ...overlap (BED) regions in FILE\n"
+" -r, --read-group STR ...are in read group STR\n"
+" -R, --read-group-file FILE ...are in a read group listed in FILE\n"
+" -N, --qname-file FILE ...whose read name is listed in FILE\n"
+" -d, --tag STR1[:STR2] ...have a tag STR1 (with associated value STR2)\n"
+" -D, --tag-file STR:FILE ...have a tag STR whose value is listed in FILE\n"
+" -q, --min-MQ INT ...have mapping quality >= INT\n"
+" -l, --library STR ...are in library STR\n"
+" -m, --min-qlen INT ...cover >= INT query bases (as measured via CIGAR)\n"
+" -e, --expr STR ...match the filter expression STR\n"
+" -f, --require-flags FLAG ...have all of the FLAGs present\n" // F&x == x
+" -F, --excl[ude]-flags FLAG ...have none of the FLAGs present\n" // F&x == 0
+" -G FLAG EXCLUDE reads with all of the FLAGs present\n" // !(F&x == x) TODO long option
+" --subsample FLOAT Keep only FLOAT fraction of templates/read pairs\n"
+" --subsample-seed INT Influence WHICH reads are kept in subsampling [0]\n"
+" -s INT.FRAC Same as --subsample 0.FRAC --subsample-seed INT\n"
+"\n"
+"Processing options:\n"
+" --add-flags FLAG Add FLAGs to reads\n"
+" --remove-flags FLAG Remove FLAGs from reads\n"
+" -x, --remove-tag STR Strip tag STR from reads (option may be repeated)\n"
+" -B, --remove-B Collapse the backward CIGAR operation\n"
+"\n"
+"General options:\n"
+" -?, --help Print long help, including note about region specification\n"
+" -S Ignored (input format is auto-detected)\n"
+" --no-PG Do not add a PG line\n");
sam_global_opt_help(fp, "-.O.T@..");
fprintf(fp, "\n");
"\n"
"6. Option `-u' is preferred over `-b' when the output is piped to\n"
" another samtools command.\n"
+"\n"
+"7. Option `-M`/`--use-index` causes overlaps with `-L` BED file regions and\n"
+" command-line region arguments to be computed using the multi-region iterator\n"
+" and an index. This increases speed, omits duplicates, and outputs the reads\n"
+" as they are ordered in the input SAM/BAM/CRAM file.\n"
+"\n"
+"8. Options `-L`/`--target[s]-file` and `--region[s]-file` may not be used\n"
+" together. `--region[s]-file FILE` is simply equivalent to `-M -L FILE`,\n"
+" so using both causes one of the specified BED files to be ignored.\n"
"\n");
return exit_status;
}
-
-int main_import(int argc, char *argv[])
-{
- int argc2, ret;
- char **argv2;
- if (argc != 4) {
- fprintf(stderr, "Usage: samtools import <in.ref_list> <in.sam> <out.bam>\n");
- return 1;
- }
- argc2 = 6;
- argv2 = calloc(6, sizeof(char*));
- argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2];
- ret = main_samview(argc2, argv2);
- free(argv2);
- return ret;
-}
/* sam_view.c -- SAM<->BAM<->CRAM conversion.
- Copyright (C) 2009-2019 Genome Research Ltd.
+ Copyright (C) 2009-2021 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include "htslib/faidx.h"
#include "htslib/khash.h"
#include "htslib/thread_pool.h"
+#include "htslib/hts_expr.h"
#include "samtools.h"
#include "sam_opts.h"
#include "bedidx.h"
-KHASH_SET_INIT_STR(rg)
-KHASH_SET_INIT_STR(tv)
+KHASH_SET_INIT_STR(str)
-typedef khash_t(rg) *rghash_t;
-typedef khash_t(tv) *tvhash_t;
+typedef khash_t(str) *strhash_t;
// This structure contains the settings for a samview run
typedef struct samview_settings {
- rghash_t rghash;
- tvhash_t tvhash;
+ strhash_t rghash;
+ strhash_t rnhash;
+ strhash_t tvhash;
int min_mapQ;
int flag_on;
int flag_off;
char** remove_aux;
int multi_region;
char* tag;
+ hts_filter_t *filter;
+ int remove_flag;
+ int add_flag;
} samview_settings_t;
// TODO Add declarations of these to a viable htslib or samtools header
extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b);
extern int bam_remove_B(bam1_t *b);
-extern char *samfaipath(const char *fn_ref);
// Returns 0 to indicate read should be output 1 otherwise
static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings)
if (settings->rghash) {
uint8_t *s = bam_aux_get(b, "RG");
if (s) {
- khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1));
+ khint_t k = kh_get(str, settings->rghash, (char*)(s + 1));
if (k == kh_end(settings->rghash)) return 1;
}
}
- if (settings->tvhash && settings->tag) {
+ if (settings->tag) {
uint8_t *s = bam_aux_get(b, settings->tag);
if (s) {
- khint_t k = kh_get(tv, settings->tvhash, (char*)(s + 1));
- if (k == kh_end(settings->tvhash)) return 1;
+ if (settings->tvhash) {
+ char t[32], *val;
+ if (*s == 'i' || *s == 'I' || *s == 's' || *s == 'S' || *s == 'c' || *s == 'C') {
+ int ret = snprintf(t, 32, "%"PRId64, bam_aux2i(s));
+ if (ret > 0) val = t;
+ else return 1;
+ } else if (*s == 'A') {
+ t[0] = *(s+1);
+ t[1] = 0;
+ val = t;
+ } else {
+ val = (char *)(s+1);
+ }
+ khint_t k = kh_get(str, settings->tvhash, val);
+ if (k == kh_end(settings->tvhash)) return 1;
+ }
} else {
return 1;
}
}
+ if (settings->rnhash) {
+ const char* rn = bam_get_qname(b);
+ if (!rn || kh_get(str, settings->rnhash, rn) == kh_end(settings->rnhash)) {
+ return 1;
+ }
+ }
if (settings->library) {
const char *p = bam_get_library((sam_hdr_t*)h, b);
if (!p || strcmp(p, settings->library) != 0) return 1;
}
}
}
+
+ if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1)
+ return 1;
+
return 0;
}
static int usage(FILE *fp, int exit_status, int is_long_help);
+static int populate_lookup_from_file(const char *subcmd, strhash_t lookup, char *fn)
+{
+ FILE *fp;
+ char buf[1024];
+ int ret = 0;
+ fp = fopen(fn, "r");
+ if (fp == NULL) {
+ print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
+ return -1;
+ }
+
+ while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) {
+ char *d = strdup(buf);
+ if (d != NULL) {
+ kh_put(str, lookup, d, &ret);
+ if (ret == 0) free(d); /* Duplicate */
+ } else {
+ ret = -1;
+ }
+ }
+ if (ferror(fp)) ret = -1;
+ if (ret == -1) {
+ print_error_errno(subcmd, "failed to read \"%s\"", fn);
+ }
+ fclose(fp);
+ return (ret != -1) ? 0 : -1;
+}
+
static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name)
{
char *d = strdup(name);
if (d == NULL) goto err;
if (settings->rghash == NULL) {
- settings->rghash = kh_init(rg);
+ settings->rghash = kh_init(str);
if (settings->rghash == NULL) goto err;
}
- kh_put(rg, settings->rghash, d, &ret);
+ kh_put(str, settings->rghash, d, &ret);
if (ret == -1) goto err;
if (ret == 0) free(d); /* Duplicate */
return 0;
return -1;
}
-static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
+static int add_read_names_file(const char *subcmd, samview_settings_t *settings, char *fn)
{
- FILE *fp;
- char buf[1024];
- int ret = 0;
- if (settings->rghash == NULL) {
- settings->rghash = kh_init(rg);
- if (settings->rghash == NULL) {
+ if (settings->rnhash == NULL) {
+ settings->rnhash = kh_init(str);
+ if (settings->rnhash == NULL) {
perror(NULL);
return -1;
}
}
+ return populate_lookup_from_file(subcmd, settings->rnhash, fn);
+}
- fp = fopen(fn, "r");
- if (fp == NULL) {
- print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
- return -1;
- }
-
- while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) {
- char *d = strdup(buf);
- if (d != NULL) {
- kh_put(rg, settings->rghash, d, &ret);
- if (ret == 0) free(d); /* Duplicate */
- } else {
- ret = -1;
+static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
+{
+ if (settings->rghash == NULL) {
+ settings->rghash = kh_init(str);
+ if (settings->rghash == NULL) {
+ perror(NULL);
+ return -1;
}
}
- if (ferror(fp)) ret = -1;
- if (ret == -1) {
- print_error_errno(subcmd, "failed to read \"%s\"", fn);
- }
- fclose(fp);
- return (ret != -1) ? 0 : -1;
+ return populate_lookup_from_file(subcmd, settings->rghash, fn);
}
static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name)
if (d == NULL) goto err;
if (settings->tvhash == NULL) {
- settings->tvhash = kh_init(tv);
+ settings->tvhash = kh_init(str);
if (settings->tvhash == NULL) goto err;
}
- kh_put(tv, settings->tvhash, d, &ret);
+ kh_put(str, settings->tvhash, d, &ret);
if (ret == -1) goto err;
if (ret == 0) free(d); /* Duplicate */
return 0;
static int add_tag_values_file(const char *subcmd, samview_settings_t *settings, char *fn)
{
- FILE *fp;
- char buf[1024];
- int ret = 0;
if (settings->tvhash == NULL) {
- settings->tvhash = kh_init(tv);
+ settings->tvhash = kh_init(str);
if (settings->tvhash == NULL) {
perror(NULL);
return -1;
}
}
-
- fp = fopen(fn, "r");
- if (fp == NULL) {
- print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
- return -1;
- }
-
- while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) {
- char *d = strdup(buf);
- if (d != NULL) {
- kh_put(tv, settings->tvhash, d, &ret);
- if (ret == 0) free(d); /* Duplicate */
- } else {
- ret = -1;
- }
- }
- if (ferror(fp)) ret = -1;
- if (ret == -1) {
- print_error_errno(subcmd, "failed to read \"%s\"", fn);
- }
- fclose(fp);
- return (ret != -1) ? 0 : -1;
+ return populate_lookup_from_file(subcmd, settings->tvhash, fn);
}
static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b, const char *fname, int *retp)
return r;
}
+static inline void change_flag(bam1_t *b, samview_settings_t *settings)
+{
+ if (settings->add_flag)
+ b->core.flag |= settings->add_flag;
+
+ if (settings->remove_flag)
+ b->core.flag &= ~settings->remove_flag;
+}
+
+// Make mnemonic distinct values for longoption-only options
+#define LONGOPT(c) ((c) + 128)
+
int main_samview(int argc, char *argv[])
{
int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0;
samFile *in = 0, *out = 0, *un_out=0;
FILE *fp_out = NULL;
sam_hdr_t *header = NULL;
- char out_mode[5], out_un_mode[5], *out_format = "";
- char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0;
+ char out_mode[6] = {0}, out_un_mode[6] = {0}, *out_format = "";
+ char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_fai = 0, *q, *fn_un_out = 0;
char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
htsThreadPool p = {NULL, 0};
.library = NULL,
.bed = NULL,
.multi_region = 0,
- .tag = NULL
+ .tag = NULL,
+ .filter = NULL,
+ .remove_flag = 0,
+ .add_flag = 0
};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'),
- {"no-PG", no_argument, NULL, 1},
+ {"add-flags", required_argument, NULL, LONGOPT('a')},
+ {"bam", no_argument, NULL, 'b'},
+ {"count", no_argument, NULL, 'c'},
+ {"cram", no_argument, NULL, 'C'},
+ {"customised-index", no_argument, NULL, 'X'},
+ {"customized-index", no_argument, NULL, 'X'},
+ {"excl-flags", required_argument, NULL, 'F'},
+ {"exclude-flags", required_argument, NULL, 'F'},
+ {"expr", required_argument, NULL, 'e'},
+ {"expression", required_argument, NULL, 'e'},
+ {"fai-reference", required_argument, NULL, 't'},
+ {"fast", no_argument, NULL, '1'},
+ {"header-only", no_argument, NULL, 'H'},
+ {"help", no_argument, NULL, LONGOPT('?')},
+ {"library", required_argument, NULL, 'l'},
+ {"min-mapq", required_argument, NULL, 'q'},
+ {"min-MQ", required_argument, NULL, 'q'},
+ {"min-mq", required_argument, NULL, 'q'},
+ {"min-qlen", required_argument, NULL, 'm'},
+ {"no-header", no_argument, NULL, LONGOPT('H')},
+ {"no-PG", no_argument, NULL, LONGOPT('P')},
+ {"output", required_argument, NULL, 'o'},
+ {"output-unselected", required_argument, NULL, 'U'},
+ {"QNAME-file", required_argument, NULL, 'N'},
+ {"qname-file", required_argument, NULL, 'N'},
+ {"read-group", required_argument, NULL, 'r'},
+ {"read-group-file", required_argument, NULL, 'R'},
+ {"readgroup", required_argument, NULL, 'r'},
+ {"readgroup-file", required_argument, NULL, 'R'},
+ {"region-file", required_argument, NULL, LONGOPT('L')},
+ {"regions-file", required_argument, NULL, LONGOPT('L')},
+ {"remove-B", no_argument, NULL, 'B'},
+ {"remove-flags", required_argument, NULL, LONGOPT('r')},
+ {"remove-tag", required_argument, NULL, 'x'},
+ {"require-flags", required_argument, NULL, 'f'},
+ {"subsample", required_argument, NULL, LONGOPT('s')},
+ {"subsample-seed", required_argument, NULL, LONGOPT('S')},
+ {"tag", required_argument, NULL, 'd'},
+ {"tag-file", required_argument, NULL, 'D'},
+ {"target-file", required_argument, NULL, 'L'},
+ {"targets-file", required_argument, NULL, 'L'},
+ {"uncompressed", no_argument, NULL, 'u'},
+ {"unoutput", required_argument, NULL, 'U'},
+ {"use-index", no_argument, NULL, 'M'},
+ {"with-header", no_argument, NULL, 'h'},
{ NULL, 0, NULL, 0 }
};
opterr = 0;
while ((c = getopt_long(argc, argv,
- "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:d:D:L:s:@:m:x:U:MX",
+ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:",
lopts, NULL)) >= 0) {
switch (c) {
case 's':
- if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) {
- // Convert likely user input 0,1,2,... to pseudo-random
- // values with more entropy and more bits set
- srand(settings.subsam_seed);
- settings.subsam_seed = rand();
- }
+ settings.subsam_seed = strtol(optarg, &q, 10);
if (q && *q == '.') {
settings.subsam_frac = strtod(q, &q);
if (*q) ret = 1;
goto view_end;
}
break;
+ case LONGOPT('s'):
+ settings.subsam_frac = strtod(optarg, &q);
+ if (*q || settings.subsam_frac < 0.0 || settings.subsam_frac > 1.0) {
+ print_error("view", "Incorrect sampling argument \"%s\"", optarg);
+ goto view_end;
+ }
+ break;
+ case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break;
case 'm': settings.min_qlen = atoi(optarg); break;
case 'c': is_count = 1; break;
case 'S': break;
case 'b': out_format = "b"; break;
case 'C': out_format = "c"; break;
- case 't': fn_list = strdup(optarg); break;
+ case 't': fn_fai = strdup(optarg); break;
case 'h': is_header = 1; break;
case 'H': is_header_only = 1; break;
+ case LONGOPT('H'): is_header = is_header_only = 0; break;
case 'o': fn_out = strdup(optarg); break;
case 'U': fn_un_out = strdup(optarg); break;
case 'X': has_index_file = 1; break;
- case 'f': settings.flag_on |= strtol(optarg, 0, 0); break;
- case 'F': settings.flag_off |= strtol(optarg, 0, 0); break;
- case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break;
+ case 'f': settings.flag_on |= bam_str2flag(optarg); break;
+ case 'F': settings.flag_off |= bam_str2flag(optarg); break;
+ case 'G': settings.flag_alloff |= bam_str2flag(optarg); break;
case 'q': settings.min_mapQ = atoi(optarg); break;
case 'u': compress_level = 0; break;
case '1': compress_level = 1; break;
case 'l': settings.library = strdup(optarg); break;
+ case LONGOPT('L'):
+ settings.multi_region = 1;
+ // fall through
case 'L':
if ((settings.bed = bed_read(optarg)) == NULL) {
print_error_errno("view", "Could not read file \"%s\"", optarg);
goto view_end;
}
break;
+ case 'N':
+ if (add_read_names_file("view", &settings, optarg) != 0) {
+ ret = 1;
+ goto view_end;
+ }
+ break;
case 'd':
- if (strlen(optarg) < 4 || optarg[2] != ':') {
+ if (strlen(optarg) < 2 || (strlen(optarg) > 2 && optarg[2] != ':')) {
print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg);
ret = 1;
goto view_end;
memcpy(settings.tag, optarg, 2);
}
- if (add_tag_value_single("view", &settings, optarg+3) != 0) {
+ if (strlen(optarg) > 3 && add_tag_value_single("view", &settings, optarg+3) != 0) {
+ print_error("view", "Could not add tag:value \"%s\"", optarg);
ret = 1;
goto view_end;
}
case 'D':
// Allow ";" as delimiter besides ":" to support MinGW CLI POSIX
// path translation as described at:
- // http://www.mingw.org/wiki/Posix_path_conversion
+ // http://www.mingw.org/wiki/Posix_path_conversion
if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) {
print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg);
ret = 1;
//case 'x': out_format = "x"; break;
//case 'X': out_format = "X"; break;
*/
+ case LONGOPT('?'):
+ return usage(samtools_stdout, EXIT_SUCCESS, 1);
case '?':
if (optopt == '?') { // '-?' appeared on command line
return usage(samtools_stdout, EXIT_SUCCESS, 1);
case 'x':
{
if (strlen(optarg) != 2) {
- fprintf(samtools_stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n");
+ print_error("main_samview", "Error parsing -x auxiliary tags should be exactly two characters long.");
return usage(samtools_stderr, EXIT_FAILURE, 0);
}
settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len));
}
break;
case 'M': settings.multi_region = 1; break;
- case 1: no_pg = 1; break;
+ case LONGOPT('P'): no_pg = 1; break;
+ case 'e':
+ if (!(settings.filter = hts_filter_init(optarg))) {
+ print_error("main_samview", "Couldn't initialise filter");
+ return 1;
+ }
+ break;
+ case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break;
+ case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break;
default:
if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0)
return usage(samtools_stderr, EXIT_FAILURE, 0);
break;
}
}
+ if (fn_fai == 0 && ga.reference) fn_fai = fai_path(ga.reference);
if (compress_level >= 0 && !*out_format) out_format = "b";
if (is_header_only) is_header = 1;
// File format auto-detection first
// Overridden by manual -b, -C
if (*out_format)
out_mode[1] = out_un_mode[1] = *out_format;
- out_mode[2] = out_un_mode[2] = '\0';
- // out_(un_)mode now 1 or 2 bytes long, followed by nul.
+ // out_(un_)mode now 1, 2 or 3 bytes long, followed by nul.
if (compress_level >= 0) {
char tmp[2];
tmp[0] = compress_level + '0'; tmp[1] = '\0';
print_error("view", "No input provided or missing option argument.");
return usage(samtools_stderr, EXIT_FAILURE, 0); // potential memory leak...
}
+ if (settings.subsam_seed != 0) {
+ // Convert likely user input 1,2,... to pseudo-random
+ // values with more entropy and more bits set
+ srand(settings.subsam_seed);
+ settings.subsam_seed = rand();
+ }
fn_in = (optind < argc)? argv[optind] : "-";
- // generate the fn_list if necessary
- if (fn_list == 0 && ga.reference) fn_list = samfaipath(ga.reference);
- // open file handlers
if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) {
print_error_errno("view", "failed to open \"%s\" for reading", fn_in);
ret = 1;
goto view_end;
}
- if (fn_list) {
- if (hts_set_fai_filename(in, fn_list) != 0) {
- fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ if (fn_fai) {
+ if (hts_set_fai_filename(in, fn_fai) != 0) {
+ fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai);
ret = 1;
goto view_end;
}
ret = 1;
goto view_end;
}
- if (fn_list) {
- if (hts_set_fai_filename(out, fn_list) != 0) {
- fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ if (fn_fai) {
+ if (hts_set_fai_filename(out, fn_fai) != 0) {
+ fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai);
ret = 1;
goto view_end;
}
ret = 1;
goto view_end;
}
- if (fn_list) {
- if (hts_set_fai_filename(un_out, fn_list) != 0) {
- fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+ if (fn_fai) {
+ if (hts_set_fai_filename(un_out, fn_fai) != 0) {
+ fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai);
ret = 1;
goto view_end;
}
// fetch alignments
while ((result = sam_itr_multi_next(in, iter, b)) >= 0) {
if (!process_aln(header, b, &settings)) {
- if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; }
+ if (!is_count) {
+ change_flag(b, &settings);
+ if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
+ }
count++;
} else {
if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
if ((has_index_file && optind >= argc - 2) || (!has_index_file && optind >= argc - 1)) { // convert/print the entire file
bam1_t *b = bam_init1();
int r;
+ errno = 0;
while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in'
if (!process_aln(header, b, &settings)) {
- if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; }
+ if (!is_count) {
+ change_flag(b, &settings);
+ if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
+ }
count++;
} else {
if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
}
}
if (r < -1) {
- fprintf(samtools_stderr, "[main_samview] truncated file.\n");
+ print_error_errno("view", "error reading file \"%s\"", fn_in);
ret = 1;
}
bam_destroy1(b);
// fetch alignments
while ((result = sam_itr_next(in, iter, b)) >= 0) {
if (!process_aln(header, b, &settings)) {
- if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; }
+ if (!is_count) {
+ change_flag(b, &settings);
+ if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
+ }
count++;
} else {
if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret);
if (fp_out) fclose(fp_out);
- free(fn_list); free(fn_out); free(settings.library); free(fn_un_out);
+ free(fn_fai); free(fn_out); free(settings.library); free(fn_un_out);
sam_global_args_free(&ga);
if ( header ) sam_hdr_destroy(header);
if (settings.bed) bed_destroy(settings.bed);
khint_t k;
for (k = 0; k < kh_end(settings.rghash); ++k)
if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k));
- kh_destroy(rg, settings.rghash);
+ kh_destroy(str, settings.rghash);
+ }
+ if (settings.rnhash) {
+ khint_t k;
+ for (k = 0; k < kh_end(settings.rnhash); ++k)
+ if (kh_exist(settings.rnhash, k)) free((char*)kh_key(settings.rnhash, k));
+ kh_destroy(str, settings.rnhash);
}
if (settings.tvhash) {
khint_t k;
for (k = 0; k < kh_end(settings.tvhash); ++k)
if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k));
- kh_destroy(tv, settings.tvhash);
+ kh_destroy(str, settings.tvhash);
}
if (settings.remove_aux_len) {
free(settings.remove_aux);
if (settings.tag) {
free(settings.tag);
}
+ if (settings.filter)
+ hts_filter_free(settings.filter);
if (p.pool)
hts_tpool_destroy(p.pool);
"\n"
"Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n"
"\n"
-"Options:\n"
-// output options
-" -b output BAM\n"
-" -C output CRAM (requires -T)\n"
-" -1 use fast BAM compression (implies -b)\n"
-" -u uncompressed BAM output (implies -b)\n"
-" -h include header in SAM output\n"
-" -H print SAM header only (no alignments)\n"
-" -c print only the count of matching records\n"
-" -o FILE output file name [samtools_stdout]\n"
-" -U FILE output reads not selected by filters to FILE [null]\n"
-// extra input
-" -t FILE FILE listing reference names and lengths (see long help) [null]\n"
-" -X include customized index file\n"
-// read filters
-" -L FILE only include reads overlapping this BED FILE [null]\n"
-" -r STR only include reads in read group STR [null]\n"
-" -R FILE only include reads with read group listed in FILE [null]\n"
-" -d STR:STR\n"
-" only include reads with tag STR and associated value STR [null]\n"
-" -D STR:FILE\n"
-" only include reads with tag STR and associated values listed in\n"
-" FILE [null]\n"
-" -q INT only include reads with mapping quality >= INT [0]\n"
-" -l STR only include reads in library STR [null]\n"
-" -m INT only include reads with number of CIGAR operations consuming\n"
-" query sequence >= INT [0]\n"
-" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
-" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0
-" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
-" -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n"
-" fraction of templates/read pairs to keep; INT part sets seed)\n"
-" -M use the multi-region iterator (increases the speed, removes\n"
-" duplicates and outputs the reads as they are ordered in the file)\n"
-// read processing
-" -x STR read tag to strip (repeatable) [null]\n"
-" -B collapse the backward CIGAR operation\n"
-// general options
-" -? print long help, including note about region specification\n"
-" -S ignored (input format is auto-detected)\n"
-" --no-PG do not add a PG line\n");
+"Output options:\n"
+" -b, --bam Output BAM\n"
+" -C, --cram Output CRAM (requires -T)\n"
+" -1, --fast Use fast BAM compression (implies --bam)\n"
+" -u, --uncompressed Uncompressed BAM output (implies --bam)\n"
+" -h, --with-header Include header in SAM output\n"
+" -H, --header-only Print SAM header only (no alignments)\n"
+" --no-header Print SAM alignment records only [default]\n"
+" -c, --count Print only the count of matching records\n"
+" -o, --output FILE Write output to FILE [standard output]\n"
+" -U, --unoutput FILE, --output-unselected FILE\n"
+" Output reads not selected by filters to FILE\n"
+"Input options:\n"
+" -t, --fai-reference FILE FILE listing reference names and lengths\n"
+" -M, --use-index Use index and multi-region iterator for regions\n"
+" --region[s]-file FILE Use index to include only reads overlapping FILE\n"
+" -X, --customized-index Expect extra index file argument after <in.bam>\n"
+"\n"
+"Filtering options (Only include in output reads that...):\n"
+" -L, --target[s]-file FILE ...overlap (BED) regions in FILE\n"
+" -r, --read-group STR ...are in read group STR\n"
+" -R, --read-group-file FILE ...are in a read group listed in FILE\n"
+" -N, --qname-file FILE ...whose read name is listed in FILE\n"
+" -d, --tag STR1[:STR2] ...have a tag STR1 (with associated value STR2)\n"
+" -D, --tag-file STR:FILE ...have a tag STR whose value is listed in FILE\n"
+" -q, --min-MQ INT ...have mapping quality >= INT\n"
+" -l, --library STR ...are in library STR\n"
+" -m, --min-qlen INT ...cover >= INT query bases (as measured via CIGAR)\n"
+" -e, --expr STR ...match the filter expression STR\n"
+" -f, --require-flags FLAG ...have all of the FLAGs present\n" // F&x == x
+" -F, --excl[ude]-flags FLAG ...have none of the FLAGs present\n" // F&x == 0
+" -G FLAG EXCLUDE reads with all of the FLAGs present\n" // !(F&x == x) TODO long option
+" --subsample FLOAT Keep only FLOAT fraction of templates/read pairs\n"
+" --subsample-seed INT Influence WHICH reads are kept in subsampling [0]\n"
+" -s INT.FRAC Same as --subsample 0.FRAC --subsample-seed INT\n"
+"\n"
+"Processing options:\n"
+" --add-flags FLAG Add FLAGs to reads\n"
+" --remove-flags FLAG Remove FLAGs from reads\n"
+" -x, --remove-tag STR Strip tag STR from reads (option may be repeated)\n"
+" -B, --remove-B Collapse the backward CIGAR operation\n"
+"\n"
+"General options:\n"
+" -?, --help Print long help, including note about region specification\n"
+" -S Ignored (input format is auto-detected)\n"
+" --no-PG Do not add a PG line\n");
sam_global_opt_help(fp, "-.O.T@..");
fprintf(fp, "\n");
"\n"
"6. Option `-u' is preferred over `-b' when the output is piped to\n"
" another samtools command.\n"
+"\n"
+"7. Option `-M`/`--use-index` causes overlaps with `-L` BED file regions and\n"
+" command-line region arguments to be computed using the multi-region iterator\n"
+" and an index. This increases speed, omits duplicates, and outputs the reads\n"
+" as they are ordered in the input SAM/BAM/CRAM file.\n"
+"\n"
+"8. Options `-L`/`--target[s]-file` and `--region[s]-file` may not be used\n"
+" together. `--region[s]-file FILE` is simply equivalent to `-M -L FILE`,\n"
+" so using both causes one of the specified BED files to be ignored.\n"
"\n");
return exit_status;
}
-
-int main_import(int argc, char *argv[])
-{
- int argc2, ret;
- char **argv2;
- if (argc != 4) {
- fprintf(samtools_stderr, "Usage: samtools import <in.ref_list> <in.sam> <out.bam>\n");
- return 1;
- }
- argc2 = 6;
- argv2 = calloc(6, sizeof(char*));
- argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2];
- ret = main_samview(argc2, argv2);
- free(argv2);
- return ret;
-}
#include <ctype.h>
#include <assert.h>
#include <unistd.h>
+#include <setjmp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
return putc('\n', samtools_stdout);
}
+
+static jmp_buf samtools_jmpbuf;
+static int samtools_status = 0;
+
+int samtools_dispatch(int argc, char *argv[])
+{
+ if (setjmp(samtools_jmpbuf) == 0)
+ return samtools_main(argc, argv);
+ else
+ return samtools_status;
+}
+
+void samtools_exit(int status)
+{
+ samtools_status = status;
+ longjmp(samtools_jmpbuf, 1);
+}
+
+
void samtools_set_optind(int val)
{
// setting this in cython via
#include <stdio.h>
+#ifndef __has_attribute
+#define __has_attribute(attribute) 0
+#endif
+#ifndef PYSAM_NORETURN
+#if __has_attribute(__noreturn__) || __GNUC__ >= 3
+#define PYSAM_NORETURN __attribute__((__noreturn__))
+#else
+#define PYSAM_NORETURN
+#endif
+#endif
+
extern FILE * samtools_stderr;
extern FILE * samtools_stdout;
int samtools_dispatch(int argc, char *argv[]);
+void PYSAM_NORETURN samtools_exit(int status);
+
void samtools_set_optind(int);
extern int samtools_main(int argc, char *argv[]);
/* stats.c -- This is the former bamcheck integrated into samtools/htslib.
- Copyright (C) 2012-2019 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
Author: Sam Nicholls <sam@samnicholls.net>
// Arrays for the histogram data
uint64_t *quals_1st, *quals_2nd;
uint64_t *gc_1st, *gc_2nd;
- acgtno_count_t *acgtno_cycles_1st;
- acgtno_count_t *acgtno_cycles_2nd;
+ acgtno_count_t *acgtno_cycles_1st, *acgtno_cycles_2nd;
+ acgtno_count_t *acgtno_revcomp;
uint64_t *read_lengths, *read_lengths_1st, *read_lengths_2nd;
uint64_t *insertions, *deletions;
uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
uint64_t nbases_mapped_cigar;
uint64_t nbases_trimmed; // bwa trimmed bases
uint64_t nmismatches;
- uint64_t nreads_QCfailed, nreads_secondary;
+ uint64_t nreads_QCfailed, nreads_secondary, nreads_supplementary;
struct {
uint32_t names, reads, quals;
} checksum;
uint32_t nchunks;
uint32_t pair_count; // Number of active pairs in the pairing hash table
- uint32_t target_count; // Number of bases covered by the target file
+ uint64_t target_count; // Number of bases covered by the target file
uint32_t last_pair_tid;
uint32_t last_read_flush;
error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t));
memset(stats->acgtno_cycles_2nd + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t));
+ stats->acgtno_revcomp = realloc(stats->acgtno_revcomp, n*sizeof(acgtno_count_t));
+ if ( !stats->acgtno_revcomp )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t));
+ memset(stats->acgtno_revcomp + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t));
+
stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
if ( !stats->read_lengths )
error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
switch (bam_seqi(seq, i)) {
case 1:
acgtno_cycles[ read_cycle ].a++;
+ reverse ? stats->acgtno_revcomp[ read_cycle ].t++ : stats->acgtno_revcomp[ read_cycle ].a++;
break;
case 2:
acgtno_cycles[ read_cycle ].c++;
+ reverse ? stats->acgtno_revcomp[ read_cycle ].g++ : stats->acgtno_revcomp[ read_cycle ].c++;
gc_count++;
break;
case 4:
acgtno_cycles[ read_cycle ].g++;
+ reverse ? stats->acgtno_revcomp[ read_cycle ].c++ : stats->acgtno_revcomp[ read_cycle ].g++;
gc_count++;
break;
case 8:
+ reverse ? stats->acgtno_revcomp[ read_cycle ].a++ : stats->acgtno_revcomp[ read_cycle ].t++;
acgtno_cycles[ read_cycle ].t++;
break;
case 15:
void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pairs)
{
+ if ( !is_in_regions(bam_line,stats) )
+ return;
if ( stats->rg_hash )
{
const uint8_t *rg = bam_aux_get(bam_line, "RG");
stats->nreads_filtered++;
return;
}
- if ( !is_in_regions(bam_line,stats) )
- return;
if ( stats->info->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->info->filter_readlen )
return;
return;
}
+ if ( bam_line->core.flag & BAM_FSUPPLEMENTARY )
+ {
+ stats->nreads_supplementary++;
+ }
+
// If line has no sequence cannot continue
int seq_len = bam_line->core.l_qseq;
if ( !seq_len ) return;
// These stats should only be calculated for the original reads ignoring supplementary artificial reads
// otherwise we'll accidentally double count
- if ( IS_ORIGINAL(bam_line) )
- {
+ if ( IS_ORIGINAL(bam_line) ) {
stats->read_lengths[read_len]++;
if ( order == READ_ORDER_FIRST ) stats->read_lengths_1st[read_len]++;
if ( order == READ_ORDER_LAST ) stats->read_lengths_2nd[read_len]++;
count_indels(stats, bam_line);
- if ( IS_PAIRED_AND_MAPPED(bam_line) )
+ if ( IS_PAIRED_AND_MAPPED(bam_line) && IS_ORIGINAL(bam_line) )
{
// The insert size is tricky, because for long inserts the libraries are
// prepared differently and the pairs point in other direction. BWA does
fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n");
fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals);
fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
- fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below)
+ fprintf(to, "SN\traw total sequences:\t%ld\t# excluding supplementary and secondary reads\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below)
fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other));
fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
fprintf(to, "SN\treads MQ0:\t%ld\t# mapped and MQ=0\n", (long)stats->nreads_mq0);
fprintf(to, "SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
fprintf(to, "SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
+ fprintf(to, "SN\tsupplementary alignments:\t%ld\n", (long)stats->nreads_supplementary);
fprintf(to, "SN\ttotal length:\t%ld\t# ignores clipping\n", (long)stats->total_len);
fprintf(to, "SN\ttotal first fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_1st);
fprintf(to, "SN\ttotal last fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_2nd);
fprintf(to, "SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0);
if ( stats->target_count ) {
- fprintf(to, "SN\tbases inside the target:\t%u\n", stats->target_count);
+ fprintf(to, "SN\tbases inside the target:\t%" PRIu64 "\n", stats->target_count);
for (icov=stats->info->cov_threshold+1; icov<stats->ncov; icov++)
cov_sum += stats->cov[icov];
fprintf(to, "SN\tpercentage of target genome with coverage > %d (%%):\t%.2f\n", stats->info->cov_threshold, (float)(100*cov_sum)/stats->target_count);
100.*(acgtno_count_1st->t + acgtno_count_2nd->t)/acgt_sum,
100.*(acgtno_count_1st->n + acgtno_count_2nd->n)/acgt_sum,
100.*(acgtno_count_1st->other + acgtno_count_2nd->other)/acgt_sum);
-
+ }
+ fprintf(to, "# ACGT content per cycle, read oriented. Use `grep ^GCT | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]\n");
+ for (ibase=0; ibase<stats->max_len; ibase++)
+ {
+ acgtno_count_t *acgtno_count = &(stats->acgtno_revcomp[ibase]);
+ uint64_t acgt_sum = acgtno_count->a + acgtno_count->c + acgtno_count->g + acgtno_count->t;
+ if ( ! acgt_sum ) continue;
+ fprintf(to, "GCT\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1,
+ 100.*(acgtno_count->a)/acgt_sum,
+ 100.*(acgtno_count->c)/acgt_sum,
+ 100.*(acgtno_count->g)/acgt_sum,
+ 100.*(acgtno_count->t)/acgt_sum);
}
uint64_t tA=0, tC=0, tG=0, tT=0, tN=0;
}
}
-static void init_regions(stats_t *stats, const char *file)
+static void init_regions(stats_t *stats, const char *file, stats_info_t* info)
{
FILE *fp = fopen(file,"r");
if ( !fp ) error("%s: %s\n",file,strerror(errno));
}
reg->npos = ++new_p;
}
- for (p = 0; p < reg->npos; p++)
- stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1);
+ for (p = 0; p < reg->npos; p++) {
+ if (reg->pos[p].end < HTS_POS_MAX) {
+ stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1);
+ } else {
+ uint64_t hdr_end = sam_hdr_tid2len(info->sam_header, r);
+ if (hdr_end)
+ stats->target_count += (hdr_end - reg->pos[p].beg + 1);
+ }
+ }
}
if (!(stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t))))
return 1;
}
-int replicate_regions(stats_t *stats, hts_itr_multi_t *iter) {
+int replicate_regions(stats_t *stats, hts_itr_multi_t *iter, stats_info_t *info) {
if ( !stats || !iter)
return 1;
for (j = 0; j < stats->regions[tid].npos; j++) {
stats->regions[tid].pos[j].beg = iter->reg_list[i].intervals[j].beg+1;
stats->regions[tid].pos[j].end = iter->reg_list[i].intervals[j].end;
-
- stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1);
+ if (stats->regions[tid].pos[j].end < HTS_POS_MAX) {
+ stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1);
+ } else {
+ uint64_t hdr_end = sam_hdr_tid2len(info->sam_header, tid);
+ if (hdr_end)
+ stats->target_count += (hdr_end - stats->regions[tid].pos[j].beg + 1);
+ }
}
}
free(stats->mpc_buf);
free(stats->acgtno_cycles_1st);
free(stats->acgtno_cycles_2nd);
+ free(stats->acgtno_revcomp);
free(stats->read_lengths);
free(stats->read_lengths_1st);
free(stats->read_lengths_2nd);
if (!stats->acgtno_cycles_1st) goto nomem;
stats->acgtno_cycles_2nd = calloc(stats->nbases,sizeof(acgtno_count_t));
if (!stats->acgtno_cycles_2nd) goto nomem;
+ stats->acgtno_revcomp = calloc(stats->nbases,sizeof(acgtno_count_t));
+ if (!stats->acgtno_revcomp) goto nomem;
stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
if (!stats->read_lengths) goto nomem;
stats->read_lengths_1st = calloc(stats->nbases,sizeof(uint64_t));
goto nomem;
realloc_rseq_buffer(stats);
if ( targets )
- init_regions(stats, targets);
+ init_regions(stats, targets, info);
return;
nomem:
error("Out of memory");
if (iter) {
if (!targets) {
all_stats->nchunks = argc-optind;
- if (replicate_regions(all_stats, iter))
+ if (replicate_regions(all_stats, iter, info))
fprintf(stderr, "Replications of the regions failed\n");
}
/* stats.c -- This is the former bamcheck integrated into samtools/htslib.
- Copyright (C) 2012-2019 Genome Research Ltd.
+ Copyright (C) 2012-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
Author: Sam Nicholls <sam@samnicholls.net>
// Arrays for the histogram data
uint64_t *quals_1st, *quals_2nd;
uint64_t *gc_1st, *gc_2nd;
- acgtno_count_t *acgtno_cycles_1st;
- acgtno_count_t *acgtno_cycles_2nd;
+ acgtno_count_t *acgtno_cycles_1st, *acgtno_cycles_2nd;
+ acgtno_count_t *acgtno_revcomp;
uint64_t *read_lengths, *read_lengths_1st, *read_lengths_2nd;
uint64_t *insertions, *deletions;
uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
uint64_t nbases_mapped_cigar;
uint64_t nbases_trimmed; // bwa trimmed bases
uint64_t nmismatches;
- uint64_t nreads_QCfailed, nreads_secondary;
+ uint64_t nreads_QCfailed, nreads_secondary, nreads_supplementary;
struct {
uint32_t names, reads, quals;
} checksum;
uint32_t nchunks;
uint32_t pair_count; // Number of active pairs in the pairing hash table
- uint32_t target_count; // Number of bases covered by the target file
+ uint64_t target_count; // Number of bases covered by the target file
uint32_t last_pair_tid;
uint32_t last_read_flush;
error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t));
memset(stats->acgtno_cycles_2nd + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t));
+ stats->acgtno_revcomp = realloc(stats->acgtno_revcomp, n*sizeof(acgtno_count_t));
+ if ( !stats->acgtno_revcomp )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t));
+ memset(stats->acgtno_revcomp + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t));
+
stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
if ( !stats->read_lengths )
error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
switch (bam_seqi(seq, i)) {
case 1:
acgtno_cycles[ read_cycle ].a++;
+ reverse ? stats->acgtno_revcomp[ read_cycle ].t++ : stats->acgtno_revcomp[ read_cycle ].a++;
break;
case 2:
acgtno_cycles[ read_cycle ].c++;
+ reverse ? stats->acgtno_revcomp[ read_cycle ].g++ : stats->acgtno_revcomp[ read_cycle ].c++;
gc_count++;
break;
case 4:
acgtno_cycles[ read_cycle ].g++;
+ reverse ? stats->acgtno_revcomp[ read_cycle ].c++ : stats->acgtno_revcomp[ read_cycle ].g++;
gc_count++;
break;
case 8:
+ reverse ? stats->acgtno_revcomp[ read_cycle ].a++ : stats->acgtno_revcomp[ read_cycle ].t++;
acgtno_cycles[ read_cycle ].t++;
break;
case 15:
void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pairs)
{
+ if ( !is_in_regions(bam_line,stats) )
+ return;
if ( stats->rg_hash )
{
const uint8_t *rg = bam_aux_get(bam_line, "RG");
stats->nreads_filtered++;
return;
}
- if ( !is_in_regions(bam_line,stats) )
- return;
if ( stats->info->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->info->filter_readlen )
return;
return;
}
+ if ( bam_line->core.flag & BAM_FSUPPLEMENTARY )
+ {
+ stats->nreads_supplementary++;
+ }
+
// If line has no sequence cannot continue
int seq_len = bam_line->core.l_qseq;
if ( !seq_len ) return;
// These stats should only be calculated for the original reads ignoring supplementary artificial reads
// otherwise we'll accidentally double count
- if ( IS_ORIGINAL(bam_line) )
- {
+ if ( IS_ORIGINAL(bam_line) ) {
stats->read_lengths[read_len]++;
if ( order == READ_ORDER_FIRST ) stats->read_lengths_1st[read_len]++;
if ( order == READ_ORDER_LAST ) stats->read_lengths_2nd[read_len]++;
count_indels(stats, bam_line);
- if ( IS_PAIRED_AND_MAPPED(bam_line) )
+ if ( IS_PAIRED_AND_MAPPED(bam_line) && IS_ORIGINAL(bam_line) )
{
// The insert size is tricky, because for long inserts the libraries are
// prepared differently and the pairs point in other direction. BWA does
fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n");
fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals);
fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
- fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below)
+ fprintf(to, "SN\traw total sequences:\t%ld\t# excluding supplementary and secondary reads\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below)
fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other));
fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
fprintf(to, "SN\treads MQ0:\t%ld\t# mapped and MQ=0\n", (long)stats->nreads_mq0);
fprintf(to, "SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
fprintf(to, "SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
+ fprintf(to, "SN\tsupplementary alignments:\t%ld\n", (long)stats->nreads_supplementary);
fprintf(to, "SN\ttotal length:\t%ld\t# ignores clipping\n", (long)stats->total_len);
fprintf(to, "SN\ttotal first fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_1st);
fprintf(to, "SN\ttotal last fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_2nd);
fprintf(to, "SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0);
if ( stats->target_count ) {
- fprintf(to, "SN\tbases inside the target:\t%u\n", stats->target_count);
+ fprintf(to, "SN\tbases inside the target:\t%" PRIu64 "\n", stats->target_count);
for (icov=stats->info->cov_threshold+1; icov<stats->ncov; icov++)
cov_sum += stats->cov[icov];
fprintf(to, "SN\tpercentage of target genome with coverage > %d (%%):\t%.2f\n", stats->info->cov_threshold, (float)(100*cov_sum)/stats->target_count);
100.*(acgtno_count_1st->t + acgtno_count_2nd->t)/acgt_sum,
100.*(acgtno_count_1st->n + acgtno_count_2nd->n)/acgt_sum,
100.*(acgtno_count_1st->other + acgtno_count_2nd->other)/acgt_sum);
-
+ }
+ fprintf(to, "# ACGT content per cycle, read oriented. Use `grep ^GCT | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]\n");
+ for (ibase=0; ibase<stats->max_len; ibase++)
+ {
+ acgtno_count_t *acgtno_count = &(stats->acgtno_revcomp[ibase]);
+ uint64_t acgt_sum = acgtno_count->a + acgtno_count->c + acgtno_count->g + acgtno_count->t;
+ if ( ! acgt_sum ) continue;
+ fprintf(to, "GCT\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1,
+ 100.*(acgtno_count->a)/acgt_sum,
+ 100.*(acgtno_count->c)/acgt_sum,
+ 100.*(acgtno_count->g)/acgt_sum,
+ 100.*(acgtno_count->t)/acgt_sum);
}
uint64_t tA=0, tC=0, tG=0, tT=0, tN=0;
}
}
-static void init_regions(stats_t *stats, const char *file)
+static void init_regions(stats_t *stats, const char *file, stats_info_t* info)
{
FILE *fp = fopen(file,"r");
if ( !fp ) error("%s: %s\n",file,strerror(errno));
}
reg->npos = ++new_p;
}
- for (p = 0; p < reg->npos; p++)
- stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1);
+ for (p = 0; p < reg->npos; p++) {
+ if (reg->pos[p].end < HTS_POS_MAX) {
+ stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1);
+ } else {
+ uint64_t hdr_end = sam_hdr_tid2len(info->sam_header, r);
+ if (hdr_end)
+ stats->target_count += (hdr_end - reg->pos[p].beg + 1);
+ }
+ }
}
if (!(stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t))))
return 1;
}
-int replicate_regions(stats_t *stats, hts_itr_multi_t *iter) {
+int replicate_regions(stats_t *stats, hts_itr_multi_t *iter, stats_info_t *info) {
if ( !stats || !iter)
return 1;
for (j = 0; j < stats->regions[tid].npos; j++) {
stats->regions[tid].pos[j].beg = iter->reg_list[i].intervals[j].beg+1;
stats->regions[tid].pos[j].end = iter->reg_list[i].intervals[j].end;
-
- stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1);
+ if (stats->regions[tid].pos[j].end < HTS_POS_MAX) {
+ stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1);
+ } else {
+ uint64_t hdr_end = sam_hdr_tid2len(info->sam_header, tid);
+ if (hdr_end)
+ stats->target_count += (hdr_end - stats->regions[tid].pos[j].beg + 1);
+ }
}
}
vfprintf(samtools_stderr, format, ap);
va_end(ap);
}
- exit(1);
+ samtools_exit(1);
}
void cleanup_stats_info(stats_info_t* info){
free(stats->mpc_buf);
free(stats->acgtno_cycles_1st);
free(stats->acgtno_cycles_2nd);
+ free(stats->acgtno_revcomp);
free(stats->read_lengths);
free(stats->read_lengths_1st);
free(stats->read_lengths_2nd);
if (!stats->acgtno_cycles_1st) goto nomem;
stats->acgtno_cycles_2nd = calloc(stats->nbases,sizeof(acgtno_count_t));
if (!stats->acgtno_cycles_2nd) goto nomem;
+ stats->acgtno_revcomp = calloc(stats->nbases,sizeof(acgtno_count_t));
+ if (!stats->acgtno_revcomp) goto nomem;
stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
if (!stats->read_lengths) goto nomem;
stats->read_lengths_1st = calloc(stats->nbases,sizeof(uint64_t));
goto nomem;
realloc_rseq_buffer(stats);
if ( targets )
- init_regions(stats, targets);
+ init_regions(stats, targets, info);
return;
nomem:
error("Out of memory");
if (iter) {
if (!targets) {
all_stats->nchunks = argc-optind;
- if (replicate_regions(all_stats, iter))
+ if (replicate_regions(all_stats, iter, info))
fprintf(samtools_stderr, "Replications of the regions failed\n");
}
a->max = max(at, a->max);
} else {
fprintf(samtools_stderr, "%s\n", "Failed to allocate memory for isize_sparse_record_t");
- exit(11);
+ samtools_exit(11);
}
} else {
return;
#include <lz4.h>
#include "htslib/sam.h"
-#ifdef _cplusplus
+#ifdef __cplusplus
extern "C" {
#endif
# DEALINGS IN THE SOFTWARE.
# Master version, for use in tarballs or non-git source copies
-VERSION=1.10
+VERSION=1.13
# If we have a git clone, then check against the current tag
if [ -e .git ]
import sys
import sysconfig
from contextlib import contextmanager
-from setuptools import setup
+from distutils import log
+from setuptools import setup, Command
+from setuptools.command.sdist import sdist
+
from cy_build import CyExtension as Extension, cy_build_ext as build_ext
try:
import cython
return make_print_config
+# This function emulates the way distutils combines settings from sysconfig,
+# environment variables, and the extension being built. It returns a dictionary
+# representing the usual set of variables, suitable for writing to a generated
+# file or for running configure (provided the returned LIBS is ignored).
+def build_config_dict(ext):
+ def env(var):
+ return [os.environ[var]] if var in os.environ else []
+
+ def sc(var):
+ value = sysconfig.get_config_var(var)
+ return [value] if value is not None else []
+
+ def optionise(option, valuelist):
+ def quote(s): return "'"+s+"'" if " " in s else s
+ return list(quote(option+v) for v in valuelist)
+
+ def kvtuples(pairlist):
+ def appendoptvalue(t): return t[0] if t[1] is None else t[0]+"="+t[1]
+ return map(appendoptvalue, pairlist)
+
+ # For CC, select the first of these that is set
+ cc = (env('CC') + sc('CC') + ['gcc'])[0]
+
+ # distutils ignores sysconfig for CPPFLAGS
+ cppflags = " ".join(env('CPPFLAGS') + optionise('-I', ext.include_dirs) +
+ optionise('-D', kvtuples(ext.define_macros)) +
+ optionise('-U', ext.undef_macros))
+
+ cflags = " ".join(sc('CFLAGS') + env('CFLAGS') + ext.extra_compile_args)
+
+ # distutils actually includes $CPPFLAGS here too, but that's weird and
+ # unnecessary for us as we know the output LDFLAGS will be used correctly
+ ldflags = " ".join(sc('LDFLAGS') + env('LDFLAGS') + env('CFLAGS') +
+ optionise('-L', ext.library_dirs) +
+ ext.extra_link_args)
+
+ # ext.libraries is computed (incorporating $LIBS etc) during configure
+ libs = " ".join(optionise('-l', ext.libraries))
+
+ return { 'CC': cc, 'CPPFLAGS': cppflags, 'CFLAGS': cflags,
+ 'LDFLAGS': ldflags, 'LIBS': libs }
+
+
+def write_configvars_header(filename, ext, prefix):
+ config = build_config_dict(ext)
+ if prefix != 'HTS':
+ config['HTSDIR'] = '(unused)'
+ config['CURSES_LIB'] = '(unused)'
+
+ log.info("creating %s for '%s' extension", filename, ext.name)
+ with open(filename, "w") as outf:
+ for var, value in config.items():
+ outf.write('#define {}_{} "{}"\n'.format(prefix, var, value))
+
+
@contextmanager
def set_compiler_envvars():
tmp_vars = []
return version.__version__
+# Override sdist command to ensure Cythonized *.c files are included.
+class cythonize_sdist(sdist):
+ # Remove when setuptools (as installed on GH runners) has these options
+ if not any(opt[0] == 'owner=' for opt in sdist.user_options):
+ sdist.user_options.append(('owner=', 'u', 'Specify owner inside tar'))
+ if not any(opt[0] == 'group=' for opt in sdist.user_options):
+ sdist.user_options.append(('group=', 'g', 'Specify group inside tar'))
+
+ def run(self):
+ from Cython.Build import cythonize
+ cythonize(self.distribution.ext_modules)
+ super().run()
+
+
+class clean_ext(Command):
+ description = "clean up Cython temporary files"
+ user_options = []
+
+ def initialize_options(self):
+ pass
+
+ def finalize_options(self):
+ pass
+
+ def run(self):
+ objs = glob.glob(os.path.join("pysam", "libc*.c"))
+ if objs:
+ log.info("removing 'pysam/libc*.c' (%s Cython objects)", len(objs))
+ for obj in objs:
+ os.remove(obj)
+
+ headers = (glob.glob(os.path.join("htslib", "*config*.h")) +
+ glob.glob(os.path.join("samtools", "*config*.h")) +
+ glob.glob(os.path.join("bcftools", "*config*.h")))
+ if headers:
+ log.info("removing '*/*config*.h' (%s generated headers)", len(headers))
+ for header in headers:
+ os.remove(header)
+
+
# How to link against HTSLIB
# shared: build shared chtslib from builtin htslib code.
# external: use shared libhts.so compiled outside of
config_headers = ["samtools/config.h",
"bcftools/config.h"]
-cmdclass = {'build_ext': build_ext}
-
# If cython is available, the pysam will be built using cython from
# the .pyx files. If no cython is available, the C-files included in the
# distribution will be used.
"from the repository"
.format(fn))
-# exclude sources that contain a main function
-EXCLUDE = {
- "samtools": (
- ),
- "bcftools": (
- "test", "plugins", "peakfit.c",
- "peakfit.h",
- # needs to renamed, name conflict with samtools reheader
- "reheader.c",
- "polysomy.c"),
- "htslib": (
- 'htslib/tabix.c',
- 'htslib/bgzip.c',
- 'htslib/htsfile.c'),
-}
-
print ("# pysam: htslib mode is {}".format(HTSLIB_MODE))
print ("# pysam: HTSLIB_CONFIGURE_OPTIONS={}".format(
HTSLIB_CONFIGURE_OPTIONS))
# The list below uses the union of include_dirs and library_dirs for
# reasons of simplicity.
+def prebuild_libchtslib(ext, force):
+ if HTSLIB_MODE not in ['shared', 'separate']: return
+ write_configvars_header("htslib/config_vars.h", ext, "HTS")
+
+def prebuild_libcsamtools(ext, force):
+ write_configvars_header("samtools/samtools_config_vars.h", ext, "SAMTOOLS")
+
modules = [
dict(name="pysam.libchtslib",
+ prebuild_func=prebuild_libchtslib,
sources=[source_pattern % "htslib", "pysam/htslib_util.c"] + shared_htslib_sources + os_c_files,
libraries=external_htslib_libraries),
dict(name="pysam.libcsamtools",
+ prebuild_func=prebuild_libcsamtools,
sources=[source_pattern % "samtools"] + glob.glob(os.path.join("samtools", "*.pysam.c")) +
[os.path.join("samtools", "lz4", "lz4.c")] + htslib_sources + os_c_files,
libraries=external_htslib_libraries + internal_htslib_libraries),
'packages': package_list,
'requires': ['cython (>=0.29.12)'],
'ext_modules': [Extension(**opts) for opts in modules],
- 'cmdclass': cmdclass,
+ 'cmdclass': {'build_ext': build_ext, 'clean_ext': clean_ext, 'sdist': cythonize_sdist},
'package_dir': package_dirs,
'package_data': {'': ['*.pxd', '*.h'], },
# do not pack in order to permit linking to csamtools.so
'zip_safe': False,
- 'use_2to3': True,
}
if __name__ == '__main__':
import copy
import array
-from TestUtils import checkFieldEqual, BAM_DATADIR, get_temp_filename, get_temp_context, IS_PYTHON3
+from TestUtils import checkFieldEqual, make_data_files, BAM_DATADIR, get_temp_filename, get_temp_context, IS_PYTHON3
if IS_PYTHON3:
else:
maketrans = string.maketrans
+
+def setUpModule():
+ make_data_files(BAM_DATADIR)
+
+
class ReadTest(unittest.TestCase):
def build_read(self):
a = pysam.AlignedSegment()
s = str(a)
self.assertEqual(
- "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]",
+ "None\t0\t*\t0\t0\tNone\t*\t0\t0\tNone\tNone\t[]",
s)
def testSettingTagInEmptyRead(self):
def test_query_length_is_limited(self):
a = self.build_read()
a.query_name = "A" * 1
- a.query_name = "A" * 251
+ a.query_name = "A" * 254
self.assertRaises(
ValueError,
setattr,
a,
"query_name",
- "A" * 252)
+ "A" * 255)
def test_header_accessible(self):
a = self.build_read()
from collections import OrderedDict as odict
import pysam
import pysam.samtools
-from TestUtils import get_temp_filename, BAM_DATADIR
+from TestUtils import get_temp_filename, make_data_files, BAM_DATADIR
if sys.version_info.major >= 3:
from io import StringIO
from StringIO import StringIO
+def setUpModule():
+ make_data_files(BAM_DATADIR)
+
+
class TestHeaderConstruction(unittest.TestCase):
"""testing header construction."""
import os
import pysam
import unittest
-from TestUtils import BAM_DATADIR, IS_PYTHON3, force_str, flatten_nested_list
+from TestUtils import make_data_files, BAM_DATADIR, IS_PYTHON3, force_str, flatten_nested_list
import PileupTestUtils
+def setUpModule():
+ make_data_files(BAM_DATADIR)
+
+
class TestPileupReadSelection(unittest.TestCase):
'''test pileup functionality.'''
import pysam.samtools
from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \
check_samtools_view_equal, checkFieldEqual, force_str, \
- get_temp_filename, BAM_DATADIR
+ get_temp_filename, make_data_files, BAM_DATADIR
+
+
+def setUpModule():
+ make_data_files(BAM_DATADIR)
##################################################
read = load_bam()
self.assertEqual(read.reference_name, "chr1")
- # TOOD
+ # TODO
# def testReadingFromSamFileWithoutHeader(self):
# '''read from samfile without header.
# '''
self.assertEqual(s.header.to_dict(), {'SQ': [{'LN': 1000, 'SN': 'chr1'}]})
def test_bam_without_seq_in_header(self):
- s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "example_no_seq_in_header.bam"))
+ s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "0example_no_seq_in_header.bam"))
self.assertTrue("SQ" in s.header.to_dict())
self.assertTrue("@SQ" in str(s.header))
def test_bam_without_seq_with_null_bytes_in_header(self):
- s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "example_no_seq_in_header_null_bytes.bam"))
+ s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "0example_no_seq_in_header_null_bytes.bam"))
self.assertTrue("SQ" in s.header.to_dict())
self.assertTrue("@SQ" in str(s.header))
return len([a for a in x])
self.assertRaises(IOError, iterall, s)
+ # Ignore closing errors, as s is now in an error state
+ try:
+ s.close()
+ except IOError:
+ pass
+
+
+class TestCorruptBAM(unittest.TestCase):
+ """See pull request 1035."""
+
+ def testCorruptBamIterator(self):
+ s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex2_corrupt.bam"))
+
+ def iterall(x):
+ return len([a for a in x])
+
+ self.assertRaises(IOError, iterall, s)
+
COMPARE_BTAG = [100, 1, 91, 0, 7, 101, 0, 201, 96, 204,
0, 0, 87, 109, 0, 7, 97, 112, 1, 12, 78,
self.check_write(read)
-class TestHeader1000Genomes(unittest.TestCase):
-
- '''see issue 110'''
- bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase3_EX_or_LC_only_alignment/data/HG00104/alignment/HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam" # noqa
- bambase = "HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam" # noqa
-
- def testRead(self):
-
- if not check_url(self.bamfile):
- return
-
- f = pysam.AlignmentFile(self.bamfile, "rb")
- data = f.header.copy()
- self.assertTrue(data)
-
- def tearDown(self):
- if os.path.exists(self.bambase + ".bai"):
- os.unlink(self.bambase + ".bai")
-
-
class TestLargeCigar(unittest.TestCase):
def setUp(self):
# mode = "w"
if __name__ == "__main__":
- # build data files
- print("building data files")
- subprocess.call("make -C %s" % BAM_DATADIR, shell=True)
print("starting tests")
unittest.main()
print("completed tests")
import errno
import unittest
from pysam import AlignmentFile
-from TestUtils import BAM_DATADIR
+from TestUtils import make_data_files, BAM_DATADIR
IS_PYTHON2 = sys.version_info[0] == 2
+def setUpModule():
+ make_data_files(BAM_DATADIR)
+
+
def alignmentfile_writer_thread(infile, outfile):
def _writer_thread(infile, outfile):
"""read from infile and write to outfile"""
import gzip
import contextlib
import inspect
+import subprocess
import tempfile
import pysam
os.unlink(f)
+def make_data_files(directory):
+ what = None
+ try:
+ if not os.path.exists(os.path.join(directory, "all.stamp")):
+ subprocess.check_output(["make", "-C", directory], stderr=subprocess.STDOUT)
+ except subprocess.CalledProcessError as e:
+ what = "Making test data in '%s' failed:\n%s" % (directory, force_str(e.output))
+
+ if what is not None:
+ raise RuntimeError(what)
+
+
def load_and_convert(filename, encode=True):
'''load data from filename and convert all fields to string.
import pysam
import shutil
import gzip
-import subprocess
try:
from pathlib import Path
except ImportError:
Path = None
-from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR, get_temp_context
+from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, make_data_files, CBCF_DATADIR, get_temp_context
+
+
+def setUpModule():
+ make_data_files(CBCF_DATADIR)
def read_header(filename):
return data
+def read_index_header(filename):
+ with gzip.open(filename) as infile:
+ magic = infile.read(4)
+ return magic
+
+
class TestMissingGenotypes(unittest.TestCase):
filename = "missing_genotypes.vcf"
shutil.copyfile(self.vcf_filename, fn)
pysam.tabix_index(fn, preset="vcf", force=True)
self.assertTrue(os.path.exists(fn + ".gz" + ".tbi"))
+ self.assertEqual(read_index_header(fn + ".gz.tbi"), b"TBI\1")
self.assertFalse(os.path.exists(fn + ".gz" + ".csi"))
with pysam.VariantFile(fn + ".gz") as inf:
pysam.tabix_index(fn, preset="vcf", force=True, csi=True)
self.assertTrue(os.path.exists(fn + ".gz" + ".csi"))
+ self.assertEqual(read_index_header(fn + ".gz.csi"), b"CSI\1")
self.assertFalse(os.path.exists(fn + ".gz" + ".tbi"))
with pysam.VariantFile(fn + ".gz") as inf:
shutil.copyfile(self.bcf_filename + ".csi", fn + ".csi")
self.assertTrue(os.path.exists(fn + ".csi"))
+ self.assertEqual(read_index_header(fn + ".csi"), b"CSI\1")
self.assertFalse(os.path.exists(fn + ".tbi"))
with pysam.VariantFile(fn) as inf:
pysam.tabix_index(fn, preset="bcf", force=True, csi=False)
self.assertTrue(os.path.exists(fn + ".csi"))
+ self.assertEqual(read_index_header(fn + ".csi"), b"CSI\1")
self.assertFalse(os.path.exists(fn + ".tbi"))
with pysam.VariantFile(fn) as inf:
pysam.tabix_index(fn, preset="vcf", force=True, csi=True)
self.assertTrue(os.path.exists(fn + ".csi"))
+ self.assertEqual(read_index_header(fn + ".csi"), b"CSI\1")
self.assertFalse(os.path.exists(fn + ".tbi"))
with pysam.VariantFile(fn) as inf:
if __name__ == "__main__":
- # build data files
- print("building data files")
- subprocess.call("make -C %s" % CBCF_DATADIR, shell=True)
print("starting tests")
unittest.main()
print("completed tests")
except ImportError:
Path = None
-from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR, get_temp_context
+from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, make_data_files, CBCF_DATADIR, get_temp_context
+
+
+def setUpModule():
+ make_data_files(CBCF_DATADIR)
@pytest.fixture
VCFGZ=$(VCF:%.vcf=%.vcf.gz)
BCF=$(VCF:%.vcf=%.bcf)
-all: $(VCFGZ) $(BCF)
+all: all.stamp
+
+all.stamp: $(VCFGZ) $(BCF)
+ touch $@
%.vcf.gz: %.vcf
bgzip < $< > $@
touch $@
clean:
- rm -f *.gz *.tbi *.csi *.bcf
-
+ -rm -f all.stamp *.gz *.tbi *.csi *.bcf
import os
import unittest
import pysam
-from TestUtils import BAM_DATADIR, TABIX_DATADIR
+from TestUtils import make_data_files, BAM_DATADIR, TABIX_DATADIR
+
+
+def setUpModule():
+ make_data_files(BAM_DATADIR)
+ make_data_files(TABIX_DATADIR)
+
try:
os.unlink('tests/_compile_test.c')
import copy
import shutil
-from TestUtils import check_url, BAM_DATADIR, get_temp_filename
+from TestUtils import check_url, make_data_files, BAM_DATADIR, get_temp_filename
+
+
+def setUpModule():
+ make_data_files(BAM_DATADIR)
class TestFastaFile(unittest.TestCase):
BAI=$(BAM:%.bam=%.bam.bai)
CRAM=ex1.cram ex2.cram ex3.cram
CRAI=$(CRAM:%.cram=%.cram.crai)
-NO_PG:=$(findstring --no-PG,$(shell samtools view))
+NO_PG:=$(findstring --no-PG,$(shell samtools view '-?'))
# ex2.bam - bam file without index
-all: ex1.pileup.gz \
+all: all.stamp
+
+all.stamp: ex1.pileup.gz \
ex1.sam ex1.bam \
ex2.sam.gz ex2.sam ex2.bam ex2.bam.bai \
with_md.sam.gz with_md.bam with_md.bam.bai \
example_bai.bam \
rg_with_tab.bam \
ex2_truncated.bam \
+ ex2_corrupt.bam \
empty.bam empty.bam.bai \
explicit_index.bam explicit_index.cram \
faidx_empty_seq.fq.gz \
- ex1.fa.gz ex1.fa.gz.csi \
+ ex1.fa.gz ex1.fa.gz.fai ex1.fa.gz.gzi \
ex1_csi.bam \
example_reverse_complement.bam \
example_dash_in_chr.bam
+ touch $@
# ex2.sam - as ex1.sam, but with header
ex2.sam.gz: ex1.bam ex1.bam.bai
# samtools view $(NO_PG) -bo $@ -t ex1.fa.fai $<
uncompressed.bam: ex2.sam
- samtools view $(NO_PG) -buS $< > $@
+ samtools view $(NO_PG) -bu -o $@ $<
%.bam: %.sam
- samtools view $(NO_PG) -bS $< > $@
+ samtools view $(NO_PG) -bo $@ $<
%.cram: %.sam
- samtools view $(NO_PG) -bC -T ex1.fa $< > $@
+ samtools view $(NO_PG) -Co $@ -T ex1.fa $<
%.cram.crai: %.cram
samtools index $<
%.sam: %.sam.gz
gunzip < $< > $@
-ex1.fa.fai:ex1.fa
- samtools faidx ex1.fa
+%.fa.fai: %.fa
+ samtools faidx $<
+
+%.fa.gz.fai %.fa.gz.gzi: %.fa.gz
+ samtools faidx $<
ex1.bam:ex1.sam.gz ex1.fa.fai
samtools view $(NO_PG) -bo ex1.bam -t ex1.fa.fai ex1.sam.gz
ex2_truncated.bam: ex2.bam
head -c 124000 ex2.bam > ex2_truncated.bam
+# Append a corrupt read with block_size < sizeof(bam_core_t fields)
+ex2_corrupt.bam: ex2.bam
+ (bgzip -d < $<; printf '\37\0\0\0\1\0\0\0') | bgzip > $@
+
ex1_csi.bam: ex1.bam
cp ex1.bam ex1_csi.bam
samtools index -c ex1_csi.bam
empty.bam: ex2.sam
- grep "^@" $< | samtools view $(NO_PG) -Sb - > $@
+ grep "^@" $< | samtools view $(NO_PG) -bo $@ -
example_unmapped_reads_no_sq.bam: example_unmapped_reads_no_sq.sam
touch tmp.list
cp ex1.cram $@
clean:
- rm -fr *.bam *.bai *.fai *.pileup* *.cram \
- *~ calDepth *.dSYM pysam_*.sam \
- ex2.sam ex2.sam.gz ex1.sam \
+ rm -fr [a-z]*.bam *.bai *.csi *.fai *.gzi *.pileup* [a-z]*.cram *.crai \
+ all.stamp *~ calDepth *.dSYM pysam_*.sam \
+ ex2.sam ex2.sam.gz ex1.sam ex1.fa.gz \
with_md.sam.gz \
*.fq.gz
%.fa.gz: %.fa
bgzip < $< > $@
-
-%.fa.gz.csi: %.fa.gz
- samtools faidx $<
Backwards incompatible changes:
================================
-1. Empty cigarstring now returns None (intstead of '')
+1. Empty cigarstring now returns None (instead of '')
2. Empty cigar now returns None (instead of [])
import pysam.bcftools
from TestUtils import checkBinaryEqual, check_lines_equal, \
check_samtools_view_equal, get_temp_filename, force_bytes, WORKDIR, \
- BAM_DATADIR
+ make_data_files, BAM_DATADIR
IS_PYTHON3 = sys.version_info[0] >= 3
+def setUpModule():
+ make_data_files(BAM_DATADIR)
+
+
def run_command(cmd):
'''run a samtools command'''
try:
# Samtools-htslib-API: bam_get_library() not yet implemented
# causes downstream problems
# TODO: The following cause subsequent commands to fail
- # unknow option
+ # unknown option
# "rmdup -s ex1.bam %(out)s_ex1.rmdup.bam",
# "merge -f %(out)s_ex1.merge.bam ex1.bam ex1.bam",
"reheader ex2.sam ex1.bam > %(out)s_ex1.reheader.bam",
def testStatements(self):
for statement in self.statements:
command = self.get_command(statement, map_to_internal=False)
- # bam2fq differs between version 1.5 and 1.6 - reenable if
+ # bam2fq differs between version 1.5 and 1.6 - re-enable if
# bioconda samtools will be available.
- if command in ("bedcov", "stats", "dict", "bam2fq"):
+ # flagstat differs between version <=1.12 and >=1.13
+ if command in ("bedcov", "stats", "dict", "bam2fq", "flagstat"):
continue
if (command == "calmd" and
# # "filter -s A ex1.vcf.gz > %(out)s_ex1.filter",
# # exit
# # "gtcheck -s A ex1.vcf.gz > %(out)s_ex1.gtcheck",
-# # segfauld, used to work wit bcftools 1.3
+# # segfault, used to work with bcftools 1.3
# # "roh -s A ex1.vcf.gz > %(out)s_ex1.roh",
# "stats ex1.vcf.gz > %(out)s_ex1.stats",
# ]
if __name__ == "__main__":
- # build data files
- print("building data files")
- subprocess.call("make -C %s" % BAM_DATADIR, shell=True)
print("starting tests")
unittest.main()
print("completed tests")
--- /dev/null
+all: all.stamp
+
+all.stamp:
+ touch $@
+
+clean:
+ -rm -f all.stamp
import gzip
import pysam
import unittest
-import subprocess
import glob
import re
from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \
- load_and_convert, TABIX_DATADIR, get_temp_filename
+ load_and_convert, make_data_files, TABIX_DATADIR, get_temp_filename
IS_PYTHON3 = sys.version_info[0] >= 3
+def setUpModule():
+ make_data_files(TABIX_DATADIR)
+
+
def myzip_open(infile, mode="r"):
'''open compressed file and decode.'''
if __name__ == "__main__":
- subprocess.call("make -C %s" % TABIX_DATADIR, shell=True)
unittest.main()
import re
import copy
import gzip
-from TestUtils import load_and_convert, TABIX_DATADIR
+from TestUtils import load_and_convert, make_data_files, TABIX_DATADIR
+
+
+def setUpModule():
+ make_data_files(TABIX_DATADIR)
class TestParser(unittest.TestCase):
import pysam
import os
import pytest
-from TestUtils import BAM_DATADIR
+from TestUtils import make_data_files, BAM_DATADIR
+
+
+def setUpModule():
+ make_data_files(BAM_DATADIR)
def test_idxstats_parse_split_lines():