New upstream version 0.17.0+ds

author Nilesh Patra <nilesh@debian.org>

Thu, 14 Oct 2021 19:28:59 +0000 (00:58 +0530)

committer Nilesh Patra <nilesh@debian.org>

Thu, 14 Oct 2021 19:28:59 +0000 (00:58 +0530)
author Nilesh Patra <nilesh@debian.org>
Thu, 14 Oct 2021 19:28:59 +0000 (00:58 +0530)
committer Nilesh Patra <nilesh@debian.org>
Thu, 14 Oct 2021 19:28:59 +0000 (00:58 +0530)
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml

new file mode 100644 (file)

index 0000000..4075f1c
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,152 @@
+name: CI
+
+# on: [push, pull_request]
+on: [pull_request]
+
+jobs:
+  direct:
+    runs-on: ${{ matrix.os }}-latest
+    strategy:
+      matrix:
+        os: [ubuntu, macos]
+        python-version: [2.7, 3.6, 3.7, 3.8, 3.9]
+        exclude:
+          # Run only the latest 2.x and 3.x on macOS
+          - os: macos
+            python-version: 3.6
+          - os: macos
+            python-version: 3.7
+          - os: macos
+            python-version: 3.8
+
+    steps:
+      - name: Checkout pysam
+        uses: actions/checkout@v2
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install prerequisite Python libraries
+        run:  pip install cython pytest pytest-pep8
+
+      - name: Install build prerequisites
+        if:   runner.os == 'Linux'
+        run: |
+          sudo apt-get update
+          sudo apt-get install -q --no-install-recommends --no-install-suggests libcurl4-openssl-dev
+
+      - name: Build (directly from checkout)
+        run:  python setup.py build
+
+      - name: Install test prerequisites
+        run: |
+          case $RUNNER_OS in
+          Linux)
+              sudo apt-get install -q --no-install-recommends --no-install-suggests samtools bcftools tabix
+              ;;
+          macOS)
+              brew install -q samtools bcftools
+              ;;
+          esac
+
+      - name: Run tests
+        run: |
+          export PYTHONPATH=$(echo $GITHUB_WORKSPACE/build/lib.*)
+          export REF_PATH=':'
+          pytest
+
+
+  sdist:
+    runs-on: ${{ matrix.os }}-latest
+    strategy:
+      matrix:
+        os: [ubuntu, macos]
+        python-version: [3.9]
+
+    steps:
+      - name: Checkout pysam
+        uses: actions/checkout@v2
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install prerequisite Python libraries
+        run:  pip install cython pytest pytest-pep8
+
+      - name: Install build prerequisites
+        if:   runner.os == 'Linux'
+        run: |
+          sudo apt-get update
+          sudo apt-get install -q --no-install-recommends --no-install-suggests libcurl4-openssl-dev
+
+      - name: Create source distribution
+        run:  python setup.py sdist --owner=root --group=root
+
+      - name: Build (via sdist tarball)
+        run:  pip install --verbose --no-deps --no-binary=':all:' pysam-*.tar.gz
+        working-directory: dist
+
+      - name: Install test prerequisites
+        run: |
+          case $RUNNER_OS in
+          Linux)
+              sudo apt-get install -q --no-install-recommends --no-install-suggests samtools bcftools tabix
+              ;;
+          macOS)
+              brew install -q samtools bcftools
+              ;;
+          esac
+
+      - name: Run tests
+        run:  REF_PATH=':' pytest
+
+      - name: Upload sdist tarball
+        if:   runner.os == 'Linux'
+        uses: actions/upload-artifact@v2
+        with:
+          name: sdist
+          path: dist/pysam-*.tar.gz
+          retention-days: 14
+
+
+  conda:
+    timeout-minutes: 20
+    runs-on: ${{ matrix.os }}-latest
+    strategy:
+      matrix:
+        os: [ubuntu]
+        python-version: [3.7]
+    defaults:
+      run:
+        shell: bash -l {0}  # needed for conda activation
+    env:
+      HTSLIB_CONFIGURE_OPTIONS: "--disable-libcurl"
+
+    steps:
+      - name: Checkout pysam
+        uses: actions/checkout@v2
+
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          channel-priority: strict
+          activate-environment: testenv
+          auto-activate-base: false
+          use-only-tar-bz2: true
+
+      - name: Set up Conda and Python ${{ matrix.python-version }}
+        run: |
+          conda config --add channels bioconda --add channels conda-forge
+          conda install python=${{ matrix.python-version }} cython
+
+      - name: Build (directly from checkout)
+        run:  python setup.py install
+
+      - name: Install test prerequisites via Conda
+        run:  conda install "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" pytest
+
+      - name: Run tests
+        run:  REF_PATH=':' pytest
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml

new file mode 100644 (file)

index 0000000..bbc954f
--- /dev/null
+++ b/.github/workflows/release.yaml
@@ -0,0 +1,115 @@
+name: Publish pysam wheels to PyPI and TestPyPI
+
+on:
+  push:
+    branches:
+      - v[0-9]+.[0-9]+.x
+    tags:
+      - v*
+  release:
+    types:
+      - published
+
+jobs:
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, macos-10.15]  # windows-2019, 
+        
+    steps:
+      - name: Checkout pysam
+        uses: actions/checkout@v2
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.8'
+        
+      - name: Install prerequisite Python libraries
+        run:  |
+          python -m pip install --upgrade pip        
+          pip install cython pytest pytest-pep8
+
+      - name: Build wheels for linux
+        if:   runner.os == 'Linux'
+        uses: pypa/cibuildwheel@v2.1.2
+        env:
+          CIBW_BUILD: cp36-* cp37-* cp38-* cp39-*
+          CIBW_BEFORE_BUILD: yum install -y libcurl-devel zlib-devel bzip2-devel xz-devel && pip install cython
+          CIBW_MANYLINUX_X86_64_IMAGE: manylinux1
+          CIBW_MANYLINUX_I686_IMAGE: manylinux1
+
+      - name: Build wheels for macos
+        if:   runner.os != 'Linux'
+        uses: pypa/cibuildwheel@v2.1.2
+        env:
+          CIBW_BUILD: cp36-* cp37-* cp38-* cp39-*
+          CIBW_BEFORE_BUILD: pip install cython
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v2
+        with:
+          path: ./wheelhouse/*.whl
+
+  build_sdist:
+
+    runs-on: ${{ matrix.os }}-latest
+    strategy:
+      matrix:
+        os: [ubuntu, macos]
+        python-version: [3.9]
+
+    steps:
+      - name: Checkout pysam
+        uses: actions/checkout@v2
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install prerequisite Python libraries
+        run:  pip install cython pytest pytest-pep8
+
+      - name: Install build prerequisites
+        if:   runner.os == 'Linux'
+        run: |
+          sudo apt-get update
+          sudo apt-get install -q --no-install-recommends --no-install-suggests libcurl4-openssl-dev
+
+      - name: Create source distribution
+        run:  python setup.py sdist
+
+      - uses: actions/upload-artifact@v2
+        with:
+          path: dist/*.tar.gz
+
+  upload_pypi:
+
+    needs: [build_wheels, build_sdist]
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Get Artifacts
+        uses: actions/download-artifact@v2
+        with:
+          name: artifact
+          path: dist
+
+      - name: Publish distribution to Test PyPI
+        if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
+        uses: pypa/gh-action-pypi-publish@master
+        with:
+          user: __token__
+          password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+          repository_url: https://test.pypi.org/legacy/
+
+      - name: Publish distribution to PyPI
+        if: github.event_name == 'release' && github.event.action == 'published'
+        uses: pypa/gh-action-pypi-publish@master
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_API_TOKEN }}
+
diff --git a/.gitignore b/.gitignore

index b07a532d0958bdbeb9909e9a7aabc2cffc5dbeed..6ec2d26cf6243876425b25809b56bb9927ebf311 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -17,11 +17,14 @@ tests/cbcf_data
  tests/tabix_data
  
  samtools/config.h
+samtools/samtools_config_vars.h
  bcftools/config.h
  htslib/config.status
  htslib/config.h
  htslib/config.log
  htslib/config.mk
+htslib/config_vars.h
+htslib/htscodecs.mk
  htslib/htslib.pc.tmp
  htslib/htslib-uninstalled.pc
  pysam/config.py
diff --git a/.travis.disabled.yml b/.travis.disabled.yml

new file mode 100644 (file)

index 0000000..5b7bcc8
--- /dev/null
+++ b/.travis.disabled.yml
@@ -0,0 +1,114 @@
+os:
+  - linux
+  - osx
+
+language: c
+
+stages:
+  - test
+  - name: deploy
+    if: tag IS present
+
+env:
+  matrix:
+    - CONDA_PY=2.7
+    - CONDA_PY=3.6
+    - CONDA_PY=3.7
+    - CONDA_PY=3.8
+  global:
+    - PYSAM_LINKING_TEST=1
+    - TWINE_USERNAME=grepall
+    - secure: bTbky3Un19NAl62lix8bMLmBv9IGNhFkRXlZH+B253nYub7jwQwPQKum3ct9ea+XHJT5//uM0B8WAF6eyugpNkPQ7+S7SEH5BJuCt30nv6qvGhSO2AffZKeHEDnfW2kqGrivn87TqeomlSBlO742CD/V0wOIUwkTT9tutd+E7FU=
+
+_cibw_common: &cibw_common
+  addons: {}
+  install:
+    - python3 -m pip install cibuildwheel>=1.1.0 twine
+  script:
+    - set -e
+    - cibuildwheel --output-dir dist
+    - twine check dist/*
+    - twine upload --skip-existing dist/*
+
+_cibw_linux: &cibw_linux
+  stage: deploy
+  os: linux
+  language: python
+  python: '3.5'
+  services:
+    - docker
+  <<: *cibw_common
+
+_cibw_linux_aarch64: &cibw_linux_aarch64
+  stage: deploy
+  os: linux
+  arch: arm64
+  language: python
+  python: '3.9'
+  services:
+    - docker
+  <<: *cibw_common
+
+matrix:
+  include:
+    - stage: deploy
+      os: linux
+      language: python
+      python: '3.5'
+      addons:
+        apt:
+          packages:
+            - gcc
+            - g++
+            - libcurl4-openssl-dev  # for libcurl support in sdist
+            - libssl-dev  # for s3 support in sdist
+      install:
+        - python3 -m pip install Cython twine
+      script:
+        - set -e
+        - python3 setup.py build_ext --inplace
+        - python3 setup.py sdist
+        - twine check dist/*
+        - twine upload --skip-existing dist/*
+    - <<: *cibw_linux
+      env:
+        - CIBW_BUILD="*_x86_64"
+        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
+        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+        - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
+        - CIBW_TEST_COMMAND='python -c "import pysam"'
+    - <<: *cibw_linux
+      env:
+        - CIBW_BUILD="*_i686"
+        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
+        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+        - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
+        - CIBW_TEST_COMMAND='python -c "import pysam"'
+    - <<: *cibw_linux_aarch64
+      env:
+        - CIBW_BUILD="*_aarch64"
+        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
+        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+        - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
+        - CIBW_TEST_COMMAND='python -c "import pysam"'
+    - stage: deploy
+      os: osx
+      language: generic
+      env:
+        - CIBW_BEFORE_BUILD="python -m pip install -r requirements.txt"
+        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+        - CIBW_TEST_COMMAND='python -c "import pysam"'
+      <<: *cibw_common
+
+addons:
+  apt:
+    packages:
+    - gcc
+    - g++
+
+script:
+  - ./devtools/run_tests_travis.sh
+
+notifications:
+  email:
+    - andreas.heger@gmail.com
diff --git a/.travis.yml b/.travis.yml

deleted file mode 100644 (file)

index 47ce194..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,97 +0,0 @@
-os:
-  - linux
-  - osx
-
-language: c
-
-stages:
-  - test
-  - name: deploy
-    if: tag IS present
-
-env:
-  matrix:
-    - CONDA_PY=2.7
-    - CONDA_PY=3.6
-    - CONDA_PY=3.7
-    - CONDA_PY=3.8
-  global:
-    - PYSAM_LINKING_TEST=1
-    - TWINE_USERNAME=grepall
-    - secure: bTbky3Un19NAl62lix8bMLmBv9IGNhFkRXlZH+B253nYub7jwQwPQKum3ct9ea+XHJT5//uM0B8WAF6eyugpNkPQ7+S7SEH5BJuCt30nv6qvGhSO2AffZKeHEDnfW2kqGrivn87TqeomlSBlO742CD/V0wOIUwkTT9tutd+E7FU=
-
-_cibw_common: &cibw_common
-  addons: {}
-  install:
-    - python3 -m pip install cibuildwheel>=1.1.0 twine
-  script:
-    - set -e
-    - cibuildwheel --output-dir dist
-    - twine check dist/*
-    - twine upload --skip-existing dist/*
-
-_cibw_linux: &cibw_linux
-  stage: deploy
-  os: linux
-  language: python
-  python: '3.5'
-  services:
-    - docker
-  <<: *cibw_common
-
-matrix:
-  include:
-    - stage: deploy
-      os: linux
-      language: python
-      python: '3.5'
-      addons:
-        apt:
-          packages:
-            - gcc
-            - g++
-            - libcurl4-openssl-dev  # for libcurl support in sdist
-            - libssl-dev  # for s3 support in sdist
-      install:
-        - python3 -m pip install Cython twine
-      script:
-        - set -e
-        - python3 setup.py build_ext --inplace
-        - python3 setup.py sdist
-        - twine check dist/*
-        - twine upload --skip-existing dist/*
-    - <<: *cibw_linux
-      env:
-        - CIBW_BUILD="*_x86_64"
-        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
-        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
-        - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
-        - CIBW_TEST_COMMAND='python -c "import pysam"'
-    - <<: *cibw_linux
-      env:
-        - CIBW_BUILD="*_i686"
-        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && python -m pip install -r requirements.txt"
-        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
-        - CIBW_REPAIR_WHEEL_COMMAND_LINUX='auditwheel repair -L . -w {dest_dir} {wheel}'
-        - CIBW_TEST_COMMAND='python -c "import pysam"'
-    - stage: deploy
-      os: osx
-      language: generic
-      env:
-        - CIBW_BEFORE_BUILD="python -m pip install -r requirements.txt"
-        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
-        - CIBW_TEST_COMMAND='python -c "import pysam"'
-      <<: *cibw_common
-
-addons:
-  apt:
-    packages:
-    - gcc
-    - g++
-
-script:
-  - ./devtools/run_tests_travis.sh
-
-notifications:
-  email:
-    - andreas.heger@gmail.com
diff --git a/AUTHORS b/AUTHORS

index 4b005369a36b843548d0a86bc80ad7ee33112b88..4e9c5eb5f897a0684a49fe0ba1927068ef9720d7 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,8 +1,17 @@
+Many people have contributed to pysam. The list of github contributors
+is the best place to get a full list of authors and their contributions.
+The list and summary below is a out-of-date and represents the earlier
+stages of the project.
+
  List of contributors:
  
  Andreas Heger, Tildon Grant Belgard, Florian Finkernagel, Leo
  Goodstadt, Martin Goodson all contributed code to pysam.
  
+John Marshall has been looking after pysam and its community for
+several years, as well as making many code contributions and improving
+the engineering of pysam.
+
  Kevin B. Jacobs implemented a Cython wrapper for the VCF/BCF
  reader/writer in htslib.
  
diff --git a/INSTALL b/INSTALL

index 963612515786496fd8386a539bad9f165bbff236..5016dcc75ba4af2cf4b07e3521460ce0ecf8f336 100644 (file)
--- a/INSTALL
+++ b/INSTALL
@@ -47,7 +47,7 @@ features. If these fail, for example due to missing library
  dependencies (`libcurl`, `libcrypto`), it will fall back to
  conservative defaults.
  
-Options can be passed to the configure script explicitely by
+Options can be passed to the configure script explicitly by
  setting the environment variable `HTSLIB_CONFIGURE_OPTIONS`.
  For example::
  
diff --git a/MANIFEST.in b/MANIFEST.in

index aaacb22534893edb4e5e9ecdfbe3c5840356cb4f..25e9a1a2c844d920ea0b47d97d6472891c88f7fc 100644 (file)
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -16,45 +16,39 @@ include pysam/libc*.pyx
  include pysam/libc*.c
  include pysam/*.c
  include pysam/*.h
+exclude pysam/config.py
+
+include win32/*.[ch]
  
  # exclude tests from pypi tar-ball - they
  # require additional data
  prune tests/
  
  # samtools
-include samtools/configure
-include samtools/config.mk.in
-include samtools/config.h.in
-include samtools/*.h
-include samtools/*.c
-exclude samtools/config.h
-include samtools/*/*.h
+include samtools/LICENSE samtools/README samtools/lz4/LICENSE
+recursive-include samtools *.[ch]
+include samtools/version.sh
+exclude samtools/*config*.h
  
  # bcftools
-include bcftools/*.h
-include bcftools/*.c
-exclude bcftools/config.h
+include bcftools/LICENSE bcftools/README
+include bcftools/*.[ch]
+include bcftools/version.sh
+exclude bcftools/*config*.h
  
  # htslib
-include htslib/*.c
-include htslib/*.h
-include htslib/INSTALL
-include htslib/NEWS
-exclude htslib/config.h
-include htslib/Makefile
-include htslib/htslib_vars.mk
-include htslib/configure
-include htslib/config.mk.in
-include htslib/config.h.in
-include htslib/htslib.pc.in
-include htslib/htslib/*.h
-include htslib/cram/*.c
-include htslib/cram/*.h
-include htslib/os/*.c
-include htslib/os/*.h
+include htslib/LICENSE htslib/README
+recursive-include htslib *.[ch]
+exclude htslib/*config*.h
+
+include htslib/configure.ac htslib/m4/*.m4 htslib/*.in
+include htslib/configure htslib/version.sh
+include htslib/Makefile htslib/*.mk
+exclude htslib/config.mk htslib/htscodecs.mk
+
  include cy_build.py
-include pysam.py
  include requirements.txt
  
  # documentation
-include doc/*
+include doc/*.py doc/*.rst
+include doc/Makefile doc/make.bat
diff --git a/NEWS b/NEWS

index 49ce4857c47038225156f64d3e132508880847f4..75d9249c6dbcfa3e4ddbaeba4a18f3677083d199 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -209,7 +209,7 @@ Release 0.11.2
  ==============
  
  This release wraps htslib/samtools/bcfools versions 1.4.1 in response
-to a security fix in these libraries. Additionaly the following
+to a security fix in these libraries. Additionally the following
  issues have been fixed:
  
  * [#452] add GFF3 support for tabix parsers
@@ -330,7 +330,7 @@ Overview
  --------
  
  The 0.9.0 release upgrades htslib to htslib 1.3 and numerous other
-enchancements and bugfixes. See below for a detailed list.
+enhancements and bugfixes. See below for a detailed list.
  
  `Htslib 1.3 <https://github.com/samtools/htslib/releases/tag/1.3>`_
  comes with additional capabilities for remote file access which depend
@@ -373,7 +373,7 @@ Detailed release notes
       and code bloat.
     * run configure for the builtin htslib library in order to detect
       optional libraries such as libcurl. Configure behaviour can be
-     controlled by setting the environmet variable
+     controlled by setting the environment variable
       HTSLIB_CONFIGURE_OPTIONS.
  * get_reference_sequence() now returns the reference sequence and not
    something looking like it. This bug had effects on
@@ -576,7 +576,7 @@ Other changes:
  
  Backwards incompatible changes
  
-* Empty cigarstring now returns None (intstead of '')
+* Empty cigarstring now returns None (instead of '')
  * Empty cigar now returns None (instead of [])
  * When using the extension classes in cython modules, AlignedRead
    needs to be substituted with AlignedSegment.
diff --git a/README.rst b/README.rst

index 4efa827033e794ff375b12df6999f18989c85f5b..368984a42e4167fa05112439bd97ca9969bafe10 100644 (file)
--- a/README.rst
+++ b/README.rst
@@ -25,7 +25,7 @@ as it resolves non-python dependencies and uses pre-configured
  compilation options. Especially for OS X this will potentially save a
  lot of trouble.
  
-The current version of pysam wraps 3rd-party code from htslib-1.10.2, samtools-1.10, and bcftools-1.10.2.
+The current version of pysam wraps 3rd-party code from htslib-1.13, samtools-1.13, and bcftools-1.13.
  
  Pysam is available through `pypi
  <https://pypi.python.org/pypi/pysam>`_. To install, type::
diff --git a/bcftools/HMM.c b/bcftools/HMM.c

index 70ad8d62639b9dc55cf98fe56822c6a74f7e15a9..c2d302f4b26f79a26e87f974408e3d11addc3409 100644 (file)
--- a/bcftools/HMM.c
+++ b/bcftools/HMM.c
@@ -1,6 +1,6 @@
  /* The MIT License
  
-   Copyright (c) 2014-2015 Genome Research Ltd.
+   Copyright (c) 2014-2017 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/HMM.c.pysam.c b/bcftools/HMM.c.pysam.c

index 2280c0db7830dfe72504c7fc329af029585f707b..d039367a9d6634b73c604563dde9848469010356 100644 (file)
--- a/bcftools/HMM.c.pysam.c
+++ b/bcftools/HMM.c.pysam.c
@@ -2,7 +2,7 @@
  
  /* The MIT License
  
-   Copyright (c) 2014-2015 Genome Research Ltd.
+   Copyright (c) 2014-2017 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/HMM.h b/bcftools/HMM.h

index 70c9cb885c9506da97089e7479648a7a91e74fe7..3a6cab30a30bb583a741ff68c4785fe96b1117bf 100644 (file)
--- a/bcftools/HMM.h
+++ b/bcftools/HMM.h
@@ -1,6 +1,6 @@
  /* The MIT License
  
-   Copyright (c) 2014-2015 Genome Research Ltd.
+   Copyright (c) 2014-2016 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/LICENSE b/bcftools/LICENSE

index 75aeb6c0c379a9835dfd4113ea6bdb5d0e85710d..f223b096a9d6c4435a75130d560701b5a3bbb7da 100644 (file)
--- a/bcftools/LICENSE
+++ b/bcftools/LICENSE
@@ -9,7 +9,7 @@ the INSTALL document), the use of this software is governed by the GPL license.
  
  The MIT/Expat License
  
-Copyright (C) 2012-2014 Genome Research Ltd.
+Copyright (C) 2012-2021 Genome Research Ltd.
  
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -746,3 +746,28 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.
+
+-----------------------------------------------------------------------------
+
+LICENSE for utlist.h
+
+Copyright (c) 2007-2014, Troy D. Hanson   http://troydhanson.github.com/uthash/
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/bcftools/README b/bcftools/README

index 5cb1bbd9623f81f283a1210fed2ba5cd64cf8d0a..fff0cb73e0438858945e7a968a602c442a336247 100644 (file)
--- a/bcftools/README
+++ b/bcftools/README
@@ -3,3 +3,25 @@ SAMtools) and manipulating VCF and BCF files.  The program is intended
  to replace the Perl-based tools from vcftools.
  
  See INSTALL for building and installation instructions.
+
+Please cite this paper when using BCFtools for your publications:
+
+Twelve years of SAMtools and BCFtools
+Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li
+GigaScience, Volume 10, Issue 2, February 2021, giab008, https://doi.org/10.1093/gigascience/giab008
+
+@article{10.1093/gigascience/giab008,
+    author = {Danecek, Petr and Bonfield, James K and Liddle, Jennifer and Marshall, John and Ohan, Valeriu and Pollard, Martin O and Whitwham, Andrew and Keane, Thomas and McCarthy, Shane A and Davies, Robert M and Li, Heng},
+    title = "{Twelve years of SAMtools and BCFtools}",
+    journal = {GigaScience},
+    volume = {10},
+    number = {2},
+    year = {2021},
+    month = {02},
+    abstract = "{SAMtools and BCFtools are widely used programs for processing and analysing high-throughput sequencing data. They include tools for file format conversion and manipulation, sorting, querying, statistics, variant calling, and effect analysis amongst other methods.The first version appeared online 12 years ago and has been maintained and further developed ever since, with many new features and improvements added over the years. The SAMtools and BCFtools packages represent a unique collection of tools that have been used in numerous other software projects and countless genomic pipelines.Both SAMtools and BCFtools are freely available on GitHub under the permissive MIT licence, free for both non-commercial and commercial use. Both packages have been installed \\&gt;1 million times via Bioconda. The source code and documentation are available from https://www.htslib.org.}",
+    issn = {2047-217X},
+    doi = {10.1093/gigascience/giab008},
+    url = {https://doi.org/10.1093/gigascience/giab008},
+    note = {giab008},
+    eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab008/36332246/giab008.pdf},
+}
diff --git a/bcftools/abuf.c b/bcftools/abuf.c

new file mode 100644 (file)

index 0000000..5e45e9e
--- /dev/null
+++ b/bcftools/abuf.c
@@ -0,0 +1,713 @@
+/* The MIT License
+
+   Copyright (c) 2021 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+#include <assert.h>
+#include <strings.h>
+#include <htslib/vcf.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "abuf.h"
+#include "rbuf.h"
+
+typedef enum
+{
+    M_FIRST, M_SUM
+}
+merge_rule_t;
+
+typedef struct
+{
+    kstring_t ref, alt;
+    int ial;        // the index of the original ALT allele, 1-based
+    int beg, end;   // 0-based inclusive offsets to ref,alt
+}
+atom_t;
+
+typedef struct
+{
+    bcf1_t *rec;
+    int nori, nout;     // number of ALTs in the input, and VCF rows on output
+    uint8_t *tbl;       // nori columns, nout rows; indicates allele contribution to output rows, see "The atomization works as follows" below
+    uint8_t *overlaps;  // is the star allele needed for this variant?
+    atom_t **atoms;
+    int matoms, mtbl, moverlaps;
+    char *info_tag;
+}
+split_t;
+
+struct _abuf_t
+{
+    abuf_opt_t mode;
+    split_t split;
+    atom_t *atoms;
+    int natoms, matoms;
+    const bcf_hdr_t *hdr;
+    bcf_hdr_t *out_hdr;
+    bcf1_t **vcf;       // dimensions stored in rbuf
+    rbuf_t rbuf;
+
+    kstring_t tmps;
+    void *tmp, *tmp2;
+    int32_t *gt, *tmpi;
+    int ngt, mgt, ntmpi, mtmpi, mtmp, mtmp2;
+    int star_allele;
+};
+
+abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode)
+{
+    if ( mode!=SPLIT ) error("todo\n");
+    abuf_t *buf = (abuf_t*) calloc(1,sizeof(abuf_t));
+    buf->hdr  = hdr;
+    buf->out_hdr = (bcf_hdr_t*) hdr;
+    buf->mode = mode;
+    buf->star_allele = 1;
+    rbuf_init(&buf->rbuf, 0);
+    return buf;
+}
+
+void abuf_destroy(abuf_t *buf)
+{
+    int i;
+    for (i=0; i<buf->matoms; i++)
+    {
+        free(buf->atoms[i].ref.s);
+        free(buf->atoms[i].alt.s);
+    }
+    free(buf->atoms);
+    free(buf->split.atoms);
+    free(buf->split.overlaps);
+    free(buf->split.tbl);
+    for (i=0; i<buf->rbuf.m; i++)
+        if ( buf->vcf[i] ) bcf_destroy(buf->vcf[i]);
+    free(buf->vcf);
+    free(buf->gt);
+    free(buf->tmpi);
+    free(buf->tmp);
+    free(buf->tmp2);
+    free(buf->tmps.s);
+    free(buf);
+}
+
+void abuf_set(abuf_t *buf, abuf_opt_t key, void *value)
+{
+    if ( key==BCF_HDR ) { buf->out_hdr = *((bcf_hdr_t**)value); return; }
+    if ( key==INFO_TAG )
+    {
+        buf->split.info_tag = *((char**)value);
+        bcf_hdr_printf(buf->out_hdr,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX\">",buf->split.info_tag); 
+        return;
+    }
+    if ( key==STAR_ALLELE ) { buf->star_allele = *((int*)value); return; }
+}
+
+/*
+    Split alleles into primitivs, e.g.
+        CC>TT  becomes  C>T,C>T
+        GCGT>GTGA  becomes C>T,T>A
+
+    There is no sequence alignment, just trimming and hungry matching
+    from left side.
+*/
+static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial)
+{
+    // Trim identical sequence from right
+    char *ref = rec->d.allele[0];
+    char *alt = rec->d.allele[ial];
+    int rlen = strlen(ref);
+    int alen = strlen(alt);
+    while ( rlen>1 && alen>1 && ref[rlen-1]==alt[alen-1] ) rlen--, alen--;
+    int Mlen = rlen > alen ? rlen : alen;
+
+    atom_t *atom = NULL; 
+    int i;
+    for (i=0; i<Mlen; i++)
+    {
+        char refb = i<rlen ? ref[i] : '-';
+        char altb = i<alen ? alt[i] : '-';
+        if ( refb!=altb )
+        {
+            if ( refb=='-' || altb=='-' )
+            {
+                assert(atom);
+                if ( altb!='-' ) kputc(altb, &atom->alt);
+                if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; }
+            }
+            else
+            {
+                buf->natoms++;
+                hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
+                atom = &buf->atoms[buf->natoms-1];
+                atom->ref.l = 0;
+                atom->alt.l = 0;
+                kputc(refb, &atom->ref);
+                kputc(altb, &atom->alt);
+                atom->beg = atom->end = i;
+                atom->ial = ial;
+            }
+            continue;
+        }
+        if ( i+1>=rlen || i+1>=alen )   // is the next base a deletion?
+        {
+            buf->natoms++;
+            hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
+            atom = &buf->atoms[buf->natoms-1];
+            atom->ref.l = 0;
+            atom->alt.l = 0;
+            kputc(refb, &atom->ref);
+            kputc(altb, &atom->alt);
+            atom->beg = atom->end = i;
+            atom->ial = ial;
+        }
+    }
+}
+static int _atoms_inconsistent(const atom_t *a, const atom_t *b)
+{
+    if ( a->beg < b->beg ) return -1;
+    if ( a->beg > b->beg ) return 1;
+    int rcmp = strcasecmp(a->ref.s,b->ref.s);
+    if ( rcmp ) return rcmp;
+    return strcasecmp(a->alt.s,b->alt.s);
+}
+/*
+    For reproducibility of tests on different platforms, we need to guarantee the same order of identical
+    atoms originating from different source ALTs.  Even though they are consistent, different values can be
+    picked for VCF annotations as currently the values from the one that comes first are used.
+*/
+static int _cmp_atoms(const void *aptr, const void *bptr)
+{
+    const atom_t *a = (const atom_t*) aptr;
+    const atom_t *b = (const atom_t*) bptr;
+    int rcmp = _atoms_inconsistent(a,b);
+    if ( rcmp ) return rcmp;
+    if ( a->ial < b->ial ) return -1;
+    if ( a->ial > b->ial ) return 1;
+    return 0;
+}
+static void _split_table_init(abuf_t *buf, bcf1_t *rec, int natoms)
+{
+    buf->split.rec  = rec;
+    buf->split.nori = rec->n_allele - 1;
+    buf->split.nout = 0;
+    hts_expand(uint8_t,buf->split.nori*natoms,buf->split.mtbl,buf->split.tbl);
+    hts_expand(atom_t*,natoms,buf->split.matoms,buf->split.atoms);
+    hts_expand(uint8_t,natoms,buf->split.moverlaps,buf->split.overlaps);
+    memset(buf->split.overlaps,0,sizeof(*buf->split.overlaps)*natoms);
+}
+static void _split_table_new(abuf_t *buf, atom_t *atom)
+{
+    int i, iout = buf->split.nout++;
+    buf->split.atoms[iout] = atom;
+    uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
+    for (i=0; i<buf->split.nori; i++) ptr[i] = 0;
+    ptr[atom->ial-1] = 1;
+}
+static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom)
+{
+    uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
+    ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1;
+    buf->split.overlaps[iout] = 1;
+}
+#if 0
+static void _split_table_print(abuf_t *buf)
+{
+    int i,j;
+    for (i=0; i<buf->split.nout; i++)
+    {
+        atom_t *atom = buf->split.atoms[i];
+        uint8_t *ptr = buf->split.tbl + i*buf->split.nori;
+        fprintf(stderr,"%d\t%s\t%s",(int)buf->split.rec->pos+1+atom->beg,atom->ref.s,atom->alt.s);
+        for (j=0; j<buf->split.nori; j++) fprintf(stderr,"\t%d",(int)ptr[j]);
+        fprintf(stderr,"\n");
+    }
+}
+static void _split_table_print_atoms(abuf_t *buf)
+{
+    int i;
+    for (i=0; i<buf->natoms; i++)
+    {
+        atom_t *atom = &buf->atoms[i];
+        fprintf(stderr,"atom%d %p: ialt=%d %s>%s %d-%d\n",i,atom,atom->ial,atom->ref.s,atom->alt.s,atom->beg,atom->end);
+    }
+}
+#endif
+static inline uint8_t _has_star_allele(abuf_t *buf, int iout)
+{
+    if ( !buf->star_allele ) return 0;
+    return buf->split.overlaps[iout];
+}
+static inline int _split_table_get_ial(abuf_t *buf, int irow, int ial)
+{
+    if ( !ial ) return ial;
+    return buf->split.tbl[irow*buf->split.nori + ial - 1];
+}
+static void _split_table_set_chrom_qual(abuf_t *buf)
+{
+    int iout,j;
+    bcf1_t *rec = buf->split.rec;
+    for (iout=0; iout<buf->split.nout; iout++)
+    {
+        rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
+        j = rbuf_append(&buf->rbuf);
+        if ( !buf->vcf[j] ) buf->vcf[j] = bcf_init1();
+        bcf1_t *out = buf->vcf[j];
+        bcf_clear1(out);
+
+        atom_t *atom = buf->split.atoms[iout];
+        out->rid = rec->rid;
+        out->pos = rec->pos + atom->beg;
+        bcf_update_id(buf->out_hdr, out, rec->d.id);
+
+        const char *als[3];
+        als[0] = atom->ref.s;
+        als[1] = atom->alt.s;
+        als[2] = "*";
+        int nals = _has_star_allele(buf,iout) ? 3 : 2;
+        bcf_update_alleles(buf->out_hdr, out, als, nals);
+
+        if ( bcf_float_is_missing(rec->qual) )
+            bcf_float_set_missing(out->qual);
+        else
+            out->qual = rec->qual;
+
+        bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt);
+    }
+}
+static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mode)
+{
+    const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key);
+    int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key);
+    int len  = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key);
+    if ( len==BCF_VL_G ) return;                                                // todo: Number=G INFO tags
+    if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return;     // todo: Number=A,R,G for strings
+    if ( type==BCF_HT_LONG ) return;                                            // todo: 64bit integers
+
+    bcf1_t *rec = buf->split.rec;
+    int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/4 : buf->mtmp;
+    int nval = bcf_get_info_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
+    if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*4;
+
+    // Check for incorrect number of values. Note this check does not consider all values missing
+    // and will remove annotations that don't pass.
+    if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return;
+
+    if ( buf->mtmp2 < buf->mtmp )
+    {
+        buf->tmp2  = realloc(buf->tmp2, buf->mtmp);
+        if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", buf->mtmp);
+        buf->mtmp2 = buf->mtmp;
+    }
+
+    int32_t missing = bcf_int32_missing;
+    void *missing_ptr = (void*)&missing;
+    if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
+
+    int iout,i;
+    for (iout=0; iout<buf->split.nout; iout++)
+    {
+        bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
+        int star_allele = _has_star_allele(buf,iout);
+        int ret = 0;
+        if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
+            ret = bcf_update_info(buf->out_hdr, out, tag, type==BCF_HT_FLAG ? NULL : buf->tmp, nval, type);
+        else if ( len==BCF_VL_A )
+        {
+            int iori = buf->split.atoms[iout]->ial - 1;
+            assert( iori<nval );
+            memcpy(buf->tmp2,buf->tmp+4*iori,4);
+            if ( star_allele )
+                memcpy(buf->tmp2+4,missing_ptr,4);
+            ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type);
+        }
+        else if ( len==BCF_VL_R )
+        {
+            memcpy(buf->tmp2,buf->tmp,4);   // REF contributes to all records
+            int iori = buf->split.atoms[iout]->ial;
+            assert( iori<nval && iori<=buf->split.nori );
+            memcpy(buf->tmp2+4,buf->tmp+4*iori,4);
+            if ( type==BCF_HT_INT && mode==M_SUM ) 
+            {
+                uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
+                for (i=iori; i<buf->split.nori; i++)
+                {
+                    if ( tbl[i]==1 ) ((int32_t*)buf->tmp2)[1] += ((int32_t*)buf->tmp)[i+1];
+                }
+            }
+            if ( star_allele )
+                memcpy(buf->tmp2+8,missing_ptr,4);
+            ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type);
+        }
+        if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
+    }
+}
+static void _split_table_set_history(abuf_t *buf)
+{
+    int i,j;
+    bcf1_t *rec = buf->split.rec;
+    buf->tmps.l = 0;
+    ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
+    for (i=1; i<rec->n_allele; i++)
+    {
+        kputs(rec->d.allele[i],&buf->tmps);
+        if ( i+1<rec->n_allele ) kputc(',',&buf->tmps);
+        else kputc(',',&buf->tmps);
+    }
+    int len = buf->tmps.l;
+    buf->tmps.s[buf->tmps.l-1] = '|';
+
+    for (i=0; i<buf->split.nout; i++)
+    {
+        buf->tmps.l = len;
+        bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)];
+        uint8_t *ptr = buf->split.tbl + i*buf->split.nori;
+        for (j=0; j<buf->split.nori; j++)
+        {
+            if ( ptr[j]!=1 ) continue;
+            kputw(j+1,&buf->tmps);
+            kputc(',',&buf->tmps);
+        }
+        buf->tmps.s[--buf->tmps.l] = 0;
+        if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
+            error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
+    }
+}
+static void _split_table_set_gt(abuf_t *buf)
+{
+    int nsmpl = bcf_hdr_nsamples(buf->hdr);
+    if ( !nsmpl ) return;
+
+    bcf1_t *rec = buf->split.rec;
+    buf->ngt = bcf_get_genotypes(buf->hdr, rec, &buf->gt, &buf->mgt);
+    if ( buf->ngt<=0 ) return;
+    else
+        hts_expand(int32_t,buf->ngt,buf->mtmpi,buf->tmpi);
+
+    int iout,i,j;
+    for (iout=0; iout<buf->split.nout; iout++)
+    {
+        bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
+        int star_allele = _has_star_allele(buf,iout);
+        int max_ploidy = buf->ngt/nsmpl;
+        int32_t *src = buf->gt, *dst = buf->tmpi;
+        for (i=0; i<nsmpl; i++)
+        {
+            for (j=0; j<max_ploidy; j++)
+            {
+                if ( src[j]==bcf_int32_vector_end || bcf_gt_is_missing(src[j]) )
+                {
+                    dst[j] = src[j];
+                    continue;
+                }
+                int iori = bcf_gt_allele(src[j]);
+                if ( iori<0 || iori>=rec->n_allele )
+                    error("Out-of-bounds genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(buf->hdr,rec),rec->pos+1);
+                int ial = _split_table_get_ial(buf,iout,iori);
+                if ( ial==2 && !star_allele )
+                    dst[j] = bcf_gt_missing;
+                else
+                    dst[j] = bcf_gt_is_phased(src[j]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
+            }
+            src += max_ploidy;
+            dst += max_ploidy;
+        }
+        bcf_update_genotypes(buf->out_hdr,out,buf->tmpi,buf->ngt);
+    }
+}
+static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mode)
+{
+    int nsmpl = bcf_hdr_nsamples(buf->hdr);
+    if ( !nsmpl ) return;
+
+    const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,fmt->id);
+    if ( tag[0]=='G' && tag[1]=='T' && !tag[2] )        // FORMAT/GT
+    {
+        _split_table_set_gt(buf);
+        return;
+    }
+
+    int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id);
+    int len  = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id);
+    if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return;     // todo: Number=A,R,G for strings
+    if ( type==BCF_HT_LONG ) return;                                            // todo: 64bit integers
+
+    const int num_size = 4;
+    assert( num_size==sizeof(int32_t) && num_size==sizeof(float) );
+    int32_t missing = bcf_int32_missing;
+    void *missing_ptr = (void*)&missing;
+    if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
+
+    bcf1_t *rec = buf->split.rec;
+    int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp;  // number of items
+    int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
+    if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size;                 // number of bytes
+
+    if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return;      // not haploid nor diploid
+
+    // Check for incorrect number of values. Note this check does not consider all values missing
+    // and will remove annotations that don't pass.
+    if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return;
+
+    // Increase buffer size to accommodate star allele
+    int nval1 = nval / nsmpl;
+    mtmp = buf->mtmp;
+    if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele
+    else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3);
+
+    if ( buf->mtmp2 < mtmp )
+    {
+        buf->tmp2  = realloc(buf->tmp2, mtmp);
+        if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", mtmp);
+        buf->mtmp2 = mtmp;
+    }
+
+    int iout, i, j;
+    for (iout=0; iout<buf->split.nout; iout++)
+    {
+        int star_allele = _has_star_allele(buf,iout);
+        bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
+        int ret = 0; 
+        if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
+            ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type);
+        else if ( len==BCF_VL_A )
+        {
+            int iori = buf->split.atoms[iout]->ial - 1;
+            assert( iori<nval );
+            for (i=0; i<nsmpl; i++)
+            {
+                void *src = buf->tmp  + nval1*num_size*i;
+                void *dst = buf->tmp2 + num_size*i*(star_allele+1);
+                memcpy(dst,src+iori*num_size,num_size);
+                if ( star_allele )
+                    memcpy(dst+num_size,missing_ptr,num_size);
+            }
+            ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type);
+        }
+        else if ( len==BCF_VL_R )
+        {
+            int iori = buf->split.atoms[iout]->ial;
+            assert( iori<=nval );
+            for (i=0; i<nsmpl; i++)
+            {
+                void *src = buf->tmp  + nval1*num_size*i;
+                void *dst = buf->tmp2 + num_size*i*(star_allele+2);
+                memcpy(dst,src,num_size);
+                memcpy(dst+num_size,src+iori*num_size,num_size);
+
+                if ( type==BCF_HT_INT && mode==M_SUM )
+                {
+                    uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
+                    for (j=iori; j<buf->split.nori; j++)
+                        if ( tbl[j]==1 ) ((int32_t*)dst)[1] += ((int32_t*)src)[j+1];
+                }
+                if ( star_allele )
+                    memcpy(dst+num_size*2,missing_ptr,num_size);
+            }
+            ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type);
+        }
+        else if ( len==BCF_VL_G )
+        {
+            int iori = buf->split.atoms[iout]->ial;
+            int i01  = bcf_alleles2gt(0,iori);
+            int i11  = bcf_alleles2gt(iori,iori);
+            assert( iori<nval );
+            #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end) { \
+                for (i=0; i<nsmpl; i++) \
+                { \
+                    type_t *src = (type_t*)buf->tmp + i*nval1; \
+                    type_t *dst = (type_t*)buf->tmp2 + i*3*(1+star_allele); \
+                    int n=0; /* determine ploidy of this genotype */ \
+                    while ( n<nval1 && !(is_vector_end) ) { n++; src++; } \
+                    src = (type_t*)buf->tmp + i*nval1; \
+                    memcpy(dst++,src,sizeof(type)); \
+                    int nmiss = 0, nend = 0; \
+                    if ( n==rec->n_allele ) /* haploid */ \
+                    { \
+                        memcpy(dst++,src+iori,sizeof(type)); \
+                        if ( star_allele ) { nmiss = 1; nend = 3; } \
+                        else nend = 1; \
+                    } \
+                    else if ( n==nval1 ) \
+                    { \
+                        memcpy(dst++,src+i01,sizeof(type)); \
+                        memcpy(dst++,src+i11,sizeof(type)); \
+                        if ( star_allele ) nmiss = 3; \
+                    } \
+                    else if ( n==1 && is_missing ) \
+                    { \
+                        if ( star_allele ) nend = 5; \
+                        else nend = 2; \
+                    } \
+                    else  \
+                        error("Incorrect number of values at %s:%"PRIhts_pos" .. tag=FORMAT/%s Number=G nAlleles=%d nValues=%d, %d-th sample\n", \
+                                bcf_seqname(buf->hdr,rec),rec->pos+1,tag,rec->n_allele,n,i+1); \
+                    for (j=0; j<nmiss; j++) { set_missing; dst++; } \
+                    for (j=0; j<nend; j++) { set_vector_end; dst++; } \
+                } \
+            }
+            switch (type)
+            {
+                case BCF_HT_INT:  BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *dst=bcf_int32_missing, *dst=bcf_int32_vector_end); break;
+                case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*dst), bcf_float_set_vector_end(*dst)); break;
+                default: error("Unexpected case: %d\n", type);
+            }
+            #undef BRANCH
+            ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
+        }
+        if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
+    }
+}
+static inline int _is_acgtn(char *seq)
+{
+    while ( *seq )
+    {
+        char c = toupper(*seq);
+        if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 0;
+        seq++;
+    }
+    return 1;
+}
+/*
+    The atomization works as follows:
+    - Atomize each alternate allele separately by leaving out sequence identical to the reference. No
+      alignment is performed, just greedy trimming of the end, then from left. This operation returns
+      a list of atoms (atom_t) which carry fragments of REF,ALT and their positions as 0-based offsets
+      to the original REF allele
+    - Sort atoms by POS, REF and ALT. Each unique atom (POS+REF+ALT) forms a new VCF record, each
+      with a single ALT.
+    - For each new VCF record determine how to translate the original allele index (iori) to this new
+      record:
+        - 1: the original allele matches the atom
+        - 0: the original allele does not overlap this atom or the overlapping part matches the REF
+             allele
+        - 2 (or equivalently "."): there is a mismatch between the original allele and the atom
+      The mapping is encoded in a table with columns corresponding to the original ALTs and rows
+      to the new POS+ALTs (atoms). The table is initialized to 0, then we set 1's for matching
+      atoms and 2's for overlapping mismatching atoms.
+
+    Note that different ALT alleles can result in the same atom (the same output line) and this code
+    does not know how to reconcile possibly conflicting VCF annotations. This could be improved
+    and merge logic provided, similarly to `merge -l`. For example, the allelic depths (AD) should
+    be summed for the same atomized output allele. However, this level of complexity is not addressed
+    in this initial draft. Higher priority for now is to provide the inverse "join" operation.
+
+    Update 2021-04-09:
+        Tags QS,AD are now automatically incremented as they should be, for both INFO and FORMAT.
+        Note that the code will fail on missing values (todo) and it needs to be generalized and
+        made customizable.
+*/
+void _abuf_split(abuf_t *buf, bcf1_t *rec)
+{
+    int i,j;
+    if ( rec->n_allele < 2 )
+    {
+        rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
+        int j = rbuf_append(&buf->rbuf);
+        if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]);
+        buf->vcf[j] = bcf_dup(rec);
+        return;
+    }
+    for (i=1; i<rec->n_allele; i++)
+    {
+        if ( _is_acgtn(rec->d.allele[i]) ) continue;
+        rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
+        int j = rbuf_append(&buf->rbuf);
+        if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]);
+        buf->vcf[j] = bcf_dup(rec);
+        return;
+    }
+
+    buf->natoms = 0;
+    for (i=1; i<rec->n_allele; i++) _atomize_allele(buf,rec,i);
+    qsort(buf->atoms,buf->natoms,sizeof(*buf->atoms),_cmp_atoms);
+    _split_table_init(buf,rec,buf->natoms);
+    for (i=0; i<buf->natoms; i++)
+    {
+        if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue;
+        _split_table_new(buf, &buf->atoms[i]);  // add a new unique output atom
+    }
+    for (i=0; i<buf->natoms; i++)
+    {
+        // Looping over sorted list of all atoms with possible duplicates from different source ALT alleles
+        atom_t *atom = &buf->atoms[i];
+        for (j=0; j<buf->split.nout; j++)
+        {
+            atom_t *out = buf->split.atoms[j];
+            if ( atom == out ) continue;            // table already set to 1
+            if ( atom->beg > out->end ) continue;   // cannot overlap this output atom
+            if ( atom->end < out->beg ) break;      // this atom is ahead of all subsequent output records
+            _split_table_overlap(buf, j, atom);
+        }
+    }
+    assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode
+
+    // Create the output records, transferring all annotations:
+    // CHROM-QUAL
+    _split_table_set_chrom_qual(buf);
+
+    // INFO
+    for (i=0; i<rec->n_info; i++)
+    {
+        // this implementation of merging rules is temporary: generalize and made customizable through the API
+        merge_rule_t mode = M_FIRST;
+        const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.info[i].key);
+        if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM;
+
+        _split_table_set_info(buf, &rec->d.info[i], mode);
+    }
+
+    // Set INFO tag showing the original record
+    if ( buf->split.info_tag )
+        _split_table_set_history(buf);
+
+    // FORMAT
+    for (i=0; i<rec->n_fmt; i++)
+    {
+        // this implementation of merging rules is temporary: generalize and made customizable through the API
+        merge_rule_t mode = M_FIRST;
+        const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.fmt[i].id);
+        if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM;
+
+        _split_table_set_format(buf, &rec->d.fmt[i], mode);
+    }
+}
+
+void abuf_push(abuf_t *buf, bcf1_t *rec)
+{
+    bcf_unpack(rec, BCF_UN_ALL);
+    if ( buf->mode==SPLIT ) _abuf_split(buf,rec);
+}
+
+bcf1_t *abuf_flush(abuf_t *buf, int flush_all)
+{
+    int i;
+
+    if ( buf->rbuf.n==0 ) return NULL;
+    if ( flush_all ) goto ret;
+
+ret:
+    i = rbuf_shift(&buf->rbuf);
+    return buf->vcf[i];
+}
+
diff --git a/bcftools/abuf.c.pysam.c b/bcftools/abuf.c.pysam.c

new file mode 100644 (file)

index 0000000..811ef10
--- /dev/null
+++ b/bcftools/abuf.c.pysam.c
@@ -0,0 +1,715 @@
+#include "bcftools.pysam.h"
+
+/* The MIT License
+
+   Copyright (c) 2021 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+#include <assert.h>
+#include <strings.h>
+#include <htslib/vcf.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "abuf.h"
+#include "rbuf.h"
+
+typedef enum
+{
+    M_FIRST, M_SUM
+}
+merge_rule_t;
+
+typedef struct
+{
+    kstring_t ref, alt;
+    int ial;        // the index of the original ALT allele, 1-based
+    int beg, end;   // 0-based inclusive offsets to ref,alt
+}
+atom_t;
+
+typedef struct
+{
+    bcf1_t *rec;
+    int nori, nout;     // number of ALTs in the input, and VCF rows on output
+    uint8_t *tbl;       // nori columns, nout rows; indicates allele contribution to output rows, see "The atomization works as follows" below
+    uint8_t *overlaps;  // is the star allele needed for this variant?
+    atom_t **atoms;
+    int matoms, mtbl, moverlaps;
+    char *info_tag;
+}
+split_t;
+
+struct _abuf_t
+{
+    abuf_opt_t mode;
+    split_t split;
+    atom_t *atoms;
+    int natoms, matoms;
+    const bcf_hdr_t *hdr;
+    bcf_hdr_t *out_hdr;
+    bcf1_t **vcf;       // dimensions stored in rbuf
+    rbuf_t rbuf;
+
+    kstring_t tmps;
+    void *tmp, *tmp2;
+    int32_t *gt, *tmpi;
+    int ngt, mgt, ntmpi, mtmpi, mtmp, mtmp2;
+    int star_allele;
+};
+
+abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode)
+{
+    if ( mode!=SPLIT ) error("todo\n");
+    abuf_t *buf = (abuf_t*) calloc(1,sizeof(abuf_t));
+    buf->hdr  = hdr;
+    buf->out_hdr = (bcf_hdr_t*) hdr;
+    buf->mode = mode;
+    buf->star_allele = 1;
+    rbuf_init(&buf->rbuf, 0);
+    return buf;
+}
+
+void abuf_destroy(abuf_t *buf)
+{
+    int i;
+    for (i=0; i<buf->matoms; i++)
+    {
+        free(buf->atoms[i].ref.s);
+        free(buf->atoms[i].alt.s);
+    }
+    free(buf->atoms);
+    free(buf->split.atoms);
+    free(buf->split.overlaps);
+    free(buf->split.tbl);
+    for (i=0; i<buf->rbuf.m; i++)
+        if ( buf->vcf[i] ) bcf_destroy(buf->vcf[i]);
+    free(buf->vcf);
+    free(buf->gt);
+    free(buf->tmpi);
+    free(buf->tmp);
+    free(buf->tmp2);
+    free(buf->tmps.s);
+    free(buf);
+}
+
+void abuf_set(abuf_t *buf, abuf_opt_t key, void *value)
+{
+    if ( key==BCF_HDR ) { buf->out_hdr = *((bcf_hdr_t**)value); return; }
+    if ( key==INFO_TAG )
+    {
+        buf->split.info_tag = *((char**)value);
+        bcf_hdr_printf(buf->out_hdr,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX\">",buf->split.info_tag); 
+        return;
+    }
+    if ( key==STAR_ALLELE ) { buf->star_allele = *((int*)value); return; }
+}
+
+/*
+    Split alleles into primitivs, e.g.
+        CC>TT  becomes  C>T,C>T
+        GCGT>GTGA  becomes C>T,T>A
+
+    There is no sequence alignment, just trimming and hungry matching
+    from left side.
+*/
+static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial)
+{
+    // Trim identical sequence from right
+    char *ref = rec->d.allele[0];
+    char *alt = rec->d.allele[ial];
+    int rlen = strlen(ref);
+    int alen = strlen(alt);
+    while ( rlen>1 && alen>1 && ref[rlen-1]==alt[alen-1] ) rlen--, alen--;
+    int Mlen = rlen > alen ? rlen : alen;
+
+    atom_t *atom = NULL; 
+    int i;
+    for (i=0; i<Mlen; i++)
+    {
+        char refb = i<rlen ? ref[i] : '-';
+        char altb = i<alen ? alt[i] : '-';
+        if ( refb!=altb )
+        {
+            if ( refb=='-' || altb=='-' )
+            {
+                assert(atom);
+                if ( altb!='-' ) kputc(altb, &atom->alt);
+                if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; }
+            }
+            else
+            {
+                buf->natoms++;
+                hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
+                atom = &buf->atoms[buf->natoms-1];
+                atom->ref.l = 0;
+                atom->alt.l = 0;
+                kputc(refb, &atom->ref);
+                kputc(altb, &atom->alt);
+                atom->beg = atom->end = i;
+                atom->ial = ial;
+            }
+            continue;
+        }
+        if ( i+1>=rlen || i+1>=alen )   // is the next base a deletion?
+        {
+            buf->natoms++;
+            hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
+            atom = &buf->atoms[buf->natoms-1];
+            atom->ref.l = 0;
+            atom->alt.l = 0;
+            kputc(refb, &atom->ref);
+            kputc(altb, &atom->alt);
+            atom->beg = atom->end = i;
+            atom->ial = ial;
+        }
+    }
+}
+static int _atoms_inconsistent(const atom_t *a, const atom_t *b)
+{
+    if ( a->beg < b->beg ) return -1;
+    if ( a->beg > b->beg ) return 1;
+    int rcmp = strcasecmp(a->ref.s,b->ref.s);
+    if ( rcmp ) return rcmp;
+    return strcasecmp(a->alt.s,b->alt.s);
+}
+/*
+    For reproducibility of tests on different platforms, we need to guarantee the same order of identical
+    atoms originating from different source ALTs.  Even though they are consistent, different values can be
+    picked for VCF annotations as currently the values from the one that comes first are used.
+*/
+static int _cmp_atoms(const void *aptr, const void *bptr)
+{
+    const atom_t *a = (const atom_t*) aptr;
+    const atom_t *b = (const atom_t*) bptr;
+    int rcmp = _atoms_inconsistent(a,b);
+    if ( rcmp ) return rcmp;
+    if ( a->ial < b->ial ) return -1;
+    if ( a->ial > b->ial ) return 1;
+    return 0;
+}
+static void _split_table_init(abuf_t *buf, bcf1_t *rec, int natoms)
+{
+    buf->split.rec  = rec;
+    buf->split.nori = rec->n_allele - 1;
+    buf->split.nout = 0;
+    hts_expand(uint8_t,buf->split.nori*natoms,buf->split.mtbl,buf->split.tbl);
+    hts_expand(atom_t*,natoms,buf->split.matoms,buf->split.atoms);
+    hts_expand(uint8_t,natoms,buf->split.moverlaps,buf->split.overlaps);
+    memset(buf->split.overlaps,0,sizeof(*buf->split.overlaps)*natoms);
+}
+static void _split_table_new(abuf_t *buf, atom_t *atom)
+{
+    int i, iout = buf->split.nout++;
+    buf->split.atoms[iout] = atom;
+    uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
+    for (i=0; i<buf->split.nori; i++) ptr[i] = 0;
+    ptr[atom->ial-1] = 1;
+}
+static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom)
+{
+    uint8_t *ptr = buf->split.tbl + iout*buf->split.nori;
+    ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1;
+    buf->split.overlaps[iout] = 1;
+}
+#if 0
+static void _split_table_print(abuf_t *buf)
+{
+    int i,j;
+    for (i=0; i<buf->split.nout; i++)
+    {
+        atom_t *atom = buf->split.atoms[i];
+        uint8_t *ptr = buf->split.tbl + i*buf->split.nori;
+        fprintf(bcftools_stderr,"%d\t%s\t%s",(int)buf->split.rec->pos+1+atom->beg,atom->ref.s,atom->alt.s);
+        for (j=0; j<buf->split.nori; j++) fprintf(bcftools_stderr,"\t%d",(int)ptr[j]);
+        fprintf(bcftools_stderr,"\n");
+    }
+}
+static void _split_table_print_atoms(abuf_t *buf)
+{
+    int i;
+    for (i=0; i<buf->natoms; i++)
+    {
+        atom_t *atom = &buf->atoms[i];
+        fprintf(bcftools_stderr,"atom%d %p: ialt=%d %s>%s %d-%d\n",i,atom,atom->ial,atom->ref.s,atom->alt.s,atom->beg,atom->end);
+    }
+}
+#endif
+static inline uint8_t _has_star_allele(abuf_t *buf, int iout)
+{
+    if ( !buf->star_allele ) return 0;
+    return buf->split.overlaps[iout];
+}
+static inline int _split_table_get_ial(abuf_t *buf, int irow, int ial)
+{
+    if ( !ial ) return ial;
+    return buf->split.tbl[irow*buf->split.nori + ial - 1];
+}
+static void _split_table_set_chrom_qual(abuf_t *buf)
+{
+    int iout,j;
+    bcf1_t *rec = buf->split.rec;
+    for (iout=0; iout<buf->split.nout; iout++)
+    {
+        rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
+        j = rbuf_append(&buf->rbuf);
+        if ( !buf->vcf[j] ) buf->vcf[j] = bcf_init1();
+        bcf1_t *out = buf->vcf[j];
+        bcf_clear1(out);
+
+        atom_t *atom = buf->split.atoms[iout];
+        out->rid = rec->rid;
+        out->pos = rec->pos + atom->beg;
+        bcf_update_id(buf->out_hdr, out, rec->d.id);
+
+        const char *als[3];
+        als[0] = atom->ref.s;
+        als[1] = atom->alt.s;
+        als[2] = "*";
+        int nals = _has_star_allele(buf,iout) ? 3 : 2;
+        bcf_update_alleles(buf->out_hdr, out, als, nals);
+
+        if ( bcf_float_is_missing(rec->qual) )
+            bcf_float_set_missing(out->qual);
+        else
+            out->qual = rec->qual;
+
+        bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt);
+    }
+}
+static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mode)
+{
+    const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key);
+    int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key);
+    int len  = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key);
+    if ( len==BCF_VL_G ) return;                                                // todo: Number=G INFO tags
+    if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return;     // todo: Number=A,R,G for strings
+    if ( type==BCF_HT_LONG ) return;                                            // todo: 64bit integers
+
+    bcf1_t *rec = buf->split.rec;
+    int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/4 : buf->mtmp;
+    int nval = bcf_get_info_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
+    if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*4;
+
+    // Check for incorrect number of values. Note this check does not consider all values missing
+    // and will remove annotations that don't pass.
+    if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return;
+
+    if ( buf->mtmp2 < buf->mtmp )
+    {
+        buf->tmp2  = realloc(buf->tmp2, buf->mtmp);
+        if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", buf->mtmp);
+        buf->mtmp2 = buf->mtmp;
+    }
+
+    int32_t missing = bcf_int32_missing;
+    void *missing_ptr = (void*)&missing;
+    if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
+
+    int iout,i;
+    for (iout=0; iout<buf->split.nout; iout++)
+    {
+        bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
+        int star_allele = _has_star_allele(buf,iout);
+        int ret = 0;
+        if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
+            ret = bcf_update_info(buf->out_hdr, out, tag, type==BCF_HT_FLAG ? NULL : buf->tmp, nval, type);
+        else if ( len==BCF_VL_A )
+        {
+            int iori = buf->split.atoms[iout]->ial - 1;
+            assert( iori<nval );
+            memcpy(buf->tmp2,buf->tmp+4*iori,4);
+            if ( star_allele )
+                memcpy(buf->tmp2+4,missing_ptr,4);
+            ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type);
+        }
+        else if ( len==BCF_VL_R )
+        {
+            memcpy(buf->tmp2,buf->tmp,4);   // REF contributes to all records
+            int iori = buf->split.atoms[iout]->ial;
+            assert( iori<nval && iori<=buf->split.nori );
+            memcpy(buf->tmp2+4,buf->tmp+4*iori,4);
+            if ( type==BCF_HT_INT && mode==M_SUM ) 
+            {
+                uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
+                for (i=iori; i<buf->split.nori; i++)
+                {
+                    if ( tbl[i]==1 ) ((int32_t*)buf->tmp2)[1] += ((int32_t*)buf->tmp)[i+1];
+                }
+            }
+            if ( star_allele )
+                memcpy(buf->tmp2+8,missing_ptr,4);
+            ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type);
+        }
+        if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
+    }
+}
+static void _split_table_set_history(abuf_t *buf)
+{
+    int i,j;
+    bcf1_t *rec = buf->split.rec;
+    buf->tmps.l = 0;
+    ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]);
+    for (i=1; i<rec->n_allele; i++)
+    {
+        kputs(rec->d.allele[i],&buf->tmps);
+        if ( i+1<rec->n_allele ) kputc(',',&buf->tmps);
+        else kputc(',',&buf->tmps);
+    }
+    int len = buf->tmps.l;
+    buf->tmps.s[buf->tmps.l-1] = '|';
+
+    for (i=0; i<buf->split.nout; i++)
+    {
+        buf->tmps.l = len;
+        bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)];
+        uint8_t *ptr = buf->split.tbl + i*buf->split.nori;
+        for (j=0; j<buf->split.nori; j++)
+        {
+            if ( ptr[j]!=1 ) continue;
+            kputw(j+1,&buf->tmps);
+            kputc(',',&buf->tmps);
+        }
+        buf->tmps.s[--buf->tmps.l] = 0;
+        if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 )
+            error("An error occurred while updating INFO/%s\n",buf->split.info_tag);
+    }
+}
+static void _split_table_set_gt(abuf_t *buf)
+{
+    int nsmpl = bcf_hdr_nsamples(buf->hdr);
+    if ( !nsmpl ) return;
+
+    bcf1_t *rec = buf->split.rec;
+    buf->ngt = bcf_get_genotypes(buf->hdr, rec, &buf->gt, &buf->mgt);
+    if ( buf->ngt<=0 ) return;
+    else
+        hts_expand(int32_t,buf->ngt,buf->mtmpi,buf->tmpi);
+
+    int iout,i,j;
+    for (iout=0; iout<buf->split.nout; iout++)
+    {
+        bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
+        int star_allele = _has_star_allele(buf,iout);
+        int max_ploidy = buf->ngt/nsmpl;
+        int32_t *src = buf->gt, *dst = buf->tmpi;
+        for (i=0; i<nsmpl; i++)
+        {
+            for (j=0; j<max_ploidy; j++)
+            {
+                if ( src[j]==bcf_int32_vector_end || bcf_gt_is_missing(src[j]) )
+                {
+                    dst[j] = src[j];
+                    continue;
+                }
+                int iori = bcf_gt_allele(src[j]);
+                if ( iori<0 || iori>=rec->n_allele )
+                    error("Out-of-bounds genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(buf->hdr,rec),rec->pos+1);
+                int ial = _split_table_get_ial(buf,iout,iori);
+                if ( ial==2 && !star_allele )
+                    dst[j] = bcf_gt_missing;
+                else
+                    dst[j] = bcf_gt_is_phased(src[j]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial);
+            }
+            src += max_ploidy;
+            dst += max_ploidy;
+        }
+        bcf_update_genotypes(buf->out_hdr,out,buf->tmpi,buf->ngt);
+    }
+}
+static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt, merge_rule_t mode)
+{
+    int nsmpl = bcf_hdr_nsamples(buf->hdr);
+    if ( !nsmpl ) return;
+
+    const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,fmt->id);
+    if ( tag[0]=='G' && tag[1]=='T' && !tag[2] )        // FORMAT/GT
+    {
+        _split_table_set_gt(buf);
+        return;
+    }
+
+    int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id);
+    int len  = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id);
+    if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return;     // todo: Number=A,R,G for strings
+    if ( type==BCF_HT_LONG ) return;                                            // todo: 64bit integers
+
+    const int num_size = 4;
+    assert( num_size==sizeof(int32_t) && num_size==sizeof(float) );
+    int32_t missing = bcf_int32_missing;
+    void *missing_ptr = (void*)&missing;
+    if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
+
+    bcf1_t *rec = buf->split.rec;
+    int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp;  // number of items
+    int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
+    if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size;                 // number of bytes
+
+    if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return;      // not haploid nor diploid
+
+    // Check for incorrect number of values. Note this check does not consider all values missing
+    // and will remove annotations that don't pass.
+    if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return;
+
+    // Increase buffer size to accommodate star allele
+    int nval1 = nval / nsmpl;
+    mtmp = buf->mtmp;
+    if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele
+    else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3);
+
+    if ( buf->mtmp2 < mtmp )
+    {
+        buf->tmp2  = realloc(buf->tmp2, mtmp);
+        if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", mtmp);
+        buf->mtmp2 = mtmp;
+    }
+
+    int iout, i, j;
+    for (iout=0; iout<buf->split.nout; iout++)
+    {
+        int star_allele = _has_star_allele(buf,iout);
+        bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)];
+        int ret = 0; 
+        if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
+            ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type);
+        else if ( len==BCF_VL_A )
+        {
+            int iori = buf->split.atoms[iout]->ial - 1;
+            assert( iori<nval );
+            for (i=0; i<nsmpl; i++)
+            {
+                void *src = buf->tmp  + nval1*num_size*i;
+                void *dst = buf->tmp2 + num_size*i*(star_allele+1);
+                memcpy(dst,src+iori*num_size,num_size);
+                if ( star_allele )
+                    memcpy(dst+num_size,missing_ptr,num_size);
+            }
+            ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type);
+        }
+        else if ( len==BCF_VL_R )
+        {
+            int iori = buf->split.atoms[iout]->ial;
+            assert( iori<=nval );
+            for (i=0; i<nsmpl; i++)
+            {
+                void *src = buf->tmp  + nval1*num_size*i;
+                void *dst = buf->tmp2 + num_size*i*(star_allele+2);
+                memcpy(dst,src,num_size);
+                memcpy(dst+num_size,src+iori*num_size,num_size);
+
+                if ( type==BCF_HT_INT && mode==M_SUM )
+                {
+                    uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
+                    for (j=iori; j<buf->split.nori; j++)
+                        if ( tbl[j]==1 ) ((int32_t*)dst)[1] += ((int32_t*)src)[j+1];
+                }
+                if ( star_allele )
+                    memcpy(dst+num_size*2,missing_ptr,num_size);
+            }
+            ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type);
+        }
+        else if ( len==BCF_VL_G )
+        {
+            int iori = buf->split.atoms[iout]->ial;
+            int i01  = bcf_alleles2gt(0,iori);
+            int i11  = bcf_alleles2gt(iori,iori);
+            assert( iori<nval );
+            #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end) { \
+                for (i=0; i<nsmpl; i++) \
+                { \
+                    type_t *src = (type_t*)buf->tmp + i*nval1; \
+                    type_t *dst = (type_t*)buf->tmp2 + i*3*(1+star_allele); \
+                    int n=0; /* determine ploidy of this genotype */ \
+                    while ( n<nval1 && !(is_vector_end) ) { n++; src++; } \
+                    src = (type_t*)buf->tmp + i*nval1; \
+                    memcpy(dst++,src,sizeof(type)); \
+                    int nmiss = 0, nend = 0; \
+                    if ( n==rec->n_allele ) /* haploid */ \
+                    { \
+                        memcpy(dst++,src+iori,sizeof(type)); \
+                        if ( star_allele ) { nmiss = 1; nend = 3; } \
+                        else nend = 1; \
+                    } \
+                    else if ( n==nval1 ) \
+                    { \
+                        memcpy(dst++,src+i01,sizeof(type)); \
+                        memcpy(dst++,src+i11,sizeof(type)); \
+                        if ( star_allele ) nmiss = 3; \
+                    } \
+                    else if ( n==1 && is_missing ) \
+                    { \
+                        if ( star_allele ) nend = 5; \
+                        else nend = 2; \
+                    } \
+                    else  \
+                        error("Incorrect number of values at %s:%"PRIhts_pos" .. tag=FORMAT/%s Number=G nAlleles=%d nValues=%d, %d-th sample\n", \
+                                bcf_seqname(buf->hdr,rec),rec->pos+1,tag,rec->n_allele,n,i+1); \
+                    for (j=0; j<nmiss; j++) { set_missing; dst++; } \
+                    for (j=0; j<nend; j++) { set_vector_end; dst++; } \
+                } \
+            }
+            switch (type)
+            {
+                case BCF_HT_INT:  BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *dst=bcf_int32_missing, *dst=bcf_int32_vector_end); break;
+                case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*dst), bcf_float_set_vector_end(*dst)); break;
+                default: error("Unexpected case: %d\n", type);
+            }
+            #undef BRANCH
+            ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type);
+        }
+        if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag);
+    }
+}
+static inline int _is_acgtn(char *seq)
+{
+    while ( *seq )
+    {
+        char c = toupper(*seq);
+        if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 0;
+        seq++;
+    }
+    return 1;
+}
+/*
+    The atomization works as follows:
+    - Atomize each alternate allele separately by leaving out sequence identical to the reference. No
+      alignment is performed, just greedy trimming of the end, then from left. This operation returns
+      a list of atoms (atom_t) which carry fragments of REF,ALT and their positions as 0-based offsets
+      to the original REF allele
+    - Sort atoms by POS, REF and ALT. Each unique atom (POS+REF+ALT) forms a new VCF record, each
+      with a single ALT.
+    - For each new VCF record determine how to translate the original allele index (iori) to this new
+      record:
+        - 1: the original allele matches the atom
+        - 0: the original allele does not overlap this atom or the overlapping part matches the REF
+             allele
+        - 2 (or equivalently "."): there is a mismatch between the original allele and the atom
+      The mapping is encoded in a table with columns corresponding to the original ALTs and rows
+      to the new POS+ALTs (atoms). The table is initialized to 0, then we set 1's for matching
+      atoms and 2's for overlapping mismatching atoms.
+
+    Note that different ALT alleles can result in the same atom (the same output line) and this code
+    does not know how to reconcile possibly conflicting VCF annotations. This could be improved
+    and merge logic provided, similarly to `merge -l`. For example, the allelic depths (AD) should
+    be summed for the same atomized output allele. However, this level of complexity is not addressed
+    in this initial draft. Higher priority for now is to provide the inverse "join" operation.
+
+    Update 2021-04-09:
+        Tags QS,AD are now automatically incremented as they should be, for both INFO and FORMAT.
+        Note that the code will fail on missing values (todo) and it needs to be generalized and
+        made customizable.
+*/
+void _abuf_split(abuf_t *buf, bcf1_t *rec)
+{
+    int i,j;
+    if ( rec->n_allele < 2 )
+    {
+        rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
+        int j = rbuf_append(&buf->rbuf);
+        if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]);
+        buf->vcf[j] = bcf_dup(rec);
+        return;
+    }
+    for (i=1; i<rec->n_allele; i++)
+    {
+        if ( _is_acgtn(rec->d.allele[i]) ) continue;
+        rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf);
+        int j = rbuf_append(&buf->rbuf);
+        if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]);
+        buf->vcf[j] = bcf_dup(rec);
+        return;
+    }
+
+    buf->natoms = 0;
+    for (i=1; i<rec->n_allele; i++) _atomize_allele(buf,rec,i);
+    qsort(buf->atoms,buf->natoms,sizeof(*buf->atoms),_cmp_atoms);
+    _split_table_init(buf,rec,buf->natoms);
+    for (i=0; i<buf->natoms; i++)
+    {
+        if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue;
+        _split_table_new(buf, &buf->atoms[i]);  // add a new unique output atom
+    }
+    for (i=0; i<buf->natoms; i++)
+    {
+        // Looping over sorted list of all atoms with possible duplicates from different source ALT alleles
+        atom_t *atom = &buf->atoms[i];
+        for (j=0; j<buf->split.nout; j++)
+        {
+            atom_t *out = buf->split.atoms[j];
+            if ( atom == out ) continue;            // table already set to 1
+            if ( atom->beg > out->end ) continue;   // cannot overlap this output atom
+            if ( atom->end < out->beg ) break;      // this atom is ahead of all subsequent output records
+            _split_table_overlap(buf, j, atom);
+        }
+    }
+    assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode
+
+    // Create the output records, transferring all annotations:
+    // CHROM-QUAL
+    _split_table_set_chrom_qual(buf);
+
+    // INFO
+    for (i=0; i<rec->n_info; i++)
+    {
+        // this implementation of merging rules is temporary: generalize and made customizable through the API
+        merge_rule_t mode = M_FIRST;
+        const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.info[i].key);
+        if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM;
+
+        _split_table_set_info(buf, &rec->d.info[i], mode);
+    }
+
+    // Set INFO tag showing the original record
+    if ( buf->split.info_tag )
+        _split_table_set_history(buf);
+
+    // FORMAT
+    for (i=0; i<rec->n_fmt; i++)
+    {
+        // this implementation of merging rules is temporary: generalize and made customizable through the API
+        merge_rule_t mode = M_FIRST;
+        const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,rec->d.fmt[i].id);
+        if ( !strcmp(tag,"QS") || !strcmp(tag,"AD") ) mode = M_SUM;
+
+        _split_table_set_format(buf, &rec->d.fmt[i], mode);
+    }
+}
+
+void abuf_push(abuf_t *buf, bcf1_t *rec)
+{
+    bcf_unpack(rec, BCF_UN_ALL);
+    if ( buf->mode==SPLIT ) _abuf_split(buf,rec);
+}
+
+bcf1_t *abuf_flush(abuf_t *buf, int flush_all)
+{
+    int i;
+
+    if ( buf->rbuf.n==0 ) return NULL;
+    if ( flush_all ) goto ret;
+
+ret:
+    i = rbuf_shift(&buf->rbuf);
+    return buf->vcf[i];
+}
+
diff --git a/bcftools/abuf.h b/bcftools/abuf.h

new file mode 100644 (file)

index 0000000..5fc1e00
--- /dev/null
+++ b/bcftools/abuf.h
@@ -0,0 +1,78 @@
+/* The MIT License
+
+   Copyright (c) 2021 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+/*
+    Atomize/deatomize complex variants
+*/
+
+#ifndef __ABUF_H__
+#define __ABUF_H__
+
+#include <htslib/vcf.h>
+
+typedef struct _abuf_t abuf_t;
+
+// Modes of operation
+typedef enum
+{
+    NONE,
+
+    // mode of operation, to be passed to abuf_init
+    SPLIT,
+    JOIN,
+
+    BCF_HDR,        // should the records be annotated, a writable bcf header is required
+    INFO_TAG,       // set BCF_HDR first
+    STAR_ALLELE     // 1: use STAR allele (the default), 0: set overlaps to missing
+}
+abuf_opt_t;
+
+#define abuf_set_opt(buf,type,key,value) { type tmp = value; abuf_set(buf, key, (void*)&tmp); }
+void abuf_set(abuf_t *buf, abuf_opt_t key, void *value);
+
+/*
+ *  abuf_init() - init buffer
+ *  @win:   number of sites (>0) or bp (<0)
+ */
+abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode);
+void abuf_destroy(abuf_t *buf);
+
+/*
+ *  abuf_push() - Push a new site for analysis
+ */
+void abuf_push(abuf_t *buf, bcf1_t *rec);
+
+/*
+ *  abuf_flush() - Return next buffered record
+ *  @flush_all: Set to 1 if no more overlapping records are coming (e.g. end of chromosome or end of file),
+ *              the buffer can be emptied.
+ *  return:     The next atomized/deatomized VCF record or NULL if no record is ready. The returned
+ *              structure will be cleaned by abuf.
+ */
+bcf1_t *abuf_flush(abuf_t *buf, int flush_all);
+
+#endif
+
diff --git a/bcftools/bam2bcf.c b/bcftools/bam2bcf.c

index d080917aab2b93c19639a30ad4d6df96971da7df..336e2f625dc3d944ae91c2302edb4b3927f0ba73 100644 (file)
--- a/bcftools/bam2bcf.c
+++ b/bcftools/bam2bcf.c
@@ -1,7 +1,7 @@
  /*  bam2bcf.c -- variant calling.
  
      Copyright (C) 2010-2012 Broad Institute.
-    Copyright (C) 2012-2014 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
@@ -40,7 +40,8 @@ extern  void ks_introsort_uint32_t(size_t n, uint32_t a[]);
  
  #define CAP_DIST 25
  
-bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
+bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ,
+                             int delta_baseQ)
  {
      bcf_callaux_t *bca;
      if (theta <= 0.) theta = CALL_DEFTHETA;
@@ -48,6 +49,8 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
      bca->capQ = 60;
      bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
      bca->min_baseQ = min_baseQ;
+    bca->max_baseQ = max_baseQ;
+    bca->delta_baseQ = delta_baseQ;
      bca->e = errmod_init(1. - theta);
      bca->min_frac = 0.002;
      bca->min_support = 1;
@@ -55,9 +58,13 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
      bca->npos = 100;
      bca->ref_pos = (int*) malloc(bca->npos*sizeof(int));
      bca->alt_pos = (int*) malloc(bca->npos*sizeof(int));
+    bca->iref_pos= (int*) malloc(bca->npos*sizeof(int));
+    bca->ialt_pos= (int*) malloc(bca->npos*sizeof(int));
      bca->nqual = 60;
      bca->ref_mq  = (int*) malloc(bca->nqual*sizeof(int));
      bca->alt_mq  = (int*) malloc(bca->nqual*sizeof(int));
+    bca->iref_mq = (int*) malloc(bca->nqual*sizeof(int));
+    bca->ialt_mq = (int*) malloc(bca->nqual*sizeof(int));
      bca->ref_bq  = (int*) malloc(bca->nqual*sizeof(int));
      bca->alt_bq  = (int*) malloc(bca->nqual*sizeof(int));
      bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int));
@@ -69,47 +76,68 @@ void bcf_call_destroy(bcf_callaux_t *bca)
  {
      if (bca == 0) return;
      errmod_destroy(bca->e);
-    if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; }
-    free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq);
+    if (bca->npos) {
+        free(bca->ref_pos);  free(bca->alt_pos);
+        free(bca->iref_pos); free(bca->ialt_pos);
+        bca->npos = 0;
+    }
+    free(bca->ref_mq); free(bca->alt_mq);
+    free(bca->iref_mq); free(bca->ialt_mq);
+    free(bca->ref_bq); free(bca->alt_bq);
      free(bca->fwd_mqs); free(bca->rev_mqs);
      bca->nqual = 0;
      free(bca->bases); free(bca->inscns); free(bca);
  }
  
  // position in the sequence with respect to the aligned part of the read
-static int get_position(const bam_pileup1_t *p, int *len)
-{
-    int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
-    for (icig=0; icig<p->b->core.n_cigar; icig++)
-    {
-        int cig  = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK;
-        int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT;
-        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
-        {
-            n_tot_bases += ncig;
-            iread += ncig;
-            continue;
-        }
-        if ( cig==BAM_CINS )
-        {
-            n_tot_bases += ncig;
-            iread += ncig;
+static int get_position(const bam_pileup1_t *p, int *len,
+                        int *sc_len, int *sc_dist) {
+    int i, j, edist = p->qpos + 1;
+    int sc_left = 0, sc_right = 0;
+    int sc_left_dist = -1, sc_right_dist = -1;
+
+    // left end
+    for (i = 0; i < p->b->core.n_cigar; i++) {
+        int cig  = bam_get_cigar(p->b)[i] & BAM_CIGAR_MASK;
+        if (cig == BAM_CHARD_CLIP)
              continue;
-        }
-        if ( cig==BAM_CSOFT_CLIP )
-        {
-            iread += ncig;
-            if ( iread<=p->qpos ) edist -= ncig;
+        else if (cig == BAM_CSOFT_CLIP)
+            sc_left += bam_get_cigar(p->b)[i] >> BAM_CIGAR_SHIFT;
+        else
+            break;
+    }
+    if (sc_left)
+        sc_left_dist = p->qpos+1 - sc_left;
+    edist -= sc_left;
+
+    // right end
+    for (j = p->b->core.n_cigar-1; j >= i; j--) {
+        int cig  = bam_get_cigar(p->b)[j] & BAM_CIGAR_MASK;
+        if (cig == BAM_CHARD_CLIP)
              continue;
+        else if (cig == BAM_CSOFT_CLIP)
+            sc_right += bam_get_cigar(p->b)[j] >> BAM_CIGAR_SHIFT;
+        else
+            break;
+    }
+    if (sc_right)
+        sc_right_dist = p->b->core.l_qseq - sc_right - p->qpos;
+
+    // Distance to nearest soft-clips and length of that clip.
+    if (sc_left_dist >= 0) {
+        if (sc_right_dist < 0 || sc_left_dist < sc_right_dist) {
+            *sc_len  = sc_left;
+            *sc_dist = sc_left_dist;
          }
-        if ( cig==BAM_CDEL ) continue;
-        if ( cig==BAM_CHARD_CLIP ) continue;
-        if ( cig==BAM_CPAD ) continue;
-        if ( cig==BAM_CREF_SKIP ) continue;
-        fprintf(stderr,"todo: cigar %d\n", cig);
-        assert(0);
-    }
-    *len = n_tot_bases;
+    } else if (sc_right_dist >= 0) {
+        *sc_len  = sc_right;
+        *sc_dist = sc_right_dist;
+    } else {
+        *sc_len  = 0;
+        *sc_dist = 0;
+    }
+
+    *len = p->b->core.l_qseq - sc_left - sc_right;
      return edist;
  }
  
@@ -117,8 +145,12 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
  {
      memset(bca->ref_pos,0,sizeof(int)*bca->npos);
      memset(bca->alt_pos,0,sizeof(int)*bca->npos);
+    memset(bca->iref_pos,0,sizeof(int)*bca->npos);
+    memset(bca->ialt_pos,0,sizeof(int)*bca->npos);
      memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
      memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
+    memset(bca->iref_mq,0,sizeof(int)*bca->nqual);
+    memset(bca->ialt_mq,0,sizeof(int)*bca->nqual);
      memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
      memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
      memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
@@ -126,13 +158,18 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
      if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
      if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
      if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
+    memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
+    memset(bca->ref_scl,  0, 100*sizeof(int));
+    memset(bca->alt_scl,  0, 100*sizeof(int));
+    memset(bca->iref_scl, 0, 100*sizeof(int));
+    memset(bca->ialt_scl, 0, 100*sizeof(int));
  }
  
  /*
      Notes:
-    - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies
-        which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation.
-        Later it's used for multiallelic calling by bcftools -m
+    - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.QS frequencies
+        which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the INFO/QS and FMT/QS annotations.
+        Later it's used for multiallelic calling by `call -m`, `call -mG` and `+trio-dnm`.
      - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
   */
  /*
@@ -150,7 +187,6 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
      // clean from previous run
      r->ori_depth = 0;
      r->mq0 = 0;
-    memset(r->qsum,0,sizeof(float)*4);
      memset(r->anno,0,sizeof(double)*16);
      memset(r->p,0,sizeof(float)*25);
      r->SCR = 0;
@@ -166,30 +202,65 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
          kroundup32(bca->max_bases);
          bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
      }
+
      // fill the bases array
+    double nqual_over_60 = bca->nqual / 60.0;
+    int ADR_ref_missed[4] = {0};
+    int ADF_ref_missed[4] = {0};
      for (i = n = 0; i < _n; ++i) {
          const bam_pileup1_t *p = pl + i;
          int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
+        if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
          if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
          if (p->is_del && !is_indel) continue;
          ++ori_depth;
          if (is_indel)
          {
-            b     = p->aux>>16&0x3f;
-            baseQ = q = p->aux&0xff;
-            // This read is not counted as indel. Instead of skipping it, treat it as ref. It is
-            // still only an approximation, but gives more accurate AD counts and calls correctly
-            // hets instead of alt-homs in some cases (see test/mpileup/indel-AD.1.sam)
-            if ( q < bca->min_baseQ ) b = 0, q = (int)bam_get_qual(p->b)[p->qpos];
-            seqQ  = p->aux>>8&0xff;
+            b = p->aux>>16&0x3f;
+            seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
+            if (q < bca->min_baseQ)
+            {
+                if (!p->indel && b < 4)
+                {
+                    if (bam_is_rev(p->b))
+                        ADR_ref_missed[b]++;
+                    else
+                        ADF_ref_missed[b]++;
+                }
+                continue;
+            }
+            if (p->indel == 0 && (q < _n/2 || _n > 20)) {
+                // high quality indel calls without p->indel set aren't
+                // particularly indicative of being a good REF match either,
+                // at least not in low coverage.  So require solid coverage
+                // before we start utilising such quals.
+                b = 0;
+                q = (int)bam_get_qual(p->b)[p->qpos];
+                seqQ = (3*seqQ + 2*q)/8;
+            }
+            if (_n > 20 && seqQ > 40) seqQ = 40;
+            baseQ  = p->aux>>8&0xff;
+
              is_diff = (b != 0);
          }
          else
          {
              b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
              b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
-            baseQ = q = (int)bam_get_qual(p->b)[p->qpos];
+
+            // Lowest of this and neighbour quality values
+            uint8_t *qual = bam_get_qual(p->b);
+            q = qual[p->qpos];
+            if (p->qpos > 0 &&
+                q > qual[p->qpos-1]+bca->delta_baseQ)
+                q = qual[p->qpos-1]+bca->delta_baseQ;
+            if (p->qpos+1 < p->b->core.l_qseq &&
+                q > qual[p->qpos+1]+bca->delta_baseQ)
+                q = qual[p->qpos+1]+bca->delta_baseQ;
+
              if (q < bca->min_baseQ) continue;
+            if (q > bca->max_baseQ) q = bca->max_baseQ;
+            baseQ = q;
              seqQ  = 99;
              is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
          }
@@ -201,11 +272,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
          if (q > 63) q = 63;
          if (q < 4) q = 4;       // MQ=0 reads count as BQ=4
          bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
-        if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
          // collect annotations
          if (b < 4)
          {
-            r->qsum[b] += q;
+            r->QS[b] += q;
              if ( r->ADF )
              {
                  if ( bam_is_rev(p->b) )
@@ -228,29 +298,65 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
          // collect for bias tests
          if ( baseQ > 59 ) baseQ = 59;
          if ( mapQ > 59 ) mapQ = 59;
-        int len, epos = 0;
-        if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB) )
+        int len, epos = 0, sc_len = 0, sc_dist = 0;
+        if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) )
          {
-            int pos = get_position(p, &len);
+            int pos = get_position(p, &len, &sc_len, &sc_dist);
              epos = (double)pos/(len+1) * bca->npos;
+
+            if (sc_len) {
+                sc_len = 15.0*sc_len / sc_dist;
+                if (sc_len > 99) sc_len = 99;
+            }
          }
-        int ibq  = baseQ/60. * bca->nqual;
-        int imq  = mapQ/60. * bca->nqual;
-        if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++;
-        else bca->fwd_mqs[imq]++;
+
+        int imq  = mapQ * nqual_over_60;
+        int ibq  = baseQ * nqual_over_60;
+
+        if ( bam_is_rev(p->b) )
+            bca->rev_mqs[imq]++;
+        else
+            bca->fwd_mqs[imq]++;
+
          if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
          {
              bca->ref_pos[epos]++;
              bca->ref_bq[ibq]++;
              bca->ref_mq[imq]++;
+            bca->ref_scl[sc_len]++;
          }
          else
          {
              bca->alt_pos[epos]++;
              bca->alt_bq[ibq]++;
              bca->alt_mq[imq]++;
+            bca->alt_scl[sc_len]++;
          }
      }
+
+    // Compensate for AD not being counted on low quality REF indel matches.
+    if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 )
+    {
+        for (i=0; i<4; i++) // verify: are the counters ever non-zero for i!=0?
+        {
+            r->ADR[i] += ADR_ref_missed[i];
+            r->ADF[i] += ADF_ref_missed[i];
+        }
+    }
+    else if ( r->ADF && bca->ambig_reads==B2B_INC_AD )
+    {
+        int dp = 0, dp_ambig = 0;
+        for (i=0; i<4; i++) dp += r->ADR[i];
+        for (i=0; i<4; i++) dp_ambig += ADR_ref_missed[i];
+        if ( dp )
+            for (i=0; i<4; i++) r->ADR[i] += lroundf((float)dp_ambig * r->ADR[i]/dp);
+        dp = 0, dp_ambig = 0;
+        for (i=0; i<4; i++) dp += r->ADF[i];
+        for (i=0; i<4; i++) dp_ambig += ADF_ref_missed[i];
+        if ( dp )
+            for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
+    }
+
      r->ori_depth = ori_depth;
      // glfgen
      errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
@@ -437,7 +543,7 @@ double calc_mwu_bias_cdf(int *a, int *b, int n)
      return pval>1 ? 1 : pval;
  }
  
-double calc_mwu_bias(int *a, int *b, int n)
+double calc_mwu_bias(int *a, int *b, int n, int left)
  {
      int na = 0, nb = 0, i;
      double U = 0, ties = 0;
@@ -461,6 +567,7 @@ double calc_mwu_bias(int *a, int *b, int n)
      if ( na==1 || nb==1 ) return 1.0;       // Flat probability, all U values are equally likely
  
      double mean = ((double)na*nb)*0.5;
+    if (left && U > mean) return 1; // for MQB which is asymmetrical
      if ( na==2 || nb==2 )
      {
          // Linear approximation
@@ -483,6 +590,85 @@ double calc_mwu_bias(int *a, int *b, int n)
      return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
  }
  
+// A Z-score version of the above function.
+//
+// See "Normal approximation and tie correction" at
+// https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
+//
+// The Z score is the number of standard deviations above or below the mean
+// with 0 being equality of the two distributions and +ve/-ve from there.
+//
+// This is a more robust score to filter on.
+double calc_mwu_biasZ(int *a, int *b, int n, int left_only, int do_Z) {
+    int i;
+    int64_t t;
+
+    // Optimisation
+    for (i = 0; i < n; i++)
+        if (b[i])
+            break;
+    int b_empty = (i == n);
+
+    // Count equal (e), less-than (l) and greater-than (g) permutations.
+    int e = 0, l = 0, na = 0, nb = 0;
+    if (b_empty) {
+        for (t = 0, i = n-1; i >= 0; i--) {
+            na += a[i];
+            t += (a[i]*a[i]-1)*a[i];  // adjustment score for ties
+        }
+    } else {
+        for (t = 0, i = n-1; i >= 0; i--) {
+            // Combinations of a[i] and b[j] for i==j
+            e += a[i]*b[i];
+
+            // nb is running total of b[i+1]..b[n-1].
+            // Therefore a[i]*nb is the number of combinations of a[i] and b[j]
+            // for all i < j.
+            l += a[i]*nb;    // a<b
+
+            na += a[i];
+            nb += b[i];
+            int p = a[i]+b[i];
+            t += (p*p-1)*p;  // adjustment score for ties
+        }
+    }
+
+    if (na+nb <= 1)
+        return HUGE_VAL;
+
+    double U, m;
+    U = l + e*0.5; // Mann-Whitney U score
+    m = na*nb / 2.0;
+
+    // With ties adjustment
+    double var2 = (na*nb)/12.0 * ((na+nb+1) - t/(double)((na+nb)*(na+nb-1)));
+    // var = na*nb*(na+nb+1)/12.0; // simpler; minus tie adjustment
+    if (var2 <= 0)
+        return HUGE_VAL;
+
+    if (do_Z) {
+        // S.D. normalised Z-score
+        //Z = (U - m - (U-m >= 0 ? 0.5 : -0.5)) / sd; // gatk method?
+        return (U - m) / sqrt(var2);
+    }
+
+    // Else U score, which can be asymmetric for some data types.
+    if (left_only && U > m)
+        return HUGE_VAL; // one-sided, +ve bias is OK, -ve is not.
+
+    if (na >= 8 || nb >= 8) {
+        // Normal approximation, very good for na>=8 && nb>=8 and
+        // reasonable if na<8 or nb<8
+        return exp(-0.5*(U-m)*(U-m)/var2);
+    }
+
+    // Exact calculation
+    if (na==1 || nb == 1)
+        return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2);
+    else
+        return mann_whitney_1947(na, nb, U) * sqrt(2*M_PI*var2);
+}
+
  static inline double logsumexp2(double a, double b)
  {
      if ( a>b )
@@ -558,7 +744,7 @@ void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call)
  int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
  {
      int ref4, i, j;
-    float qsum[5] = {0,0,0,0,0};
+    float qsum[B2B_MAX_ALLELES] = {0,0,0,0,0};
      if (ref_base >= 0) {
          call->ori_ref = ref4 = seq_nt16_int[ref_base];
          if (ref4 > 4) ref4 = 4;
@@ -569,9 +755,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
      for (i = 0; i < n; ++i)
      {
          float sum = 0;
-        for (j = 0; j < 4; ++j) sum += calls[i].qsum[j];
+        for (j = 0; j < 4; ++j) sum += calls[i].QS[j];
          if ( sum )
-            for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum;
+            for (j = 0; j < 4; j++) qsum[j] += (float)calls[i].QS[j] / sum;
      }
  
      // sort qsum in ascending order (insertion sort)
@@ -583,7 +769,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
  
      // Set the reference allele and alternative allele(s)
      for (i=0; i<5; i++) call->a[i] = -1;
-    for (i=0; i<5; i++) call->qsum[i] = 0;
+    for (i=0; i<B2B_MAX_ALLELES; i++) call->qsum[i] = 0;
      call->unseen = -1;
      call->a[0] = ref4;
      for (i=3, j=1; i>=0; i--)   // i: alleles sorted by QS; j, a[j]: output allele ordering
@@ -695,6 +881,21 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
                  adf += B2B_MAX_ALLELES;
              }
          }
+        if ( bca->fmt_flag & B2B_FMT_QS )
+        {
+            assert( call->n_alleles<=B2B_MAX_ALLELES );   // this is always true for SNPs and so far for indels as well
+
+            // reorder QS to match the allele ordering at this site
+            int32_t tmp[B2B_MAX_ALLELES];
+            int32_t *qs = call->QS, *qs_out = call->QS;
+            for (i=0; i<n; i++)
+            {
+                for (j=0; j<call->n_alleles; j++) tmp[j] = qs[ call->a[j] ];
+                for (j=0; j<call->n_alleles; j++) qs_out[j] = tmp[j] < BCF_MAX_BT_INT32 ? tmp[j] : BCF_MAX_BT_INT32;
+                qs_out += call->n_alleles;
+                qs += B2B_MAX_ALLELES;
+            }
+        }
  
  //      if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
          call->shift = (int)(sum_min + .499);
@@ -717,11 +918,43 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
      // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
      // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
  
-    if ( bca->fmt_flag & B2B_INFO_RPB )
-        call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos);
-    call->mwu_mq  = calc_mwu_bias(bca->ref_mq,  bca->alt_mq,  bca->nqual);
-    call->mwu_bq  = calc_mwu_bias(bca->ref_bq,  bca->alt_bq,  bca->nqual);
-    call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
+    if (bca->fmt_flag & B2B_INFO_ZSCORE) {
+        // U z-normalised as +/- number of standard deviations from mean.
+        if (call->ori_ref < 0) {
+            if (bca->fmt_flag & B2B_INFO_RPB)
+                call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
+                                               bca->npos, 0, 1);
+            call->mwu_mq  = calc_mwu_biasZ(bca->iref_mq,  bca->ialt_mq,
+                                           bca->nqual,1,1);
+            if ( bca->fmt_flag & B2B_INFO_SCB )
+                call->mwu_sc  = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl,
+                                               100, 0,1);
+        } else {
+            if (bca->fmt_flag & B2B_INFO_RPB)
+                call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
+                                               bca->npos, 0, 1);
+            call->mwu_mq  = calc_mwu_biasZ(bca->ref_mq,  bca->alt_mq,
+                                           bca->nqual,1,1);
+            call->mwu_bq  = calc_mwu_biasZ(bca->ref_bq,  bca->alt_bq,
+                                           bca->nqual,0,1);
+            call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
+                                           bca->nqual,0,1);
+            if ( bca->fmt_flag & B2B_INFO_SCB )
+                call->mwu_sc  = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
+                                               100, 0,1);
+        }
+    } else {
+        // Old method; U as probability between 0 and 1
+        if ( bca->fmt_flag & B2B_INFO_RPB )
+            call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
+                                           bca->npos, 0, 0);
+        call->mwu_mq  = calc_mwu_biasZ(bca->ref_mq,  bca->alt_mq,
+                                       bca->nqual, 1, 0);
+        call->mwu_bq  = calc_mwu_biasZ(bca->ref_bq,  bca->alt_bq,
+                                       bca->nqual, 0, 0);
+        call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
+                                       bca->nqual, 0, 0);
+    }
  
  #if CDF_MWU_TESTS
      // CDF version of MWU tests is not calculated by default
@@ -732,7 +965,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
      call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
  #endif
  
-    if ( bca->fmt_flag & B2B_INFO_VDB ) 
+    if ( bca->fmt_flag & B2B_INFO_VDB )
          call->vdb = calc_vdb(bca->alt_pos, bca->npos);
  
      return 0;
@@ -819,10 +1052,32 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
  
      if ( bc->vdb != HUGE_VAL )      bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
      if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
-    if ( bc->mwu_pos != HUGE_VAL )  bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
-    if ( bc->mwu_mq != HUGE_VAL )   bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
-    if ( bc->mwu_mqs != HUGE_VAL )  bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
-    if ( bc->mwu_bq != HUGE_VAL )   bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+
+    if (bca->fmt_flag & B2B_INFO_ZSCORE) {
+        if ( bc->mwu_pos != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
+        if ( bc->mwu_mq != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
+        if ( bc->mwu_mqs != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
+        if ( bc->mwu_bq != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
+        if ( bc->mwu_sc != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
+    } else {
+        if ( bc->mwu_pos != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
+        if ( bc->mwu_mq != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
+        if ( bc->mwu_mqs != HUGE_VAL )
+             bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
+        if ( bc->mwu_bq != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+    }
+
+    if ( bc->strand_bias != HUGE_VAL )
+        bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
+
  #if CDF_MWU_TESTS
      if ( bc->mwu_pos_cdf != HUGE_VAL )  bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
      if ( bc->mwu_mq_cdf != HUGE_VAL )   bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
@@ -884,6 +1139,8 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
      }
      if ( fmt_flag&B2B_FMT_SCR )
          bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample);
+    if ( fmt_flag&B2B_FMT_QS )
+        bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele);
  
      return 0;
  }
diff --git a/bcftools/bam2bcf.c.pysam.c b/bcftools/bam2bcf.c.pysam.c

index 16a559a7da4e17930489b21777de633d5d503b92..001363e60497af7eb89036fc0224c733a48c5ef3 100644 (file)
--- a/bcftools/bam2bcf.c.pysam.c
+++ b/bcftools/bam2bcf.c.pysam.c
@@ -3,7 +3,7 @@
  /*  bam2bcf.c -- variant calling.
  
      Copyright (C) 2010-2012 Broad Institute.
-    Copyright (C) 2012-2014 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
@@ -42,7 +42,8 @@ extern  void ks_introsort_uint32_t(size_t n, uint32_t a[]);
  
  #define CAP_DIST 25
  
-bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
+bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ,
+                             int delta_baseQ)
  {
      bcf_callaux_t *bca;
      if (theta <= 0.) theta = CALL_DEFTHETA;
@@ -50,6 +51,8 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
      bca->capQ = 60;
      bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
      bca->min_baseQ = min_baseQ;
+    bca->max_baseQ = max_baseQ;
+    bca->delta_baseQ = delta_baseQ;
      bca->e = errmod_init(1. - theta);
      bca->min_frac = 0.002;
      bca->min_support = 1;
@@ -57,9 +60,13 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
      bca->npos = 100;
      bca->ref_pos = (int*) malloc(bca->npos*sizeof(int));
      bca->alt_pos = (int*) malloc(bca->npos*sizeof(int));
+    bca->iref_pos= (int*) malloc(bca->npos*sizeof(int));
+    bca->ialt_pos= (int*) malloc(bca->npos*sizeof(int));
      bca->nqual = 60;
      bca->ref_mq  = (int*) malloc(bca->nqual*sizeof(int));
      bca->alt_mq  = (int*) malloc(bca->nqual*sizeof(int));
+    bca->iref_mq = (int*) malloc(bca->nqual*sizeof(int));
+    bca->ialt_mq = (int*) malloc(bca->nqual*sizeof(int));
      bca->ref_bq  = (int*) malloc(bca->nqual*sizeof(int));
      bca->alt_bq  = (int*) malloc(bca->nqual*sizeof(int));
      bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int));
@@ -71,47 +78,68 @@ void bcf_call_destroy(bcf_callaux_t *bca)
  {
      if (bca == 0) return;
      errmod_destroy(bca->e);
-    if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; }
-    free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq);
+    if (bca->npos) {
+        free(bca->ref_pos);  free(bca->alt_pos);
+        free(bca->iref_pos); free(bca->ialt_pos);
+        bca->npos = 0;
+    }
+    free(bca->ref_mq); free(bca->alt_mq);
+    free(bca->iref_mq); free(bca->ialt_mq);
+    free(bca->ref_bq); free(bca->alt_bq);
      free(bca->fwd_mqs); free(bca->rev_mqs);
      bca->nqual = 0;
      free(bca->bases); free(bca->inscns); free(bca);
  }
  
  // position in the sequence with respect to the aligned part of the read
-static int get_position(const bam_pileup1_t *p, int *len)
-{
-    int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
-    for (icig=0; icig<p->b->core.n_cigar; icig++)
-    {
-        int cig  = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK;
-        int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT;
-        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
-        {
-            n_tot_bases += ncig;
-            iread += ncig;
-            continue;
-        }
-        if ( cig==BAM_CINS )
-        {
-            n_tot_bases += ncig;
-            iread += ncig;
+static int get_position(const bam_pileup1_t *p, int *len,
+                        int *sc_len, int *sc_dist) {
+    int i, j, edist = p->qpos + 1;
+    int sc_left = 0, sc_right = 0;
+    int sc_left_dist = -1, sc_right_dist = -1;
+
+    // left end
+    for (i = 0; i < p->b->core.n_cigar; i++) {
+        int cig  = bam_get_cigar(p->b)[i] & BAM_CIGAR_MASK;
+        if (cig == BAM_CHARD_CLIP)
              continue;
-        }
-        if ( cig==BAM_CSOFT_CLIP )
-        {
-            iread += ncig;
-            if ( iread<=p->qpos ) edist -= ncig;
+        else if (cig == BAM_CSOFT_CLIP)
+            sc_left += bam_get_cigar(p->b)[i] >> BAM_CIGAR_SHIFT;
+        else
+            break;
+    }
+    if (sc_left)
+        sc_left_dist = p->qpos+1 - sc_left;
+    edist -= sc_left;
+
+    // right end
+    for (j = p->b->core.n_cigar-1; j >= i; j--) {
+        int cig  = bam_get_cigar(p->b)[j] & BAM_CIGAR_MASK;
+        if (cig == BAM_CHARD_CLIP)
              continue;
+        else if (cig == BAM_CSOFT_CLIP)
+            sc_right += bam_get_cigar(p->b)[j] >> BAM_CIGAR_SHIFT;
+        else
+            break;
+    }
+    if (sc_right)
+        sc_right_dist = p->b->core.l_qseq - sc_right - p->qpos;
+
+    // Distance to nearest soft-clips and length of that clip.
+    if (sc_left_dist >= 0) {
+        if (sc_right_dist < 0 || sc_left_dist < sc_right_dist) {
+            *sc_len  = sc_left;
+            *sc_dist = sc_left_dist;
          }
-        if ( cig==BAM_CDEL ) continue;
-        if ( cig==BAM_CHARD_CLIP ) continue;
-        if ( cig==BAM_CPAD ) continue;
-        if ( cig==BAM_CREF_SKIP ) continue;
-        fprintf(bcftools_stderr,"todo: cigar %d\n", cig);
-        assert(0);
-    }
-    *len = n_tot_bases;
+    } else if (sc_right_dist >= 0) {
+        *sc_len  = sc_right;
+        *sc_dist = sc_right_dist;
+    } else {
+        *sc_len  = 0;
+        *sc_dist = 0;
+    }
+
+    *len = p->b->core.l_qseq - sc_left - sc_right;
      return edist;
  }
  
@@ -119,8 +147,12 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
  {
      memset(bca->ref_pos,0,sizeof(int)*bca->npos);
      memset(bca->alt_pos,0,sizeof(int)*bca->npos);
+    memset(bca->iref_pos,0,sizeof(int)*bca->npos);
+    memset(bca->ialt_pos,0,sizeof(int)*bca->npos);
      memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
      memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
+    memset(bca->iref_mq,0,sizeof(int)*bca->nqual);
+    memset(bca->ialt_mq,0,sizeof(int)*bca->nqual);
      memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
      memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
      memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
@@ -128,13 +160,18 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
      if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
      if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
      if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
+    memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
+    memset(bca->ref_scl,  0, 100*sizeof(int));
+    memset(bca->alt_scl,  0, 100*sizeof(int));
+    memset(bca->iref_scl, 0, 100*sizeof(int));
+    memset(bca->ialt_scl, 0, 100*sizeof(int));
  }
  
  /*
      Notes:
-    - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies
-        which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation.
-        Later it's used for multiallelic calling by bcftools -m
+    - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.QS frequencies
+        which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the INFO/QS and FMT/QS annotations.
+        Later it's used for multiallelic calling by `call -m`, `call -mG` and `+trio-dnm`.
      - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
   */
  /*
@@ -152,7 +189,6 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
      // clean from previous run
      r->ori_depth = 0;
      r->mq0 = 0;
-    memset(r->qsum,0,sizeof(float)*4);
      memset(r->anno,0,sizeof(double)*16);
      memset(r->p,0,sizeof(float)*25);
      r->SCR = 0;
@@ -168,30 +204,65 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
          kroundup32(bca->max_bases);
          bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
      }
+
      // fill the bases array
+    double nqual_over_60 = bca->nqual / 60.0;
+    int ADR_ref_missed[4] = {0};
+    int ADF_ref_missed[4] = {0};
      for (i = n = 0; i < _n; ++i) {
          const bam_pileup1_t *p = pl + i;
          int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
+        if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
          if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
          if (p->is_del && !is_indel) continue;
          ++ori_depth;
          if (is_indel)
          {
-            b     = p->aux>>16&0x3f;
-            baseQ = q = p->aux&0xff;
-            // This read is not counted as indel. Instead of skipping it, treat it as ref. It is
-            // still only an approximation, but gives more accurate AD counts and calls correctly
-            // hets instead of alt-homs in some cases (see test/mpileup/indel-AD.1.sam)
-            if ( q < bca->min_baseQ ) b = 0, q = (int)bam_get_qual(p->b)[p->qpos];
-            seqQ  = p->aux>>8&0xff;
+            b = p->aux>>16&0x3f;
+            seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
+            if (q < bca->min_baseQ)
+            {
+                if (!p->indel && b < 4)
+                {
+                    if (bam_is_rev(p->b))
+                        ADR_ref_missed[b]++;
+                    else
+                        ADF_ref_missed[b]++;
+                }
+                continue;
+            }
+            if (p->indel == 0 && (q < _n/2 || _n > 20)) {
+                // high quality indel calls without p->indel set aren't
+                // particularly indicative of being a good REF match either,
+                // at least not in low coverage.  So require solid coverage
+                // before we start utilising such quals.
+                b = 0;
+                q = (int)bam_get_qual(p->b)[p->qpos];
+                seqQ = (3*seqQ + 2*q)/8;
+            }
+            if (_n > 20 && seqQ > 40) seqQ = 40;
+            baseQ  = p->aux>>8&0xff;
+
              is_diff = (b != 0);
          }
          else
          {
              b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
              b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
-            baseQ = q = (int)bam_get_qual(p->b)[p->qpos];
+
+            // Lowest of this and neighbour quality values
+            uint8_t *qual = bam_get_qual(p->b);
+            q = qual[p->qpos];
+            if (p->qpos > 0 &&
+                q > qual[p->qpos-1]+bca->delta_baseQ)
+                q = qual[p->qpos-1]+bca->delta_baseQ;
+            if (p->qpos+1 < p->b->core.l_qseq &&
+                q > qual[p->qpos+1]+bca->delta_baseQ)
+                q = qual[p->qpos+1]+bca->delta_baseQ;
+
              if (q < bca->min_baseQ) continue;
+            if (q > bca->max_baseQ) q = bca->max_baseQ;
+            baseQ = q;
              seqQ  = 99;
              is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
          }
@@ -203,11 +274,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
          if (q > 63) q = 63;
          if (q < 4) q = 4;       // MQ=0 reads count as BQ=4
          bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
-        if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
          // collect annotations
          if (b < 4)
          {
-            r->qsum[b] += q;
+            r->QS[b] += q;
              if ( r->ADF )
              {
                  if ( bam_is_rev(p->b) )
@@ -230,29 +300,65 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
          // collect for bias tests
          if ( baseQ > 59 ) baseQ = 59;
          if ( mapQ > 59 ) mapQ = 59;
-        int len, epos = 0;
-        if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB) )
+        int len, epos = 0, sc_len = 0, sc_dist = 0;
+        if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) )
          {
-            int pos = get_position(p, &len);
+            int pos = get_position(p, &len, &sc_len, &sc_dist);
              epos = (double)pos/(len+1) * bca->npos;
+
+            if (sc_len) {
+                sc_len = 15.0*sc_len / sc_dist;
+                if (sc_len > 99) sc_len = 99;
+            }
          }
-        int ibq  = baseQ/60. * bca->nqual;
-        int imq  = mapQ/60. * bca->nqual;
-        if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++;
-        else bca->fwd_mqs[imq]++;
+
+        int imq  = mapQ * nqual_over_60;
+        int ibq  = baseQ * nqual_over_60;
+
+        if ( bam_is_rev(p->b) )
+            bca->rev_mqs[imq]++;
+        else
+            bca->fwd_mqs[imq]++;
+
          if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
          {
              bca->ref_pos[epos]++;
              bca->ref_bq[ibq]++;
              bca->ref_mq[imq]++;
+            bca->ref_scl[sc_len]++;
          }
          else
          {
              bca->alt_pos[epos]++;
              bca->alt_bq[ibq]++;
              bca->alt_mq[imq]++;
+            bca->alt_scl[sc_len]++;
          }
      }
+
+    // Compensate for AD not being counted on low quality REF indel matches.
+    if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 )
+    {
+        for (i=0; i<4; i++) // verify: are the counters ever non-zero for i!=0?
+        {
+            r->ADR[i] += ADR_ref_missed[i];
+            r->ADF[i] += ADF_ref_missed[i];
+        }
+    }
+    else if ( r->ADF && bca->ambig_reads==B2B_INC_AD )
+    {
+        int dp = 0, dp_ambig = 0;
+        for (i=0; i<4; i++) dp += r->ADR[i];
+        for (i=0; i<4; i++) dp_ambig += ADR_ref_missed[i];
+        if ( dp )
+            for (i=0; i<4; i++) r->ADR[i] += lroundf((float)dp_ambig * r->ADR[i]/dp);
+        dp = 0, dp_ambig = 0;
+        for (i=0; i<4; i++) dp += r->ADF[i];
+        for (i=0; i<4; i++) dp_ambig += ADF_ref_missed[i];
+        if ( dp )
+            for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
+    }
+
      r->ori_depth = ori_depth;
      // glfgen
      errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
@@ -439,7 +545,7 @@ double calc_mwu_bias_cdf(int *a, int *b, int n)
      return pval>1 ? 1 : pval;
  }
  
-double calc_mwu_bias(int *a, int *b, int n)
+double calc_mwu_bias(int *a, int *b, int n, int left)
  {
      int na = 0, nb = 0, i;
      double U = 0, ties = 0;
@@ -463,6 +569,7 @@ double calc_mwu_bias(int *a, int *b, int n)
      if ( na==1 || nb==1 ) return 1.0;       // Flat probability, all U values are equally likely
  
      double mean = ((double)na*nb)*0.5;
+    if (left && U > mean) return 1; // for MQB which is asymmetrical
      if ( na==2 || nb==2 )
      {
          // Linear approximation
@@ -485,6 +592,85 @@ double calc_mwu_bias(int *a, int *b, int n)
      return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
  }
  
+// A Z-score version of the above function.
+//
+// See "Normal approximation and tie correction" at
+// https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
+//
+// The Z score is the number of standard deviations above or below the mean
+// with 0 being equality of the two distributions and +ve/-ve from there.
+//
+// This is a more robust score to filter on.
+double calc_mwu_biasZ(int *a, int *b, int n, int left_only, int do_Z) {
+    int i;
+    int64_t t;
+
+    // Optimisation
+    for (i = 0; i < n; i++)
+        if (b[i])
+            break;
+    int b_empty = (i == n);
+
+    // Count equal (e), less-than (l) and greater-than (g) permutations.
+    int e = 0, l = 0, na = 0, nb = 0;
+    if (b_empty) {
+        for (t = 0, i = n-1; i >= 0; i--) {
+            na += a[i];
+            t += (a[i]*a[i]-1)*a[i];  // adjustment score for ties
+        }
+    } else {
+        for (t = 0, i = n-1; i >= 0; i--) {
+            // Combinations of a[i] and b[j] for i==j
+            e += a[i]*b[i];
+
+            // nb is running total of b[i+1]..b[n-1].
+            // Therefore a[i]*nb is the number of combinations of a[i] and b[j]
+            // for all i < j.
+            l += a[i]*nb;    // a<b
+
+            na += a[i];
+            nb += b[i];
+            int p = a[i]+b[i];
+            t += (p*p-1)*p;  // adjustment score for ties
+        }
+    }
+
+    if (na+nb <= 1)
+        return HUGE_VAL;
+
+    double U, m;
+    U = l + e*0.5; // Mann-Whitney U score
+    m = na*nb / 2.0;
+
+    // With ties adjustment
+    double var2 = (na*nb)/12.0 * ((na+nb+1) - t/(double)((na+nb)*(na+nb-1)));
+    // var = na*nb*(na+nb+1)/12.0; // simpler; minus tie adjustment
+    if (var2 <= 0)
+        return HUGE_VAL;
+
+    if (do_Z) {
+        // S.D. normalised Z-score
+        //Z = (U - m - (U-m >= 0 ? 0.5 : -0.5)) / sd; // gatk method?
+        return (U - m) / sqrt(var2);
+    }
+
+    // Else U score, which can be asymmetric for some data types.
+    if (left_only && U > m)
+        return HUGE_VAL; // one-sided, +ve bias is OK, -ve is not.
+
+    if (na >= 8 || nb >= 8) {
+        // Normal approximation, very good for na>=8 && nb>=8 and
+        // reasonable if na<8 or nb<8
+        return exp(-0.5*(U-m)*(U-m)/var2);
+    }
+
+    // Exact calculation
+    if (na==1 || nb == 1)
+        return mann_whitney_1947_(na, nb, U) * sqrt(2*M_PI*var2);
+    else
+        return mann_whitney_1947(na, nb, U) * sqrt(2*M_PI*var2);
+}
+
  static inline double logsumexp2(double a, double b)
  {
      if ( a>b )
@@ -560,7 +746,7 @@ void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call)
  int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
  {
      int ref4, i, j;
-    float qsum[5] = {0,0,0,0,0};
+    float qsum[B2B_MAX_ALLELES] = {0,0,0,0,0};
      if (ref_base >= 0) {
          call->ori_ref = ref4 = seq_nt16_int[ref_base];
          if (ref4 > 4) ref4 = 4;
@@ -571,9 +757,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
      for (i = 0; i < n; ++i)
      {
          float sum = 0;
-        for (j = 0; j < 4; ++j) sum += calls[i].qsum[j];
+        for (j = 0; j < 4; ++j) sum += calls[i].QS[j];
          if ( sum )
-            for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum;
+            for (j = 0; j < 4; j++) qsum[j] += (float)calls[i].QS[j] / sum;
      }
  
      // sort qsum in ascending order (insertion sort)
@@ -585,7 +771,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
  
      // Set the reference allele and alternative allele(s)
      for (i=0; i<5; i++) call->a[i] = -1;
-    for (i=0; i<5; i++) call->qsum[i] = 0;
+    for (i=0; i<B2B_MAX_ALLELES; i++) call->qsum[i] = 0;
      call->unseen = -1;
      call->a[0] = ref4;
      for (i=3, j=1; i>=0; i--)   // i: alleles sorted by QS; j, a[j]: output allele ordering
@@ -697,6 +883,21 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
                  adf += B2B_MAX_ALLELES;
              }
          }
+        if ( bca->fmt_flag & B2B_FMT_QS )
+        {
+            assert( call->n_alleles<=B2B_MAX_ALLELES );   // this is always true for SNPs and so far for indels as well
+
+            // reorder QS to match the allele ordering at this site
+            int32_t tmp[B2B_MAX_ALLELES];
+            int32_t *qs = call->QS, *qs_out = call->QS;
+            for (i=0; i<n; i++)
+            {
+                for (j=0; j<call->n_alleles; j++) tmp[j] = qs[ call->a[j] ];
+                for (j=0; j<call->n_alleles; j++) qs_out[j] = tmp[j] < BCF_MAX_BT_INT32 ? tmp[j] : BCF_MAX_BT_INT32;
+                qs_out += call->n_alleles;
+                qs += B2B_MAX_ALLELES;
+            }
+        }
  
  //      if (ref_base < 0) fprintf(bcftools_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
          call->shift = (int)(sum_min + .499);
@@ -719,11 +920,43 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
      // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
      // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
  
-    if ( bca->fmt_flag & B2B_INFO_RPB )
-        call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos);
-    call->mwu_mq  = calc_mwu_bias(bca->ref_mq,  bca->alt_mq,  bca->nqual);
-    call->mwu_bq  = calc_mwu_bias(bca->ref_bq,  bca->alt_bq,  bca->nqual);
-    call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
+    if (bca->fmt_flag & B2B_INFO_ZSCORE) {
+        // U z-normalised as +/- number of standard deviations from mean.
+        if (call->ori_ref < 0) {
+            if (bca->fmt_flag & B2B_INFO_RPB)
+                call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
+                                               bca->npos, 0, 1);
+            call->mwu_mq  = calc_mwu_biasZ(bca->iref_mq,  bca->ialt_mq,
+                                           bca->nqual,1,1);
+            if ( bca->fmt_flag & B2B_INFO_SCB )
+                call->mwu_sc  = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl,
+                                               100, 0,1);
+        } else {
+            if (bca->fmt_flag & B2B_INFO_RPB)
+                call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
+                                               bca->npos, 0, 1);
+            call->mwu_mq  = calc_mwu_biasZ(bca->ref_mq,  bca->alt_mq,
+                                           bca->nqual,1,1);
+            call->mwu_bq  = calc_mwu_biasZ(bca->ref_bq,  bca->alt_bq,
+                                           bca->nqual,0,1);
+            call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
+                                           bca->nqual,0,1);
+            if ( bca->fmt_flag & B2B_INFO_SCB )
+                call->mwu_sc  = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
+                                               100, 0,1);
+        }
+    } else {
+        // Old method; U as probability between 0 and 1
+        if ( bca->fmt_flag & B2B_INFO_RPB )
+            call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
+                                           bca->npos, 0, 0);
+        call->mwu_mq  = calc_mwu_biasZ(bca->ref_mq,  bca->alt_mq,
+                                       bca->nqual, 1, 0);
+        call->mwu_bq  = calc_mwu_biasZ(bca->ref_bq,  bca->alt_bq,
+                                       bca->nqual, 0, 0);
+        call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
+                                       bca->nqual, 0, 0);
+    }
  
  #if CDF_MWU_TESTS
      // CDF version of MWU tests is not calculated by default
@@ -734,7 +967,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
      call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
  #endif
  
-    if ( bca->fmt_flag & B2B_INFO_VDB ) 
+    if ( bca->fmt_flag & B2B_INFO_VDB )
          call->vdb = calc_vdb(bca->alt_pos, bca->npos);
  
      return 0;
@@ -821,10 +1054,32 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
  
      if ( bc->vdb != HUGE_VAL )      bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
      if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
-    if ( bc->mwu_pos != HUGE_VAL )  bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
-    if ( bc->mwu_mq != HUGE_VAL )   bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
-    if ( bc->mwu_mqs != HUGE_VAL )  bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
-    if ( bc->mwu_bq != HUGE_VAL )   bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+
+    if (bca->fmt_flag & B2B_INFO_ZSCORE) {
+        if ( bc->mwu_pos != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
+        if ( bc->mwu_mq != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
+        if ( bc->mwu_mqs != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
+        if ( bc->mwu_bq != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
+        if ( bc->mwu_sc != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
+    } else {
+        if ( bc->mwu_pos != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
+        if ( bc->mwu_mq != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
+        if ( bc->mwu_mqs != HUGE_VAL )
+             bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
+        if ( bc->mwu_bq != HUGE_VAL )
+            bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+    }
+
+    if ( bc->strand_bias != HUGE_VAL )
+        bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
+
  #if CDF_MWU_TESTS
      if ( bc->mwu_pos_cdf != HUGE_VAL )  bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
      if ( bc->mwu_mq_cdf != HUGE_VAL )   bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
@@ -886,6 +1141,8 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
      }
      if ( fmt_flag&B2B_FMT_SCR )
          bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample);
+    if ( fmt_flag&B2B_FMT_QS )
+        bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele);
  
      return 0;
  }
diff --git a/bcftools/bam2bcf.h b/bcftools/bam2bcf.h

index 2d2cf83ee32bff459ae2f18509c5728607eb4c9d..e8b0fb9854d3504e811059b8ad5fda2ba1c60b01 100644 (file)
--- a/bcftools/bam2bcf.h
+++ b/bcftools/bam2bcf.h
@@ -1,7 +1,7 @@
  /*  bam2bcf.h -- variant calling.
  
      Copyright (C) 2010-2012 Broad Institute.
-    Copyright (C) 2012-2014,2016 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
@@ -59,21 +59,36 @@ DEALINGS IN THE SOFTWARE.  */
  #define B2B_FMT_SCR     (1<<13)
  #define B2B_INFO_VDB    (1<<14)
  #define B2B_INFO_RPB    (1<<15)
+#define B2B_FMT_QS      (1<<16)
+#define B2B_INFO_SCB    (1<<17)
+#define B2B_INFO_ZSCORE (1<<30) // MWU as-is or Z-normalised
  
  #define B2B_MAX_ALLELES 5
  
+#define B2B_DROP      0
+#define B2B_INC_AD    1
+#define B2B_INC_AD0   2
+
  #define PLP_HAS_SOFT_CLIP(i) ((i)&1)
-#define PLP_SAMPLE_ID(i)     ((i)>>1)
+#define PLP_HAS_INDEL(i)     ((i)&2)
+#define PLP_SAMPLE_ID(i)     ((i)>>2)
+
+#define PLP_SET_SOFT_CLIP(i)     ((i)|=1)
+#define PLP_SET_INDEL(i)         ((i)|=2)
+#define PLP_SET_SAMPLE_ID(i,n)   ((i)|=(n)<<2)
  
  typedef struct __bcf_callaux_t {
-    int fmt_flag;
-    int capQ, min_baseQ;
+    int fmt_flag, ambig_reads;
+    int capQ, min_baseQ, max_baseQ, delta_baseQ;
      int openQ, extQ, tandemQ; // for indels
      uint32_t min_support, max_support; // for collecting indel candidates
      double min_frac; // for collecting indel candidates
      float max_frac; // for collecting indel candidates
      int per_sample_flt; // indel filtering strategy
      int *ref_pos, *alt_pos, npos, *ref_mq, *alt_mq, *ref_bq, *alt_bq, *fwd_mqs, *rev_mqs, nqual; // for bias tests
+    int *iref_pos, *ialt_pos, *iref_mq, *ialt_mq; // for indels
+    int ref_scl[100], alt_scl[100];   // soft-clip length bias; SNP
+    int iref_scl[100], ialt_scl[100]; // soft-clip length bias; INDEL
      // for internal uses
      int max_bases;
      int indel_types[4];     // indel lengths
@@ -83,14 +98,14 @@ typedef struct __bcf_callaux_t {
      uint16_t *bases;        // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types)
      errmod_t *e;
      void *rghash;
+    float indel_bias;  // adjusts indel score threshold; lower => call more.
  } bcf_callaux_t;
  
  // per-sample values
  typedef struct {
-    uint32_t ori_depth;
+    uint32_t ori_depth;     // ori_depth = anno[0..3] but before --min-BQ is applied
      unsigned int mq0;
-    int32_t *ADF, *ADR, SCR;
-    float qsum[4];
+    int32_t *ADF, *ADR, SCR, *QS;   // FMT/QS
      // The fields are:
      //      depth fwd   .. ref (0) and non-ref (2)
      //      depth rev   .. ref (1) and non-ref (3)
@@ -112,19 +127,20 @@ typedef struct {
      int tid, pos;
      bcf_hdr_t *bcf_hdr;
      int a[5]; // alleles: ref, alt, alt2, alt3
-    float qsum[5];  // for the QS tag
+    float qsum[B2B_MAX_ALLELES];  // INFO/QS tag
      int n, n_alleles, shift, ori_ref, unseen;
      int n_supp; // number of supporting non-reference reads
      double anno[16];
      unsigned int depth, ori_depth, mq0;
-    int32_t *PL, *DP4, *ADR, *ADF, *SCR;
+    int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS;
      uint8_t *fmt_arr;
      float vdb; // variant distance bias
-    float mwu_pos, mwu_mq, mwu_bq, mwu_mqs;
+    float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc;
  #if CDF_MWU_TESTS
      float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf;
  #endif
      float seg_bias;
+    float strand_bias; // phred-scaled fisher-exact test
      kstring_t tmp;
  } bcf_call_t;
  
@@ -132,7 +148,8 @@ typedef struct {
  extern "C" {
  #endif
  
-    bcf_callaux_t *bcf_call_init(double theta, int min_baseQ);
+    bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ,
+                                 int delta_baseQ);
      void bcf_call_destroy(bcf_callaux_t *bca);
      int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r);
      int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call);
diff --git a/bcftools/bam2bcf_indel.c b/bcftools/bam2bcf_indel.c

index 6c367da7388d50b0a5c0426df6549c7257c9227b..facb3bf27e184c75395fad3b1ed2fe8bba5cdbeb 100644 (file)
--- a/bcftools/bam2bcf_indel.c
+++ b/bcftools/bam2bcf_indel.c
@@ -1,7 +1,7 @@
  /*  bam2bcf_indel.c -- indel caller.
  
      Copyright (C) 2010, 2011 Broad Institute.
-    Copyright (C) 2012-2014,2016 Genome Research Ltd.
+    Copyright (C) 2012-2014,2016-2017, 2021 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
@@ -26,19 +26,29 @@ DEALINGS IN THE SOFTWARE.  */
  #include <assert.h>
  #include <ctype.h>
  #include <string.h>
+#include <math.h>
  #include <htslib/hts.h>
  #include <htslib/sam.h>
  #include <htslib/khash_str2int.h>
  #include "bam2bcf.h"
+#include "str_finder.h"
  
  #include <htslib/ksort.h>
  KSORT_INIT_GENERIC(uint32_t)
  
  #define MINUS_CONST 0x10000000
-#define INDEL_WINDOW_SIZE 50
+#define INDEL_WINDOW_SIZE 110
  
+#define MAX_TYPES 64
+
+// Take a reference position tpos and convert to a query position (returned).
+// This uses the CIGAR string plus alignment c->pos to do the mapping.
+//
+// *_tpos is returned as tpos if query overlaps tpos, but for deletions
+// it'll be either the start (is_left) or end (!is_left) ref position.
  static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
  {
+    // x = pos in ref, y = pos in query seq
      int k, x = c->pos, y = 0, last_y = 0;
      *_tpos = c->pos;
      for (k = 0; k < c->n_cigar; ++k) {
@@ -64,6 +74,7 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos,
      *_tpos = x;
      return last_y;
  }
+
  // FIXME: check if the inserted sequence is consistent with the homopolymer run
  // l is the relative gap length and l_run is the length of the homopolymer on the reference
  static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
@@ -87,21 +98,609 @@ static inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
      return max_i - pos;
  }
  
+// Identify spft-clip length, position in seq, and clipped seq len
+static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p,
+                           int *sc_len_r, int *slen_r, int *epos_r, int *end) {
+    bam1_t *b = p->b;
+    int sc_len = 0, sc_dist = -1, at_left = 1;
+    int epos = p->qpos, slen = b->core.l_qseq;
+    int k;
+    uint32_t *cigar = bam_get_cigar(b);
+    *end = -1;
+    for (k = 0; k < b->core.n_cigar; k++) {
+        int op = bam_cigar_op(cigar[k]);
+        if (op == BAM_CSOFT_CLIP) {
+            slen -= bam_cigar_oplen(cigar[k]);
+            if (at_left) {
+                // left end
+                sc_len += bam_cigar_oplen(cigar[k]);
+                epos -= sc_len; // don't count SC in seq pos
+                sc_dist = epos;
+                *end = 0;
+            } else {
+                // right end
+                int srlen = bam_cigar_oplen(cigar[k]);
+                int rd = b->core.l_qseq - srlen - p->qpos;
+                if (sc_dist < 0 || sc_dist > rd) {
+                    // closer to right end than left
+                    // FIXME: compensate for indel length too?
+                    sc_dist = rd;
+                    sc_len = srlen;
+                    *end = 1;
+                }
+            }
+        } else if (op != BAM_CHARD_CLIP) {
+            at_left = 0;
+        }
+    }
+
+    if (p->indel > 0 && slen - (epos+p->indel) < epos)
+        epos += p->indel-1; // end of insertion, if near end of seq
+
+    // slen is now length of sequence minus soft-clips and
+    // epos is position of indel in seq minus left-clip.
+    *epos_r = (double)epos / (slen+1) * bca->npos;
+
+    if (sc_len) {
+        // scale importance of clip by distance to closest end
+        *sc_len_r = 15.0*sc_len / (sc_dist+1);
+        if (*sc_len_r > 99) *sc_len_r = 99;
+    } else {
+        *sc_len_r = 0;
+    }
+
+    *slen_r = slen;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Scans the pileup to identify all the different sizes of indels
+// present.
+//
+// Returns types and fills out n_types_r,  max_rd_len_r and ref_type_r,
+//         or NULL on error.
+static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp,
+                               int pos, bcf_callaux_t *bca, const char *ref,
+                               int *max_rd_len_r, int *n_types_r,
+                               int *ref_type_r, int *N_r) {
+    int i, j, t, s, N, m, max_rd_len, n_types;
+    int n_alt = 0, n_tot = 0, indel_support_ok = 0;
+    uint32_t *aux;
+    int *types;
+
+    // N is the total number of reads
+    for (s = N = 0; s < n; ++s)
+        N += n_plp[s];
+
+    bca->max_support = bca->max_frac = 0;
+    aux = (uint32_t*) calloc(N + 1, 4);
+    if (!aux)
+        return NULL;
+
+    m = max_rd_len = 0;
+    aux[m++] = MINUS_CONST; // zero indel is always a type (REF)
+
+    // Fill out aux[] array with all the non-zero indel sizes.
+    // Also tally number with indels (n_alt) and total (n_tot).
+    for (s = 0; s < n; ++s) {
+        int na = 0, nt = 0;
+        for (i = 0; i < n_plp[s]; ++i) {
+            const bam_pileup1_t *p = plp[s] + i;
+            ++nt;
+            if (p->indel != 0) {
+                ++na;
+                aux[m++] = MINUS_CONST + p->indel;
+            }
+
+            // FIXME: cache me in pileup struct.
+            j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
+            if (j > max_rd_len) max_rd_len = j;
+        }
+        double frac = (double)na/nt;
+        if ( !indel_support_ok && na >= bca->min_support
+             && frac >= bca->min_frac )
+            indel_support_ok = 1;
+        if ( na > bca->max_support && frac > 0 )
+            bca->max_support = na, bca->max_frac = frac;
+
+        n_alt += na;
+        n_tot += nt;
+    }
+
+    // Sort aux[] and dedup
+    ks_introsort(uint32_t, m, aux);
+    for (i = 1, n_types = 1; i < m; ++i)
+        if (aux[i] != aux[i-1]) ++n_types;
+
+    // Taking totals makes it hard to call rare indels (IMF filter)
+    if ( !bca->per_sample_flt )
+        indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac
+                             || n_alt < bca->min_support )
+            ? 0 : 1;
+    if ( n_types == 1 || !indel_support_ok ) { // then skip
+        free(aux);
+        return NULL;
+    }
+
+    // Bail out if we have far too many types of indel
+    if (n_types >= MAX_TYPES) {
+        free(aux);
+        // TODO revisit how/whether to control printing this warning
+        if (hts_verbose >= 2)
+            fprintf(stderr, "[%s] excessive INDEL alleles at position %d. "
+                    "Skip the position.\n", __func__, pos + 1);
+        return NULL;
+    }
+
+    // To prevent long stretches of N's to be mistaken for indels
+    // (sometimes thousands of bases), check the number of N's in the
+    // sequence and skip places where half or more reference bases are Ns.
+    int nN=0, i_end = pos + (2*INDEL_WINDOW_SIZE < max_rd_len
+                            ?2*INDEL_WINDOW_SIZE : max_rd_len);
+    for (i=pos; i<i_end && ref[i]; i++)
+        nN += ref[i] == 'N';
+    if ( nN*2>(i-pos) ) {
+        free(aux);
+        return NULL;
+    }
+
+    // Finally fill out the types[] array detailing the size of insertion
+    // or deletion.
+    types = (int*)calloc(n_types, sizeof(int));
+    if (!types) {
+        free(aux);
+        return NULL;
+    }
+    t = 0;
+    types[t++] = aux[0] - MINUS_CONST;
+    for (i = 1; i < m; ++i)
+        if (aux[i] != aux[i-1])
+            types[t++] = aux[i] - MINUS_CONST;
+    free(aux);
+
+    // Find reference type; types[?] == 0)
+    for (t = 0; t < n_types; ++t)
+        if (types[t] == 0) break;
+
+    *ref_type_r   = t;
+    *n_types_r    = n_types;
+    *max_rd_len_r = max_rd_len;
+    *N_r          = N;
+
+    return types;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Construct per-sample consensus.
+//
+// Returns an array of consensus seqs,
+//         or NULL on failure.
+static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp,
+                                 int pos, bcf_callaux_t *bca, const char *ref,
+                                 int left, int right) {
+    int i, k, s, L = right - left + 1, max_i, max2_i;
+    char **ref_sample; // returned
+    uint32_t *cns = NULL, max, max2;
+    char *ref0 = NULL, *r;
+    ref_sample = (char**) calloc(n, sizeof(char*));
+    cns = (uint32_t*) calloc(L, 4);
+    ref0 = (char*) calloc(L, 1);
+    if (!ref_sample || !cns || !ref0) {
+        n = 0;
+        goto err;
+    }
+
+    // Convert ref ASCII to 0-15.
+    for (i = 0; i < right - left; ++i)
+        ref0[i] = seq_nt16_table[(int)ref[i+left]];
+
+    // NB: one consensus per sample 'n', not per indel type.
+    // FIXME: consider fixing this.  We should compute alignments vs
+    // types, not vs samples?  Or types/sample combined?
+    for (s = 0; s < n; ++s) {
+        r = ref_sample[s] = (char*) calloc(L, 1);
+        if (!r) {
+            n = s-1;
+            goto err;
+        }
+
+        memset(cns, 0, sizeof(int) * L);
+
+        // collect ref and non-ref counts in cns
+        for (i = 0; i < n_plp[s]; ++i) {
+            bam_pileup1_t *p = plp[s] + i;
+            bam1_t *b = p->b;
+            uint32_t *cigar = bam_get_cigar(b);
+            uint8_t *seq = bam_get_seq(b);
+            int x = b->core.pos, y = 0;
+
+            // TODO: pileup exposes pileup_ind, but we also need e.g.
+            // pileup_len to know how much of the current CIGAR op-len
+            // we've used (or have remaining).  If we had that, we
+            // could start at p->qpos without having to scan through
+            // the entire CIGAR string until we find it.
+            //
+            // Without it about all we could do is have a side channel
+            // to cache the last known coords.  Messy, so punt for now.
+            // This is no longer the bottle neck until we get to 1000s of
+            // CIGAR ops.
+
+            for (k = 0; k < b->core.n_cigar; ++k) {
+                int op = cigar[k]&0xf;
+                int j, l = cigar[k]>>4;
+                if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+                    if (x + l >= left) {
+                        j = left - x > 0 ? left - x : 0;
+                        int j_end = right - x < l ? right - x : l;
+                        for (; j < j_end; j++)
+                            // Append to cns.  Note this is ref coords,
+                            // so insertions aren't in cns and deletions
+                            // will have lower coverage.
+
+                            // FIXME: want true consensus (with ins) per
+                            // type, so we can independently compare each
+                            // seq to each consensus and see which it
+                            // matches best, so we get proper GT analysis.
+                            cns[x+j-left] +=
+                                (bam_seqi(seq, y+j) == ref0[x+j-left])
+                                ? 1        // REF
+                                : (1<<16); // ALT
+                    }
+                    x += l; y += l;
+                } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+                    x += l;
+                } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+                    y += l;
+                }
+
+                if (x > right)
+                    break;
+            }
+        }
+
+        // Determine a sample specific reference.
+        for (i = 0; i < right - left; ++i)
+            r[i] = ref0[i];
+
+        // Find deepest and 2nd deepest ALT region (max & max2).
+        max = max2 = 0; max_i = max2_i = -1;
+        for (i = 0; i < right - left; ++i) {
+            if (cns[i]>>16 >= max>>16)
+                max2 = max, max2_i = max_i, max = cns[i], max_i = i;
+            else if (cns[i]>>16 >= max2>>16)
+                max2 = cns[i], max2_i = i;
+        }
+
+        // Masks mismatches present in at least 70% of the reads with 'N'.
+        // This code is nREF/(nREF+n_ALT) >= 70% for deepest region.
+        // The effect is that at least 30% of bases differing to REF will
+        // use "N" in consensus, so we don't penalise ALT or REF when
+        // aligning against it.  (A poor man IUPAC code)
+        //
+        // Why is it only done in two loci at most?
+        if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7)
+            max_i = -1;
+        if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7)
+            max2_i = -1;
+        if (max_i >= 0) r[max_i] = 15;
+        if (max2_i >= 0) r[max2_i] = 15;
+
+        //for (i = 0; i < right - left; ++i)
+        //    fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr);
+        //fputc('\n', stderr);
+    }
+
+    free(ref0);
+    free(cns);
+
+    return ref_sample;
+
+ err:
+    free(ref0);
+    free(cns);
+    if (ref_sample) {
+        for (s = 0; s < n; s++)
+            free(ref_sample[s]);
+        free(ref_sample);
+    }
+
+    return NULL;
+}
+
+// The length of the homopolymer run around the current position
+static int bcf_cgp_l_run(const char *ref, int pos) {
+    int i, l_run;
+
+    int c = seq_nt16_table[(int)ref[pos + 1]];
+    if (c == 15) {
+        l_run = 1;
+    } else {
+        for (i = pos + 2; ref[i]; ++i)
+            if (seq_nt16_table[(int)ref[i]] != c) break;
+        l_run = i;
+        for (i = pos; i >= 0; --i)
+            if (seq_nt16_table[(int)ref[i]] != c) break;
+        l_run -= i + 1;
+    }
+
+    return l_run;
+}
+
+
+// Compute the consensus for this sample 's', minus indels which
+// get added later.
+static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp,
+                               int pos, int *types, int n_types,
+                               int max_ins, int s) {
+    int i, j, t, k;
+    int *inscns_aux = (int*)calloc(5 * n_types * max_ins, sizeof(int));
+    if (!inscns_aux)
+        return NULL;
+
+    // Count the number of occurrences of each base at each position for
+    // each type of insertion.
+    for (t = 0; t < n_types; ++t) {
+        if (types[t] > 0) {
+            for (s = 0; s < n; ++s) {
+                for (i = 0; i < n_plp[s]; ++i) {
+                    bam_pileup1_t *p = plp[s] + i;
+                    if (p->indel == types[t]) {
+                        uint8_t *seq = bam_get_seq(p->b);
+                        for (k = 1; k <= p->indel; ++k) {
+                            int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
+                            assert(c<5);
+                            ++inscns_aux[(t*max_ins+(k-1))*5 + c];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // Use the majority rule to construct the consensus
+    char *inscns = (char *)calloc(n_types * max_ins, 1);
+    for (t = 0; t < n_types; ++t) {
+        for (j = 0; j < types[t]; ++j) {
+            int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
+            for (k = 0; k < 5; ++k)
+                if (ia[k] > max)
+                    max = ia[k], max_k = k;
+            inscns[t*max_ins + j] = max ? max_k : 4;
+            if (max_k == 4) {
+                // discard insertions which contain N's
+                types[t] = 0;
+                break;
+            }
+        }
+    }
+    free(inscns_aux);
+
+    return inscns;
+}
+
+#ifndef MIN
+#  define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+// Part of bcf_call_gap_prep.
+//
+// Realign using BAQ to get an alignment score of a single read vs
+// a haplotype consensus.
+//
+// Fills out score
+// Returns 0 on success,
+//        <0 on error
+static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
+                               int type, uint8_t *ref2, uint8_t *query,
+                               int r_start, int r_end, int long_read,
+                               int tbeg, int tend,
+                               int left, int right,
+                               int qbeg, int qend,
+                               int qpos, int max_deletion,
+                               int *score) {
+    // Illumina
+    probaln_par_t apf = { 1e-4, 1e-2, 10 };
+
+    // Parameters that work better on PacBio CCS 15k.
+    // We should consider querying the header and RG PU field.
+    // See also htslib/realn.c:sam_prob_realn()
+    if (long_read) {
+        apf.d = 1e-3;
+        apf.e = 1e-1;
+    }
+
+    type = abs(type);
+    apf.bw = type + 3;
+    int l, sc;
+    const uint8_t *qual = bam_get_qual(p->b), *bq;
+    uint8_t *qq;
+
+    // Get segment of quality, either ZQ tag or if absent QUAL.
+    if (!(qq = (uint8_t*) calloc(qend - qbeg, 1)))
+        return -1;
+    bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
+    if (bq) ++bq; // skip type
+    for (l = qbeg; l < qend; ++l) {
+        int qval = bq? qual[l] + (bq[l] - 64) : qual[l];
+        if (qval > 30)
+            qval = 30;
+        if (qval < 7)
+            qval = 7;
+        qq[l - qbeg] = qval;
+    }
+
+    // The bottom 8 bits are length-normalised score while
+    // the top bits are unnormalised.
+    sc = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type,
+                        query, qend - qbeg, qq, &apf, 0, 0);
+    if (sc < 0) {
+        *score = 0xffffff;
+        free(qq);
+        return 0;
+    }
+
+    // used for adjusting indelQ below
+    l = (int)(100. * sc / (qend - qbeg) + .499) * bca->indel_bias;
+    *score = sc<<8 | MIN(255, l);
+
+    rep_ele *reps, *elt, *tmp;
+    uint8_t *seg = ref2 + tbeg - left;
+    int seg_len = tend - tbeg + type;
+
+    // Note: although seg moves (tbeg varies), ref2 is reused many times
+    // so we could factor out some find_STR calls.  However it's not the
+    // bottleneck for now.
+
+    // FIXME: need to make this work on IUPAC.
+    reps = find_STR((char *)seg, seg_len, 0);
+    int iscore = 0;
+
+    // Identify STRs in ref covering the indel up to
+    // (or close to) the end of the sequence.
+    // Those having an indel and right at the sequence
+    // end do not confirm the total length of indel
+    // size.  Specifically a *lack* of indel at the
+    // end, where we know indels occur in other
+    // sequences, is a possible reference bias.
+    //
+    // This is emphasised further if the sequence ends with
+    // soft clipping.
+    DL_FOREACH_SAFE(reps, elt, tmp) {
+        if (elt->start <= qpos && elt->end >= qpos) {
+            iscore += (elt->end-elt->start) / elt->rep_len;  // c
+            if (elt->start+tbeg <= r_start ||
+                elt->end+tbeg   >= r_end)
+                iscore += 2*(elt->end-elt->start);
+       }
+
+        DL_DELETE(reps, elt);
+        free(elt);
+    }
+
+    // Apply STR score to existing indelQ
+    l  =  (*score&0xff)*.8 + iscore*2;
+    *score = (*score & ~0xff) | MIN(255, l);
+
+    free(qq);
+
+    return 0;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Returns n_alt on success
+//         -1 on failure
+static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
+                                  bcf_callaux_t *bca, char *inscns,
+                                  int l_run, int max_ins,
+                                  int ref_type, int *types, int n_types,
+                                  int *score) {
+    // FIXME: n_types has a maximum; no need to alloc - use a #define?
+    int sc[MAX_TYPES], sumq[MAX_TYPES], s, i, j, t, K, n_alt, tmp;
+    memset(sumq, 0, n_types * sizeof(int));
+    for (s = K = 0; s < n; ++s) {
+        for (i = 0; i < n_plp[s]; ++i, ++K) {
+            bam_pileup1_t *p = plp[s] + i;
+            int *sct = &score[K*n_types], seqQ, indelQ;
+            for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+            for (t = 1; t < n_types; ++t) // insertion sort
+                for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+                    tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+
+            /* errmod_cal() assumes that if the call is wrong, the
+             * likelihoods of other events are equal. This is about
+             * right for substitutions, but is not desired for
+             * indels. To reuse errmod_cal(), I have to make
+             * compromise for multi-allelic indels.
+             */
+            if ((sc[0]&0x3f) == ref_type) {
+                indelQ = (sc[1]>>14) - (sc[0]>>14);
+                seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
+            } else {
+                for (t = 0; t < n_types; ++t) // look for the reference type
+                    if ((sc[t]&0x3f) == ref_type) break;
+                indelQ = (sc[t]>>14) - (sc[0]>>14);
+                seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
+            }
+            tmp = sc[0]>>6 & 0xff;
+            // reduce indelQ
+            indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499);
+
+            // Doesn't really help accuracy, but permits -h to take
+            // affect still.
+            if (indelQ > seqQ) indelQ = seqQ;
+            if (indelQ > 255) indelQ = 255;
+            if (seqQ > 255) seqQ = 255;
+            p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
+            sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
+            //              fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
+        }
+    }
+    // determine bca->indel_types[] and bca->inscns
+    bca->maxins = max_ins;
+    bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
+    if (bca->maxins && !bca->inscns)
+        return -1;
+    for (t = 0; t < n_types; ++t)
+        sumq[t] = sumq[t]<<6 | t;
+    for (t = 1; t < n_types; ++t) // insertion sort
+        for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
+            tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
+    for (t = 0; t < n_types; ++t) // look for the reference type
+        if ((sumq[t]&0x3f) == ref_type) break;
+    if (t) { // then move the reference type to the first
+        tmp = sumq[t];
+        for (; t > 0; --t) sumq[t] = sumq[t-1];
+        sumq[0] = tmp;
+    }
+    for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
+    for (t = 0; t < 4 && t < n_types; ++t) {
+        bca->indel_types[t] = types[sumq[t]&0x3f];
+        if (bca->maxins)
+            memcpy(&bca->inscns[t * bca->maxins],
+                   &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
+    }
+    // update p->aux
+    for (s = n_alt = 0; s < n; ++s) {
+        for (i = 0; i < n_plp[s]; ++i) {
+            bam_pileup1_t *p = plp[s] + i;
+            int x = types[p->aux>>16&0x3f];
+            for (j = 0; j < 4; ++j)
+                if (x == bca->indel_types[j]) break;
+            p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
+            if ((p->aux>>16&0x3f) > 0) ++n_alt;
+            //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
+        }
+    }
+
+    return n_alt;
+}
+
+/*
+FIXME: with high number of samples, do we handle IMF correctly?  Is it
+fraction of indels across entire data set, or just fraction for this
+specific sample? Needs to check bca->per_sample_flt (--per-sample-mF) opt.
+ */
+
  /*
      notes:
-        - n .. number of samples
-        - the routine sets bam_pileup1_t.aux of each read as follows:
-            - 6: unused
-            - 6: the call; index to bcf_callaux_t.indel_types   .. (aux>>16)&0x3f
-            - 8: estimated sequence quality                     .. (aux>>8)&0xff
-            - 8: indel quality                                  .. aux&0xff
+    - n .. number of samples
+    - the routine sets bam_pileup1_t.aux of each read as follows:
+        - 6: unused
+        - 6: the call; index to bcf_callaux_t.indel_types   .. (aux>>16)&0x3f
+        - 8: estimated sequence quality                     .. (aux>>8)&0xff
+        - 8: indel quality                                  .. aux&0xff
   */
-int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
+                      bcf_callaux_t *bca, const char *ref)
  {
-    int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2;
+    if (ref == 0 || bca == 0) return -1;
+
+    int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins;
+    int *score, max_ref2;
      int N, K, l_run, ref_type, n_alt;
      char *inscns = 0, *ref2, *query, **ref_sample;
-    if (ref == 0 || bca == 0) return -1;
  
      // determine if there is a gap
      for (s = N = 0; s < n; ++s) {
@@ -109,77 +708,29 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
              if (plp[s][i].indel != 0) break;
          if (i < n_plp[s]) break;
      }
-    if (s == n) return -1; // there is no indel at this position.
-    for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
-    { // find out how many types of indels are present
-        bca->max_support = bca->max_frac = 0;
-        int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
-        uint32_t *aux;
-        aux = (uint32_t*) calloc(N + 1, 4);
-        m = max_rd_len = 0;
-        aux[m++] = MINUS_CONST; // zero indel is always a type
-        for (s = 0; s < n; ++s) {
-            int na = 0, nt = 0;
-            for (i = 0; i < n_plp[s]; ++i) {
-                const bam_pileup1_t *p = plp[s] + i;
-                ++nt;
-                if (p->indel != 0) {
-                    ++na;
-                    aux[m++] = MINUS_CONST + p->indel;
-                }
-                j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
-                if (j > max_rd_len) max_rd_len = j;
-            }
-            double frac = (double)na/nt;
-            if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
-                indel_support_ok = 1;
-            if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
-            n_alt += na;
-            n_tot += nt;
-        }
-        // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
-        //  check the number of N's in the sequence and skip places where half or more reference bases are Ns.
-        int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
-        if ( nN*2>(i-pos) ) { free(aux); return -1; }
-
-        ks_introsort(uint32_t, m, aux);
-        // squeeze out identical types
-        for (i = 1, n_types = 1; i < m; ++i)
-            if (aux[i] != aux[i-1]) ++n_types;
-        // Taking totals makes it hard to call rare indels
-        if ( !bca->per_sample_flt )
-            indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
-        if ( n_types == 1 || !indel_support_ok ) { // then skip
-            free(aux); return -1;
-        }
-        if (n_types >= 64) {
-            free(aux);
-            // TODO revisit how/whether to control printing this warning
-            if (hts_verbose >= 2)
-                fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
-            return -1;
-        }
-        types = (int*)calloc(n_types, sizeof(int));
-        t = 0;
-        types[t++] = aux[0] - MINUS_CONST;
-        for (i = 1; i < m; ++i)
-            if (aux[i] != aux[i-1])
-                types[t++] = aux[i] - MINUS_CONST;
-        free(aux);
-        for (t = 0; t < n_types; ++t)
-            if (types[t] == 0) break;
-        ref_type = t; // the index of the reference type (0)
-    }
-    { // calculate left and right boundary
-        left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
-        right = pos + INDEL_WINDOW_SIZE;
-        if (types[0] < 0) right -= types[0];
-        // in case the alignments stand out the reference
-        for (i = pos; i < right; ++i)
-            if (ref[i] == 0) break;
-        right = i;
-    }
-    /* The following block fixes a long-existing flaw in the INDEL
+    if (s == n)
+        // there is no indel at this position.
+        return -1;
+
+    // find out how many types of indels are present
+    types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref,
+                               &max_rd_len, &n_types, &ref_type, &N);
+    if (!types)
+        return -1;
+
+
+    // calculate left and right boundary
+    left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
+    right = pos + INDEL_WINDOW_SIZE;
+    if (types[0] < 0) right -= types[0];
+
+    // in case the alignments stand out the reference
+    for (i = pos; i < right; ++i)
+        if (ref[i] == 0) break;
+    right = i;
+
+
+    /* The following call fixes a long-existing flaw in the INDEL
       * calling model: the interference of nearby SNPs. However, it also
       * reduces the power because sometimes, substitutions caused by
       * indels are not distinguishable from true mutations. Multiple
@@ -187,284 +738,211 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
       *
       * Masks mismatches present in at least 70% of the reads with 'N'.
       */
-    { // construct per-sample consensus
-        int L = right - left + 1, max_i, max2_i;
-        uint32_t *cns, max, max2;
-        char *ref0, *r;
-        ref_sample = (char**) calloc(n, sizeof(char*));
-        cns = (uint32_t*) calloc(L, 4);
-        ref0 = (char*) calloc(L, 1);
-        for (i = 0; i < right - left; ++i)
-            ref0[i] = seq_nt16_table[(int)ref[i+left]];
-        for (s = 0; s < n; ++s) {
-            r = ref_sample[s] = (char*) calloc(L, 1);
-            memset(cns, 0, sizeof(int) * L);
-            // collect ref and non-ref counts
-            for (i = 0; i < n_plp[s]; ++i) {
-                bam_pileup1_t *p = plp[s] + i;
-                bam1_t *b = p->b;
-                uint32_t *cigar = bam_get_cigar(b);
-                uint8_t *seq = bam_get_seq(b);
-                int x = b->core.pos, y = 0;
-                for (k = 0; k < b->core.n_cigar; ++k) {
-                    int op = cigar[k]&0xf;
-                    int j, l = cigar[k]>>4;
-                    if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
-                        for (j = 0; j < l; ++j)
-                            if (x + j >= left && x + j < right)
-                                cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
-                        x += l; y += l;
-                    } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
-                    else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
-                }
-            }
-            // determine the consensus
-            for (i = 0; i < right - left; ++i) r[i] = ref0[i];
-            max = max2 = 0; max_i = max2_i = -1;
-            for (i = 0; i < right - left; ++i) {
-                if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
-                else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
-            }
-            if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
-            if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
-            if (max_i >= 0) r[max_i] = 15;
-            if (max2_i >= 0) r[max2_i] = 15;
-            //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr);
-        }
-        free(ref0); free(cns);
-    }
-    { // the length of the homopolymer run around the current position
-        int c = seq_nt16_table[(int)ref[pos + 1]];
-        if (c == 15) l_run = 1;
-        else {
-            for (i = pos + 2; ref[i]; ++i)
-                if (seq_nt16_table[(int)ref[i]] != c) break;
-            l_run = i;
-            for (i = pos; i >= 0; --i)
-                if (seq_nt16_table[(int)ref[i]] != c) break;
-            l_run -= i + 1;
-        }
-    }
-    // construct the consensus sequence
+    ref_sample = bcf_cgp_ref_sample(n, n_plp, plp, pos, bca, ref, left, right);
+
+    // The length of the homopolymer run around the current position
+    l_run = bcf_cgp_l_run(ref, pos);
+
+    // construct the consensus sequence (minus indels, which are added later)
      max_ins = types[n_types - 1];   // max_ins is at least 0
      if (max_ins > 0) {
-        int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int));
-        // count the number of occurrences of each base at each position for each type of insertion
-        for (t = 0; t < n_types; ++t) {
-            if (types[t] > 0) {
-                for (s = 0; s < n; ++s) {
-                    for (i = 0; i < n_plp[s]; ++i) {
-                        bam_pileup1_t *p = plp[s] + i;
-                        if (p->indel == types[t]) {
-                            uint8_t *seq = bam_get_seq(p->b);
-                            for (k = 1; k <= p->indel; ++k) {
-                                int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
-                                assert(c<5);
-                                ++inscns_aux[(t*max_ins+(k-1))*5 + c];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        // use the majority rule to construct the consensus
-        inscns = (char*) calloc(n_types * max_ins, 1);
-        for (t = 0; t < n_types; ++t) {
-            for (j = 0; j < types[t]; ++j) {
-                int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
-                for (k = 0; k < 5; ++k)
-                    if (ia[k] > max)
-                        max = ia[k], max_k = k;
-                inscns[t*max_ins + j] = max? max_k : 4;
-                if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
-            }
-        }
-        free(inscns_aux);
+        inscns = bcf_cgp_calc_cons(n, n_plp, plp, pos,
+                                   types, n_types, max_ins, s);
+        if (!inscns)
+            return -1;
      }
+
      // compute the likelihood given each type of indel for each read
      max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
      ref2  = (char*) calloc(max_ref2, 1);
      query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1);
-    score1 = (int*) calloc(N * n_types, sizeof(int));
-    score2 = (int*) calloc(N * n_types, sizeof(int));
+    score = (int*) calloc(N * n_types, sizeof(int));
      bca->indelreg = 0;
+    double nqual_over_60 = bca->nqual / 60.0;
+
      for (t = 0; t < n_types; ++t) {
          int l, ir;
-        probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
-        apf1.bw = apf2.bw = abs(types[t]) + 3;
+
          // compute indelreg
-        if (types[t] == 0) ir = 0;
-        else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
-        else ir = est_indelreg(pos, ref, -types[t], 0);
-        if (ir > bca->indelreg) bca->indelreg = ir;
-//      fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir);
-        // realignment
+        if (types[t] == 0)
+            ir = 0;
+        else if (types[t] > 0)
+            ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
+        else
+            ir = est_indelreg(pos, ref, -types[t], 0);
+
+        if (ir > bca->indelreg)
+            bca->indelreg = ir;
+
+        // Identify max deletion length
+        int max_deletion = 0;
+        for (s = 0; s < n; ++s) {
+            for (i = 0; i < n_plp[s]; ++i, ++K) {
+                bam_pileup1_t *p = plp[s] + i;
+                if (max_deletion < -p->indel)
+                    max_deletion = -p->indel;
+            }
+        }
+
+        // Realignment score, computed via BAQ
          for (s = K = 0; s < n; ++s) {
-            // write ref2
+            // Construct ref2 from ref_sample, inscns and indels.
+            // This is now the true sample consensus (possibly prepended
+            // and appended with reference if sample data doesn't span
+            // the full length).
              for (k = 0, j = left; j <= pos; ++j)
                  ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
-            if (types[t] <= 0) j += -types[t];
-            else for (l = 0; l < types[t]; ++l)
-                     ref2[k++] = inscns[t*max_ins + l];
+
+            if (types[t] <= 0)
+                j += -types[t];
+            else
+                for (l = 0; l < types[t]; ++l)
+                    ref2[k++] = inscns[t*max_ins + l];
+
              for (; j < right && ref[j]; ++j)
                  ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
-            for (; k < max_ref2; ++k) ref2[k] = 4;
-            if (j < right) right = j;
+            for (; k < max_ref2; ++k)
+                ref2[k] = 4;
+
+            if (right > j)
+                right = j;
+
              // align each read to ref2
              for (i = 0; i < n_plp[s]; ++i, ++K) {
                  bam_pileup1_t *p = plp[s] + i;
-                int qbeg, qend, tbeg, tend, sc, kk;
+
+                // Some basic ref vs alt stats.
+                int imq = p->b->core.qual > 59 ? 59 : p->b->core.qual;
+                imq *= nqual_over_60;
+
+                int sc_len, slen, epos, sc_end;
+
+                // Only need to gather stats on one type, as it's
+                // identical calculation for all the subsequent ones
+                // and we're sharing the same stats array
+                if (t == 0) {
+                    // Gather stats for INFO field to aid filtering.
+                    // mq and sc_len not very helpful for filtering, but could
+                    // help in assigning a better QUAL value.
+                    //
+                    // Pos is slightly useful.
+                    // Base qual can be useful, but need qual prior to BAQ?
+                    // May need to cache orig quals in aux tag so we can fetch
+                    // them even after mpileup step.
+                    get_pos(bca, p, &sc_len, &slen, &epos, &sc_end);
+
+                    assert(imq >= 0 && imq < bca->nqual);
+                    assert(epos >= 0 && epos < bca->npos);
+                    assert(sc_len >= 0 && sc_len < 100);
+                    if (p->indel) {
+                        bca->ialt_mq[imq]++;
+                        bca->ialt_scl[sc_len]++;
+                        bca->ialt_pos[epos]++;
+                    } else {
+                        bca->iref_mq[imq]++;
+                        bca->iref_scl[sc_len]++;
+                        bca->iref_pos[epos]++;
+                    }
+                }
+
+                int qbeg, qpos, qend, tbeg, tend, kk;
                  uint8_t *seq = bam_get_seq(p->b);
                  uint32_t *cigar = bam_get_cigar(p->b);
-                if (p->b->core.flag&4) continue; // unmapped reads
-                // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
+                if (p->b->core.flag & BAM_FUNMAP) continue;
+
+                // FIXME: the following loop should be better moved outside;
+                // nonetheless, realignment should be much slower anyway.
                  for (kk = 0; kk < p->b->core.n_cigar; ++kk)
-                    if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
-                if (kk < p->b->core.n_cigar) continue;
-                // FIXME: the following skips soft clips, but using them may be more sensitive.
+                    if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
+                        break;
+                if (kk < p->b->core.n_cigar)
+                    continue;
+
                  // determine the start and end of sequences for alignment
-                qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left,  0, &tbeg);
-                qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend);
+                // FIXME: loops over CIGAR multiple times
+                int left2 = left, right2 = right;
+                if (p->b->core.l_qseq > 1000) {
+                    // long read data needs less context.  It also tends to
+                    // have many more candidate indels to investigate so
+                    // speed here matters more.
+                    if (pos - left >= INDEL_WINDOW_SIZE)
+                        left2 += INDEL_WINDOW_SIZE/2;
+                    if (right-pos >= INDEL_WINDOW_SIZE)
+                        right2 -= INDEL_WINDOW_SIZE/2;
+                }
+
+                int r_start = p->b->core.pos;
+                int r_end = bam_cigar2rlen(p->b->core.n_cigar,
+                                           bam_get_cigar(p->b))
+                            -1 + r_start;
+
+                qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left2,
+                                 0, &tbeg);
+                qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos,
+                                     0, &tend) - qbeg;
+                qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right2,
+                                 1, &tend);
+
                  if (types[t] < 0) {
                      int l = -types[t];
                      tbeg = tbeg - l > left?  tbeg - l : left;
                  }
+
                  // write the query sequence
                  for (l = qbeg; l < qend; ++l)
                      query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
-                { // do realignment; this is the bottleneck
-                    const uint8_t *qual = bam_get_qual(p->b), *bq;
-                    uint8_t *qq;
-                    qq = (uint8_t*) calloc(qend - qbeg, 1);
-                    bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
-                    if (bq) ++bq; // skip type
-                    for (l = qbeg; l < qend; ++l) {
-                        qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
-                        if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
-                        if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
-                    }
-                    sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
-                                        (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
-                    l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
-                    if (l > 255) l = 255;
-                    score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
-                    if (sc > 5) {
-                        sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
-                                            (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
-                        l = (int)(100. * sc / (qend - qbeg) + .499);
-                        if (l > 255) l = 255;
-                        score2[K*n_types + t] = sc<<8 | l;
+
+                // A fudge for now.  Consider checking SAM header for
+                // RG platform field.
+                int long_read = p->b->core.l_qseq > 1000;
+
+                // do realignment; this is the bottleneck
+                if (tend > tbeg) {
+                    if (bcf_cgp_align_score(p, bca, types[t],
+                                            (uint8_t *)ref2 + left2-left,
+                                            (uint8_t *)query,
+                                            r_start, r_end, long_read,
+                                            tbeg, tend, left2, right2,
+                                            qbeg, qend, qpos, max_deletion,
+                                            &score[K*n_types + t]) < 0) {
+                        score[K*n_types + t] = 0xffffff;
+                        return -1;
                      }
-                    free(qq);
+                } else {
+                    // place holder large cost for reads that cover the
+                    // region entirely within a deletion (thus tend < tbeg).
+                    score[K*n_types + t] = 0xffffff;
                  }
  #if 0
                  for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
                      fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr);
                  fputc('\n', stderr);
-                for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr);
+                for (l = 0; l < qend - qbeg; ++l)
+                    fputc("ACGTN"[(int)query[l]], stderr);
                  fputc('\n', stderr);
-                fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), qbeg, tbeg, sc);
+                fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s "
+                        "qbeg=%d tbeg=%d score=%d\n",
+                        pos, types[t], s, i, bam_get_qname(p->b),
+                        qbeg, tbeg, sc);
  #endif
              }
          }
      }
-    free(ref2); free(query);
-    { // compute indelQ
-        int sc_a[16], sumq_a[16];
-        int tmp, *sc = sc_a, *sumq = sumq_a;
-        if (n_types > 16) {
-            sc   = (int *)malloc(n_types * sizeof(int));
-            sumq = (int *)malloc(n_types * sizeof(int));
-        }
-        memset(sumq, 0, n_types * sizeof(int));
-        for (s = K = 0; s < n; ++s) {
-            for (i = 0; i < n_plp[s]; ++i, ++K) {
-                bam_pileup1_t *p = plp[s] + i;
-                int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
-                for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
-                for (t = 1; t < n_types; ++t) // insertion sort
-                    for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
-                        tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
-                /* errmod_cal() assumes that if the call is wrong, the
-                 * likelihoods of other events are equal. This is about
-                 * right for substitutions, but is not desired for
-                 * indels. To reuse errmod_cal(), I have to make
-                 * compromise for multi-allelic indels.
-                 */
-                if ((sc[0]&0x3f) == ref_type) {
-                    indelQ1 = (sc[1]>>14) - (sc[0]>>14);
-                    seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
-                } else {
-                    for (t = 0; t < n_types; ++t) // look for the reference type
-                        if ((sc[t]&0x3f) == ref_type) break;
-                    indelQ1 = (sc[t]>>14) - (sc[0]>>14);
-                    seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
-                }
-                tmp = sc[0]>>6 & 0xff;
-                indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
-                sct = &score2[K*n_types];
-                for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
-                for (t = 1; t < n_types; ++t) // insertion sort
-                    for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
-                        tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
-                if ((sc[0]&0x3f) == ref_type) {
-                    indelQ2 = (sc[1]>>14) - (sc[0]>>14);
-                } else {
-                    for (t = 0; t < n_types; ++t) // look for the reference type
-                        if ((sc[t]&0x3f) == ref_type) break;
-                    indelQ2 = (sc[t]>>14) - (sc[0]>>14);
-                }
-                tmp = sc[0]>>6 & 0xff;
-                indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
-                // pick the smaller between indelQ1 and indelQ2
-                indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
-                if (indelQ > 255) indelQ = 255;
-                if (seqQ > 255) seqQ = 255;
-                p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
-                sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
-//              fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
-            }
-        }
-        // determine bca->indel_types[] and bca->inscns
-        bca->maxins = max_ins;
-        bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
-        for (t = 0; t < n_types; ++t)
-            sumq[t] = sumq[t]<<6 | t;
-        for (t = 1; t < n_types; ++t) // insertion sort
-            for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
-                tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
-        for (t = 0; t < n_types; ++t) // look for the reference type
-            if ((sumq[t]&0x3f) == ref_type) break;
-        if (t) { // then move the reference type to the first
-            tmp = sumq[t];
-            for (; t > 0; --t) sumq[t] = sumq[t-1];
-            sumq[0] = tmp;
-        }
-        for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
-        for (t = 0; t < 4 && t < n_types; ++t) {
-            bca->indel_types[t] = types[sumq[t]&0x3f];
-            memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
-        }
-        // update p->aux
-        for (s = n_alt = 0; s < n; ++s) {
-            for (i = 0; i < n_plp[s]; ++i) {
-                bam_pileup1_t *p = plp[s] + i;
-                int x = types[p->aux>>16&0x3f];
-                for (j = 0; j < 4; ++j)
-                    if (x == bca->indel_types[j]) break;
-                p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
-                if ((p->aux>>16&0x3f) > 0) ++n_alt;
-                //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
-            }
-        }
  
-        if (sc   != sc_a)   free(sc);
-        if (sumq != sumq_a) free(sumq);
-    }
-    free(score1); free(score2);
+    // compute indelQ
+    n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins,
+                                   ref_type, types, n_types, score);
+
      // free
-    for (i = 0; i < n; ++i) free(ref_sample[i]);
+    free(ref2);
+    free(query);
+    free(score);
+
+    for (i = 0; i < n; ++i)
+        free(ref_sample[i]);
+
      free(ref_sample);
      free(types); free(inscns);
+
      return n_alt > 0? 0 : -1;
  }
diff --git a/bcftools/bam2bcf_indel.c.pysam.c b/bcftools/bam2bcf_indel.c.pysam.c

index 67fff21860a5e6d83bc9a781385814a7dffbeac8..82bf31cf916777efd7b8b76bf7e89d0bf2e926e8 100644 (file)
--- a/bcftools/bam2bcf_indel.c.pysam.c
+++ b/bcftools/bam2bcf_indel.c.pysam.c
@@ -3,7 +3,7 @@
  /*  bam2bcf_indel.c -- indel caller.
  
      Copyright (C) 2010, 2011 Broad Institute.
-    Copyright (C) 2012-2014,2016 Genome Research Ltd.
+    Copyright (C) 2012-2014,2016-2017, 2021 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
@@ -28,19 +28,29 @@ DEALINGS IN THE SOFTWARE.  */
  #include <assert.h>
  #include <ctype.h>
  #include <string.h>
+#include <math.h>
  #include <htslib/hts.h>
  #include <htslib/sam.h>
  #include <htslib/khash_str2int.h>
  #include "bam2bcf.h"
+#include "str_finder.h"
  
  #include <htslib/ksort.h>
  KSORT_INIT_GENERIC(uint32_t)
  
  #define MINUS_CONST 0x10000000
-#define INDEL_WINDOW_SIZE 50
+#define INDEL_WINDOW_SIZE 110
  
+#define MAX_TYPES 64
+
+// Take a reference position tpos and convert to a query position (returned).
+// This uses the CIGAR string plus alignment c->pos to do the mapping.
+//
+// *_tpos is returned as tpos if query overlaps tpos, but for deletions
+// it'll be either the start (is_left) or end (!is_left) ref position.
  static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
  {
+    // x = pos in ref, y = pos in query seq
      int k, x = c->pos, y = 0, last_y = 0;
      *_tpos = c->pos;
      for (k = 0; k < c->n_cigar; ++k) {
@@ -66,6 +76,7 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos,
      *_tpos = x;
      return last_y;
  }
+
  // FIXME: check if the inserted sequence is consistent with the homopolymer run
  // l is the relative gap length and l_run is the length of the homopolymer on the reference
  static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
@@ -89,21 +100,609 @@ static inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
      return max_i - pos;
  }
  
+// Identify spft-clip length, position in seq, and clipped seq len
+static inline void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p,
+                           int *sc_len_r, int *slen_r, int *epos_r, int *end) {
+    bam1_t *b = p->b;
+    int sc_len = 0, sc_dist = -1, at_left = 1;
+    int epos = p->qpos, slen = b->core.l_qseq;
+    int k;
+    uint32_t *cigar = bam_get_cigar(b);
+    *end = -1;
+    for (k = 0; k < b->core.n_cigar; k++) {
+        int op = bam_cigar_op(cigar[k]);
+        if (op == BAM_CSOFT_CLIP) {
+            slen -= bam_cigar_oplen(cigar[k]);
+            if (at_left) {
+                // left end
+                sc_len += bam_cigar_oplen(cigar[k]);
+                epos -= sc_len; // don't count SC in seq pos
+                sc_dist = epos;
+                *end = 0;
+            } else {
+                // right end
+                int srlen = bam_cigar_oplen(cigar[k]);
+                int rd = b->core.l_qseq - srlen - p->qpos;
+                if (sc_dist < 0 || sc_dist > rd) {
+                    // closer to right end than left
+                    // FIXME: compensate for indel length too?
+                    sc_dist = rd;
+                    sc_len = srlen;
+                    *end = 1;
+                }
+            }
+        } else if (op != BAM_CHARD_CLIP) {
+            at_left = 0;
+        }
+    }
+
+    if (p->indel > 0 && slen - (epos+p->indel) < epos)
+        epos += p->indel-1; // end of insertion, if near end of seq
+
+    // slen is now length of sequence minus soft-clips and
+    // epos is position of indel in seq minus left-clip.
+    *epos_r = (double)epos / (slen+1) * bca->npos;
+
+    if (sc_len) {
+        // scale importance of clip by distance to closest end
+        *sc_len_r = 15.0*sc_len / (sc_dist+1);
+        if (*sc_len_r > 99) *sc_len_r = 99;
+    } else {
+        *sc_len_r = 0;
+    }
+
+    *slen_r = slen;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Scans the pileup to identify all the different sizes of indels
+// present.
+//
+// Returns types and fills out n_types_r,  max_rd_len_r and ref_type_r,
+//         or NULL on error.
+static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp,
+                               int pos, bcf_callaux_t *bca, const char *ref,
+                               int *max_rd_len_r, int *n_types_r,
+                               int *ref_type_r, int *N_r) {
+    int i, j, t, s, N, m, max_rd_len, n_types;
+    int n_alt = 0, n_tot = 0, indel_support_ok = 0;
+    uint32_t *aux;
+    int *types;
+
+    // N is the total number of reads
+    for (s = N = 0; s < n; ++s)
+        N += n_plp[s];
+
+    bca->max_support = bca->max_frac = 0;
+    aux = (uint32_t*) calloc(N + 1, 4);
+    if (!aux)
+        return NULL;
+
+    m = max_rd_len = 0;
+    aux[m++] = MINUS_CONST; // zero indel is always a type (REF)
+
+    // Fill out aux[] array with all the non-zero indel sizes.
+    // Also tally number with indels (n_alt) and total (n_tot).
+    for (s = 0; s < n; ++s) {
+        int na = 0, nt = 0;
+        for (i = 0; i < n_plp[s]; ++i) {
+            const bam_pileup1_t *p = plp[s] + i;
+            ++nt;
+            if (p->indel != 0) {
+                ++na;
+                aux[m++] = MINUS_CONST + p->indel;
+            }
+
+            // FIXME: cache me in pileup struct.
+            j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
+            if (j > max_rd_len) max_rd_len = j;
+        }
+        double frac = (double)na/nt;
+        if ( !indel_support_ok && na >= bca->min_support
+             && frac >= bca->min_frac )
+            indel_support_ok = 1;
+        if ( na > bca->max_support && frac > 0 )
+            bca->max_support = na, bca->max_frac = frac;
+
+        n_alt += na;
+        n_tot += nt;
+    }
+
+    // Sort aux[] and dedup
+    ks_introsort(uint32_t, m, aux);
+    for (i = 1, n_types = 1; i < m; ++i)
+        if (aux[i] != aux[i-1]) ++n_types;
+
+    // Taking totals makes it hard to call rare indels (IMF filter)
+    if ( !bca->per_sample_flt )
+        indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac
+                             || n_alt < bca->min_support )
+            ? 0 : 1;
+    if ( n_types == 1 || !indel_support_ok ) { // then skip
+        free(aux);
+        return NULL;
+    }
+
+    // Bail out if we have far too many types of indel
+    if (n_types >= MAX_TYPES) {
+        free(aux);
+        // TODO revisit how/whether to control printing this warning
+        if (hts_verbose >= 2)
+            fprintf(bcftools_stderr, "[%s] excessive INDEL alleles at position %d. "
+                    "Skip the position.\n", __func__, pos + 1);
+        return NULL;
+    }
+
+    // To prevent long stretches of N's to be mistaken for indels
+    // (sometimes thousands of bases), check the number of N's in the
+    // sequence and skip places where half or more reference bases are Ns.
+    int nN=0, i_end = pos + (2*INDEL_WINDOW_SIZE < max_rd_len
+                            ?2*INDEL_WINDOW_SIZE : max_rd_len);
+    for (i=pos; i<i_end && ref[i]; i++)
+        nN += ref[i] == 'N';
+    if ( nN*2>(i-pos) ) {
+        free(aux);
+        return NULL;
+    }
+
+    // Finally fill out the types[] array detailing the size of insertion
+    // or deletion.
+    types = (int*)calloc(n_types, sizeof(int));
+    if (!types) {
+        free(aux);
+        return NULL;
+    }
+    t = 0;
+    types[t++] = aux[0] - MINUS_CONST;
+    for (i = 1; i < m; ++i)
+        if (aux[i] != aux[i-1])
+            types[t++] = aux[i] - MINUS_CONST;
+    free(aux);
+
+    // Find reference type; types[?] == 0)
+    for (t = 0; t < n_types; ++t)
+        if (types[t] == 0) break;
+
+    *ref_type_r   = t;
+    *n_types_r    = n_types;
+    *max_rd_len_r = max_rd_len;
+    *N_r          = N;
+
+    return types;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Construct per-sample consensus.
+//
+// Returns an array of consensus seqs,
+//         or NULL on failure.
+static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp,
+                                 int pos, bcf_callaux_t *bca, const char *ref,
+                                 int left, int right) {
+    int i, k, s, L = right - left + 1, max_i, max2_i;
+    char **ref_sample; // returned
+    uint32_t *cns = NULL, max, max2;
+    char *ref0 = NULL, *r;
+    ref_sample = (char**) calloc(n, sizeof(char*));
+    cns = (uint32_t*) calloc(L, 4);
+    ref0 = (char*) calloc(L, 1);
+    if (!ref_sample || !cns || !ref0) {
+        n = 0;
+        goto err;
+    }
+
+    // Convert ref ASCII to 0-15.
+    for (i = 0; i < right - left; ++i)
+        ref0[i] = seq_nt16_table[(int)ref[i+left]];
+
+    // NB: one consensus per sample 'n', not per indel type.
+    // FIXME: consider fixing this.  We should compute alignments vs
+    // types, not vs samples?  Or types/sample combined?
+    for (s = 0; s < n; ++s) {
+        r = ref_sample[s] = (char*) calloc(L, 1);
+        if (!r) {
+            n = s-1;
+            goto err;
+        }
+
+        memset(cns, 0, sizeof(int) * L);
+
+        // collect ref and non-ref counts in cns
+        for (i = 0; i < n_plp[s]; ++i) {
+            bam_pileup1_t *p = plp[s] + i;
+            bam1_t *b = p->b;
+            uint32_t *cigar = bam_get_cigar(b);
+            uint8_t *seq = bam_get_seq(b);
+            int x = b->core.pos, y = 0;
+
+            // TODO: pileup exposes pileup_ind, but we also need e.g.
+            // pileup_len to know how much of the current CIGAR op-len
+            // we've used (or have remaining).  If we had that, we
+            // could start at p->qpos without having to scan through
+            // the entire CIGAR string until we find it.
+            //
+            // Without it about all we could do is have a side channel
+            // to cache the last known coords.  Messy, so punt for now.
+            // This is no longer the bottle neck until we get to 1000s of
+            // CIGAR ops.
+
+            for (k = 0; k < b->core.n_cigar; ++k) {
+                int op = cigar[k]&0xf;
+                int j, l = cigar[k]>>4;
+                if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+                    if (x + l >= left) {
+                        j = left - x > 0 ? left - x : 0;
+                        int j_end = right - x < l ? right - x : l;
+                        for (; j < j_end; j++)
+                            // Append to cns.  Note this is ref coords,
+                            // so insertions aren't in cns and deletions
+                            // will have lower coverage.
+
+                            // FIXME: want true consensus (with ins) per
+                            // type, so we can independently compare each
+                            // seq to each consensus and see which it
+                            // matches best, so we get proper GT analysis.
+                            cns[x+j-left] +=
+                                (bam_seqi(seq, y+j) == ref0[x+j-left])
+                                ? 1        // REF
+                                : (1<<16); // ALT
+                    }
+                    x += l; y += l;
+                } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+                    x += l;
+                } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+                    y += l;
+                }
+
+                if (x > right)
+                    break;
+            }
+        }
+
+        // Determine a sample specific reference.
+        for (i = 0; i < right - left; ++i)
+            r[i] = ref0[i];
+
+        // Find deepest and 2nd deepest ALT region (max & max2).
+        max = max2 = 0; max_i = max2_i = -1;
+        for (i = 0; i < right - left; ++i) {
+            if (cns[i]>>16 >= max>>16)
+                max2 = max, max2_i = max_i, max = cns[i], max_i = i;
+            else if (cns[i]>>16 >= max2>>16)
+                max2 = cns[i], max2_i = i;
+        }
+
+        // Masks mismatches present in at least 70% of the reads with 'N'.
+        // This code is nREF/(nREF+n_ALT) >= 70% for deepest region.
+        // The effect is that at least 30% of bases differing to REF will
+        // use "N" in consensus, so we don't penalise ALT or REF when
+        // aligning against it.  (A poor man IUPAC code)
+        //
+        // Why is it only done in two loci at most?
+        if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7)
+            max_i = -1;
+        if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7)
+            max2_i = -1;
+        if (max_i >= 0) r[max_i] = 15;
+        if (max2_i >= 0) r[max2_i] = 15;
+
+        //for (i = 0; i < right - left; ++i)
+        //    fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], bcftools_stderr);
+        //fputc('\n', bcftools_stderr);
+    }
+
+    free(ref0);
+    free(cns);
+
+    return ref_sample;
+
+ err:
+    free(ref0);
+    free(cns);
+    if (ref_sample) {
+        for (s = 0; s < n; s++)
+            free(ref_sample[s]);
+        free(ref_sample);
+    }
+
+    return NULL;
+}
+
+// The length of the homopolymer run around the current position
+static int bcf_cgp_l_run(const char *ref, int pos) {
+    int i, l_run;
+
+    int c = seq_nt16_table[(int)ref[pos + 1]];
+    if (c == 15) {
+        l_run = 1;
+    } else {
+        for (i = pos + 2; ref[i]; ++i)
+            if (seq_nt16_table[(int)ref[i]] != c) break;
+        l_run = i;
+        for (i = pos; i >= 0; --i)
+            if (seq_nt16_table[(int)ref[i]] != c) break;
+        l_run -= i + 1;
+    }
+
+    return l_run;
+}
+
+
+// Compute the consensus for this sample 's', minus indels which
+// get added later.
+static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp,
+                               int pos, int *types, int n_types,
+                               int max_ins, int s) {
+    int i, j, t, k;
+    int *inscns_aux = (int*)calloc(5 * n_types * max_ins, sizeof(int));
+    if (!inscns_aux)
+        return NULL;
+
+    // Count the number of occurrences of each base at each position for
+    // each type of insertion.
+    for (t = 0; t < n_types; ++t) {
+        if (types[t] > 0) {
+            for (s = 0; s < n; ++s) {
+                for (i = 0; i < n_plp[s]; ++i) {
+                    bam_pileup1_t *p = plp[s] + i;
+                    if (p->indel == types[t]) {
+                        uint8_t *seq = bam_get_seq(p->b);
+                        for (k = 1; k <= p->indel; ++k) {
+                            int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
+                            assert(c<5);
+                            ++inscns_aux[(t*max_ins+(k-1))*5 + c];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // Use the majority rule to construct the consensus
+    char *inscns = (char *)calloc(n_types * max_ins, 1);
+    for (t = 0; t < n_types; ++t) {
+        for (j = 0; j < types[t]; ++j) {
+            int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
+            for (k = 0; k < 5; ++k)
+                if (ia[k] > max)
+                    max = ia[k], max_k = k;
+            inscns[t*max_ins + j] = max ? max_k : 4;
+            if (max_k == 4) {
+                // discard insertions which contain N's
+                types[t] = 0;
+                break;
+            }
+        }
+    }
+    free(inscns_aux);
+
+    return inscns;
+}
+
+#ifndef MIN
+#  define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+// Part of bcf_call_gap_prep.
+//
+// Realign using BAQ to get an alignment score of a single read vs
+// a haplotype consensus.
+//
+// Fills out score
+// Returns 0 on success,
+//        <0 on error
+static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
+                               int type, uint8_t *ref2, uint8_t *query,
+                               int r_start, int r_end, int long_read,
+                               int tbeg, int tend,
+                               int left, int right,
+                               int qbeg, int qend,
+                               int qpos, int max_deletion,
+                               int *score) {
+    // Illumina
+    probaln_par_t apf = { 1e-4, 1e-2, 10 };
+
+    // Parameters that work better on PacBio CCS 15k.
+    // We should consider querying the header and RG PU field.
+    // See also htslib/realn.c:sam_prob_realn()
+    if (long_read) {
+        apf.d = 1e-3;
+        apf.e = 1e-1;
+    }
+
+    type = abs(type);
+    apf.bw = type + 3;
+    int l, sc;
+    const uint8_t *qual = bam_get_qual(p->b), *bq;
+    uint8_t *qq;
+
+    // Get segment of quality, either ZQ tag or if absent QUAL.
+    if (!(qq = (uint8_t*) calloc(qend - qbeg, 1)))
+        return -1;
+    bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
+    if (bq) ++bq; // skip type
+    for (l = qbeg; l < qend; ++l) {
+        int qval = bq? qual[l] + (bq[l] - 64) : qual[l];
+        if (qval > 30)
+            qval = 30;
+        if (qval < 7)
+            qval = 7;
+        qq[l - qbeg] = qval;
+    }
+
+    // The bottom 8 bits are length-normalised score while
+    // the top bits are unnormalised.
+    sc = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type,
+                        query, qend - qbeg, qq, &apf, 0, 0);
+    if (sc < 0) {
+        *score = 0xffffff;
+        free(qq);
+        return 0;
+    }
+
+    // used for adjusting indelQ below
+    l = (int)(100. * sc / (qend - qbeg) + .499) * bca->indel_bias;
+    *score = sc<<8 | MIN(255, l);
+
+    rep_ele *reps, *elt, *tmp;
+    uint8_t *seg = ref2 + tbeg - left;
+    int seg_len = tend - tbeg + type;
+
+    // Note: although seg moves (tbeg varies), ref2 is reused many times
+    // so we could factor out some find_STR calls.  However it's not the
+    // bottleneck for now.
+
+    // FIXME: need to make this work on IUPAC.
+    reps = find_STR((char *)seg, seg_len, 0);
+    int iscore = 0;
+
+    // Identify STRs in ref covering the indel up to
+    // (or close to) the end of the sequence.
+    // Those having an indel and right at the sequence
+    // end do not confirm the total length of indel
+    // size.  Specifically a *lack* of indel at the
+    // end, where we know indels occur in other
+    // sequences, is a possible reference bias.
+    //
+    // This is emphasised further if the sequence ends with
+    // soft clipping.
+    DL_FOREACH_SAFE(reps, elt, tmp) {
+        if (elt->start <= qpos && elt->end >= qpos) {
+            iscore += (elt->end-elt->start) / elt->rep_len;  // c
+            if (elt->start+tbeg <= r_start ||
+                elt->end+tbeg   >= r_end)
+                iscore += 2*(elt->end-elt->start);
+       }
+
+        DL_DELETE(reps, elt);
+        free(elt);
+    }
+
+    // Apply STR score to existing indelQ
+    l  =  (*score&0xff)*.8 + iscore*2;
+    *score = (*score & ~0xff) | MIN(255, l);
+
+    free(qq);
+
+    return 0;
+}
+
+// Part of bcf_call_gap_prep.
+//
+// Returns n_alt on success
+//         -1 on failure
+static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
+                                  bcf_callaux_t *bca, char *inscns,
+                                  int l_run, int max_ins,
+                                  int ref_type, int *types, int n_types,
+                                  int *score) {
+    // FIXME: n_types has a maximum; no need to alloc - use a #define?
+    int sc[MAX_TYPES], sumq[MAX_TYPES], s, i, j, t, K, n_alt, tmp;
+    memset(sumq, 0, n_types * sizeof(int));
+    for (s = K = 0; s < n; ++s) {
+        for (i = 0; i < n_plp[s]; ++i, ++K) {
+            bam_pileup1_t *p = plp[s] + i;
+            int *sct = &score[K*n_types], seqQ, indelQ;
+            for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+            for (t = 1; t < n_types; ++t) // insertion sort
+                for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+                    tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+
+            /* errmod_cal() assumes that if the call is wrong, the
+             * likelihoods of other events are equal. This is about
+             * right for substitutions, but is not desired for
+             * indels. To reuse errmod_cal(), I have to make
+             * compromise for multi-allelic indels.
+             */
+            if ((sc[0]&0x3f) == ref_type) {
+                indelQ = (sc[1]>>14) - (sc[0]>>14);
+                seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
+            } else {
+                for (t = 0; t < n_types; ++t) // look for the reference type
+                    if ((sc[t]&0x3f) == ref_type) break;
+                indelQ = (sc[t]>>14) - (sc[0]>>14);
+                seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
+            }
+            tmp = sc[0]>>6 & 0xff;
+            // reduce indelQ
+            indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499);
+
+            // Doesn't really help accuracy, but permits -h to take
+            // affect still.
+            if (indelQ > seqQ) indelQ = seqQ;
+            if (indelQ > 255) indelQ = 255;
+            if (seqQ > 255) seqQ = 255;
+            p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
+            sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
+            //              fprintf(bcftools_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
+        }
+    }
+    // determine bca->indel_types[] and bca->inscns
+    bca->maxins = max_ins;
+    bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
+    if (bca->maxins && !bca->inscns)
+        return -1;
+    for (t = 0; t < n_types; ++t)
+        sumq[t] = sumq[t]<<6 | t;
+    for (t = 1; t < n_types; ++t) // insertion sort
+        for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
+            tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
+    for (t = 0; t < n_types; ++t) // look for the reference type
+        if ((sumq[t]&0x3f) == ref_type) break;
+    if (t) { // then move the reference type to the first
+        tmp = sumq[t];
+        for (; t > 0; --t) sumq[t] = sumq[t-1];
+        sumq[0] = tmp;
+    }
+    for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
+    for (t = 0; t < 4 && t < n_types; ++t) {
+        bca->indel_types[t] = types[sumq[t]&0x3f];
+        if (bca->maxins)
+            memcpy(&bca->inscns[t * bca->maxins],
+                   &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
+    }
+    // update p->aux
+    for (s = n_alt = 0; s < n; ++s) {
+        for (i = 0; i < n_plp[s]; ++i) {
+            bam_pileup1_t *p = plp[s] + i;
+            int x = types[p->aux>>16&0x3f];
+            for (j = 0; j < 4; ++j)
+                if (x == bca->indel_types[j]) break;
+            p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
+            if ((p->aux>>16&0x3f) > 0) ++n_alt;
+            //fprintf(bcftools_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
+        }
+    }
+
+    return n_alt;
+}
+
+/*
+FIXME: with high number of samples, do we handle IMF correctly?  Is it
+fraction of indels across entire data set, or just fraction for this
+specific sample? Needs to check bca->per_sample_flt (--per-sample-mF) opt.
+ */
+
  /*
      notes:
-        - n .. number of samples
-        - the routine sets bam_pileup1_t.aux of each read as follows:
-            - 6: unused
-            - 6: the call; index to bcf_callaux_t.indel_types   .. (aux>>16)&0x3f
-            - 8: estimated sequence quality                     .. (aux>>8)&0xff
-            - 8: indel quality                                  .. aux&0xff
+    - n .. number of samples
+    - the routine sets bam_pileup1_t.aux of each read as follows:
+        - 6: unused
+        - 6: the call; index to bcf_callaux_t.indel_types   .. (aux>>16)&0x3f
+        - 8: estimated sequence quality                     .. (aux>>8)&0xff
+        - 8: indel quality                                  .. aux&0xff
   */
-int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
+                      bcf_callaux_t *bca, const char *ref)
  {
-    int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2;
+    if (ref == 0 || bca == 0) return -1;
+
+    int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins;
+    int *score, max_ref2;
      int N, K, l_run, ref_type, n_alt;
      char *inscns = 0, *ref2, *query, **ref_sample;
-    if (ref == 0 || bca == 0) return -1;
  
      // determine if there is a gap
      for (s = N = 0; s < n; ++s) {
@@ -111,77 +710,29 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
              if (plp[s][i].indel != 0) break;
          if (i < n_plp[s]) break;
      }
-    if (s == n) return -1; // there is no indel at this position.
-    for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
-    { // find out how many types of indels are present
-        bca->max_support = bca->max_frac = 0;
-        int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
-        uint32_t *aux;
-        aux = (uint32_t*) calloc(N + 1, 4);
-        m = max_rd_len = 0;
-        aux[m++] = MINUS_CONST; // zero indel is always a type
-        for (s = 0; s < n; ++s) {
-            int na = 0, nt = 0;
-            for (i = 0; i < n_plp[s]; ++i) {
-                const bam_pileup1_t *p = plp[s] + i;
-                ++nt;
-                if (p->indel != 0) {
-                    ++na;
-                    aux[m++] = MINUS_CONST + p->indel;
-                }
-                j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
-                if (j > max_rd_len) max_rd_len = j;
-            }
-            double frac = (double)na/nt;
-            if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
-                indel_support_ok = 1;
-            if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
-            n_alt += na;
-            n_tot += nt;
-        }
-        // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
-        //  check the number of N's in the sequence and skip places where half or more reference bases are Ns.
-        int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
-        if ( nN*2>(i-pos) ) { free(aux); return -1; }
-
-        ks_introsort(uint32_t, m, aux);
-        // squeeze out identical types
-        for (i = 1, n_types = 1; i < m; ++i)
-            if (aux[i] != aux[i-1]) ++n_types;
-        // Taking totals makes it hard to call rare indels
-        if ( !bca->per_sample_flt )
-            indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
-        if ( n_types == 1 || !indel_support_ok ) { // then skip
-            free(aux); return -1;
-        }
-        if (n_types >= 64) {
-            free(aux);
-            // TODO revisit how/whether to control printing this warning
-            if (hts_verbose >= 2)
-                fprintf(bcftools_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
-            return -1;
-        }
-        types = (int*)calloc(n_types, sizeof(int));
-        t = 0;
-        types[t++] = aux[0] - MINUS_CONST;
-        for (i = 1; i < m; ++i)
-            if (aux[i] != aux[i-1])
-                types[t++] = aux[i] - MINUS_CONST;
-        free(aux);
-        for (t = 0; t < n_types; ++t)
-            if (types[t] == 0) break;
-        ref_type = t; // the index of the reference type (0)
-    }
-    { // calculate left and right boundary
-        left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
-        right = pos + INDEL_WINDOW_SIZE;
-        if (types[0] < 0) right -= types[0];
-        // in case the alignments stand out the reference
-        for (i = pos; i < right; ++i)
-            if (ref[i] == 0) break;
-        right = i;
-    }
-    /* The following block fixes a long-existing flaw in the INDEL
+    if (s == n)
+        // there is no indel at this position.
+        return -1;
+
+    // find out how many types of indels are present
+    types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref,
+                               &max_rd_len, &n_types, &ref_type, &N);
+    if (!types)
+        return -1;
+
+
+    // calculate left and right boundary
+    left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
+    right = pos + INDEL_WINDOW_SIZE;
+    if (types[0] < 0) right -= types[0];
+
+    // in case the alignments stand out the reference
+    for (i = pos; i < right; ++i)
+        if (ref[i] == 0) break;
+    right = i;
+
+
+    /* The following call fixes a long-existing flaw in the INDEL
       * calling model: the interference of nearby SNPs. However, it also
       * reduces the power because sometimes, substitutions caused by
       * indels are not distinguishable from true mutations. Multiple
@@ -189,284 +740,211 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
       *
       * Masks mismatches present in at least 70% of the reads with 'N'.
       */
-    { // construct per-sample consensus
-        int L = right - left + 1, max_i, max2_i;
-        uint32_t *cns, max, max2;
-        char *ref0, *r;
-        ref_sample = (char**) calloc(n, sizeof(char*));
-        cns = (uint32_t*) calloc(L, 4);
-        ref0 = (char*) calloc(L, 1);
-        for (i = 0; i < right - left; ++i)
-            ref0[i] = seq_nt16_table[(int)ref[i+left]];
-        for (s = 0; s < n; ++s) {
-            r = ref_sample[s] = (char*) calloc(L, 1);
-            memset(cns, 0, sizeof(int) * L);
-            // collect ref and non-ref counts
-            for (i = 0; i < n_plp[s]; ++i) {
-                bam_pileup1_t *p = plp[s] + i;
-                bam1_t *b = p->b;
-                uint32_t *cigar = bam_get_cigar(b);
-                uint8_t *seq = bam_get_seq(b);
-                int x = b->core.pos, y = 0;
-                for (k = 0; k < b->core.n_cigar; ++k) {
-                    int op = cigar[k]&0xf;
-                    int j, l = cigar[k]>>4;
-                    if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
-                        for (j = 0; j < l; ++j)
-                            if (x + j >= left && x + j < right)
-                                cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
-                        x += l; y += l;
-                    } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
-                    else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
-                }
-            }
-            // determine the consensus
-            for (i = 0; i < right - left; ++i) r[i] = ref0[i];
-            max = max2 = 0; max_i = max2_i = -1;
-            for (i = 0; i < right - left; ++i) {
-                if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
-                else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
-            }
-            if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
-            if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
-            if (max_i >= 0) r[max_i] = 15;
-            if (max2_i >= 0) r[max2_i] = 15;
-            //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], bcftools_stderr); fputc('\n', bcftools_stderr);
-        }
-        free(ref0); free(cns);
-    }
-    { // the length of the homopolymer run around the current position
-        int c = seq_nt16_table[(int)ref[pos + 1]];
-        if (c == 15) l_run = 1;
-        else {
-            for (i = pos + 2; ref[i]; ++i)
-                if (seq_nt16_table[(int)ref[i]] != c) break;
-            l_run = i;
-            for (i = pos; i >= 0; --i)
-                if (seq_nt16_table[(int)ref[i]] != c) break;
-            l_run -= i + 1;
-        }
-    }
-    // construct the consensus sequence
+    ref_sample = bcf_cgp_ref_sample(n, n_plp, plp, pos, bca, ref, left, right);
+
+    // The length of the homopolymer run around the current position
+    l_run = bcf_cgp_l_run(ref, pos);
+
+    // construct the consensus sequence (minus indels, which are added later)
      max_ins = types[n_types - 1];   // max_ins is at least 0
      if (max_ins > 0) {
-        int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int));
-        // count the number of occurrences of each base at each position for each type of insertion
-        for (t = 0; t < n_types; ++t) {
-            if (types[t] > 0) {
-                for (s = 0; s < n; ++s) {
-                    for (i = 0; i < n_plp[s]; ++i) {
-                        bam_pileup1_t *p = plp[s] + i;
-                        if (p->indel == types[t]) {
-                            uint8_t *seq = bam_get_seq(p->b);
-                            for (k = 1; k <= p->indel; ++k) {
-                                int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
-                                assert(c<5);
-                                ++inscns_aux[(t*max_ins+(k-1))*5 + c];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        // use the majority rule to construct the consensus
-        inscns = (char*) calloc(n_types * max_ins, 1);
-        for (t = 0; t < n_types; ++t) {
-            for (j = 0; j < types[t]; ++j) {
-                int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
-                for (k = 0; k < 5; ++k)
-                    if (ia[k] > max)
-                        max = ia[k], max_k = k;
-                inscns[t*max_ins + j] = max? max_k : 4;
-                if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
-            }
-        }
-        free(inscns_aux);
+        inscns = bcf_cgp_calc_cons(n, n_plp, plp, pos,
+                                   types, n_types, max_ins, s);
+        if (!inscns)
+            return -1;
      }
+
      // compute the likelihood given each type of indel for each read
      max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
      ref2  = (char*) calloc(max_ref2, 1);
      query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1);
-    score1 = (int*) calloc(N * n_types, sizeof(int));
-    score2 = (int*) calloc(N * n_types, sizeof(int));
+    score = (int*) calloc(N * n_types, sizeof(int));
      bca->indelreg = 0;
+    double nqual_over_60 = bca->nqual / 60.0;
+
      for (t = 0; t < n_types; ++t) {
          int l, ir;
-        probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
-        apf1.bw = apf2.bw = abs(types[t]) + 3;
+
          // compute indelreg
-        if (types[t] == 0) ir = 0;
-        else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
-        else ir = est_indelreg(pos, ref, -types[t], 0);
-        if (ir > bca->indelreg) bca->indelreg = ir;
-//      fprintf(bcftools_stderr, "%d, %d, %d\n", pos, types[t], ir);
-        // realignment
+        if (types[t] == 0)
+            ir = 0;
+        else if (types[t] > 0)
+            ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
+        else
+            ir = est_indelreg(pos, ref, -types[t], 0);
+
+        if (ir > bca->indelreg)
+            bca->indelreg = ir;
+
+        // Identify max deletion length
+        int max_deletion = 0;
+        for (s = 0; s < n; ++s) {
+            for (i = 0; i < n_plp[s]; ++i, ++K) {
+                bam_pileup1_t *p = plp[s] + i;
+                if (max_deletion < -p->indel)
+                    max_deletion = -p->indel;
+            }
+        }
+
+        // Realignment score, computed via BAQ
          for (s = K = 0; s < n; ++s) {
-            // write ref2
+            // Construct ref2 from ref_sample, inscns and indels.
+            // This is now the true sample consensus (possibly prepended
+            // and appended with reference if sample data doesn't span
+            // the full length).
              for (k = 0, j = left; j <= pos; ++j)
                  ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
-            if (types[t] <= 0) j += -types[t];
-            else for (l = 0; l < types[t]; ++l)
-                     ref2[k++] = inscns[t*max_ins + l];
+
+            if (types[t] <= 0)
+                j += -types[t];
+            else
+                for (l = 0; l < types[t]; ++l)
+                    ref2[k++] = inscns[t*max_ins + l];
+
              for (; j < right && ref[j]; ++j)
                  ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
-            for (; k < max_ref2; ++k) ref2[k] = 4;
-            if (j < right) right = j;
+            for (; k < max_ref2; ++k)
+                ref2[k] = 4;
+
+            if (right > j)
+                right = j;
+
              // align each read to ref2
              for (i = 0; i < n_plp[s]; ++i, ++K) {
                  bam_pileup1_t *p = plp[s] + i;
-                int qbeg, qend, tbeg, tend, sc, kk;
+
+                // Some basic ref vs alt stats.
+                int imq = p->b->core.qual > 59 ? 59 : p->b->core.qual;
+                imq *= nqual_over_60;
+
+                int sc_len, slen, epos, sc_end;
+
+                // Only need to gather stats on one type, as it's
+                // identical calculation for all the subsequent ones
+                // and we're sharing the same stats array
+                if (t == 0) {
+                    // Gather stats for INFO field to aid filtering.
+                    // mq and sc_len not very helpful for filtering, but could
+                    // help in assigning a better QUAL value.
+                    //
+                    // Pos is slightly useful.
+                    // Base qual can be useful, but need qual prior to BAQ?
+                    // May need to cache orig quals in aux tag so we can fetch
+                    // them even after mpileup step.
+                    get_pos(bca, p, &sc_len, &slen, &epos, &sc_end);
+
+                    assert(imq >= 0 && imq < bca->nqual);
+                    assert(epos >= 0 && epos < bca->npos);
+                    assert(sc_len >= 0 && sc_len < 100);
+                    if (p->indel) {
+                        bca->ialt_mq[imq]++;
+                        bca->ialt_scl[sc_len]++;
+                        bca->ialt_pos[epos]++;
+                    } else {
+                        bca->iref_mq[imq]++;
+                        bca->iref_scl[sc_len]++;
+                        bca->iref_pos[epos]++;
+                    }
+                }
+
+                int qbeg, qpos, qend, tbeg, tend, kk;
                  uint8_t *seq = bam_get_seq(p->b);
                  uint32_t *cigar = bam_get_cigar(p->b);
-                if (p->b->core.flag&4) continue; // unmapped reads
-                // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
+                if (p->b->core.flag & BAM_FUNMAP) continue;
+
+                // FIXME: the following loop should be better moved outside;
+                // nonetheless, realignment should be much slower anyway.
                  for (kk = 0; kk < p->b->core.n_cigar; ++kk)
-                    if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
-                if (kk < p->b->core.n_cigar) continue;
-                // FIXME: the following skips soft clips, but using them may be more sensitive.
+                    if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
+                        break;
+                if (kk < p->b->core.n_cigar)
+                    continue;
+
                  // determine the start and end of sequences for alignment
-                qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left,  0, &tbeg);
-                qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend);
+                // FIXME: loops over CIGAR multiple times
+                int left2 = left, right2 = right;
+                if (p->b->core.l_qseq > 1000) {
+                    // long read data needs less context.  It also tends to
+                    // have many more candidate indels to investigate so
+                    // speed here matters more.
+                    if (pos - left >= INDEL_WINDOW_SIZE)
+                        left2 += INDEL_WINDOW_SIZE/2;
+                    if (right-pos >= INDEL_WINDOW_SIZE)
+                        right2 -= INDEL_WINDOW_SIZE/2;
+                }
+
+                int r_start = p->b->core.pos;
+                int r_end = bam_cigar2rlen(p->b->core.n_cigar,
+                                           bam_get_cigar(p->b))
+                            -1 + r_start;
+
+                qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left2,
+                                 0, &tbeg);
+                qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos,
+                                     0, &tend) - qbeg;
+                qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right2,
+                                 1, &tend);
+
                  if (types[t] < 0) {
                      int l = -types[t];
                      tbeg = tbeg - l > left?  tbeg - l : left;
                  }
+
                  // write the query sequence
                  for (l = qbeg; l < qend; ++l)
                      query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
-                { // do realignment; this is the bottleneck
-                    const uint8_t *qual = bam_get_qual(p->b), *bq;
-                    uint8_t *qq;
-                    qq = (uint8_t*) calloc(qend - qbeg, 1);
-                    bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
-                    if (bq) ++bq; // skip type
-                    for (l = qbeg; l < qend; ++l) {
-                        qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
-                        if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
-                        if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
-                    }
-                    sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
-                                        (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
-                    l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
-                    if (l > 255) l = 255;
-                    score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
-                    if (sc > 5) {
-                        sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
-                                            (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
-                        l = (int)(100. * sc / (qend - qbeg) + .499);
-                        if (l > 255) l = 255;
-                        score2[K*n_types + t] = sc<<8 | l;
+
+                // A fudge for now.  Consider checking SAM header for
+                // RG platform field.
+                int long_read = p->b->core.l_qseq > 1000;
+
+                // do realignment; this is the bottleneck
+                if (tend > tbeg) {
+                    if (bcf_cgp_align_score(p, bca, types[t],
+                                            (uint8_t *)ref2 + left2-left,
+                                            (uint8_t *)query,
+                                            r_start, r_end, long_read,
+                                            tbeg, tend, left2, right2,
+                                            qbeg, qend, qpos, max_deletion,
+                                            &score[K*n_types + t]) < 0) {
+                        score[K*n_types + t] = 0xffffff;
+                        return -1;
                      }
-                    free(qq);
+                } else {
+                    // place holder large cost for reads that cover the
+                    // region entirely within a deletion (thus tend < tbeg).
+                    score[K*n_types + t] = 0xffffff;
                  }
  #if 0
                  for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
                      fputc("ACGTN"[(int)ref2[tbeg-left+l]], bcftools_stderr);
                  fputc('\n', bcftools_stderr);
-                for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], bcftools_stderr);
+                for (l = 0; l < qend - qbeg; ++l)
+                    fputc("ACGTN"[(int)query[l]], bcftools_stderr);
                  fputc('\n', bcftools_stderr);
-                fprintf(bcftools_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), qbeg, tbeg, sc);
+                fprintf(bcftools_stderr, "pos=%d type=%d read=%d:%d name=%s "
+                        "qbeg=%d tbeg=%d score=%d\n",
+                        pos, types[t], s, i, bam_get_qname(p->b),
+                        qbeg, tbeg, sc);
  #endif
              }
          }
      }
-    free(ref2); free(query);
-    { // compute indelQ
-        int sc_a[16], sumq_a[16];
-        int tmp, *sc = sc_a, *sumq = sumq_a;
-        if (n_types > 16) {
-            sc   = (int *)malloc(n_types * sizeof(int));
-            sumq = (int *)malloc(n_types * sizeof(int));
-        }
-        memset(sumq, 0, n_types * sizeof(int));
-        for (s = K = 0; s < n; ++s) {
-            for (i = 0; i < n_plp[s]; ++i, ++K) {
-                bam_pileup1_t *p = plp[s] + i;
-                int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
-                for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
-                for (t = 1; t < n_types; ++t) // insertion sort
-                    for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
-                        tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
-                /* errmod_cal() assumes that if the call is wrong, the
-                 * likelihoods of other events are equal. This is about
-                 * right for substitutions, but is not desired for
-                 * indels. To reuse errmod_cal(), I have to make
-                 * compromise for multi-allelic indels.
-                 */
-                if ((sc[0]&0x3f) == ref_type) {
-                    indelQ1 = (sc[1]>>14) - (sc[0]>>14);
-                    seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
-                } else {
-                    for (t = 0; t < n_types; ++t) // look for the reference type
-                        if ((sc[t]&0x3f) == ref_type) break;
-                    indelQ1 = (sc[t]>>14) - (sc[0]>>14);
-                    seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
-                }
-                tmp = sc[0]>>6 & 0xff;
-                indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
-                sct = &score2[K*n_types];
-                for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
-                for (t = 1; t < n_types; ++t) // insertion sort
-                    for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
-                        tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
-                if ((sc[0]&0x3f) == ref_type) {
-                    indelQ2 = (sc[1]>>14) - (sc[0]>>14);
-                } else {
-                    for (t = 0; t < n_types; ++t) // look for the reference type
-                        if ((sc[t]&0x3f) == ref_type) break;
-                    indelQ2 = (sc[t]>>14) - (sc[0]>>14);
-                }
-                tmp = sc[0]>>6 & 0xff;
-                indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
-                // pick the smaller between indelQ1 and indelQ2
-                indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
-                if (indelQ > 255) indelQ = 255;
-                if (seqQ > 255) seqQ = 255;
-                p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
-                sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
-//              fprintf(bcftools_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
-            }
-        }
-        // determine bca->indel_types[] and bca->inscns
-        bca->maxins = max_ins;
-        bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
-        for (t = 0; t < n_types; ++t)
-            sumq[t] = sumq[t]<<6 | t;
-        for (t = 1; t < n_types; ++t) // insertion sort
-            for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
-                tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
-        for (t = 0; t < n_types; ++t) // look for the reference type
-            if ((sumq[t]&0x3f) == ref_type) break;
-        if (t) { // then move the reference type to the first
-            tmp = sumq[t];
-            for (; t > 0; --t) sumq[t] = sumq[t-1];
-            sumq[0] = tmp;
-        }
-        for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
-        for (t = 0; t < 4 && t < n_types; ++t) {
-            bca->indel_types[t] = types[sumq[t]&0x3f];
-            memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
-        }
-        // update p->aux
-        for (s = n_alt = 0; s < n; ++s) {
-            for (i = 0; i < n_plp[s]; ++i) {
-                bam_pileup1_t *p = plp[s] + i;
-                int x = types[p->aux>>16&0x3f];
-                for (j = 0; j < 4; ++j)
-                    if (x == bca->indel_types[j]) break;
-                p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
-                if ((p->aux>>16&0x3f) > 0) ++n_alt;
-                //fprintf(bcftools_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
-            }
-        }
  
-        if (sc   != sc_a)   free(sc);
-        if (sumq != sumq_a) free(sumq);
-    }
-    free(score1); free(score2);
+    // compute indelQ
+    n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins,
+                                   ref_type, types, n_types, score);
+
      // free
-    for (i = 0; i < n; ++i) free(ref_sample[i]);
+    free(ref2);
+    free(query);
+    free(score);
+
+    for (i = 0; i < n; ++i)
+        free(ref_sample[i]);
+
      free(ref_sample);
      free(types); free(inscns);
+
      return n_alt > 0? 0 : -1;
  }
diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h

index 96237eefb524053b01ef7a689025548e01828ba0..953cf6ba7033c71eca26b7c39b0ae87ae41dd02b 100644 (file)
--- a/bcftools/bcftools.h
+++ b/bcftools/bcftools.h
@@ -1,6 +1,6 @@
  /*  bcftools.h -- utility function declarations.
  
-    Copyright (C) 2013 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -50,25 +50,40 @@ void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT
  
  void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
  const char *hts_bcf_wmode(int file_type);
+const char *hts_bcf_wmode2(int file_type, char *fname);
+char *init_tmp_prefix(const char *prefix);
  
  void *smalloc(size_t size);     // safe malloc
  
-static inline char gt2iupac(char a, char b)
+static inline int iupac2bitmask(char iupac)
  {
-    static const char iupac[4][4] = { {'A','M','R','W'},{'M','C','S','Y'},{'R','S','G','K'},{'W','Y','K','T'} };
-    if ( a>='a' ) a -= 'a' - 'A';
-    if ( b>='a' ) b -= 'a' - 'A';
-    if ( a=='A' ) a = 0;
-    else if ( a=='C' ) a = 1;
-    else if ( a=='G' ) a = 2;
-    else if ( a=='T' ) a = 3;
-    else return 'N';
-    if ( b=='A' ) b = 0;
-    else if ( b=='C' ) b = 1;
-    else if ( b=='G' ) b = 2;
-    else if ( b=='T' ) b = 3;
-    else return 'N';
-    return iupac[(int)a][(int)b];
+    const int A = 1;
+    const int C = 2;
+    const int G = 4;
+    const int T = 8;
+    if ( iupac >= 97 ) iupac -= 32;
+    if ( iupac == 'A' ) return A;
+    if ( iupac == 'C' ) return C;
+    if ( iupac == 'G' ) return G;
+    if ( iupac == 'T' ) return T;
+    if ( iupac == 'M' ) return A|C;
+    if ( iupac == 'R' ) return A|G;
+    if ( iupac == 'W' ) return A|T;
+    if ( iupac == 'S' ) return C|G;
+    if ( iupac == 'Y' ) return C|T;
+    if ( iupac == 'K' ) return G|T;
+    if ( iupac == 'V' ) return A|C|G;
+    if ( iupac == 'H' ) return A|C|T;
+    if ( iupac == 'D' ) return A|G|T;
+    if ( iupac == 'B' ) return C|G|T;
+    if ( iupac == 'N' ) return A|C|G|T;
+    return -1;
+}
+static inline char bitmask2iupac(int bitmask)
+{
+    const char iupac[16] = {'.','A','C','M','G','R','S','V','T','W','Y','H','K','D','B','N'};
+    if ( bitmask <= 0 || bitmask > 15 ) return 0;
+    return iupac[bitmask];
  }
  
  static inline int iupac_consistent(char iupac, char nt)
@@ -101,4 +116,24 @@ static inline double phred_score(double prob)
      return prob>99 ? 99 : prob;
  }
  
+static const uint64_t bcf_double_missing    = 0x7ff0000000000001;
+static const uint64_t bcf_double_vector_end = 0x7ff0000000000002;
+static inline void bcf_double_set(double *ptr, uint64_t value)
+{
+    union { uint64_t i; double d; } u;
+    u.i = value;
+    *ptr = u.d;
+}
+static inline int bcf_double_test(double d, uint64_t value)
+{
+    union { uint64_t i; double d; } u;
+    u.d = d;
+    return u.i==value ? 1 : 0;
+}
+#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end)
+#define bcf_double_set_missing(x)    bcf_double_set(&(x),bcf_double_missing)
+#define bcf_double_is_vector_end(x)  bcf_double_test((x),bcf_double_vector_end)
+#define bcf_double_is_missing(x)     bcf_double_test((x),bcf_double_missing)
+#define bcf_double_is_missing_or_vector_end(x)     (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end))
+
  #endif
diff --git a/bcftools/bcftools.pysam.c b/bcftools/bcftools.pysam.c

index de8739d06fb89253a1832676a35bf85a48289b0d..c6f4fd84846fc3e8169b3e8c321b39fc4f7f00e4 100644 (file)
--- a/bcftools/bcftools.pysam.c
+++ b/bcftools/bcftools.pysam.c
@@ -1,6 +1,7 @@
  #include <ctype.h>
  #include <assert.h>
  #include <unistd.h>
+#include <setjmp.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
@@ -55,6 +56,25 @@ int bcftools_puts(const char *s)
    return putc('\n', bcftools_stdout);
  }
  
+
+static jmp_buf bcftools_jmpbuf;
+static int bcftools_status = 0;
+
+int bcftools_dispatch(int argc, char *argv[])
+{
+  if (setjmp(bcftools_jmpbuf) == 0)
+    return bcftools_main(argc, argv);
+  else
+    return bcftools_status;
+}
+
+void bcftools_exit(int status)
+{
+  bcftools_status = status;
+  longjmp(bcftools_jmpbuf, 1);
+}
+
+
  void bcftools_set_optind(int val)
  {
    // setting this in cython via 
diff --git a/bcftools/bcftools.pysam.h b/bcftools/bcftools.pysam.h

index 453567a54067d46fff33f0ad9962e840caca4292..b8bf93e5345ed8dd79e46e5087a0a36bcdb577cd 100644 (file)
--- a/bcftools/bcftools.pysam.h
+++ b/bcftools/bcftools.pysam.h
@@ -3,6 +3,17 @@
  
  #include <stdio.h>
  
+#ifndef __has_attribute
+#define __has_attribute(attribute) 0
+#endif
+#ifndef PYSAM_NORETURN
+#if __has_attribute(__noreturn__) || __GNUC__ >= 3
+#define PYSAM_NORETURN __attribute__((__noreturn__))
+#else
+#define PYSAM_NORETURN
+#endif
+#endif
+
  extern FILE * bcftools_stderr;
  
  extern FILE * bcftools_stdout;
@@ -40,6 +51,8 @@ int bcftools_puts(const char *s);
  
  int bcftools_dispatch(int argc, char *argv[]);
  
+void PYSAM_NORETURN bcftools_exit(int status);
+
  void bcftools_set_optind(int);
  
  extern int bcftools_main(int argc, char *argv[]);
diff --git a/bcftools/bin.c b/bcftools/bin.c

index 95a2be156b33f25d6ae4a5603ec9f0a777376593..a4817cf45e293bbc4292cf0301ccd30e1fe0b660 100644 (file)
--- a/bcftools/bin.c
+++ b/bcftools/bin.c
@@ -25,6 +25,7 @@
   */
  
  #include <stdio.h>
+#include <assert.h>
  #include "bcftools.h"
  #include "bin.h"
  
diff --git a/bcftools/bin.c.pysam.c b/bcftools/bin.c.pysam.c

index 426ef455515765e7dd40f5cad17bf5b17194722d..1a177be400b86770294ba002cee5e5f376673b18 100644 (file)
--- a/bcftools/bin.c.pysam.c
+++ b/bcftools/bin.c.pysam.c
@@ -27,6 +27,7 @@
   */
  
  #include <stdio.h>
+#include <assert.h>
  #include "bcftools.h"
  #include "bin.h"
  
diff --git a/bcftools/call.h b/bcftools/call.h

index 50e4815ab752c00392d799cd789e2fcf3d484e3a..16bf0b68e558064e75a94b37f1ca8e678f7adf3d 100644 (file)
--- a/bcftools/call.h
+++ b/bcftools/call.h
@@ -1,6 +1,6 @@
  /*  call.h -- variant calling declarations.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2015, 2019-2020 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -34,7 +34,7 @@ THE SOFTWARE.  */
  #define CALL_CONSTR_TRIO    (1<<2)
  #define CALL_CONSTR_ALLELES (1<<3)
  //
-//
+#define CALL_FMT_PV4        (1<<5)
  #define CALL_FMT_GQ         (1<<6)
  #define CALL_FMT_GP         (1<<7)
  
@@ -52,18 +52,13 @@ family_t;
  // For the single-sample and grouped -G calling
  typedef struct
  {
+    double ref_lk, max_lk, lk_sum;
      float *qsum;    // QS(quality sum) values
-    int nqsum, dp;
-    double fa,fb,fc,fa2,fb2,fc2,fab,fac,fbc;
-}
-grp1_t;
-typedef struct
-{
-    grp1_t *grp;
-    int ngrp;
-    int *smpl2grp;
+    int nqsum;
+    uint32_t *smpl, nsmpl;
+    uint32_t nals, als;
  }
-grp_t;
+smpl_grp_t;
  
  // For the `-C alleles -i` constrained calling
  typedef struct
@@ -82,6 +77,7 @@ typedef struct
      int *pl_map, npl_map;   // same as above for PLs, but reverse (new -> old)
      char **als;             // array to hold the trimmed set of alleles to appear on output
      int nals;               // size of the als array
+    int als_new, nals_new;  // bitmask with final alleles and their number
      family_t *fams;         // list of families and samples for trio calling
      int nfams, mfams;
      int ntrio[5][5];        // possible trio genotype combinations and their counts; first idx:
@@ -96,18 +92,16 @@ typedef struct
      int32_t *ugts, *cgts;   // unconstraind and constrained GTs
      uint32_t output_tags;
      char *prior_AN, *prior_AC;  // reference panel AF tags (AF=AC/AN)
-    tgt_als_t *tgt_als;     // for CALL_CONSTR_ALLELES
-    char *sample_groups;    // for single-sample or grouped calling with -G
-    grp_t smpl_grp;
-    float *qsum;
-    int nqsum;
+    tgt_als_t *tgt_als;         // for CALL_CONSTR_ALLELES
+    char *sample_groups;        // for single-sample or grouped calling with -G
+    char *sample_groups_tag;    // for -G [AD|QS:]
+    smpl_grp_t *smpl_grp;
+    int nsmpl_grp;
  
      // ccall only
      double indel_frac, min_perm_p, min_lrt;
      double prior_type, pref;
-    double ref_lk, lk_sum;
      int ngrp1_samples, n_perm;
-    int nhets, ndiploid;
      char *prior_file;
      ccall_t *cdat;
  
@@ -149,7 +143,7 @@ void qcall_destroy(call_t *call);
  void call_init_pl2p(call_t *call);
  uint32_t *call_trio_prep(int is_x, int is_son);
  
-void init_allele_trimming_maps(call_t *call, int als, int nals);
-void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als);
+void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out);
+void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new);
  
  #endif
diff --git a/bcftools/ccall.c b/bcftools/ccall.c

index 9f6958ac437391005953ed017a85df0b81bd2558..6bf987b69519164578fed570dd57d9390c9c3e9d 100644 (file)
--- a/bcftools/ccall.c
+++ b/bcftools/ccall.c
@@ -24,6 +24,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.  */
  
  #include <math.h>
+#include <assert.h>
  #include <htslib/kfunc.h>
  #include "call.h"
  #include "kmin.h"
@@ -302,8 +303,8 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double
      // trim Number=R tags
      int out_als = 0;
      for (i=0; i<nals; i++) out_als |= 1<<i;
-    init_allele_trimming_maps(call, out_als, nals_ori);
-    mcall_trim_numberR(call, rec, nals_ori, nals, out_als);
+    init_allele_trimming_maps(call, nals_ori, out_als);
+    mcall_trim_and_update_numberR(call, rec, nals_ori, nals);
  
      return is_var;
  }
diff --git a/bcftools/ccall.c.pysam.c b/bcftools/ccall.c.pysam.c

index 696b455db9c4922925dee6dd169b12979246399b..eb7c615817e20229bb908cb758aa8689552776d4 100644 (file)
--- a/bcftools/ccall.c.pysam.c
+++ b/bcftools/ccall.c.pysam.c
@@ -26,6 +26,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.  */
  
  #include <math.h>
+#include <assert.h>
  #include <htslib/kfunc.h>
  #include "call.h"
  #include "kmin.h"
@@ -304,8 +305,8 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double
      // trim Number=R tags
      int out_als = 0;
      for (i=0; i<nals; i++) out_als |= 1<<i;
-    init_allele_trimming_maps(call, out_als, nals_ori);
-    mcall_trim_numberR(call, rec, nals_ori, nals, out_als);
+    init_allele_trimming_maps(call, nals_ori, out_als);
+    mcall_trim_and_update_numberR(call, rec, nals_ori, nals);
  
      return is_var;
  }
diff --git a/bcftools/consensus.c b/bcftools/consensus.c

index 4652a39a89be3f2a06f2b8bdf0773796f0d72b23..a232174c8e33ca938a37c42110518a6ee06d9032 100644 (file)
--- a/bcftools/consensus.c
+++ b/bcftools/consensus.c
@@ -1,6 +1,6 @@
  /* The MIT License
  
-   Copyright (c) 2014-2017 Genome Research Ltd.
+   Copyright (c) 2014-2021 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -28,6 +28,7 @@
  #include <stdlib.h>
  #include <string.h>
  #include <strings.h>
+#include <assert.h>
  #include <errno.h>
  #include <getopt.h>
  #include <unistd.h>
@@ -52,6 +53,9 @@
  #define PICK_SHORT 8
  #define PICK_IUPAC 16
  
+#define TO_UPPER 0
+#define TO_LOWER 1
+
  typedef struct
  {
      int num;                // number of ungapped blocks in this chain
@@ -64,6 +68,16 @@ typedef struct
  }
  chain_t;
  
+#define MASK_LC 1
+#define MASK_UC 2
+#define MASK_SKIP(x) (((x)->with!=MASK_LC && (x)->with!=MASK_UC) ? 1 : 0)
+typedef struct
+{
+    char *fname, with;
+    regidx_t *idx;
+    regitr_t *itr;
+}
+mask_t;
  
  typedef struct
  {
@@ -71,9 +85,10 @@ typedef struct
      int fa_ori_pos;     // start position of the fa_buffer (wrt original sequence)
      int fa_frz_pos;     // protected position to avoid conflicting variants (last pos for SNPs/ins)
      int fa_mod_off;     // position difference of fa_frz_pos in the ori and modified sequence (ins positive)
+    int fa_frz_mod;     // the fa_buf offset of the protected fa_frz_pos position, includes the modified sequence
      int fa_end_pos;     // region's end position in the original sequence
      int fa_length;      // region's length in the original sequence (in case end_pos not provided in the FASTA header)
-    int fa_case;        // output upper case or lower case?
+    int fa_case;        // output upper case or lower case: TO_UPPER|TO_LOWER
      int fa_src_pos;     // last genomic coordinate read from the input fasta (0-based)
      char prev_base;     // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778
      int prev_base_pos;  // the position of prev_base
@@ -84,8 +99,8 @@ typedef struct
      int nvcf_buf, rid;
      char *chr, *chr_prefix;
  
-    regidx_t *mask;
-    regitr_t *itr;
+    mask_t *mask;
+    int nmask;
  
      int chain_id;       // chain_id, to provide a unique ID to each chain in the chain output
      chain_t *chain;     // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
@@ -101,7 +116,10 @@ typedef struct
      FILE *fp_chain;
      char **argv;
      int argc, output_iupac, haplotype, allele, isample, napplied;
-    char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele;
+    uint8_t *iupac_bitmask;
+    int miupac_bitmask;
+    char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
+    char mark_del, mark_ins, mark_snv;
  }
  args_t;
  
@@ -182,7 +200,7 @@ static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_s
  //     fprintf(stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
      int num = chain->num;
  
-    if (ref_start <= chain->ref_last_block_ori) {
+    if (num && ref_start <= chain->ref_last_block_ori) {
          // In case this variant is back-to-back with the previous one
          chain->ref_last_block_ori = ref_start + ref_len;
          chain->alt_last_block_ori = alt_start + alt_len;
@@ -222,11 +240,13 @@ static void init_data(args_t *args)
          if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
          args->isample = 0;
      }
-    if ( args->mask_fname )
+    int i;
+    for (i=0; i<args->nmask; i++)
      {
-        args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL);
-        if ( !args->mask ) error("Failed to initialize mask regions\n");
-        args->itr = regitr_init(args->mask);
+        mask_t *mask = &args->mask[i];
+        mask->idx = regidx_init(mask->fname,NULL,NULL,0,NULL);
+        if ( !mask->idx ) error("Failed to initialize mask regions\n");
+        mask->itr = regitr_init(mask->idx);
      }
      // In case we want to store the chains
      if ( args->chain_fname )
@@ -245,10 +265,28 @@ static void init_data(args_t *args)
      if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n");
      if ( args->filter_str )
          args->filter = filter_init(args->hdr, args->filter_str);
+    args->rid = -1;
+}
+static void add_mask(args_t *args, char *fname)
+{
+    args->nmask++;
+    args->mask = (mask_t*)realloc(args->mask,args->nmask*sizeof(*args->mask));
+    mask_t *mask = &args->mask[args->nmask-1];
+    mask->fname = fname;
+    mask->with  = 'N';
+}
+static void add_mask_with(args_t *args, char *with)
+{
+    if ( !args->nmask ) error("The --mask-with option must follow --mask\n");
+    mask_t *mask = &args->mask[args->nmask-1];
+    if ( !strcasecmp(with,"uc") ) mask->with = MASK_UC;
+    else if ( !strcasecmp(with,"lc") ) mask->with = MASK_LC;
+    else if ( strlen(with)!=1 ) error("Expected \"lc\", \"uc\", or a single character with the --mask-with option\n");
+    else mask->with = *with;
  }
-
  static void destroy_data(args_t *args)
  {
+    free(args->iupac_bitmask);
      if (args->filter) filter_destroy(args->filter);
      bcf_sr_destroy(args->files);
      int i;
@@ -257,8 +295,13 @@ static void destroy_data(args_t *args)
      free(args->vcf_buf);
      free(args->fa_buf.s);
      free(args->chr);
-    if ( args->mask ) regidx_destroy(args->mask);
-    if ( args->itr ) regitr_destroy(args->itr);
+    for (i=0; i<args->nmask; i++)
+    {
+        mask_t *mask = &args->mask[i];
+        regidx_destroy(mask->idx);
+        regitr_destroy(mask->itr);
+    }
+    free(args->mask);
      if ( args->chain_fname )
          if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
      if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
@@ -297,6 +340,7 @@ static void init_region(args_t *args, char *line)
      args->fa_src_pos = from;
      args->fa_mod_off = 0;
      args->fa_frz_pos = -1;
+    args->fa_frz_mod = -1;
      args->fa_case    = -1;
      args->vcf_rbuf.n = 0;
      bcf_sr_seek(args->files,line,args->fa_ori_pos);
@@ -345,7 +389,6 @@ static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
  static void flush_fa_buffer(args_t *args, int len)
  {
      if ( !args->fa_buf.l ) return;
-
      int nwr = 0;
      while ( nwr + 60 <= args->fa_buf.l )
      {
@@ -356,6 +399,8 @@ static void flush_fa_buffer(args_t *args, int len)
      if ( nwr )
          args->fa_ori_pos += nwr;
  
+    args->fa_frz_mod -= nwr;
+
      if ( len )
      {
          // not finished on this chr yet and the buffer cannot be emptied completely
@@ -375,21 +420,84 @@ static void flush_fa_buffer(args_t *args, int len)
      args->fa_mod_off = 0;
      args->fa_buf.l = 0;
  }
+static void apply_absent(args_t *args, hts_pos_t pos)
+{
+    if ( !args->fa_buf.l || pos <= args->fa_frz_pos + 1 || pos <= args->fa_ori_pos ) return;
+
+    int ie = pos && pos - args->fa_ori_pos + args->fa_mod_off < args->fa_buf.l ? pos - args->fa_ori_pos + args->fa_mod_off : args->fa_buf.l;
+    int ib = args->fa_frz_mod < 0 ? 0 : args->fa_frz_mod;
+    int i;
+    for (i=ib; i<ie; i++)
+        args->fa_buf.s[i] = args->absent_allele;
+}
+static void freeze_ref(args_t *args, bcf1_t *rec)
+{
+    if ( args->fa_frz_pos >= rec->pos + rec->rlen - 1 ) return;
+    args->fa_frz_pos = rec->pos + rec->rlen - 1;
+    args->fa_frz_mod = rec->pos - args->fa_ori_pos + args->fa_mod_off + rec->rlen;
+}
+static char *mark_del(char *ref, int rlen, char *alt, int mark)
+{
+    char *out = malloc(rlen+1);
+    int i;
+    if ( alt )
+    {
+        int nalt = strlen(alt);
+        for (i=0; i<nalt; i++) out[i] = alt[i];
+    }
+    else    // symbolic <DEL>
+    {
+        int nref = strlen(ref);
+        for (i=0; i<nref; i++) out[i] = ref[i];
+    }
+    for (; i<rlen; i++) out[i] = mark;
+    out[rlen] = 0;
+    return out;
+}
+static void mark_ins(char *ref, char *alt, char mark)
+{
+    int i, nref = strlen(ref), nalt = strlen(alt);
+    if ( mark=='l' )
+        for (i=nref; i<nalt; i++) alt[i] = tolower(alt[i]);
+    else
+        for (i=nref; i<nalt; i++) alt[i] = toupper(alt[i]);
+}
+static void mark_snv(char *ref, char *alt, char mark)
+{
+    int i, nref = strlen(ref), nalt = strlen(alt);
+    int n = nref < nalt ? nref : nalt;
+    if ( mark=='l' )
+    {
+        for (i=0; i<n; i++)
+            if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = tolower(alt[i]);
+    }
+    else
+    {
+        for (i=0; i<n; i++)
+            if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
+    }
+}
  static void apply_variant(args_t *args, bcf1_t *rec)
  {
      static int warned_haplotype = 0;
  
-    if ( rec->n_allele==1 && !args->missing_allele ) return;
+    if ( args->absent_allele ) apply_absent(args, rec->pos);
+    if ( rec->n_allele==1 && !args->missing_allele && !args->absent_allele ) { return; }
  
+    int i,j;
      if ( args->mask )
      {
          char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
          int start = rec->pos;
          int end   = rec->pos + rec->rlen - 1;
-        if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
+        for (i=0; i<args->nmask; i++)
+        {
+            mask_t *mask = &args->mask[i];
+            if ( MASK_SKIP(mask) && regidx_overlap(mask->idx, chr,start,end,NULL) ) return;
+        }
      }
  
-    int i, ialt = 1;    // the alternate allele
+    int ialt = 1;    // the alternate allele
      if ( args->isample >= 0 )
      {
          bcf_unpack(rec, BCF_UN_FMT);
@@ -403,6 +511,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          enum { use_hap, use_iupac, pick_one } action = use_hap;
          if ( args->allele==PICK_IUPAC )
          {
+            if ( !args->haplotype ) action = use_iupac;
              if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac;
          }
          else if ( args->output_iupac ) action = use_iupac;
@@ -441,41 +550,40 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          }
          else if ( action==use_iupac ) 
          {
-            ialt = ptr[0];
-            if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end )
+            ialt = -1;
+            int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1;
+            for (i=0; i<fmt->n; i++)
              {
-                if ( !args->missing_allele ) return;
-                ialt = -1;
-            }
-            else
-                ialt = bcf_gt_allele(ialt);
+                if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; }
+                if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
+                int jalt = bcf_gt_allele(ptr[i]);
+                if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+                if ( fallback_alt <= 0 ) fallback_alt = jalt;
  
-            int jalt;
-            if ( fmt->n>1 )
-            {
-                jalt = ptr[1];
-                if ( bcf_gt_is_missing(jalt) )
+                int l = strlen(rec->d.allele[jalt]);
+                for (j=0; j<l; j++)
+                    if ( iupac2bitmask(rec->d.allele[jalt][j]) < 0 ) break;
+                if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
+
+                if ( l > mlen )
                  {
-                    if ( !args->missing_allele ) return;
-                    ialt = -1;
+                    hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
+                    for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
+                    mlen = l;
                  }
-                else if ( jalt==bcf_int32_vector_end ) jalt = ialt;
-                else
-                    jalt = bcf_gt_allele(jalt);
-            }
-            else jalt = ialt;
-
-            if ( ialt>=0 )
-            {
-                if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
-                if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
+                if ( jalt>0 && l>alen )
                  {
-                    char ial = rec->d.allele[ialt][0];
-                    char jal = rec->d.allele[jalt][0];
-                    if ( !ialt ) ialt = jalt;   // only ialt is used, make sure 0/1 is not ignored
-                    rec->d.allele[ialt][0] = gt2iupac(ial,jal);
+                    alen = l;
+                    ialt = jalt;
                  }
+                for (j=0; j<l; j++)
+                    args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]);
              }
+            if ( alen > 0 )
+                for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
+            else if ( fallback_alt >= 0 )
+                ialt = fallback_alt;
+            else if ( is_missing && !args->missing_allele ) return;
          }
          else
          {
@@ -520,17 +628,50 @@ static void apply_variant(args_t *args, bcf1_t *rec)
                  }
              }
          }
-        if ( !ialt ) return;  // ref allele
+        if ( !ialt )
+        {
+            // ref allele
+            if ( args->absent_allele ) freeze_ref(args,rec);
+            return;
+        }
          if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
      }
-    else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] )
+    else if ( args->output_iupac && rec->n_allele>1 )
      {
-        char ial = rec->d.allele[0][0];
-        char jal = rec->d.allele[1][0];
-        rec->d.allele[1][0] = gt2iupac(ial,jal);
+        int ialt, alen = 0, mlen = 0;
+        for (i=0; i<rec->n_allele; i++)
+        {
+            int l = strlen(rec->d.allele[i]);
+            for (j=0; j<l; j++)
+                if ( iupac2bitmask(rec->d.allele[i][j]) < 0 ) break;
+            if ( j<l ) continue;    // symbolic allele, breakpoint or invalid character in the allele
+
+            if ( l > mlen )
+            {
+                hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
+                for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
+                mlen = l;
+            }
+            if ( i>0 && l>alen )
+            {
+                alen = l;
+                ialt = i;
+            }
+            for (j=0; j<l; j++)
+                args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]);
+        }
+        if ( alen > 0 )
+            for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
+        else
+            ialt = 1;
      }
  
-    if ( rec->n_allele==1 && ialt!=-1 ) return; // non-missing reference
+    if ( rec->n_allele==1 && ialt!=-1 )
+    {
+        // non-missing reference
+        if ( args->absent_allele ) freeze_ref(args,rec);
+        return;
+    }
      if ( ialt==-1 )
      {
          char alleles[4];
@@ -542,15 +683,34 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          ialt = 1;
      }
  
+    // For some variant types POS+REF refer to the base *before* the event; in such case set trim_beg
+    int trim_beg = 0;
+    int var_type = bcf_get_variant_type(rec,ialt);
+    int var_len  = rec->d.var[ialt].n;
+    if ( var_type & VCF_INDEL )
+    {
+        // normally indel starts one base after, but not if the first base of the fa reference is deleted
+        if ( rec->d.allele[0][0] == rec->d.allele[ialt][0] )
+            trim_beg = 1;
+        else
+            trim_beg = 0;
+    }
+    else if ( (var_type & VCF_OTHER) && !strcasecmp(rec->d.allele[ialt],"<DEL>") )
+    {
+        trim_beg = 1;
+        var_len  = 1 - rec->rlen;
+    }
+    else if ( (var_type & VCF_OTHER) && !strncasecmp(rec->d.allele[ialt],"<INS",4) ) trim_beg = 1;
+
      // Overlapping variant?
      if ( rec->pos <= args->fa_frz_pos )
      {
          // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888).
          // This still may not be enough for more complicated cases with multiple duplicate positions
          // and other types in between. In such case let the user normalize the VCF and remove duplicates.
+
          int overlap = 0;
-        if ( rec->pos < args->fa_frz_pos || !(bcf_get_variant_type(rec,ialt) & VCF_INDEL) ) overlap = 1;
-        else if ( rec->d.var[ialt].n <= 0 || args->prev_is_insert ) overlap = 1;
+        if ( rec->pos < args->fa_frz_pos || !trim_beg || var_len==0 || args->prev_is_insert ) overlap = 1;
  
          if ( overlap )
          {
@@ -560,6 +720,9 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          
      }
  
+    char *alt_allele = rec->d.allele[ialt];
+    int rmme_alt = 0;
+
      int len_diff = 0, alen = 0;
      int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
      if ( idx<0 )
@@ -570,10 +733,10 @@ static void apply_variant(args_t *args, bcf1_t *rec)
      if ( rec->rlen > args->fa_buf.l - idx )
      {
          rec->rlen = args->fa_buf.l - idx;
-        alen = strlen(rec->d.allele[ialt]);
+        alen = strlen(alt_allele);
          if ( alen > rec->rlen )
          {
-            rec->d.allele[ialt][rec->rlen] = 0;
+            alt_allele[rec->rlen] = 0;
              fprintf(stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
          }
      }
@@ -581,14 +744,44 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
  
      // sanity check the reference base
-    if ( rec->d.allele[ialt][0]=='<' )
+    if ( alt_allele[0]=='<' )
      {
-        if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
-            error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%"PRId64"\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
-        assert( rec->d.allele[0][1]==0 );           // todo: for now expecting strlen(REF) = 1
-        len_diff = 1-rec->rlen;
-        rec->d.allele[ialt] = rec->d.allele[0];     // according to VCF spec, REF must precede the event
-        alen = strlen(rec->d.allele[ialt]);
+        // TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG
+
+        if ( strcasecmp(alt_allele,"<DEL>") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"<NON_REF>") )
+            error("Symbolic alleles other than <DEL>, <*> or <NON_REF> are currently not supported, e.g. %s at %s:%"PRId64".\n"
+                  "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n",
+                alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+        if ( !strcasecmp(alt_allele,"<DEL>") )
+        {
+            static int multibase_ref_del_warned = 0;
+            if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned )
+            {
+                fprintf(stderr,
+                    "Warning: one REF base is expected with <DEL>, assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n"
+                    "         (This warning is printed only once.)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+                multibase_ref_del_warned = 1;
+            }
+            if ( args->mark_del )   // insert dashes instead of delete sequence
+            {
+                alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del);
+                alen = rec->rlen;
+                len_diff = 0;
+                rmme_alt = 1;
+            }
+            else
+            {
+                len_diff = 1-rec->rlen;
+                alt_allele = rec->d.allele[0];     // according to VCF spec, the first REF base must precede the event
+                alen = 1;
+            }
+        }
+        else
+        {
+            // <*>  or <NON_REF> .. gVCF, evidence for the reference allele throughout the whole block
+            freeze_ref(args,rec);
+            return;
+        }
      }
      else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
      {
@@ -614,39 +807,63 @@ static void apply_variant(args_t *args, bcf1_t *rec)
              }
              error(
                      "The fasta sequence does not match the REF allele at %s:%"PRId64":\n"
-                    "   .vcf: [%s] <- (REF)\n" 
-                    "   .vcf: [%s] <- (ALT)\n" 
-                    "   .fa:  [%s]%c%s\n",
-                    bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx,
+                    "   REF .vcf: [%s]\n"
+                    "   ALT .vcf: [%s]\n"
+                    "   REF .fa : [%s]%c%s\n",
+                    bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx,
                      tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
                   );
          }
-        alen = strlen(rec->d.allele[ialt]);
+        alen = strlen(alt_allele);
          len_diff = alen - rec->rlen;
+
+        if ( args->mark_del && len_diff<0 ) 
+        {
+            alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
+            alen = rec->rlen;
+            len_diff = 0;
+            rmme_alt = 1;
+        }
      }
      else
      {
-        alen = strlen(rec->d.allele[ialt]);
+        alen = strlen(alt_allele);
          len_diff = alen - rec->rlen;
+
+        if ( args->mark_del && len_diff<0 ) 
+        {
+            alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
+            alen = rec->rlen;
+            len_diff = 0;
+            rmme_alt = 1;
+        }
      }
  
-    if ( args->fa_case )
-        for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]);
+    args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER;
+    if ( args->fa_case==TO_UPPER )
+        for (i=0; i<alen; i++) alt_allele[i] = toupper(alt_allele[i]);
      else
-        for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]);
+        for (i=0; i<alen; i++) alt_allele[i] = tolower(alt_allele[i]);
+
+    if ( args->mark_ins && len_diff>0 )
+        mark_ins(rec->d.allele[0], alt_allele, args->mark_ins);
+    if ( args->mark_snv )
+        mark_snv(rec->d.allele[0], alt_allele, args->mark_snv);
  
      if ( len_diff <= 0 )
      {
          // deletion or same size event
-        for (i=0; i<alen; i++)
-            args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
+        assert( args->fa_buf.l >= idx+rec->rlen );
+        args->prev_base = args->fa_buf.s[idx+rec->rlen-1];
+        args->prev_base_pos = rec->pos + rec->rlen - 1;
+        args->prev_is_insert = 0;
+        args->fa_frz_mod = idx + alen;
+
+        for (i=trim_beg; i<alen; i++)
+            args->fa_buf.s[idx+i] = alt_allele[i];
  
          if ( len_diff )
              memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
-
-        args->prev_base = rec->d.allele[0][rec->rlen - 1];
-        args->prev_base_pos = rec->pos + rec->rlen - 1;
-        args->prev_is_insert = 0;
      }
      else
      {
@@ -663,14 +880,16 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          //      1   C   T
          //      1   C   CAA
          int ibeg = 0;
-        while ( ibeg<alen && rec->d.allele[0][ibeg]==rec->d.allele[ialt][ibeg] && rec->pos + ibeg <= args->prev_base_pos  ) ibeg++;
+        while ( ibeg<alen && rec->d.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos  ) ibeg++;
          for (i=ibeg; i<alen; i++)
-            args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
+            args->fa_buf.s[idx+i] = alt_allele[i];
+
+        args->fa_frz_mod = idx + alen - ibeg + 1;
      }
      if (args->chain && len_diff != 0)
      {
          // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
-        if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0)
+        if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0)
          {
              // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
              push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
@@ -685,6 +904,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
      args->fa_mod_off += len_diff;
      args->fa_frz_pos  = rec->pos + rec->rlen - 1;
      args->napplied++;
+    if ( rmme_alt ) free(alt_allele);
  }
  
  
@@ -692,17 +912,27 @@ static void mask_region(args_t *args, char *seq, int len)
  {
      int start = args->fa_src_pos - len;
      int end   = args->fa_src_pos;
+    int i;
  
-    if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;
-
-    int idx_start, idx_end, i;
-    while ( regitr_overlap(args->itr) )
+    for (i=0; i<args->nmask; i++)
      {
-        idx_start = args->itr->beg - start;
-        idx_end   = args->itr->end - start;
-        if ( idx_start < 0 ) idx_start = 0;
-        if ( idx_end >= len ) idx_end = len - 1;
-        for (i=idx_start; i<=idx_end; i++) seq[i] = 'N';
+        mask_t *mask = &args->mask[i];
+        if ( !regidx_overlap(mask->idx, args->chr,start,end, mask->itr) ) continue;
+
+        int idx_start, idx_end, j;
+        while ( regitr_overlap(mask->itr) )
+        {
+            idx_start = mask->itr->beg - start;
+            idx_end   = mask->itr->end - start;
+            if ( idx_start < 0 ) idx_start = 0;
+            if ( idx_end >= len ) idx_end = len - 1;
+            if ( mask->with==MASK_UC )
+                for (j=idx_start; j<=idx_end; j++) seq[j] = toupper(seq[j]);
+            else if ( mask->with==MASK_LC )
+                for (j=idx_start; j<=idx_end; j++) seq[j] = tolower(seq[j]);
+            else
+                for (j=idx_start; j<=idx_end; j++) seq[j] = mask->with;
+        }
      }
  }
  
@@ -720,13 +950,20 @@ static void consensus(args_t *args)
                  print_chain(args);
                  destroy_chain(args);
              }
-            // apply all cached variants
-            while ( args->vcf_rbuf.n )
+            // apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*)
+            bcf1_t **rec_ptr = NULL;
+            while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
              {
-                bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f];
+                bcf1_t *rec = *rec_ptr;
                  if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break;
-                int i = rbuf_shift(&args->vcf_rbuf);
-                apply_variant(args, args->vcf_buf[i]);
+                apply_variant(args, rec);
+            }
+            if ( args->absent_allele )
+            {
+                int pos = 0;
+                if ( args->vcf_rbuf.n && args->vcf_buf[args->vcf_rbuf.f]->rid==args->rid )
+                    pos = args->vcf_buf[args->vcf_rbuf.f]->pos;
+                apply_absent(args, pos);
              }
              flush_fa_buffer(args, 0);
              init_region(args, str.s+1);
@@ -771,7 +1008,11 @@ static void consensus(args_t *args)
              }
              apply_variant(args, rec);
          }
-        if ( !rec_ptr ) flush_fa_buffer(args, 60);
+        if ( !rec_ptr )
+        {
+            if ( args->absent_allele ) apply_absent(args, args->fa_ori_pos - args->fa_mod_off + args->fa_buf.l);
+            flush_fa_buffer(args, 60);
+        }
      }
      bcf1_t **rec_ptr = NULL;
      while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
@@ -787,6 +1028,7 @@ static void consensus(args_t *args)
          print_chain(args);
          destroy_chain(args);
      }
+    if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX);
      flush_fa_buffer(args, 0);
      bgzf_close(fasta);
      free(str.s);
@@ -801,27 +1043,33 @@ static void usage(args_t *args)
      fprintf(stderr, "       --sample (and, optionally, --haplotype) option will apply genotype\n");
      fprintf(stderr, "       (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
      fprintf(stderr, "       information, such as INFO/AD or FORMAT/AD.\n");
-    fprintf(stderr, "Usage:   bcftools consensus [OPTIONS] <file.vcf.gz>\n");
+    fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
      fprintf(stderr, "Options:\n");
-    fprintf(stderr, "    -c, --chain <file>         write a chain file for liftover\n");
-    fprintf(stderr, "    -e, --exclude <expr>       exclude sites for which the expression is true (see man page for details)\n");
-    fprintf(stderr, "    -f, --fasta-ref <file>     reference sequence in fasta format\n");
-    fprintf(stderr, "    -H, --haplotype <which>    choose which allele to use from the FORMAT/GT field, note\n");
-    fprintf(stderr, "                               the codes are case-insensitive:\n");
-    fprintf(stderr, "                                   1: first allele from GT, regardless of phasing\n");
-    fprintf(stderr, "                                   2: second allele from GT, regardless of phasing\n");
-    fprintf(stderr, "                                   R: REF allele in het genotypes\n");
-    fprintf(stderr, "                                   A: ALT allele\n");
-    fprintf(stderr, "                                   LR,LA: longer allele and REF/ALT if equal length\n");
-    fprintf(stderr, "                                   SR,SA: shorter allele and REF/ALT if equal length\n");
-    fprintf(stderr, "                                   1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
-    fprintf(stderr, "    -i, --include <expr>       select sites for which the expression is true (see man page for details)\n");
-    fprintf(stderr, "    -I, --iupac-codes          output variants in the form of IUPAC ambiguity codes\n");
-    fprintf(stderr, "    -m, --mask <file>          replace regions with N\n");
-    fprintf(stderr, "    -M, --missing <char>       output <char> instead of skipping the missing genotypes\n");
-    fprintf(stderr, "    -o, --output <file>        write output to a file [standard output]\n");
-    fprintf(stderr, "    -p, --prefix <string>      prefix to add to output sequence names\n");
-    fprintf(stderr, "    -s, --sample <name>        apply variants of the given sample\n");
+    fprintf(stderr, "    -c, --chain FILE               write a chain file for liftover\n");
+    fprintf(stderr, "    -a, --absent CHAR              replace positions absent from VCF with CHAR\n");
+    fprintf(stderr, "    -e, --exclude EXPR             exclude sites for which the expression is true (see man page for details)\n");
+    fprintf(stderr, "    -f, --fasta-ref FILE           reference sequence in fasta format\n");
+    fprintf(stderr, "    -H, --haplotype WHICH          choose which allele to use from the FORMAT/GT field, note\n");
+    fprintf(stderr, "                                   the codes are case-insensitive:\n");
+    fprintf(stderr, "                                       1: first allele from GT, regardless of phasing\n");
+    fprintf(stderr, "                                       2: second allele from GT, regardless of phasing\n");
+    fprintf(stderr, "                                       R: REF allele in het genotypes\n");
+    fprintf(stderr, "                                       A: ALT allele\n");
+    fprintf(stderr, "                                       I: IUPAC code for all genotypes\n");
+    fprintf(stderr, "                                       LR,LA: longer allele and REF/ALT if equal length\n");
+    fprintf(stderr, "                                       SR,SA: shorter allele and REF/ALT if equal length\n");
+    fprintf(stderr, "                                       1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
+    fprintf(stderr, "    -i, --include EXPR             select sites for which the expression is true (see man page for details)\n");
+    fprintf(stderr, "    -I, --iupac-codes              output variants in the form of IUPAC ambiguity codes\n");
+    fprintf(stderr, "        --mark-del CHAR            instead of removing sequence, insert CHAR for deletions\n");
+    fprintf(stderr, "        --mark-ins uc|lc           highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+    fprintf(stderr, "        --mark-snv uc|lc           highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+    fprintf(stderr, "    -m, --mask FILE                replace regions according to the next --mask-with option. The default is --mask-with N\n");
+    fprintf(stderr, "        --mask-with CHAR|uc|lc     replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
+    fprintf(stderr, "    -M, --missing CHAR             output CHAR instead of skipping a missing genotype \"./.\"\n");
+    fprintf(stderr, "    -o, --output FILE              write output to a file [standard output]\n");
+    fprintf(stderr, "    -p, --prefix STRING            prefix to add to output sequence names\n");
+    fprintf(stderr, "    -s, --sample NAME              apply variants of the given sample\n");
      fprintf(stderr, "Examples:\n");
      fprintf(stderr, "   # Get the consensus for one region. The fasta header lines are then expected\n");
      fprintf(stderr, "   # in the form \">chr:from-to\".\n");
@@ -837,6 +1085,10 @@ int main_consensus(int argc, char *argv[])
  
      static struct option loptions[] = 
      {
+        {"mark-del",required_argument,NULL,1},
+        {"mark-ins",required_argument,NULL,2},
+        {"mark-snv",required_argument,NULL,3},
+        {"mask-with",1,0,4},
          {"exclude",required_argument,NULL,'e'},
          {"include",required_argument,NULL,'i'},
          {"sample",1,0,'s'},
@@ -846,23 +1098,44 @@ int main_consensus(int argc, char *argv[])
          {"fasta-ref",1,0,'f'},
          {"mask",1,0,'m'},
          {"missing",1,0,'M'},
+        {"absent",1,0,'a'},
          {"chain",1,0,'c'},
          {"prefix",required_argument,0,'p'},
          {0,0,0,0}
      };
      int c;
-    while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:",loptions,NULL)) >= 0) 
+    while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
      {
          switch (c) 
          {
+            case  1 : args->mark_del = optarg[0]; break;
+            case  2 :
+                if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u';
+                else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l';
+                else error("The argument is not recognised: --mark-ins %s\n",optarg);
+                break;
+            case  3 :
+                if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u';
+                else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l';
+                else error("The argument is not recognised: --mark-snv %s\n",optarg);
+                break;
              case 'p': args->chr_prefix = optarg; break;
              case 's': args->sample = optarg; break;
              case 'o': args->output_fname = optarg; break;
              case 'I': args->output_iupac = 1; break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e': 
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i': 
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'f': args->ref_fname = optarg; break;
-            case 'm': args->mask_fname = optarg; break;
+            case 'm': add_mask(args,optarg); break;
+            case  4 : add_mask_with(args,optarg); break;
+            case 'a':
+                args->absent_allele = optarg[0];
+                if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg);
+                break;
              case 'M': 
                  args->missing_allele = optarg[0]; 
                  if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg);
@@ -877,6 +1150,7 @@ int main_consensus(int argc, char *argv[])
                  else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT;
                  else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF;
                  else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT;
+                else if ( !strcasecmp(optarg,"I") ) args->allele |= PICK_IUPAC;
                  else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1;
                  else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2;
                  else
diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c

index b1b186137f6c5050da1897b14e3fa5b2694d6833..5105a2e10bda0e4b63c7bf9e470db0a88126e951 100644 (file)
--- a/bcftools/consensus.c.pysam.c
+++ b/bcftools/consensus.c.pysam.c
@@ -2,7 +2,7 @@
  
  /* The MIT License
  
-   Copyright (c) 2014-2017 Genome Research Ltd.
+   Copyright (c) 2014-2021 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -30,6 +30,7 @@
  #include <stdlib.h>
  #include <string.h>
  #include <strings.h>
+#include <assert.h>
  #include <errno.h>
  #include <getopt.h>
  #include <unistd.h>
@@ -54,6 +55,9 @@
  #define PICK_SHORT 8
  #define PICK_IUPAC 16
  
+#define TO_UPPER 0
+#define TO_LOWER 1
+
  typedef struct
  {
      int num;                // number of ungapped blocks in this chain
@@ -66,6 +70,16 @@ typedef struct
  }
  chain_t;
  
+#define MASK_LC 1
+#define MASK_UC 2
+#define MASK_SKIP(x) (((x)->with!=MASK_LC && (x)->with!=MASK_UC) ? 1 : 0)
+typedef struct
+{
+    char *fname, with;
+    regidx_t *idx;
+    regitr_t *itr;
+}
+mask_t;
  
  typedef struct
  {
@@ -73,9 +87,10 @@ typedef struct
      int fa_ori_pos;     // start position of the fa_buffer (wrt original sequence)
      int fa_frz_pos;     // protected position to avoid conflicting variants (last pos for SNPs/ins)
      int fa_mod_off;     // position difference of fa_frz_pos in the ori and modified sequence (ins positive)
+    int fa_frz_mod;     // the fa_buf offset of the protected fa_frz_pos position, includes the modified sequence
      int fa_end_pos;     // region's end position in the original sequence
      int fa_length;      // region's length in the original sequence (in case end_pos not provided in the FASTA header)
-    int fa_case;        // output upper case or lower case?
+    int fa_case;        // output upper case or lower case: TO_UPPER|TO_LOWER
      int fa_src_pos;     // last genomic coordinate read from the input fasta (0-based)
      char prev_base;     // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778
      int prev_base_pos;  // the position of prev_base
@@ -86,8 +101,8 @@ typedef struct
      int nvcf_buf, rid;
      char *chr, *chr_prefix;
  
-    regidx_t *mask;
-    regitr_t *itr;
+    mask_t *mask;
+    int nmask;
  
      int chain_id;       // chain_id, to provide a unique ID to each chain in the chain output
      chain_t *chain;     // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
@@ -103,7 +118,10 @@ typedef struct
      FILE *fp_chain;
      char **argv;
      int argc, output_iupac, haplotype, allele, isample, napplied;
-    char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele;
+    uint8_t *iupac_bitmask;
+    int miupac_bitmask;
+    char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
+    char mark_del, mark_ins, mark_snv;
  }
  args_t;
  
@@ -184,7 +202,7 @@ static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_s
  //     fprintf(bcftools_stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
      int num = chain->num;
  
-    if (ref_start <= chain->ref_last_block_ori) {
+    if (num && ref_start <= chain->ref_last_block_ori) {
          // In case this variant is back-to-back with the previous one
          chain->ref_last_block_ori = ref_start + ref_len;
          chain->alt_last_block_ori = alt_start + alt_len;
@@ -224,11 +242,13 @@ static void init_data(args_t *args)
          if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
          args->isample = 0;
      }
-    if ( args->mask_fname )
+    int i;
+    for (i=0; i<args->nmask; i++)
      {
-        args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL);
-        if ( !args->mask ) error("Failed to initialize mask regions\n");
-        args->itr = regitr_init(args->mask);
+        mask_t *mask = &args->mask[i];
+        mask->idx = regidx_init(mask->fname,NULL,NULL,0,NULL);
+        if ( !mask->idx ) error("Failed to initialize mask regions\n");
+        mask->itr = regitr_init(mask->idx);
      }
      // In case we want to store the chains
      if ( args->chain_fname )
@@ -247,10 +267,28 @@ static void init_data(args_t *args)
      if ( args->isample<0 ) fprintf(bcftools_stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n");
      if ( args->filter_str )
          args->filter = filter_init(args->hdr, args->filter_str);
+    args->rid = -1;
+}
+static void add_mask(args_t *args, char *fname)
+{
+    args->nmask++;
+    args->mask = (mask_t*)realloc(args->mask,args->nmask*sizeof(*args->mask));
+    mask_t *mask = &args->mask[args->nmask-1];
+    mask->fname = fname;
+    mask->with  = 'N';
+}
+static void add_mask_with(args_t *args, char *with)
+{
+    if ( !args->nmask ) error("The --mask-with option must follow --mask\n");
+    mask_t *mask = &args->mask[args->nmask-1];
+    if ( !strcasecmp(with,"uc") ) mask->with = MASK_UC;
+    else if ( !strcasecmp(with,"lc") ) mask->with = MASK_LC;
+    else if ( strlen(with)!=1 ) error("Expected \"lc\", \"uc\", or a single character with the --mask-with option\n");
+    else mask->with = *with;
  }
-
  static void destroy_data(args_t *args)
  {
+    free(args->iupac_bitmask);
      if (args->filter) filter_destroy(args->filter);
      bcf_sr_destroy(args->files);
      int i;
@@ -259,8 +297,13 @@ static void destroy_data(args_t *args)
      free(args->vcf_buf);
      free(args->fa_buf.s);
      free(args->chr);
-    if ( args->mask ) regidx_destroy(args->mask);
-    if ( args->itr ) regitr_destroy(args->itr);
+    for (i=0; i<args->nmask; i++)
+    {
+        mask_t *mask = &args->mask[i];
+        regidx_destroy(mask->idx);
+        regitr_destroy(mask->itr);
+    }
+    free(args->mask);
      if ( args->chain_fname )
          if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
      if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
@@ -299,6 +342,7 @@ static void init_region(args_t *args, char *line)
      args->fa_src_pos = from;
      args->fa_mod_off = 0;
      args->fa_frz_pos = -1;
+    args->fa_frz_mod = -1;
      args->fa_case    = -1;
      args->vcf_rbuf.n = 0;
      bcf_sr_seek(args->files,line,args->fa_ori_pos);
@@ -347,7 +391,6 @@ static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
  static void flush_fa_buffer(args_t *args, int len)
  {
      if ( !args->fa_buf.l ) return;
-
      int nwr = 0;
      while ( nwr + 60 <= args->fa_buf.l )
      {
@@ -358,6 +401,8 @@ static void flush_fa_buffer(args_t *args, int len)
      if ( nwr )
          args->fa_ori_pos += nwr;
  
+    args->fa_frz_mod -= nwr;
+
      if ( len )
      {
          // not finished on this chr yet and the buffer cannot be emptied completely
@@ -377,21 +422,84 @@ static void flush_fa_buffer(args_t *args, int len)
      args->fa_mod_off = 0;
      args->fa_buf.l = 0;
  }
+static void apply_absent(args_t *args, hts_pos_t pos)
+{
+    if ( !args->fa_buf.l || pos <= args->fa_frz_pos + 1 || pos <= args->fa_ori_pos ) return;
+
+    int ie = pos && pos - args->fa_ori_pos + args->fa_mod_off < args->fa_buf.l ? pos - args->fa_ori_pos + args->fa_mod_off : args->fa_buf.l;
+    int ib = args->fa_frz_mod < 0 ? 0 : args->fa_frz_mod;
+    int i;
+    for (i=ib; i<ie; i++)
+        args->fa_buf.s[i] = args->absent_allele;
+}
+static void freeze_ref(args_t *args, bcf1_t *rec)
+{
+    if ( args->fa_frz_pos >= rec->pos + rec->rlen - 1 ) return;
+    args->fa_frz_pos = rec->pos + rec->rlen - 1;
+    args->fa_frz_mod = rec->pos - args->fa_ori_pos + args->fa_mod_off + rec->rlen;
+}
+static char *mark_del(char *ref, int rlen, char *alt, int mark)
+{
+    char *out = malloc(rlen+1);
+    int i;
+    if ( alt )
+    {
+        int nalt = strlen(alt);
+        for (i=0; i<nalt; i++) out[i] = alt[i];
+    }
+    else    // symbolic <DEL>
+    {
+        int nref = strlen(ref);
+        for (i=0; i<nref; i++) out[i] = ref[i];
+    }
+    for (; i<rlen; i++) out[i] = mark;
+    out[rlen] = 0;
+    return out;
+}
+static void mark_ins(char *ref, char *alt, char mark)
+{
+    int i, nref = strlen(ref), nalt = strlen(alt);
+    if ( mark=='l' )
+        for (i=nref; i<nalt; i++) alt[i] = tolower(alt[i]);
+    else
+        for (i=nref; i<nalt; i++) alt[i] = toupper(alt[i]);
+}
+static void mark_snv(char *ref, char *alt, char mark)
+{
+    int i, nref = strlen(ref), nalt = strlen(alt);
+    int n = nref < nalt ? nref : nalt;
+    if ( mark=='l' )
+    {
+        for (i=0; i<n; i++)
+            if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = tolower(alt[i]);
+    }
+    else
+    {
+        for (i=0; i<n; i++)
+            if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
+    }
+}
  static void apply_variant(args_t *args, bcf1_t *rec)
  {
      static int warned_haplotype = 0;
  
-    if ( rec->n_allele==1 && !args->missing_allele ) return;
+    if ( args->absent_allele ) apply_absent(args, rec->pos);
+    if ( rec->n_allele==1 && !args->missing_allele && !args->absent_allele ) { return; }
  
+    int i,j;
      if ( args->mask )
      {
          char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
          int start = rec->pos;
          int end   = rec->pos + rec->rlen - 1;
-        if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
+        for (i=0; i<args->nmask; i++)
+        {
+            mask_t *mask = &args->mask[i];
+            if ( MASK_SKIP(mask) && regidx_overlap(mask->idx, chr,start,end,NULL) ) return;
+        }
      }
  
-    int i, ialt = 1;    // the alternate allele
+    int ialt = 1;    // the alternate allele
      if ( args->isample >= 0 )
      {
          bcf_unpack(rec, BCF_UN_FMT);
@@ -405,6 +513,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          enum { use_hap, use_iupac, pick_one } action = use_hap;
          if ( args->allele==PICK_IUPAC )
          {
+            if ( !args->haplotype ) action = use_iupac;
              if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac;
          }
          else if ( args->output_iupac ) action = use_iupac;
@@ -443,41 +552,40 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          }
          else if ( action==use_iupac ) 
          {
-            ialt = ptr[0];
-            if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end )
+            ialt = -1;
+            int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1;
+            for (i=0; i<fmt->n; i++)
              {
-                if ( !args->missing_allele ) return;
-                ialt = -1;
-            }
-            else
-                ialt = bcf_gt_allele(ialt);
+                if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; }
+                if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
+                int jalt = bcf_gt_allele(ptr[i]);
+                if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+                if ( fallback_alt <= 0 ) fallback_alt = jalt;
  
-            int jalt;
-            if ( fmt->n>1 )
-            {
-                jalt = ptr[1];
-                if ( bcf_gt_is_missing(jalt) )
+                int l = strlen(rec->d.allele[jalt]);
+                for (j=0; j<l; j++)
+                    if ( iupac2bitmask(rec->d.allele[jalt][j]) < 0 ) break;
+                if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
+
+                if ( l > mlen )
                  {
-                    if ( !args->missing_allele ) return;
-                    ialt = -1;
+                    hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
+                    for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
+                    mlen = l;
                  }
-                else if ( jalt==bcf_int32_vector_end ) jalt = ialt;
-                else
-                    jalt = bcf_gt_allele(jalt);
-            }
-            else jalt = ialt;
-
-            if ( ialt>=0 )
-            {
-                if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
-                if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
+                if ( jalt>0 && l>alen )
                  {
-                    char ial = rec->d.allele[ialt][0];
-                    char jal = rec->d.allele[jalt][0];
-                    if ( !ialt ) ialt = jalt;   // only ialt is used, make sure 0/1 is not ignored
-                    rec->d.allele[ialt][0] = gt2iupac(ial,jal);
+                    alen = l;
+                    ialt = jalt;
                  }
+                for (j=0; j<l; j++)
+                    args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]);
              }
+            if ( alen > 0 )
+                for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
+            else if ( fallback_alt >= 0 )
+                ialt = fallback_alt;
+            else if ( is_missing && !args->missing_allele ) return;
          }
          else
          {
@@ -522,17 +630,50 @@ static void apply_variant(args_t *args, bcf1_t *rec)
                  }
              }
          }
-        if ( !ialt ) return;  // ref allele
+        if ( !ialt )
+        {
+            // ref allele
+            if ( args->absent_allele ) freeze_ref(args,rec);
+            return;
+        }
          if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
      }
-    else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] )
+    else if ( args->output_iupac && rec->n_allele>1 )
      {
-        char ial = rec->d.allele[0][0];
-        char jal = rec->d.allele[1][0];
-        rec->d.allele[1][0] = gt2iupac(ial,jal);
+        int ialt, alen = 0, mlen = 0;
+        for (i=0; i<rec->n_allele; i++)
+        {
+            int l = strlen(rec->d.allele[i]);
+            for (j=0; j<l; j++)
+                if ( iupac2bitmask(rec->d.allele[i][j]) < 0 ) break;
+            if ( j<l ) continue;    // symbolic allele, breakpoint or invalid character in the allele
+
+            if ( l > mlen )
+            {
+                hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
+                for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
+                mlen = l;
+            }
+            if ( i>0 && l>alen )
+            {
+                alen = l;
+                ialt = i;
+            }
+            for (j=0; j<l; j++)
+                args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]);
+        }
+        if ( alen > 0 )
+            for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
+        else
+            ialt = 1;
      }
  
-    if ( rec->n_allele==1 && ialt!=-1 ) return; // non-missing reference
+    if ( rec->n_allele==1 && ialt!=-1 )
+    {
+        // non-missing reference
+        if ( args->absent_allele ) freeze_ref(args,rec);
+        return;
+    }
      if ( ialt==-1 )
      {
          char alleles[4];
@@ -544,15 +685,34 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          ialt = 1;
      }
  
+    // For some variant types POS+REF refer to the base *before* the event; in such case set trim_beg
+    int trim_beg = 0;
+    int var_type = bcf_get_variant_type(rec,ialt);
+    int var_len  = rec->d.var[ialt].n;
+    if ( var_type & VCF_INDEL )
+    {
+        // normally indel starts one base after, but not if the first base of the fa reference is deleted
+        if ( rec->d.allele[0][0] == rec->d.allele[ialt][0] )
+            trim_beg = 1;
+        else
+            trim_beg = 0;
+    }
+    else if ( (var_type & VCF_OTHER) && !strcasecmp(rec->d.allele[ialt],"<DEL>") )
+    {
+        trim_beg = 1;
+        var_len  = 1 - rec->rlen;
+    }
+    else if ( (var_type & VCF_OTHER) && !strncasecmp(rec->d.allele[ialt],"<INS",4) ) trim_beg = 1;
+
      // Overlapping variant?
      if ( rec->pos <= args->fa_frz_pos )
      {
          // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888).
          // This still may not be enough for more complicated cases with multiple duplicate positions
          // and other types in between. In such case let the user normalize the VCF and remove duplicates.
+
          int overlap = 0;
-        if ( rec->pos < args->fa_frz_pos || !(bcf_get_variant_type(rec,ialt) & VCF_INDEL) ) overlap = 1;
-        else if ( rec->d.var[ialt].n <= 0 || args->prev_is_insert ) overlap = 1;
+        if ( rec->pos < args->fa_frz_pos || !trim_beg || var_len==0 || args->prev_is_insert ) overlap = 1;
  
          if ( overlap )
          {
@@ -562,6 +722,9 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          
      }
  
+    char *alt_allele = rec->d.allele[ialt];
+    int rmme_alt = 0;
+
      int len_diff = 0, alen = 0;
      int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
      if ( idx<0 )
@@ -572,10 +735,10 @@ static void apply_variant(args_t *args, bcf1_t *rec)
      if ( rec->rlen > args->fa_buf.l - idx )
      {
          rec->rlen = args->fa_buf.l - idx;
-        alen = strlen(rec->d.allele[ialt]);
+        alen = strlen(alt_allele);
          if ( alen > rec->rlen )
          {
-            rec->d.allele[ialt][rec->rlen] = 0;
+            alt_allele[rec->rlen] = 0;
              fprintf(bcftools_stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
          }
      }
@@ -583,14 +746,44 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
  
      // sanity check the reference base
-    if ( rec->d.allele[ialt][0]=='<' )
+    if ( alt_allele[0]=='<' )
      {
-        if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
-            error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%"PRId64"\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
-        assert( rec->d.allele[0][1]==0 );           // todo: for now expecting strlen(REF) = 1
-        len_diff = 1-rec->rlen;
-        rec->d.allele[ialt] = rec->d.allele[0];     // according to VCF spec, REF must precede the event
-        alen = strlen(rec->d.allele[ialt]);
+        // TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG
+
+        if ( strcasecmp(alt_allele,"<DEL>") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"<NON_REF>") )
+            error("Symbolic alleles other than <DEL>, <*> or <NON_REF> are currently not supported, e.g. %s at %s:%"PRId64".\n"
+                  "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n",
+                alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+        if ( !strcasecmp(alt_allele,"<DEL>") )
+        {
+            static int multibase_ref_del_warned = 0;
+            if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned )
+            {
+                fprintf(bcftools_stderr,
+                    "Warning: one REF base is expected with <DEL>, assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n"
+                    "         (This warning is printed only once.)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+                multibase_ref_del_warned = 1;
+            }
+            if ( args->mark_del )   // insert dashes instead of delete sequence
+            {
+                alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del);
+                alen = rec->rlen;
+                len_diff = 0;
+                rmme_alt = 1;
+            }
+            else
+            {
+                len_diff = 1-rec->rlen;
+                alt_allele = rec->d.allele[0];     // according to VCF spec, the first REF base must precede the event
+                alen = 1;
+            }
+        }
+        else
+        {
+            // <*>  or <NON_REF> .. gVCF, evidence for the reference allele throughout the whole block
+            freeze_ref(args,rec);
+            return;
+        }
      }
      else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
      {
@@ -616,39 +809,63 @@ static void apply_variant(args_t *args, bcf1_t *rec)
              }
              error(
                      "The fasta sequence does not match the REF allele at %s:%"PRId64":\n"
-                    "   .vcf: [%s] <- (REF)\n" 
-                    "   .vcf: [%s] <- (ALT)\n" 
-                    "   .fa:  [%s]%c%s\n",
-                    bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx,
+                    "   REF .vcf: [%s]\n"
+                    "   ALT .vcf: [%s]\n"
+                    "   REF .fa : [%s]%c%s\n",
+                    bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx,
                      tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
                   );
          }
-        alen = strlen(rec->d.allele[ialt]);
+        alen = strlen(alt_allele);
          len_diff = alen - rec->rlen;
+
+        if ( args->mark_del && len_diff<0 ) 
+        {
+            alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
+            alen = rec->rlen;
+            len_diff = 0;
+            rmme_alt = 1;
+        }
      }
      else
      {
-        alen = strlen(rec->d.allele[ialt]);
+        alen = strlen(alt_allele);
          len_diff = alen - rec->rlen;
+
+        if ( args->mark_del && len_diff<0 ) 
+        {
+            alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
+            alen = rec->rlen;
+            len_diff = 0;
+            rmme_alt = 1;
+        }
      }
  
-    if ( args->fa_case )
-        for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]);
+    args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER;
+    if ( args->fa_case==TO_UPPER )
+        for (i=0; i<alen; i++) alt_allele[i] = toupper(alt_allele[i]);
      else
-        for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]);
+        for (i=0; i<alen; i++) alt_allele[i] = tolower(alt_allele[i]);
+
+    if ( args->mark_ins && len_diff>0 )
+        mark_ins(rec->d.allele[0], alt_allele, args->mark_ins);
+    if ( args->mark_snv )
+        mark_snv(rec->d.allele[0], alt_allele, args->mark_snv);
  
      if ( len_diff <= 0 )
      {
          // deletion or same size event
-        for (i=0; i<alen; i++)
-            args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
+        assert( args->fa_buf.l >= idx+rec->rlen );
+        args->prev_base = args->fa_buf.s[idx+rec->rlen-1];
+        args->prev_base_pos = rec->pos + rec->rlen - 1;
+        args->prev_is_insert = 0;
+        args->fa_frz_mod = idx + alen;
+
+        for (i=trim_beg; i<alen; i++)
+            args->fa_buf.s[idx+i] = alt_allele[i];
  
          if ( len_diff )
              memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
-
-        args->prev_base = rec->d.allele[0][rec->rlen - 1];
-        args->prev_base_pos = rec->pos + rec->rlen - 1;
-        args->prev_is_insert = 0;
      }
      else
      {
@@ -665,14 +882,16 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          //      1   C   T
          //      1   C   CAA
          int ibeg = 0;
-        while ( ibeg<alen && rec->d.allele[0][ibeg]==rec->d.allele[ialt][ibeg] && rec->pos + ibeg <= args->prev_base_pos  ) ibeg++;
+        while ( ibeg<alen && rec->d.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos  ) ibeg++;
          for (i=ibeg; i<alen; i++)
-            args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
+            args->fa_buf.s[idx+i] = alt_allele[i];
+
+        args->fa_frz_mod = idx + alen - ibeg + 1;
      }
      if (args->chain && len_diff != 0)
      {
          // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
-        if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0)
+        if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0)
          {
              // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
              push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
@@ -687,6 +906,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
      args->fa_mod_off += len_diff;
      args->fa_frz_pos  = rec->pos + rec->rlen - 1;
      args->napplied++;
+    if ( rmme_alt ) free(alt_allele);
  }
  
  
@@ -694,17 +914,27 @@ static void mask_region(args_t *args, char *seq, int len)
  {
      int start = args->fa_src_pos - len;
      int end   = args->fa_src_pos;
+    int i;
  
-    if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;
-
-    int idx_start, idx_end, i;
-    while ( regitr_overlap(args->itr) )
+    for (i=0; i<args->nmask; i++)
      {
-        idx_start = args->itr->beg - start;
-        idx_end   = args->itr->end - start;
-        if ( idx_start < 0 ) idx_start = 0;
-        if ( idx_end >= len ) idx_end = len - 1;
-        for (i=idx_start; i<=idx_end; i++) seq[i] = 'N';
+        mask_t *mask = &args->mask[i];
+        if ( !regidx_overlap(mask->idx, args->chr,start,end, mask->itr) ) continue;
+
+        int idx_start, idx_end, j;
+        while ( regitr_overlap(mask->itr) )
+        {
+            idx_start = mask->itr->beg - start;
+            idx_end   = mask->itr->end - start;
+            if ( idx_start < 0 ) idx_start = 0;
+            if ( idx_end >= len ) idx_end = len - 1;
+            if ( mask->with==MASK_UC )
+                for (j=idx_start; j<=idx_end; j++) seq[j] = toupper(seq[j]);
+            else if ( mask->with==MASK_LC )
+                for (j=idx_start; j<=idx_end; j++) seq[j] = tolower(seq[j]);
+            else
+                for (j=idx_start; j<=idx_end; j++) seq[j] = mask->with;
+        }
      }
  }
  
@@ -722,13 +952,20 @@ static void consensus(args_t *args)
                  print_chain(args);
                  destroy_chain(args);
              }
-            // apply all cached variants
-            while ( args->vcf_rbuf.n )
+            // apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*)
+            bcf1_t **rec_ptr = NULL;
+            while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
              {
-                bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f];
+                bcf1_t *rec = *rec_ptr;
                  if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break;
-                int i = rbuf_shift(&args->vcf_rbuf);
-                apply_variant(args, args->vcf_buf[i]);
+                apply_variant(args, rec);
+            }
+            if ( args->absent_allele )
+            {
+                int pos = 0;
+                if ( args->vcf_rbuf.n && args->vcf_buf[args->vcf_rbuf.f]->rid==args->rid )
+                    pos = args->vcf_buf[args->vcf_rbuf.f]->pos;
+                apply_absent(args, pos);
              }
              flush_fa_buffer(args, 0);
              init_region(args, str.s+1);
@@ -773,7 +1010,11 @@ static void consensus(args_t *args)
              }
              apply_variant(args, rec);
          }
-        if ( !rec_ptr ) flush_fa_buffer(args, 60);
+        if ( !rec_ptr )
+        {
+            if ( args->absent_allele ) apply_absent(args, args->fa_ori_pos - args->fa_mod_off + args->fa_buf.l);
+            flush_fa_buffer(args, 60);
+        }
      }
      bcf1_t **rec_ptr = NULL;
      while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
@@ -789,6 +1030,7 @@ static void consensus(args_t *args)
          print_chain(args);
          destroy_chain(args);
      }
+    if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX);
      flush_fa_buffer(args, 0);
      bgzf_close(fasta);
      free(str.s);
@@ -803,33 +1045,39 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "       --sample (and, optionally, --haplotype) option will apply genotype\n");
      fprintf(bcftools_stderr, "       (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
      fprintf(bcftools_stderr, "       information, such as INFO/AD or FORMAT/AD.\n");
-    fprintf(bcftools_stderr, "Usage:   bcftools consensus [OPTIONS] <file.vcf.gz>\n");
+    fprintf(bcftools_stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
      fprintf(bcftools_stderr, "Options:\n");
-    fprintf(bcftools_stderr, "    -c, --chain <file>         write a chain file for liftover\n");
-    fprintf(bcftools_stderr, "    -e, --exclude <expr>       exclude sites for which the expression is true (see man page for details)\n");
-    fprintf(bcftools_stderr, "    -f, --fasta-ref <file>     reference sequence in fasta format\n");
-    fprintf(bcftools_stderr, "    -H, --haplotype <which>    choose which allele to use from the FORMAT/GT field, note\n");
-    fprintf(bcftools_stderr, "                               the codes are case-insensitive:\n");
-    fprintf(bcftools_stderr, "                                   1: first allele from GT, regardless of phasing\n");
-    fprintf(bcftools_stderr, "                                   2: second allele from GT, regardless of phasing\n");
-    fprintf(bcftools_stderr, "                                   R: REF allele in het genotypes\n");
-    fprintf(bcftools_stderr, "                                   A: ALT allele\n");
-    fprintf(bcftools_stderr, "                                   LR,LA: longer allele and REF/ALT if equal length\n");
-    fprintf(bcftools_stderr, "                                   SR,SA: shorter allele and REF/ALT if equal length\n");
-    fprintf(bcftools_stderr, "                                   1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
-    fprintf(bcftools_stderr, "    -i, --include <expr>       select sites for which the expression is true (see man page for details)\n");
-    fprintf(bcftools_stderr, "    -I, --iupac-codes          output variants in the form of IUPAC ambiguity codes\n");
-    fprintf(bcftools_stderr, "    -m, --mask <file>          replace regions with N\n");
-    fprintf(bcftools_stderr, "    -M, --missing <char>       output <char> instead of skipping the missing genotypes\n");
-    fprintf(bcftools_stderr, "    -o, --output <file>        write output to a file [standard output]\n");
-    fprintf(bcftools_stderr, "    -p, --prefix <string>      prefix to add to output sequence names\n");
-    fprintf(bcftools_stderr, "    -s, --sample <name>        apply variants of the given sample\n");
+    fprintf(bcftools_stderr, "    -c, --chain FILE               write a chain file for liftover\n");
+    fprintf(bcftools_stderr, "    -a, --absent CHAR              replace positions absent from VCF with CHAR\n");
+    fprintf(bcftools_stderr, "    -e, --exclude EXPR             exclude sites for which the expression is true (see man page for details)\n");
+    fprintf(bcftools_stderr, "    -f, --fasta-ref FILE           reference sequence in fasta format\n");
+    fprintf(bcftools_stderr, "    -H, --haplotype WHICH          choose which allele to use from the FORMAT/GT field, note\n");
+    fprintf(bcftools_stderr, "                                   the codes are case-insensitive:\n");
+    fprintf(bcftools_stderr, "                                       1: first allele from GT, regardless of phasing\n");
+    fprintf(bcftools_stderr, "                                       2: second allele from GT, regardless of phasing\n");
+    fprintf(bcftools_stderr, "                                       R: REF allele in het genotypes\n");
+    fprintf(bcftools_stderr, "                                       A: ALT allele\n");
+    fprintf(bcftools_stderr, "                                       I: IUPAC code for all genotypes\n");
+    fprintf(bcftools_stderr, "                                       LR,LA: longer allele and REF/ALT if equal length\n");
+    fprintf(bcftools_stderr, "                                       SR,SA: shorter allele and REF/ALT if equal length\n");
+    fprintf(bcftools_stderr, "                                       1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
+    fprintf(bcftools_stderr, "    -i, --include EXPR             select sites for which the expression is true (see man page for details)\n");
+    fprintf(bcftools_stderr, "    -I, --iupac-codes              output variants in the form of IUPAC ambiguity codes\n");
+    fprintf(bcftools_stderr, "        --mark-del CHAR            instead of removing sequence, insert CHAR for deletions\n");
+    fprintf(bcftools_stderr, "        --mark-ins uc|lc           highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+    fprintf(bcftools_stderr, "        --mark-snv uc|lc           highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+    fprintf(bcftools_stderr, "    -m, --mask FILE                replace regions according to the next --mask-with option. The default is --mask-with N\n");
+    fprintf(bcftools_stderr, "        --mask-with CHAR|uc|lc     replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
+    fprintf(bcftools_stderr, "    -M, --missing CHAR             output CHAR instead of skipping a missing genotype \"./.\"\n");
+    fprintf(bcftools_stderr, "    -o, --output FILE              write output to a file [standard output]\n");
+    fprintf(bcftools_stderr, "    -p, --prefix STRING            prefix to add to output sequence names\n");
+    fprintf(bcftools_stderr, "    -s, --sample NAME              apply variants of the given sample\n");
      fprintf(bcftools_stderr, "Examples:\n");
      fprintf(bcftools_stderr, "   # Get the consensus for one region. The fasta header lines are then expected\n");
      fprintf(bcftools_stderr, "   # in the form \">chr:from-to\".\n");
      fprintf(bcftools_stderr, "   samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_consensus(int argc, char *argv[])
@@ -839,6 +1087,10 @@ int main_consensus(int argc, char *argv[])
  
      static struct option loptions[] = 
      {
+        {"mark-del",required_argument,NULL,1},
+        {"mark-ins",required_argument,NULL,2},
+        {"mark-snv",required_argument,NULL,3},
+        {"mask-with",1,0,4},
          {"exclude",required_argument,NULL,'e'},
          {"include",required_argument,NULL,'i'},
          {"sample",1,0,'s'},
@@ -848,23 +1100,44 @@ int main_consensus(int argc, char *argv[])
          {"fasta-ref",1,0,'f'},
          {"mask",1,0,'m'},
          {"missing",1,0,'M'},
+        {"absent",1,0,'a'},
          {"chain",1,0,'c'},
          {"prefix",required_argument,0,'p'},
          {0,0,0,0}
      };
      int c;
-    while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:",loptions,NULL)) >= 0) 
+    while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
      {
          switch (c) 
          {
+            case  1 : args->mark_del = optarg[0]; break;
+            case  2 :
+                if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u';
+                else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l';
+                else error("The argument is not recognised: --mark-ins %s\n",optarg);
+                break;
+            case  3 :
+                if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u';
+                else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l';
+                else error("The argument is not recognised: --mark-snv %s\n",optarg);
+                break;
              case 'p': args->chr_prefix = optarg; break;
              case 's': args->sample = optarg; break;
              case 'o': args->output_fname = optarg; break;
              case 'I': args->output_iupac = 1; break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e': 
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i': 
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'f': args->ref_fname = optarg; break;
-            case 'm': args->mask_fname = optarg; break;
+            case 'm': add_mask(args,optarg); break;
+            case  4 : add_mask_with(args,optarg); break;
+            case 'a':
+                args->absent_allele = optarg[0];
+                if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg);
+                break;
              case 'M': 
                  args->missing_allele = optarg[0]; 
                  if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg);
@@ -879,6 +1152,7 @@ int main_consensus(int argc, char *argv[])
                  else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT;
                  else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF;
                  else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT;
+                else if ( !strcasecmp(optarg,"I") ) args->allele |= PICK_IUPAC;
                  else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1;
                  else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2;
                  else
diff --git a/bcftools/convert.c b/bcftools/convert.c

index fbf98e0ad93add24c2be7724116399089cc92323..71dfb51b455d9679e4059bb853645da6b16af875 100644 (file)
--- a/bcftools/convert.c
+++ b/bcftools/convert.c
@@ -1,6 +1,6 @@
  /*  convert.c -- functions for converting between VCF/BCF and related formats.
  
-    Copyright (C) 2013-2018 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -25,6 +25,7 @@ THE SOFTWARE.  */
  #include <stdio.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <ctype.h>
  #include <string.h>
  #include <errno.h>
@@ -40,6 +41,7 @@ THE SOFTWARE.  */
  #include "bcftools.h"
  #include "variantkey.h"
  #include "convert.h"
+#include "filter.h"
  
  #define T_CHROM   1
  #define T_POS     2
@@ -73,6 +75,7 @@ THE SOFTWARE.  */
  #define T_RSX          30   // RSID HEX
  #define T_VKX          31   // VARIANTKEY HEX
  #define T_PBINOM       32
+#define T_NPASS        33
  
  typedef struct _fmt_t
  {
@@ -503,7 +506,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam
                  type_t val = x[j]; \
                  if ( !val ) continue; \
                  for (i=0; i<nbits; i+=2) \
-                    if ( val & (mask<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
+                    if ( val & (mask<<i) ) { kputs(csq->str[(j*30+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
              } \
          } \
          if ( fmt->subscript<0 || fmt->subscript==2 ) \
@@ -513,7 +516,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam
                  type_t val = x[j]; \
                  if ( !val ) continue; \
                  for (i=1; i<nbits; i+=2) \
-                    if ( val & (1<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
+                    if ( val & (1<<i) ) { kputs(csq->str[(j*30+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
              } \
          } \
      }
@@ -521,7 +524,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam
      {
          case BCF_BT_INT8:  BRANCH(uint8_t, 8); break;
          case BCF_BT_INT16: BRANCH(uint16_t,16); break;
-        case BCF_BT_INT32: BRANCH(uint32_t,32); break;
+        case BCF_BT_INT32: BRANCH(uint32_t,30); break;  // 2 bytes unused to account for the reserved BCF values
          default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break;
      }
      #undef BRANCH
@@ -782,8 +785,8 @@ static void process_gp_to_prob3(convert_t *convert, bcf1_t *line, fmt_t *fmt, in
          int j;
          for (j=0; j<n; j++)
          {
-            if ( ptr[j]==bcf_int32_vector_end ) break;
-            if ( ptr[j]==bcf_int32_missing ) { ptr[j]=0; continue; }
+            if ( bcf_float_is_vector_end(ptr[j]) ) break;
+            if ( bcf_float_is_missing(ptr[j]) ) { ptr[j]=0; continue; }
              if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]);
              sum+=ptr[j];
          }
@@ -1122,6 +1125,21 @@ static void process_variantkey_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt,
      ksprintf(str, "%016" PRIx64 "", vk);
  }
  
+static void process_npass(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+    int i, nsmpl = 0;
+    filter_t *flt = (filter_t*) fmt->usr;
+    const uint8_t *smpl;
+    filter_test(flt,line,&smpl);
+    for (i=0; i<convert->nsamples; i++)
+        if ( smpl[i] ) nsmpl++;
+    kputd(nsmpl, str);
+}
+static void destroy_npass(void *usr)
+{
+    filter_destroy((filter_t*)usr);
+}
+
  static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
  {
      int i;
@@ -1225,11 +1243,17 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
              else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; }
              else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; }
          }
-        if ( fmt->type==T_PBINOM )
+        else if ( fmt->type==T_PBINOM )
          {
              fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key);
              if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id)  ) error("No such FORMAT tag defined in the header: %s\n", fmt->key);
          }
+        else if ( fmt->type==T_NPASS )
+        {
+            filter_t *flt = filter_init(convert->header,key);
+            convert->max_unpack |= filter_max_unpack(flt);
+            fmt->usr = (void*) flt;
+        }
      }
  
      switch (fmt->type)
@@ -1266,6 +1290,7 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
          case T_RSX: fmt->handler = &process_rsid_hex; break;
          case T_VKX: fmt->handler = &process_variantkey_hex; break;
          case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break;
+        case T_NPASS: fmt->handler = &process_npass; fmt->destroy = &destroy_npass; break;
          default: error("TODO: handler for type %d\n", fmt->type);
      }
      if ( key && fmt->type==T_INFO )
@@ -1344,6 +1369,8 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
              register_tag(convert, T_PBINOM, str.s, is_gtf);
              q++;
          }
+        else if ( !strcmp(str.s,"N_PASS") )
+            error("N_PASS() must be placed outside the square brackets\n");
          else
          {
              fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf);
@@ -1380,7 +1407,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
          else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf);
          else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf);
          else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf);
-        else if ( !strcmp(str.s,"pbinom") ) error("Error: pbinom() is currently supported only with FORMAT tags. (todo)\n");
+        else if ( !strcmp(str.s,"PBINOM") ) error("Error: PBINOM() is currently supported only with FORMAT tags. (todo)\n");
          else if ( !strcmp(str.s, "INFO") )
          {
              if ( *q=='/' )
@@ -1398,6 +1425,22 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
          }
          else if ( !strcmp(str.s, "FORMAT") )
               register_tag(convert, T_FORMAT, NULL, 0);
+        else if ( !strcmp(str.s,"N_PASS") )
+        {
+            if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str);
+            p = ++q;
+            str.l = 0;
+            int nopen = 1;
+            while ( *q && nopen )
+            {
+                if ( *q=='(' ) nopen++;
+                else if ( *q==')' ) nopen--;
+                q++;
+            }
+            if ( q-p==0 || nopen ) error("Could not parse format string: %s\n", convert->format_str);
+            kputsn(p, q-p-1, &str);
+            register_tag(convert, T_NPASS, str.s, is_gtf);
+        }
          else
          {
              fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
@@ -1565,7 +1608,8 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
              for (js=0; js<convert->nsamples; js++)
              {
                  // Skip samples when filtering was requested
-                if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;
+                int ks = convert->samples[js];
+                if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] ) continue;
  
                  // Here comes a hack designed for TBCSQ. When running on large files,
                  // such as 1000GP, there are too many empty fields in the output and
@@ -1574,7 +1618,6 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
                  // brackets here. This may be changed in future, time will show...
                  size_t l_start = str->l;
              
-                int ks = convert->samples[js];
                  for (k=i; k<j; k++)
                  {
                      if ( convert->fmt[k].type == T_MASK )
diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c

index 8f049114a0f7ba148b5d53b20645328b5104322e..e3c995f54f9b7e471752ec4e2f35829979330d45 100644 (file)
--- a/bcftools/convert.c.pysam.c
+++ b/bcftools/convert.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  convert.c -- functions for converting between VCF/BCF and related formats.
  
-    Copyright (C) 2013-2018 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -27,6 +27,7 @@ THE SOFTWARE.  */
  #include <stdio.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <ctype.h>
  #include <string.h>
  #include <errno.h>
@@ -42,6 +43,7 @@ THE SOFTWARE.  */
  #include "bcftools.h"
  #include "variantkey.h"
  #include "convert.h"
+#include "filter.h"
  
  #define T_CHROM   1
  #define T_POS     2
@@ -75,6 +77,7 @@ THE SOFTWARE.  */
  #define T_RSX          30   // RSID HEX
  #define T_VKX          31   // VARIANTKEY HEX
  #define T_PBINOM       32
+#define T_NPASS        33
  
  typedef struct _fmt_t
  {
@@ -270,7 +273,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
              case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
              case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else kputd(info->v1.f, str); break;
              case BCF_BT_CHAR:  kputc(info->v1.i, str); break;
-            default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break;
+            default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); bcftools_exit(1); break;
          }
      }
      else if ( fmt->subscript >=0 )
@@ -292,7 +295,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
              case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
              case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
              case BCF_BT_CHAR:  _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
-            default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break;
+            default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); bcftools_exit(1); break;
          }
          #undef BRANCH
      }
@@ -505,7 +508,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam
                  type_t val = x[j]; \
                  if ( !val ) continue; \
                  for (i=0; i<nbits; i+=2) \
-                    if ( val & (mask<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
+                    if ( val & (mask<<i) ) { kputs(csq->str[(j*30+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
              } \
          } \
          if ( fmt->subscript<0 || fmt->subscript==2 ) \
@@ -515,7 +518,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam
                  type_t val = x[j]; \
                  if ( !val ) continue; \
                  for (i=1; i<nbits; i+=2) \
-                    if ( val & (1<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
+                    if ( val & (1<<i) ) { kputs(csq->str[(j*30+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
              } \
          } \
      }
@@ -523,8 +526,8 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam
      {
          case BCF_BT_INT8:  BRANCH(uint8_t, 8); break;
          case BCF_BT_INT16: BRANCH(uint16_t,16); break;
-        case BCF_BT_INT32: BRANCH(uint32_t,32); break;
-        default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break;
+        case BCF_BT_INT32: BRANCH(uint32_t,30); break;  // 2 bytes unused to account for the reserved BCF values
+        default: error("Unexpected type: %d\n", fmt->fmt->type); bcftools_exit(1); break;
      }
      #undef BRANCH
  
@@ -784,8 +787,8 @@ static void process_gp_to_prob3(convert_t *convert, bcf1_t *line, fmt_t *fmt, in
          int j;
          for (j=0; j<n; j++)
          {
-            if ( ptr[j]==bcf_int32_vector_end ) break;
-            if ( ptr[j]==bcf_int32_missing ) { ptr[j]=0; continue; }
+            if ( bcf_float_is_vector_end(ptr[j]) ) break;
+            if ( bcf_float_is_missing(ptr[j]) ) { ptr[j]=0; continue; }
              if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]);
              sum+=ptr[j];
          }
@@ -1124,6 +1127,21 @@ static void process_variantkey_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt,
      ksprintf(str, "%016" PRIx64 "", vk);
  }
  
+static void process_npass(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+    int i, nsmpl = 0;
+    filter_t *flt = (filter_t*) fmt->usr;
+    const uint8_t *smpl;
+    filter_test(flt,line,&smpl);
+    for (i=0; i<convert->nsamples; i++)
+        if ( smpl[i] ) nsmpl++;
+    kputd(nsmpl, str);
+}
+static void destroy_npass(void *usr)
+{
+    filter_destroy((filter_t*)usr);
+}
+
  static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
  {
      int i;
@@ -1227,11 +1245,17 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
              else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; }
              else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; }
          }
-        if ( fmt->type==T_PBINOM )
+        else if ( fmt->type==T_PBINOM )
          {
              fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key);
              if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id)  ) error("No such FORMAT tag defined in the header: %s\n", fmt->key);
          }
+        else if ( fmt->type==T_NPASS )
+        {
+            filter_t *flt = filter_init(convert->header,key);
+            convert->max_unpack |= filter_max_unpack(flt);
+            fmt->usr = (void*) flt;
+        }
      }
  
      switch (fmt->type)
@@ -1268,6 +1292,7 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
          case T_RSX: fmt->handler = &process_rsid_hex; break;
          case T_VKX: fmt->handler = &process_variantkey_hex; break;
          case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break;
+        case T_NPASS: fmt->handler = &process_npass; fmt->destroy = &destroy_npass; break;
          default: error("TODO: handler for type %d\n", fmt->type);
      }
      if ( key && fmt->type==T_INFO )
@@ -1346,6 +1371,8 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
              register_tag(convert, T_PBINOM, str.s, is_gtf);
              q++;
          }
+        else if ( !strcmp(str.s,"N_PASS") )
+            error("N_PASS() must be placed outside the square brackets\n");
          else
          {
              fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf);
@@ -1382,7 +1409,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
          else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf);
          else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf);
          else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf);
-        else if ( !strcmp(str.s,"pbinom") ) error("Error: pbinom() is currently supported only with FORMAT tags. (todo)\n");
+        else if ( !strcmp(str.s,"PBINOM") ) error("Error: PBINOM() is currently supported only with FORMAT tags. (todo)\n");
          else if ( !strcmp(str.s, "INFO") )
          {
              if ( *q=='/' )
@@ -1400,6 +1427,22 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
          }
          else if ( !strcmp(str.s, "FORMAT") )
               register_tag(convert, T_FORMAT, NULL, 0);
+        else if ( !strcmp(str.s,"N_PASS") )
+        {
+            if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str);
+            p = ++q;
+            str.l = 0;
+            int nopen = 1;
+            while ( *q && nopen )
+            {
+                if ( *q=='(' ) nopen++;
+                else if ( *q==')' ) nopen--;
+                q++;
+            }
+            if ( q-p==0 || nopen ) error("Could not parse format string: %s\n", convert->format_str);
+            kputsn(p, q-p-1, &str);
+            register_tag(convert, T_NPASS, str.s, is_gtf);
+        }
          else
          {
              fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
@@ -1567,7 +1610,8 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
              for (js=0; js<convert->nsamples; js++)
              {
                  // Skip samples when filtering was requested
-                if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;
+                int ks = convert->samples[js];
+                if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] ) continue;
  
                  // Here comes a hack designed for TBCSQ. When running on large files,
                  // such as 1000GP, there are too many empty fields in the output and
@@ -1576,7 +1620,6 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
                  // brackets here. This may be changed in future, time will show...
                  size_t l_start = str->l;
              
-                int ks = convert->samples[js];
                  for (k=i; k<j; k++)
                  {
                      if ( convert->fmt[k].type == T_MASK )
diff --git a/bcftools/csq.c b/bcftools/csq.c

index c9a0132d3bf179e7dc66d1eb846f4fee9cc08ed0..8e3ee3b7eec2b592636ac21f855921a16cdea82e 100644 (file)
--- a/bcftools/csq.c
+++ b/bcftools/csq.c
@@ -1,9 +1,6 @@
-//$bt csq -f $ref -g $gff -p r -Ou -o /dev/null /lustre/scratch116/vr/projects/g1k/phase3/release/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
-
-
  /* The MIT License
  
-   Copyright (c) 2016-2018 Genome Research Ltd.
+   Copyright (c) 2016-2021 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -136,6 +133,7 @@
   
  #include <stdio.h>
  #include <stdlib.h>
+#include <assert.h>
  #include <getopt.h>
  #include <math.h>
  #include <inttypes.h>
@@ -592,8 +590,8 @@ typedef struct _args_t
      char *bcsq_tag;
      int argc, output_type;
      int phase, verbosity, local_csq, record_cmd_line;
-    int ncsq_max, nfmt_bcsq;    // maximum number of csq per site that can be accessed from FORMAT/BCSQ
-    int ncsq_small_warned;
+    int ncsq2_max, nfmt_bcsq;   // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
+    int ncsq2_small_warned;
      int brief_predictions;
      
      int rid;                    // current chromosome
@@ -680,11 +678,42 @@ static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
      int iseq;
      if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
      {
-        hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
-        aux->seq[aux->nseq] = strdup(chr_beg);
-        iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
-        aux->nseq++;
-        assert( aux->nseq < 1<<29 );  // see gf_gene_t.iseq and ftr_t.iseq
+        // check for possible mismatch in chromosome naming convention such as chrX vs X
+        char *new_chr = NULL;
+        if ( faidx_has_seq(args->fai,chr_beg) )
+            new_chr = strdup(chr_beg);                  // valid chr name, the same in gff and faidx
+        else
+        {
+            int len = strlen(chr_beg);
+            if ( !strncmp("chr",chr_beg,3) && len>3 )
+                new_chr = strdup(chr_beg+3);            // gff has the prefix, faidx does not
+            else
+            {
+                new_chr = malloc(len+3);                // gff does not have the prefix, faidx has
+                memcpy(new_chr,"chr",3);
+                memcpy(new_chr+3,chr_beg,len);
+                new_chr[len+3] = 0;
+            }
+            if ( !faidx_has_seq(args->fai,new_chr) )    // modification did not help, this sequence is not in fai
+            {
+                static int unkwn_chr_warned = 0;
+                if ( !unkwn_chr_warned && args->verbosity>0 )
+                    fprintf(stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg);
+                unkwn_chr_warned = 1;
+                free(new_chr);
+                new_chr = strdup(chr_beg);              // use the original sequence name
+            }
+        }
+        if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 )
+        {
+            hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+            aux->seq[aux->nseq] = new_chr;
+            iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+            aux->nseq++;
+            assert( aux->nseq < 1<<29 );  // see gf_gene_t.iseq and ftr_t.iseq
+        }
+        else
+            free(new_chr);
      }
      chr_end[1] = c;
      return iseq;
@@ -1140,7 +1169,8 @@ void tscript_init_cds(args_t *args)
                          tscript_ok = 0;
                          break;
                      }
-                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                        args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                  }
                  len += tr->cds[i]->len; 
              }
@@ -1178,7 +1208,8 @@ void tscript_init_cds(args_t *args)
                          tscript_ok = 0;
                          break;
                      }
-                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                        args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                  }
                  len += tr->cds[i]->len;
              }
@@ -1196,8 +1227,17 @@ void tscript_init_cds(args_t *args)
              gf_cds_t *a = tr->cds[i-1];
              gf_cds_t *b = tr->cds[i];
              if ( a->beg + a->len - 1 >= b->beg ) 
-                error("Error: CDS overlap in the transcript %"PRIu32": %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32"\n", 
-                    kh_key(aux->id2tr, k), a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+            {
+                if ( args->force )
+                {
+                    fprintf(stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n",
+                        args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+                }
+                else
+                    error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
+                          "       Use the --force option to override (at your own risk).\n", 
+                            args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+            }
          }
          if ( len%3 != 0 )
          {
@@ -1337,9 +1377,22 @@ void init_gff(args_t *args)
      khash_str2int_destroy_free(aux->ignored_biotypes);
  }
  
+static inline int ncsq2_to_nfmt(int ncsq2)
+{
+    return 1 + (ncsq2 - 1) / 30;
+}
+static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit)
+{
+    *ival = icsq2 / 30;
+    *ibit = icsq2 % 30;
+}
+
  void init_data(args_t *args)
  {
-    args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32; 
+    args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max);
+
+    args->fai = fai_load(args->fa_fname);
+    if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
  
      if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname);
      init_gff(args);
@@ -1349,9 +1402,6 @@ void init_data(args_t *args)
      if ( args->filter_str )
          args->filter = filter_init(args->hdr, args->filter_str);
  
-    args->fai = fai_load(args->fa_fname);
-    if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
-
      args->pos2vbuf  = kh_init(pos2vbuf);
      args->active_tr = khp_init(trhp);
      args->hap = (hap_t*) calloc(1,sizeof(hap_t));
@@ -1395,7 +1445,7 @@ void init_data(args_t *args)
      }
      else
      {
-        args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+        args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname));
          if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno));
          if ( args->n_threads > 0)
              hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p);
@@ -1410,6 +1460,11 @@ void init_data(args_t *args)
  
  void destroy_data(args_t *args)
  {
+    if ( args->ncsq2_small_warned )
+        fprintf(stderr,
+            "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n"
+            "      the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2);
+
      regidx_destroy(args->idx_cds);
      regidx_destroy(args->idx_utr);
      regidx_destroy(args->idx_exon);
@@ -2683,13 +2738,13 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
  
  void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
  {
-    if ( !args->brief_predictions )
+    if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 )
          kputs(aa->s, str);
      else
      {
-        int len = aa->l;
+        int i, len = aa->l;
          if ( aa->s[len-1]=='*' ) len--;
-        kputc(aa->s[0], str);
+        for (i=0; i<len && i<args->brief_predictions; i++) kputc(aa->s[i], str);
          kputs("..", str);
          kputw(beg+len, str);
      }
@@ -3083,22 +3138,24 @@ static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int iha
      {
          csq_t *csq = node->csq_list + i;
          vrec_t *vrec = csq->vrec;
-        int icsq = 2*csq->idx + ihap;
-        if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+        int icsq2 = 2*csq->idx + ihap;
+        if ( icsq2 >= args->ncsq2_max ) // more than ncsq2_max consequences, so can't fit it in FMT
          {
-            if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) )
+            if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) )
              {
                  fprintf(stderr,
                      "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n",
                      args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx);
-                if ( !args->ncsq_small_warned )
+                if ( !args->ncsq2_small_warned )
                      fprintf(stderr,"         The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n");
-                args->ncsq_small_warned = 1;
              }
+            if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2;
              break;
          }
-        if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
-        vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+        int ival, ibit;
+        icsq2_to_bit(icsq2, &ival,&ibit);
+        if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
+        vrec->smpl[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit;
      }
  }
  
@@ -3727,22 +3784,26 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
          {
              if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
  
-            int icsq = 2*csq->idx + j;
-            if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+            int icsq2 = 2*csq->idx + j;
+            if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT
              {
                  int ismpl = args->smpl->idx[i];
-                if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) )
+                if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) )
                  {
                      fprintf(stderr,
                              "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n",
-                            args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq+1);
-                    if ( !args->ncsq_small_warned )
+                            args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq2+1);
+                    if ( !args->ncsq2_small_warned )
                          fprintf(stderr,"         The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n");
-                    args->ncsq_small_warned = 1;
+                    args->ncsq2_small_warned = 1;
                  }
+                if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2;
+                break;
              }
-            if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
-            vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+            int ival, ibit;
+            icsq2_to_bit(icsq2, &ival,&ibit);
+            if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
+            vrec->smpl[i*args->nfmt_bcsq + ival] |= 1 << ibit;
          }
      }
  }
@@ -4041,39 +4102,39 @@ static const char *usage(void)
      return 
          "\n"
          "About: Haplotype-aware consequence caller.\n"
-        "Usage: bcftools csq [options] in.vcf\n"
+        "Usage: bcftools csq [OPTIONS] in.vcf\n"
          "\n"
          "Required options:\n"
-        "   -f, --fasta-ref <file>          reference file in fasta format\n"
-        "   -g, --gff-annot <file>          gff3 annotation file\n"
+        "   -f, --fasta-ref FILE            reference file in fasta format\n"
+        "   -g, --gff-annot FILE            gff3 annotation file\n"
          "\n"
          "CSQ options:\n"
-        "   -b, --brief-predictions         annotate with abbreviated protein-changing predictions\n"
-        "   -c, --custom-tag <string>       use this tag instead of the default BCSQ\n"
+        "   -B, --trim-protein-seq INT      abbreviate protein-changing predictions to max INT aminoacids\n" 
+        "   -c, --custom-tag STRING         use this tag instead of the default BCSQ\n"
          "   -l, --local-csq                 localized predictions, consider only one VCF record at a time\n"
-        "   -n, --ncsq <int>                maximum number of consequences to consider per site [16]\n"
-        "   -p, --phase <a|m|r|R|s>         how to handle unphased heterozygous genotypes: [r]\n"
+        "   -n, --ncsq INT                  maximum number of per-haplotype consequences to consider for each site [15]\n"
+        "   -p, --phase a|m|r|R|s           how to handle unphased heterozygous genotypes: [r]\n"
          "                                     a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
          "                                     m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
          "                                     r: require phased GTs, throw an error on unphased het GTs\n"
          "                                     R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
          "                                     s: skip unphased hets\n"
          "Options:\n"
-        "   -e, --exclude <expr>            exclude sites for which the expression is true\n"
+        "   -e, --exclude EXPR              exclude sites for which the expression is true\n"
          "       --force                     run even if some sanity checks fail\n"
-        "   -i, --include <expr>            select sites for which the expression is true\n"
+        "   -i, --include EXPR              select sites for which the expression is true\n"
          "       --no-version                do not append version and command line to the header\n"
-        "   -o, --output <file>             write output to a file [standard output]\n"
-        "   -O, --output-type <b|u|z|v|t>   b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
+        "   -o, --output FILE               write output to a file [standard output]\n"
+        "   -O, --output-type b|u|z|v|t     b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
          "                                   v: uncompressed VCF, t: plain tab-delimited text output [v]\n"
-        "   -r, --regions <region>          restrict to comma-separated list of regions\n"
-        "   -R, --regions-file <file>       restrict to regions listed in a file\n"
-        "   -s, --samples <-|list>          samples to include or \"-\" to apply all variants and ignore samples\n"
-        "   -S, --samples-file <file>       samples to include\n"
-        "   -t, --targets <region>          similar to -r but streams rather than index-jumps\n"
-        "   -T, --targets-file <file>       similar to -R but streams rather than index-jumps\n"
-        "       --threads <int>             use multithreading with <int> worker threads [0]\n"
-        "   -v, --verbose <int>             verbosity level 0-2 [1]\n"
+        "   -r, --regions REGION            restrict to comma-separated list of regions\n"
+        "   -R, --regions-file FILE         restrict to regions listed in a file\n"
+        "   -s, --samples -|LIST            samples to include or \"-\" to apply all variants and ignore samples\n"
+        "   -S, --samples-file FILE         samples to include\n"
+        "   -t, --targets REGION            similar to -r but streams rather than index-jumps\n"
+        "   -T, --targets-file FILE         similar to -R but streams rather than index-jumps\n"
+        "       --threads INT               use multithreading with <int> worker threads [0]\n"
+        "   -v, --verbose INT               verbosity level 0-2 [1]\n"
          "\n"
          "Example:\n"
          "   bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
@@ -4090,7 +4151,7 @@ int main_csq(int argc, char *argv[])
      args->argc = argc; args->argv = argv;
      args->output_type = FT_VCF;
      args->bcsq_tag = "BCSQ";
-    args->ncsq_max = 2*16;
+    args->ncsq2_max = 2*(16-1);      // 1 bit is reserved for BCF missing values
      args->verbosity = 1;
      args->record_cmd_line = 1;
  
@@ -4100,7 +4161,8 @@ int main_csq(int argc, char *argv[])
          {"threads",required_argument,NULL,2},
          {"help",0,0,'h'},
          {"ncsq",1,0,'n'},
-        {"brief-predictions",0,0,'b'},
+        {"brief-predictions",no_argument,0,'b'},
+        {"trim-protein-seq",required_argument,0,'B'},
          {"custom-tag",1,0,'c'},
          {"local-csq",0,0,'l'},
          {"gff-annot",1,0,'g'},
@@ -4123,7 +4185,7 @@ int main_csq(int argc, char *argv[])
      };
      int c, targets_is_file = 0, regions_is_file = 0; 
      char *targets_list = NULL, *regions_list = NULL, *tmp;
-    while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bv:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0)
      {
          switch (c) 
          {
@@ -4133,7 +4195,14 @@ int main_csq(int argc, char *argv[])
                  if ( *tmp ) error("Could not parse argument: --threads  %s\n", optarg);
                  break;
              case  3 : args->record_cmd_line = 0; break;
-            case 'b': args->brief_predictions = 1; break;
+            case 'b':
+                    args->brief_predictions = 1;
+                    fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
+                    break;
+            case 'B': 
+                    args->brief_predictions = strtol(optarg,&tmp,10);
+                    if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg);
+                    break;
              case 'l': args->local_csq = 1; break;
              case 'c': args->bcsq_tag = optarg; break;
              case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break;
@@ -4155,8 +4224,8 @@ int main_csq(int argc, char *argv[])
              case 'f': args->fa_fname = optarg; break;
              case 'g': args->gff_fname = optarg; break;
              case 'n': 
-                args->ncsq_max = 2 * atoi(optarg);
-                if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg);
+                args->ncsq2_max = 2 * atoi(optarg);
+                if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg);
                  break;
              case 'o': args->output_fname = optarg; break;
              case 'O':
@@ -4169,8 +4238,12 @@ int main_csq(int argc, char *argv[])
                            default: error("The output type \"%s\" not recognised\n", optarg);
                        }
                        break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'r': regions_list = optarg; break;
              case 'R': regions_list = optarg; regions_is_file = 1; break;
              case 's': args->sample_list = optarg; break;
diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c

index e0c3001852a8c32a97cfbf8067209f879a3e5c76..e7f6a704fde445ab20aad1eb8c269b5da0628f6e 100644 (file)
--- a/bcftools/csq.c.pysam.c
+++ b/bcftools/csq.c.pysam.c
@@ -1,11 +1,8 @@
  #include "bcftools.pysam.h"
  
-//$bt csq -f $ref -g $gff -p r -Ou -o /dev/null /lustre/scratch116/vr/projects/g1k/phase3/release/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
-
-
  /* The MIT License
  
-   Copyright (c) 2016-2018 Genome Research Ltd.
+   Copyright (c) 2016-2021 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -138,6 +135,7 @@
   
  #include <stdio.h>
  #include <stdlib.h>
+#include <assert.h>
  #include <getopt.h>
  #include <math.h>
  #include <inttypes.h>
@@ -594,8 +592,8 @@ typedef struct _args_t
      char *bcsq_tag;
      int argc, output_type;
      int phase, verbosity, local_csq, record_cmd_line;
-    int ncsq_max, nfmt_bcsq;    // maximum number of csq per site that can be accessed from FORMAT/BCSQ
-    int ncsq_small_warned;
+    int ncsq2_max, nfmt_bcsq;   // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
+    int ncsq2_small_warned;
      int brief_predictions;
      
      int rid;                    // current chromosome
@@ -682,11 +680,42 @@ static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
      int iseq;
      if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
      {
-        hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
-        aux->seq[aux->nseq] = strdup(chr_beg);
-        iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
-        aux->nseq++;
-        assert( aux->nseq < 1<<29 );  // see gf_gene_t.iseq and ftr_t.iseq
+        // check for possible mismatch in chromosome naming convention such as chrX vs X
+        char *new_chr = NULL;
+        if ( faidx_has_seq(args->fai,chr_beg) )
+            new_chr = strdup(chr_beg);                  // valid chr name, the same in gff and faidx
+        else
+        {
+            int len = strlen(chr_beg);
+            if ( !strncmp("chr",chr_beg,3) && len>3 )
+                new_chr = strdup(chr_beg+3);            // gff has the prefix, faidx does not
+            else
+            {
+                new_chr = malloc(len+3);                // gff does not have the prefix, faidx has
+                memcpy(new_chr,"chr",3);
+                memcpy(new_chr+3,chr_beg,len);
+                new_chr[len+3] = 0;
+            }
+            if ( !faidx_has_seq(args->fai,new_chr) )    // modification did not help, this sequence is not in fai
+            {
+                static int unkwn_chr_warned = 0;
+                if ( !unkwn_chr_warned && args->verbosity>0 )
+                    fprintf(bcftools_stderr,"Warning: GFF chromosome \"%s\" not part of the reference genome\n",chr_beg);
+                unkwn_chr_warned = 1;
+                free(new_chr);
+                new_chr = strdup(chr_beg);              // use the original sequence name
+            }
+        }
+        if ( khash_str2int_get(aux->seq2int, new_chr, &iseq)!=0 )
+        {
+            hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+            aux->seq[aux->nseq] = new_chr;
+            iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+            aux->nseq++;
+            assert( aux->nseq < 1<<29 );  // see gf_gene_t.iseq and ftr_t.iseq
+        }
+        else
+            free(new_chr);
      }
      chr_end[1] = c;
      return iseq;
@@ -1142,7 +1171,8 @@ void tscript_init_cds(args_t *args)
                          tscript_ok = 0;
                          break;
                      }
-                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                        args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                  }
                  len += tr->cds[i]->len; 
              }
@@ -1180,7 +1210,8 @@ void tscript_init_cds(args_t *args)
                          tscript_ok = 0;
                          break;
                      }
-                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
+                    error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
+                        args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
                  }
                  len += tr->cds[i]->len;
              }
@@ -1198,8 +1229,17 @@ void tscript_init_cds(args_t *args)
              gf_cds_t *a = tr->cds[i-1];
              gf_cds_t *b = tr->cds[i];
              if ( a->beg + a->len - 1 >= b->beg ) 
-                error("Error: CDS overlap in the transcript %"PRIu32": %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32"\n", 
-                    kh_key(aux->id2tr, k), a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+            {
+                if ( args->force )
+                {
+                    fprintf(bcftools_stderr,"Warning: GFF contains overlapping CDS %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32".\n",
+                        args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+                }
+                else
+                    error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
+                          "       Use the --force option to override (at your own risk).\n", 
+                            args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+            }
          }
          if ( len%3 != 0 )
          {
@@ -1339,9 +1379,22 @@ void init_gff(args_t *args)
      khash_str2int_destroy_free(aux->ignored_biotypes);
  }
  
+static inline int ncsq2_to_nfmt(int ncsq2)
+{
+    return 1 + (ncsq2 - 1) / 30;
+}
+static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit)
+{
+    *ival = icsq2 / 30;
+    *ibit = icsq2 % 30;
+}
+
  void init_data(args_t *args)
  {
-    args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32; 
+    args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max);
+
+    args->fai = fai_load(args->fa_fname);
+    if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
  
      if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", args->gff_fname);
      init_gff(args);
@@ -1351,9 +1404,6 @@ void init_data(args_t *args)
      if ( args->filter_str )
          args->filter = filter_init(args->hdr, args->filter_str);
  
-    args->fai = fai_load(args->fa_fname);
-    if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
-
      args->pos2vbuf  = kh_init(pos2vbuf);
      args->active_tr = khp_init(trhp);
      args->hap = (hap_t*) calloc(1,sizeof(hap_t));
@@ -1397,7 +1447,7 @@ void init_data(args_t *args)
      }
      else
      {
-        args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+        args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname));
          if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno));
          if ( args->n_threads > 0)
              hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p);
@@ -1412,6 +1462,11 @@ void init_data(args_t *args)
  
  void destroy_data(args_t *args)
  {
+    if ( args->ncsq2_small_warned )
+        fprintf(bcftools_stderr,
+            "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n"
+            "      the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2);
+
      regidx_destroy(args->idx_cds);
      regidx_destroy(args->idx_utr);
      regidx_destroy(args->idx_exon);
@@ -2685,13 +2740,13 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
  
  void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
  {
-    if ( !args->brief_predictions )
+    if ( !args->brief_predictions || (int)aa->l - args->brief_predictions < 3 )
          kputs(aa->s, str);
      else
      {
-        int len = aa->l;
+        int i, len = aa->l;
          if ( aa->s[len-1]=='*' ) len--;
-        kputc(aa->s[0], str);
+        for (i=0; i<len && i<args->brief_predictions; i++) kputc(aa->s[i], str);
          kputs("..", str);
          kputw(beg+len, str);
      }
@@ -3085,22 +3140,24 @@ static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int iha
      {
          csq_t *csq = node->csq_list + i;
          vrec_t *vrec = csq->vrec;
-        int icsq = 2*csq->idx + ihap;
-        if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+        int icsq2 = 2*csq->idx + ihap;
+        if ( icsq2 >= args->ncsq2_max ) // more than ncsq2_max consequences, so can't fit it in FMT
          {
-            if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) )
+            if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) )
              {
                  fprintf(bcftools_stderr,
                      "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n",
                      args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx);
-                if ( !args->ncsq_small_warned )
+                if ( !args->ncsq2_small_warned )
                      fprintf(bcftools_stderr,"         The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n");
-                args->ncsq_small_warned = 1;
              }
+            if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2;
              break;
          }
-        if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
-        vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+        int ival, ibit;
+        icsq2_to_bit(icsq2, &ival,&ibit);
+        if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
+        vrec->smpl[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit;
      }
  }
  
@@ -3729,22 +3786,26 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
          {
              if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
  
-            int icsq = 2*csq->idx + j;
-            if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+            int icsq2 = 2*csq->idx + j;
+            if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT
              {
                  int ismpl = args->smpl->idx[i];
-                if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) )
+                if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) )
                  {
                      fprintf(bcftools_stderr,
                              "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n",
-                            args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq+1);
-                    if ( !args->ncsq_small_warned )
+                            args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq2+1);
+                    if ( !args->ncsq2_small_warned )
                          fprintf(bcftools_stderr,"         The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n");
-                    args->ncsq_small_warned = 1;
+                    args->ncsq2_small_warned = 1;
                  }
+                if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2;
+                break;
              }
-            if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
-            vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+            int ival, ibit;
+            icsq2_to_bit(icsq2, &ival,&ibit);
+            if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
+            vrec->smpl[i*args->nfmt_bcsq + ival] |= 1 << ibit;
          }
      }
  }
@@ -4043,39 +4104,39 @@ static const char *usage(void)
      return 
          "\n"
          "About: Haplotype-aware consequence caller.\n"
-        "Usage: bcftools csq [options] in.vcf\n"
+        "Usage: bcftools csq [OPTIONS] in.vcf\n"
          "\n"
          "Required options:\n"
-        "   -f, --fasta-ref <file>          reference file in fasta format\n"
-        "   -g, --gff-annot <file>          gff3 annotation file\n"
+        "   -f, --fasta-ref FILE            reference file in fasta format\n"
+        "   -g, --gff-annot FILE            gff3 annotation file\n"
          "\n"
          "CSQ options:\n"
-        "   -b, --brief-predictions         annotate with abbreviated protein-changing predictions\n"
-        "   -c, --custom-tag <string>       use this tag instead of the default BCSQ\n"
+        "   -B, --trim-protein-seq INT      abbreviate protein-changing predictions to max INT aminoacids\n" 
+        "   -c, --custom-tag STRING         use this tag instead of the default BCSQ\n"
          "   -l, --local-csq                 localized predictions, consider only one VCF record at a time\n"
-        "   -n, --ncsq <int>                maximum number of consequences to consider per site [16]\n"
-        "   -p, --phase <a|m|r|R|s>         how to handle unphased heterozygous genotypes: [r]\n"
+        "   -n, --ncsq INT                  maximum number of per-haplotype consequences to consider for each site [15]\n"
+        "   -p, --phase a|m|r|R|s           how to handle unphased heterozygous genotypes: [r]\n"
          "                                     a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
          "                                     m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
          "                                     r: require phased GTs, throw an error on unphased het GTs\n"
          "                                     R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
          "                                     s: skip unphased hets\n"
          "Options:\n"
-        "   -e, --exclude <expr>            exclude sites for which the expression is true\n"
+        "   -e, --exclude EXPR              exclude sites for which the expression is true\n"
          "       --force                     run even if some sanity checks fail\n"
-        "   -i, --include <expr>            select sites for which the expression is true\n"
+        "   -i, --include EXPR              select sites for which the expression is true\n"
          "       --no-version                do not append version and command line to the header\n"
-        "   -o, --output <file>             write output to a file [standard output]\n"
-        "   -O, --output-type <b|u|z|v|t>   b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
+        "   -o, --output FILE               write output to a file [standard output]\n"
+        "   -O, --output-type b|u|z|v|t     b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
          "                                   v: uncompressed VCF, t: plain tab-delimited text output [v]\n"
-        "   -r, --regions <region>          restrict to comma-separated list of regions\n"
-        "   -R, --regions-file <file>       restrict to regions listed in a file\n"
-        "   -s, --samples <-|list>          samples to include or \"-\" to apply all variants and ignore samples\n"
-        "   -S, --samples-file <file>       samples to include\n"
-        "   -t, --targets <region>          similar to -r but streams rather than index-jumps\n"
-        "   -T, --targets-file <file>       similar to -R but streams rather than index-jumps\n"
-        "       --threads <int>             use multithreading with <int> worker threads [0]\n"
-        "   -v, --verbose <int>             verbosity level 0-2 [1]\n"
+        "   -r, --regions REGION            restrict to comma-separated list of regions\n"
+        "   -R, --regions-file FILE         restrict to regions listed in a file\n"
+        "   -s, --samples -|LIST            samples to include or \"-\" to apply all variants and ignore samples\n"
+        "   -S, --samples-file FILE         samples to include\n"
+        "   -t, --targets REGION            similar to -r but streams rather than index-jumps\n"
+        "   -T, --targets-file FILE         similar to -R but streams rather than index-jumps\n"
+        "       --threads INT               use multithreading with <int> worker threads [0]\n"
+        "   -v, --verbose INT               verbosity level 0-2 [1]\n"
          "\n"
          "Example:\n"
          "   bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
@@ -4092,7 +4153,7 @@ int main_csq(int argc, char *argv[])
      args->argc = argc; args->argv = argv;
      args->output_type = FT_VCF;
      args->bcsq_tag = "BCSQ";
-    args->ncsq_max = 2*16;
+    args->ncsq2_max = 2*(16-1);      // 1 bit is reserved for BCF missing values
      args->verbosity = 1;
      args->record_cmd_line = 1;
  
@@ -4102,7 +4163,8 @@ int main_csq(int argc, char *argv[])
          {"threads",required_argument,NULL,2},
          {"help",0,0,'h'},
          {"ncsq",1,0,'n'},
-        {"brief-predictions",0,0,'b'},
+        {"brief-predictions",no_argument,0,'b'},
+        {"trim-protein-seq",required_argument,0,'B'},
          {"custom-tag",1,0,'c'},
          {"local-csq",0,0,'l'},
          {"gff-annot",1,0,'g'},
@@ -4125,7 +4187,7 @@ int main_csq(int argc, char *argv[])
      };
      int c, targets_is_file = 0, regions_is_file = 0; 
      char *targets_list = NULL, *regions_list = NULL, *tmp;
-    while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bv:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0)
      {
          switch (c) 
          {
@@ -4135,7 +4197,14 @@ int main_csq(int argc, char *argv[])
                  if ( *tmp ) error("Could not parse argument: --threads  %s\n", optarg);
                  break;
              case  3 : args->record_cmd_line = 0; break;
-            case 'b': args->brief_predictions = 1; break;
+            case 'b':
+                    args->brief_predictions = 1;
+                    fprintf(bcftools_stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
+                    break;
+            case 'B': 
+                    args->brief_predictions = strtol(optarg,&tmp,10);
+                    if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg);
+                    break;
              case 'l': args->local_csq = 1; break;
              case 'c': args->bcsq_tag = optarg; break;
              case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break;
@@ -4157,8 +4226,8 @@ int main_csq(int argc, char *argv[])
              case 'f': args->fa_fname = optarg; break;
              case 'g': args->gff_fname = optarg; break;
              case 'n': 
-                args->ncsq_max = 2 * atoi(optarg);
-                if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg);
+                args->ncsq2_max = 2 * atoi(optarg);
+                if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg);
                  break;
              case 'o': args->output_fname = optarg; break;
              case 'O':
@@ -4171,8 +4240,12 @@ int main_csq(int argc, char *argv[])
                            default: error("The output type \"%s\" not recognised\n", optarg);
                        }
                        break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'r': regions_list = optarg; break;
              case 'R': regions_list = optarg; regions_is_file = 1; break;
              case 's': args->sample_list = optarg; break;
diff --git a/bcftools/dist.c b/bcftools/dist.c

new file mode 100644 (file)

index 0000000..094fc73
--- /dev/null
+++ b/bcftools/dist.c
@@ -0,0 +1,124 @@
+/* The MIT License
+
+   Copyright (c) 2016-2020 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include "dist.h"
+
+extern void error(const char *format, ...);
+
+struct _dist_t
+{
+    uint64_t *bins, nvalues;
+    int nbins;
+    int npow;   // the number of orders of magnitude to represent exactly
+    int nexact; // pow(10,npow)
+    int nlevel;
+};
+
+dist_t *dist_init(int npow)
+{
+    dist_t *dist = (dist_t*) calloc(1,sizeof(dist_t));
+    dist->npow   = npow;
+    dist->nexact = pow(10,npow);
+    dist->nlevel = dist->nexact - pow(10,npow-1);
+    return dist;
+}
+
+void dist_destroy(dist_t *dist)
+{
+    if ( !dist ) return;
+    free(dist->bins);
+    free(dist);
+}
+
+int dist_nbins(dist_t *dist)
+{
+    return dist->nbins;
+}
+
+int dist_nvalues(dist_t *dist)
+{
+    return dist->nvalues;
+}
+
+uint32_t dist_insert(dist_t *dist, uint32_t value)
+{
+    int ibin;
+
+    if ( value <= dist->nexact ) 
+        ibin = value;
+    else
+    {
+        int npow  = (int) log10(value);
+        int level = npow - dist->npow + 1;
+        uint32_t step = pow(10, level);
+        ibin = dist->nexact + dist->nlevel*(level-1) + (value - pow(10,npow)) / step;
+    }
+
+    if ( ibin >= dist->nbins )
+    {
+        dist->bins = (uint64_t*) realloc(dist->bins, sizeof(*dist->bins)*(ibin+1));
+        memset(dist->bins + dist->nbins, 0, (ibin+1 - dist->nbins)*sizeof(*dist->bins));
+        dist->nbins = ibin+1;
+    }
+    dist->bins[ibin]++;
+    dist->nvalues++;
+    return ibin;
+}
+uint32_t dist_insert_n(dist_t *dist, uint32_t value, uint32_t cnt)
+{
+    if ( !cnt ) return 0;
+    int ibin = dist_insert(dist, value);
+    dist->bins[ibin] += cnt - 1;
+    dist->nvalues += cnt;
+    return ibin;
+}
+
+uint64_t dist_get(dist_t *dist, uint32_t idx, uint32_t *beg, uint32_t *end)
+{
+    if ( idx < dist->nexact )
+    {
+        if ( beg ) *beg = idx;
+        if ( end ) *end = idx + 1;
+    }
+    else
+    {
+        int level = (idx - dist->nexact) / dist->nlevel + 1;
+        int bin   = idx - dist->nexact - dist->nlevel*(level-1);
+
+        uint32_t step  = pow(10, level);
+        uint32_t value = pow(10, level + dist->npow - 1) + step*bin;
+
+        if ( beg ) *beg = value;
+        if ( end ) *end = value + step;
+    }
+    return dist->bins[idx];
+}
+
diff --git a/bcftools/dist.c.pysam.c b/bcftools/dist.c.pysam.c

new file mode 100644 (file)

index 0000000..f3f0915
--- /dev/null
+++ b/bcftools/dist.c.pysam.c
@@ -0,0 +1,126 @@
+#include "bcftools.pysam.h"
+
+/* The MIT License
+
+   Copyright (c) 2016-2020 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include "dist.h"
+
+extern void error(const char *format, ...);
+
+struct _dist_t
+{
+    uint64_t *bins, nvalues;
+    int nbins;
+    int npow;   // the number of orders of magnitude to represent exactly
+    int nexact; // pow(10,npow)
+    int nlevel;
+};
+
+dist_t *dist_init(int npow)
+{
+    dist_t *dist = (dist_t*) calloc(1,sizeof(dist_t));
+    dist->npow   = npow;
+    dist->nexact = pow(10,npow);
+    dist->nlevel = dist->nexact - pow(10,npow-1);
+    return dist;
+}
+
+void dist_destroy(dist_t *dist)
+{
+    if ( !dist ) return;
+    free(dist->bins);
+    free(dist);
+}
+
+int dist_nbins(dist_t *dist)
+{
+    return dist->nbins;
+}
+
+int dist_nvalues(dist_t *dist)
+{
+    return dist->nvalues;
+}
+
+uint32_t dist_insert(dist_t *dist, uint32_t value)
+{
+    int ibin;
+
+    if ( value <= dist->nexact ) 
+        ibin = value;
+    else
+    {
+        int npow  = (int) log10(value);
+        int level = npow - dist->npow + 1;
+        uint32_t step = pow(10, level);
+        ibin = dist->nexact + dist->nlevel*(level-1) + (value - pow(10,npow)) / step;
+    }
+
+    if ( ibin >= dist->nbins )
+    {
+        dist->bins = (uint64_t*) realloc(dist->bins, sizeof(*dist->bins)*(ibin+1));
+        memset(dist->bins + dist->nbins, 0, (ibin+1 - dist->nbins)*sizeof(*dist->bins));
+        dist->nbins = ibin+1;
+    }
+    dist->bins[ibin]++;
+    dist->nvalues++;
+    return ibin;
+}
+uint32_t dist_insert_n(dist_t *dist, uint32_t value, uint32_t cnt)
+{
+    if ( !cnt ) return 0;
+    int ibin = dist_insert(dist, value);
+    dist->bins[ibin] += cnt - 1;
+    dist->nvalues += cnt;
+    return ibin;
+}
+
+uint64_t dist_get(dist_t *dist, uint32_t idx, uint32_t *beg, uint32_t *end)
+{
+    if ( idx < dist->nexact )
+    {
+        if ( beg ) *beg = idx;
+        if ( end ) *end = idx + 1;
+    }
+    else
+    {
+        int level = (idx - dist->nexact) / dist->nlevel + 1;
+        int bin   = idx - dist->nexact - dist->nlevel*(level-1);
+
+        uint32_t step  = pow(10, level);
+        uint32_t value = pow(10, level + dist->npow - 1) + step*bin;
+
+        if ( beg ) *beg = value;
+        if ( end ) *end = value + step;
+    }
+    return dist->bins[idx];
+}
+
diff --git a/bcftools/dist.h b/bcftools/dist.h

new file mode 100644 (file)

index 0000000..5c9c571
--- /dev/null
+++ b/bcftools/dist.h
@@ -0,0 +1,98 @@
+/* The MIT License
+
+   Copyright (c) 2016-2020 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+/*
+    Logarithmic binning
+
+    Example of usage:
+
+        // Initialize, make the binning exact up to 10^4, then add a log-step
+        dist_t *dist = dist_init(4);
+
+        // Insert values
+        int i;
+        for (i=0; i<1e6; i++)
+            dist_insert(dist, i);
+
+        // Number of bins used
+        int n = dist_n(dist);
+
+        // Now print the distribution
+        uint32_t beg, end;
+        for (i=0; i<n; i++)
+        {
+            // Raw count in the bin. The boundaries beg,end are optional, 
+            // and can be used to plot correctly the density
+            uint64_t cnt = dist_get(dist, i, &beg, &end);
+            if ( !cnt ) continue;
+
+            // Print the interval, count and density
+            printf("%u\t%u\t%"PRIu64"\t%f\n", beg, end, cnt, (double)cnt/(end-beg));
+        }
+
+        // Clean up
+        dist_destroy(dist);
+ */
+
+#ifndef __DIST_H__
+#define __DIST_H__
+
+#include <stdio.h>
+#include <inttypes.h>
+
+typedef struct _dist_t dist_t;
+
+/*
+ *  dist_init() - init bins
+ */
+dist_t *dist_init(int npow);
+void dist_destroy(dist_t *dist);
+
+/*
+    dist_nbins() - get the number of bins
+ */
+int dist_nbins(dist_t *dist);
+
+/*
+    dist_nvalues() - get the total number of values inserted
+ */
+int dist_nvalues(dist_t *dist);
+
+/*
+    dist_insert()   - insert new value
+    dist_insert_n() - insert new value n times
+ */
+uint32_t dist_insert(dist_t *dist, uint32_t value);
+uint32_t dist_insert_n(dist_t *dist, uint32_t value, uint32_t cnt);
+
+/*
+   dist_get() 
+   @idx:        from the interval [0,dist_n-1]
+   @beg,end:    [beg,end)
+ */
+uint64_t dist_get(dist_t *dist, uint32_t idx, uint32_t *beg, uint32_t *end);
+
+#endif
+
diff --git a/bcftools/em.c b/bcftools/em.c

index a976f22abdbd3a0654b4689ed0c9f42e21466abe..baa34907d0b94e3fc4d3e0e0e7d7662fe4872bbb 100644 (file)
--- a/bcftools/em.c
+++ b/bcftools/em.c
@@ -1,7 +1,7 @@
  /*  em.c -- mathematical functions.
  
      Copyright (C) 2010, 2011 Broad Institute.
-    Portions copyright (C) 2013 Genome Research Ltd.
+    Portions copyright (C) 2013-2014 Genome Research Ltd.
  
      Author: Heng Li <lh3@live.co.uk>
  
diff --git a/bcftools/em.c.pysam.c b/bcftools/em.c.pysam.c

index db27d065ed7196b57cb0f093a110ae451d8f9956..37a3dea2e24a5d1ccb62f4b4531f03bddc75c96b 100644 (file)
--- a/bcftools/em.c.pysam.c
+++ b/bcftools/em.c.pysam.c
@@ -3,7 +3,7 @@
  /*  em.c -- mathematical functions.
  
      Copyright (C) 2010, 2011 Broad Institute.
-    Portions copyright (C) 2013 Genome Research Ltd.
+    Portions copyright (C) 2013-2014 Genome Research Ltd.
  
      Author: Heng Li <lh3@live.co.uk>
  
diff --git a/bcftools/extsort.c b/bcftools/extsort.c

new file mode 100644 (file)

index 0000000..014e03b
--- /dev/null
+++ b/bcftools/extsort.c
@@ -0,0 +1,250 @@
+/*  ext-sort.h -- sort on disk
+
+   Copyright (C) 2020-2021 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <unistd.h>     // for unlink()
+#include <sys/stat.h>   // for chmod()
+#include <assert.h>
+#include <fcntl.h>
+#ifdef _WIN32
+#include <windows.h>
+#endif
+#include "bcftools.h"
+#include "extsort.h"
+#include "kheap.h"
+
+typedef struct
+{
+    extsort_t *es;  // this is to get access to extsort_cmp_f from kheap
+    int fd;
+    char *fname;
+    void *dat;
+}
+blk_t;
+
+static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr);
+KHEAP_INIT(blk, blk_t*, blk_is_smaller)     /* defines khp_blk_t */
+
+struct _extsort_t
+{
+    size_t dat_size, mem, max_mem;
+    char *tmp_prefix;
+    extsort_cmp_f cmp;
+
+    size_t nbuf, mbuf, nblk;
+    blk_t **blk;
+    void **buf, *tmp_dat;
+    khp_blk_t *bhp;
+};
+
+static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr)
+{
+    blk_t *a = *aptr;
+    blk_t *b = *bptr;
+    int ret = a->es->cmp(&a->dat,&b->dat);
+    if ( ret < 0 ) return 1;
+    return 0;
+}
+
+size_t parse_mem_string(const char *str);
+
+void extsort_set(extsort_t *es, extsort_opt_t key, void *value)
+{
+    if ( key==DAT_SIZE ) { es->dat_size = *((size_t*)value); return; }
+    if ( key==MAX_MEM )
+    {
+        es->max_mem = parse_mem_string(*((const char**)value));
+        if ( es->max_mem <=0 ) error("Could not parse the memory string, expected positive number: %s\n",*((const char**)value));
+        return;
+    }
+    if ( key==TMP_PREFIX ) { es->tmp_prefix = init_tmp_prefix(*((const char**)value)); return; }
+    if ( key==FUNC_CMP ) { es->cmp = *((extsort_cmp_f*)value); return; }
+}
+
+extsort_t *extsort_alloc(void)
+{
+    extsort_t *es = (extsort_t*) calloc(1,sizeof(*es));
+    es->max_mem = 100e6;
+    return es;
+}
+void extsort_init(extsort_t *es)
+{
+    assert( es->cmp );
+    assert( es->dat_size );
+    if ( !es->tmp_prefix ) es->tmp_prefix = init_tmp_prefix(NULL);
+    es->tmp_dat = malloc(es->dat_size);
+}
+
+void extsort_destroy(extsort_t *es)
+{
+    int i;
+    for (i=0; i<es->nblk; i++)
+    {
+        blk_t *blk = es->blk[i];
+        if ( blk->fd!=-1 )
+#ifdef _WIN32
+            _close(blk->fd);
+#else
+            close(blk->fd);
+#endif
+        free(blk->fname);
+        free(blk->dat);
+        free(blk);
+    }
+    free(es->tmp_dat);
+    free(es->tmp_prefix);
+    free(es->blk);
+    khp_destroy(blk, es->bhp);
+    free(es);
+}
+
+static void _buf_flush(extsort_t *es)
+{
+    int i;
+    if ( !es->nbuf ) return;
+
+    qsort(es->buf, es->nbuf, sizeof(void*), es->cmp);
+
+    es->nblk++;
+    es->blk = (blk_t**) realloc(es->blk, sizeof(blk_t*)*es->nblk);
+    es->blk[es->nblk-1] = (blk_t*) calloc(1,sizeof(blk_t));
+    blk_t *blk = es->blk[es->nblk-1];
+    blk->es    = es;
+    blk->dat   = malloc(es->dat_size);
+    blk->fname = strdup(es->tmp_prefix);
+    #ifdef _WIN32
+        for (i=0; i<100000; i++)
+        {
+            memcpy(blk->fname,es->tmp_prefix,strlen(es->tmp_prefix));
+            mktemp(blk->fname);
+            blk->fd = _open(blk->fname, O_RDWR|O_CREAT|O_EXCL|O_BINARY|O_TEMPORARY, 0600);
+            if ( blk->fd==-1 )
+            {
+                if ( errno==EEXIST ) continue; 
+                error("Error: failed to open a temporary file %s\n",blk->fname);
+            }
+            break;
+        }
+        if ( !blk->fd ) error("Error: failed to create a unique temporary file name from %s\n",es->tmp_prefix);
+        if ( _chmod(blk->fname, S_IRUSR|S_IWUSR)!=0 ) error("Error: failed to set permissions of the temporary file %s\n",blk->fname);
+    #else
+        if ( (blk->fd = mkstemp(blk->fname))==-1 )
+            error("Error: failed to open a temporary file %s\n",blk->fname);
+        if ( fchmod(blk->fd,S_IRUSR|S_IWUSR)!=0 ) error("Error: failed to set permissions of the temporary file %s\n",blk->fname);
+        unlink(blk->fname); // should auto delete when closed on linux, the descriptor remains open
+    #endif
+
+    for (i=0; i<es->nbuf; i++)
+    {
+        #ifdef _WIN32
+            if ( _write(blk->fd, es->buf[i], es->dat_size)!=es->dat_size ) error("Error: failed to write %zu bytes to the temporary file %s\n",es->dat_size,blk->fname);
+        #else
+            if ( write(blk->fd, es->buf[i], es->dat_size)!=es->dat_size ) error("Error: failed to write %zu bytes to the temporary file %s\n",es->dat_size,blk->fname);
+        #endif
+        free(es->buf[i]);
+    }
+#ifdef _WIN32
+    if ( _lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#else
+    if ( lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#endif
+
+    es->nbuf = 0;
+    es->mem  = 0;
+}
+
+void extsort_push(extsort_t *es, void *dat)
+{
+    int delta = sizeof(void*) + es->dat_size;
+    if ( es->nbuf && es->mem + delta > es->max_mem ) _buf_flush(es);
+    es->nbuf++;
+    es->mem += delta;
+    hts_expand(void*, es->nbuf, es->mbuf, es->buf);
+    es->buf[es->nbuf-1] = dat;
+}
+
+// return number of elements read
+static ssize_t _blk_read(extsort_t *es, blk_t *blk)
+{
+    ssize_t ret = 0;
+    if ( blk->fd==-1 ) return ret;
+#ifdef _WIN32
+    ret = _read(blk->fd, blk->dat, es->dat_size);
+#else
+    ret = read(blk->fd, blk->dat, es->dat_size);
+#endif
+    if ( ret < 0 ) error("Error: failed to read from the temporary file %s\n", blk->fname);
+    if ( ret == 0 )
+    {
+#ifdef _WIN32
+        if ( _close(blk->fd)!=0 ) error("Error: failed to close the temporary file %s\n", blk->fname);
+#else
+        if ( close(blk->fd)!=0 ) error("Error: failed to close the temporary file %s\n", blk->fname);
+#endif
+        blk->fd = -1;
+        return ret;
+    }
+    if ( ret < es->dat_size ) error("Error: failed to read %zu bytes from the temporary file %s\n",es->dat_size,blk->fname);
+    return ret;
+}
+
+void extsort_sort(extsort_t *es)
+{
+    _buf_flush(es);
+    free(es->buf);
+    es->buf = NULL;
+    es->bhp = khp_init(blk);
+
+    // open all blocks, read one record from each, create a heap
+    int i;
+    for (i=0; i<es->nblk; i++)
+    {
+        blk_t *blk = es->blk[i];
+#ifdef _WIN32
+        if ( _lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#else
+        if ( lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#endif
+        int ret = _blk_read(es, blk);
+        if ( ret ) khp_insert(blk, es->bhp, &blk);
+    }
+}
+
+void *extsort_shift(extsort_t *es)
+{
+    if ( !es->bhp->ndat ) return NULL;
+    blk_t *blk = es->bhp->dat[0];
+
+    // swap the pointer which keeps the location of user data so that it is not overwritten by the next read
+    void *tmp = es->tmp_dat; es->tmp_dat = blk->dat; blk->dat = tmp;
+    khp_delete(blk, es->bhp);
+
+    int ret = _blk_read(es, blk);
+    if ( ret ) khp_insert(blk, es->bhp, &blk);
+
+    return es->tmp_dat;
+}
+
diff --git a/bcftools/extsort.c.pysam.c b/bcftools/extsort.c.pysam.c

new file mode 100644 (file)

index 0000000..1b410a7
--- /dev/null
+++ b/bcftools/extsort.c.pysam.c
@@ -0,0 +1,252 @@
+#include "bcftools.pysam.h"
+
+/*  ext-sort.h -- sort on disk
+
+   Copyright (C) 2020-2021 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <unistd.h>     // for unlink()
+#include <sys/stat.h>   // for chmod()
+#include <assert.h>
+#include <fcntl.h>
+#ifdef _WIN32
+#include <windows.h>
+#endif
+#include "bcftools.h"
+#include "extsort.h"
+#include "kheap.h"
+
+typedef struct
+{
+    extsort_t *es;  // this is to get access to extsort_cmp_f from kheap
+    int fd;
+    char *fname;
+    void *dat;
+}
+blk_t;
+
+static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr);
+KHEAP_INIT(blk, blk_t*, blk_is_smaller)     /* defines khp_blk_t */
+
+struct _extsort_t
+{
+    size_t dat_size, mem, max_mem;
+    char *tmp_prefix;
+    extsort_cmp_f cmp;
+
+    size_t nbuf, mbuf, nblk;
+    blk_t **blk;
+    void **buf, *tmp_dat;
+    khp_blk_t *bhp;
+};
+
+static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr)
+{
+    blk_t *a = *aptr;
+    blk_t *b = *bptr;
+    int ret = a->es->cmp(&a->dat,&b->dat);
+    if ( ret < 0 ) return 1;
+    return 0;
+}
+
+size_t parse_mem_string(const char *str);
+
+void extsort_set(extsort_t *es, extsort_opt_t key, void *value)
+{
+    if ( key==DAT_SIZE ) { es->dat_size = *((size_t*)value); return; }
+    if ( key==MAX_MEM )
+    {
+        es->max_mem = parse_mem_string(*((const char**)value));
+        if ( es->max_mem <=0 ) error("Could not parse the memory string, expected positive number: %s\n",*((const char**)value));
+        return;
+    }
+    if ( key==TMP_PREFIX ) { es->tmp_prefix = init_tmp_prefix(*((const char**)value)); return; }
+    if ( key==FUNC_CMP ) { es->cmp = *((extsort_cmp_f*)value); return; }
+}
+
+extsort_t *extsort_alloc(void)
+{
+    extsort_t *es = (extsort_t*) calloc(1,sizeof(*es));
+    es->max_mem = 100e6;
+    return es;
+}
+void extsort_init(extsort_t *es)
+{
+    assert( es->cmp );
+    assert( es->dat_size );
+    if ( !es->tmp_prefix ) es->tmp_prefix = init_tmp_prefix(NULL);
+    es->tmp_dat = malloc(es->dat_size);
+}
+
+void extsort_destroy(extsort_t *es)
+{
+    int i;
+    for (i=0; i<es->nblk; i++)
+    {
+        blk_t *blk = es->blk[i];
+        if ( blk->fd!=-1 )
+#ifdef _WIN32
+            _close(blk->fd);
+#else
+            close(blk->fd);
+#endif
+        free(blk->fname);
+        free(blk->dat);
+        free(blk);
+    }
+    free(es->tmp_dat);
+    free(es->tmp_prefix);
+    free(es->blk);
+    khp_destroy(blk, es->bhp);
+    free(es);
+}
+
+static void _buf_flush(extsort_t *es)
+{
+    int i;
+    if ( !es->nbuf ) return;
+
+    qsort(es->buf, es->nbuf, sizeof(void*), es->cmp);
+
+    es->nblk++;
+    es->blk = (blk_t**) realloc(es->blk, sizeof(blk_t*)*es->nblk);
+    es->blk[es->nblk-1] = (blk_t*) calloc(1,sizeof(blk_t));
+    blk_t *blk = es->blk[es->nblk-1];
+    blk->es    = es;
+    blk->dat   = malloc(es->dat_size);
+    blk->fname = strdup(es->tmp_prefix);
+    #ifdef _WIN32
+        for (i=0; i<100000; i++)
+        {
+            memcpy(blk->fname,es->tmp_prefix,strlen(es->tmp_prefix));
+            mktemp(blk->fname);
+            blk->fd = _open(blk->fname, O_RDWR|O_CREAT|O_EXCL|O_BINARY|O_TEMPORARY, 0600);
+            if ( blk->fd==-1 )
+            {
+                if ( errno==EEXIST ) continue; 
+                error("Error: failed to open a temporary file %s\n",blk->fname);
+            }
+            break;
+        }
+        if ( !blk->fd ) error("Error: failed to create a unique temporary file name from %s\n",es->tmp_prefix);
+        if ( _chmod(blk->fname, S_IRUSR|S_IWUSR)!=0 ) error("Error: failed to set permissions of the temporary file %s\n",blk->fname);
+    #else
+        if ( (blk->fd = mkstemp(blk->fname))==-1 )
+            error("Error: failed to open a temporary file %s\n",blk->fname);
+        if ( fchmod(blk->fd,S_IRUSR|S_IWUSR)!=0 ) error("Error: failed to set permissions of the temporary file %s\n",blk->fname);
+        unlink(blk->fname); // should auto delete when closed on linux, the descriptor remains open
+    #endif
+
+    for (i=0; i<es->nbuf; i++)
+    {
+        #ifdef _WIN32
+            if ( _write(blk->fd, es->buf[i], es->dat_size)!=es->dat_size ) error("Error: failed to write %zu bytes to the temporary file %s\n",es->dat_size,blk->fname);
+        #else
+            if ( write(blk->fd, es->buf[i], es->dat_size)!=es->dat_size ) error("Error: failed to write %zu bytes to the temporary file %s\n",es->dat_size,blk->fname);
+        #endif
+        free(es->buf[i]);
+    }
+#ifdef _WIN32
+    if ( _lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#else
+    if ( lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#endif
+
+    es->nbuf = 0;
+    es->mem  = 0;
+}
+
+void extsort_push(extsort_t *es, void *dat)
+{
+    int delta = sizeof(void*) + es->dat_size;
+    if ( es->nbuf && es->mem + delta > es->max_mem ) _buf_flush(es);
+    es->nbuf++;
+    es->mem += delta;
+    hts_expand(void*, es->nbuf, es->mbuf, es->buf);
+    es->buf[es->nbuf-1] = dat;
+}
+
+// return number of elements read
+static ssize_t _blk_read(extsort_t *es, blk_t *blk)
+{
+    ssize_t ret = 0;
+    if ( blk->fd==-1 ) return ret;
+#ifdef _WIN32
+    ret = _read(blk->fd, blk->dat, es->dat_size);
+#else
+    ret = read(blk->fd, blk->dat, es->dat_size);
+#endif
+    if ( ret < 0 ) error("Error: failed to read from the temporary file %s\n", blk->fname);
+    if ( ret == 0 )
+    {
+#ifdef _WIN32
+        if ( _close(blk->fd)!=0 ) error("Error: failed to close the temporary file %s\n", blk->fname);
+#else
+        if ( close(blk->fd)!=0 ) error("Error: failed to close the temporary file %s\n", blk->fname);
+#endif
+        blk->fd = -1;
+        return ret;
+    }
+    if ( ret < es->dat_size ) error("Error: failed to read %zu bytes from the temporary file %s\n",es->dat_size,blk->fname);
+    return ret;
+}
+
+void extsort_sort(extsort_t *es)
+{
+    _buf_flush(es);
+    free(es->buf);
+    es->buf = NULL;
+    es->bhp = khp_init(blk);
+
+    // open all blocks, read one record from each, create a heap
+    int i;
+    for (i=0; i<es->nblk; i++)
+    {
+        blk_t *blk = es->blk[i];
+#ifdef _WIN32
+        if ( _lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#else
+        if ( lseek(blk->fd,0,SEEK_SET)!=0 ) error("Error: failed to lseek() to the start of the temporary file %s\n", blk->fname);
+#endif
+        int ret = _blk_read(es, blk);
+        if ( ret ) khp_insert(blk, es->bhp, &blk);
+    }
+}
+
+void *extsort_shift(extsort_t *es)
+{
+    if ( !es->bhp->ndat ) return NULL;
+    blk_t *blk = es->bhp->dat[0];
+
+    // swap the pointer which keeps the location of user data so that it is not overwritten by the next read
+    void *tmp = es->tmp_dat; es->tmp_dat = blk->dat; blk->dat = tmp;
+    khp_delete(blk, es->bhp);
+
+    int ret = _blk_read(es, blk);
+    if ( ret ) khp_insert(blk, es->bhp, &blk);
+
+    return es->tmp_dat;
+}
+
diff --git a/bcftools/extsort.h b/bcftools/extsort.h

new file mode 100644 (file)

index 0000000..ba6282e
--- /dev/null
+++ b/bcftools/extsort.h
@@ -0,0 +1,56 @@
+/*  ext-sort.h -- sort on disk
+
+   Copyright (C) 2020 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+#ifndef __EXTSORT_H__
+#define __EXTSORT_H__
+
+//todo: return status to all functions
+
+typedef struct _extsort_t extsort_t;
+
+typedef int (*extsort_cmp_f) (const void *aptr, const void *bptr);
+
+// Modes of operation
+typedef enum
+{
+    DAT_SIZE,       // size_t        .. assuming constant size records for now
+    TMP_PREFIX,     // const char*   .. prefix of temporary files, XXXXXX will be appended
+    MAX_MEM,        // const char*   .. maximum memory to use, e.g. 100MB
+    FUNC_CMP,       // extsort_cmp_f .. sort function
+}
+extsort_opt_t;
+
+#define extsort_set_opt(es,type,key,value) { type tmp = value; extsort_set(es, key, (void*)&tmp); }
+
+extsort_t *extsort_alloc(void);
+void extsort_set(extsort_t *es, extsort_opt_t key, void *value);
+void extsort_init(extsort_t *es);
+void extsort_push(extsort_t *es, void *dat);    // dat will be freed by extsort later
+void extsort_sort(extsort_t *es);
+void *extsort_shift(extsort_t *es);
+void extsort_destroy(extsort_t *es);
+
+#endif
diff --git a/bcftools/filter.c b/bcftools/filter.c

index ea60036d6c3f43cfb655519f3e60ea6a6944c563..3c451950f2113600222f8d280250cacf55759c1e 100644 (file)
--- a/bcftools/filter.c
+++ b/bcftools/filter.c
@@ -1,6 +1,6 @@
  /*  filter.c -- filter expressions.
  
-    Copyright (C) 2013-2018 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -25,6 +25,7 @@ THE SOFTWARE.  */
  #include <ctype.h>
  #include <stdlib.h>
  #include <strings.h>
+#include <assert.h>
  #include <errno.h>
  #include <math.h>
  #include <sys/types.h>
@@ -56,27 +57,6 @@ static int filter_ninit = 0;
  #  define __FUNCTION__ __func__
  #endif
  
-static const uint64_t bcf_double_missing    = 0x7ff0000000000001;
-static const uint64_t bcf_double_vector_end = 0x7ff0000000000002;
-static inline void bcf_double_set(double *ptr, uint64_t value)
-{
-    union { uint64_t i; double d; } u;
-    u.i = value;
-    *ptr = u.d;
-}
-static inline int bcf_double_test(double d, uint64_t value)
-{
-    union { uint64_t i; double d; } u;
-    u.d = d;
-    return u.i==value ? 1 : 0;
-}
-#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end)
-#define bcf_double_set_missing(x)    bcf_double_set(&(x),bcf_double_missing)
-#define bcf_double_is_vector_end(x)  bcf_double_test((x),bcf_double_vector_end)
-#define bcf_double_is_missing(x)     bcf_double_test((x),bcf_double_missing)
-#define bcf_double_is_missing_or_vector_end(x)     (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end))
-
-
  typedef struct _token_t
  {
      // read-only values, same for all VCF lines
@@ -89,9 +69,9 @@ typedef struct _token_t
      int hdr_id, tag_type;   // BCF header lookup ID and one of BCF_HL_* types
      int idx;            // 0-based index to VCF vectors,
                          //  -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
-    int *idxs;          // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
+    int *idxs;          // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only
      int nidxs, nuidxs;  // size of idxs array and the number of elements set to 1
-    uint8_t *usmpl;     // bitmask of used samples as set by idx
+    uint8_t *usmpl;     // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise
      int nsamples;       // number of samples for format fields, 0 for info and other fields
      void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
      int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack);
@@ -158,11 +138,19 @@ struct _filter_t
  #define TOK_PHRED   29
  #define TOK_MEDIAN  30
  #define TOK_STDEV   31
-
-//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
-//                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l  f  c  p  b  P  i  s
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
-#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis"
+#define TOK_sMAX    32
+#define TOK_sMIN    33
+#define TOK_sAVG    34
+#define TOK_sMEDIAN 35
+#define TOK_sSTDEV  36
+#define TOK_sSUM    37
+#define TOK_IN      38      // contains, e.g. FILTER~"A" 
+#define TOK_NOT_IN  39      // does not contain, e.g. FILTER!~"A" 
+
+//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
+//                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l  f  c  p  b  P  i  s 
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
+#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis"       // this is only for debugging, not maintained diligently
  
  // Return negative values if it is a function with variable number of arguments
  static int filters_next_token(char **str, int *len)
@@ -184,6 +172,20 @@ static int filters_next_token(char **str, int *len)
          tmp = *str;
      }
  
+    if ( !strncasecmp(tmp,"SMPL_MAX(",9) ) { (*str) += 8; return TOK_sMAX; }
+    if ( !strncasecmp(tmp,"SMPL_MIN(",9) ) { (*str) += 8; return TOK_sMIN; }
+    if ( !strncasecmp(tmp,"SMPL_MEAN(",10) ) { (*str) += 9; return TOK_sAVG; }
+    if ( !strncasecmp(tmp,"SMPL_MEDIAN(",12) ) { (*str) += 11; return TOK_sMEDIAN; }
+    if ( !strncasecmp(tmp,"SMPL_AVG(",9) ) { (*str) += 8; return TOK_sAVG; }
+    if ( !strncasecmp(tmp,"SMPL_STDEV(",11) ) { (*str) += 10; return TOK_sSTDEV; }
+    if ( !strncasecmp(tmp,"SMPL_SUM(",9) ) { (*str) += 8; return TOK_sSUM; }
+    if ( !strncasecmp(tmp,"sMAX(",5) ) { (*str) += 4; return TOK_sMAX; }
+    if ( !strncasecmp(tmp,"sMIN(",5) ) { (*str) += 4; return TOK_sMIN; }
+    if ( !strncasecmp(tmp,"sMEAN(",6) ) { (*str) += 5; return TOK_sAVG; }
+    if ( !strncasecmp(tmp,"sMEDIAN(",8) ) { (*str) += 7; return TOK_sMEDIAN; }
+    if ( !strncasecmp(tmp,"sAVG(",5) ) { (*str) += 4; return TOK_sAVG; }
+    if ( !strncasecmp(tmp,"sSTDEV(",7) ) { (*str) += 6; return TOK_sSTDEV; }
+    if ( !strncasecmp(tmp,"sSUM(",5) ) { (*str) += 4; return TOK_sSUM; }
      if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; }
      if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; }
      if ( !strncasecmp(tmp,"MEAN(",5) ) { (*str) += 4; return TOK_AVG; }
@@ -417,7 +419,7 @@ static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf
  static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
  {
      int i;
-    if ( rtok->tok_type==TOK_NE )  // AND logic: none of the filters can match
+    if ( rtok->tok_type==TOK_NOT_IN )
      {
          if ( !line->d.n_flt )
          {
@@ -430,7 +432,7 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1
          rtok->pass_site = 1;
          return;
      }
-    else if ( rtok->tok_type==TOK_EQ ) // OR logic: at least one of the filters must match
+    else if ( rtok->tok_type==TOK_IN )
      {
          if ( !line->d.n_flt )
          {
@@ -441,8 +443,30 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1
              if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; }
          return;
      }
+    else if ( rtok->tok_type==TOK_NE )  // exact match
+    {
+        if ( !line->d.n_flt )
+        {
+            if ( atok->hdr_id==-1 ) return;   // missing value
+            rtok->pass_site = 1;
+            return; // no filter present, eval to true
+        }
+        if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) return;    // exact match, fail iff a single matching value is present
+        rtok->pass_site = 1;
+        return;
+    }
+    else if ( rtok->tok_type==TOK_EQ )  // exact match, pass iff a single matching value is present
+    {
+        if ( !line->d.n_flt )
+        {
+            if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
+            return; // no filter present, eval to false
+        }
+        if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) rtok->pass_site = 1;
+        return;
+    }
      else 
-        error("Only == and != operators are supported for FILTER\n");
+        error("Only ==, !=, ~, and !~ operators are supported for FILTER\n");
      return;
  }
  static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
@@ -1036,54 +1060,46 @@ static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok)
          tok->nvalues = 0;
          return;
      }
-    if ( fmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8\n");
-
+    
      int j,nmissing = 0;
-    for (i=0; i<line->n_sample; i++)
-    {
-        int8_t *ptr = (int8_t*) (fmt->p + i*fmt->size);
-        for (j=0; j<fmt->n; j++)
-        {
-            if ( ptr[j]==bcf_int8_vector_end ) break;
-            if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; }
-        }
+    #define BRANCH(type_t, is_vector_end) { \
+        for (i=0; i<line->n_sample; i++) \
+        { \
+            type_t *ptr = (type_t *) (fmt->p + i*fmt->size); \
+            for (j=0; j<fmt->n; j++) \
+            { \
+                if ( ptr[j]==is_vector_end ) break; \
+                if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } \
+            } \
+        } \
+    }
+    switch (fmt->type) {
+        case BCF_BT_INT8:  BRANCH(int8_t,  bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+        default: fprintf(stderr,"todo: type %d\n", fmt->type); exit(1); break;
      }
+    #undef BRANCH
      tok->nvalues = 1;
      tok->values[0] = tok->tag[0]=='N' ? nmissing : (double)nmissing / line->n_sample;
  }
  static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
-    if ( nstack==0 ) error("Error parsing the expresion\n");
+    if ( nstack==0 ) error("Error parsing the expression\n");
      token_t *tok = stack[nstack - 1];
      if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag);
-
-    rtok->nsamples = tok->nsamples;
-    memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples));
-
      assert(tok->usmpl);
-    if ( !rtok->usmpl )
-    {
-        rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl));
-        memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl));
-    }
  
      int i, npass = 0;
-    for (i=0; i<rtok->nsamples; i++)
+    for (i=0; i<tok->nsamples; i++)
      {
-        if ( !rtok->usmpl[i] ) continue;
-        if ( rtok->pass_samples[i] ) npass++;
+        if ( !tok->usmpl[i] ) continue;
+        if ( tok->pass_samples[i] ) npass++;
      }
-
-    hts_expand(double,rtok->nsamples,rtok->mvalues,rtok->values);
-    double value = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0);
-    rtok->nval1 = 1;
-    rtok->nvalues = rtok->nsamples;
-
-    // Set per-sample status so that `query -i 'F_PASS(GT!="mis" & GQ >= 20) > 0.5'` or +trio-stats
-    // consider only the passing site AND samples. The values for failed samples is set to -1 so
-    // that it can never conflict with valid expressions.
-    for (i=0; i<rtok->nsamples; i++)
-        rtok->values[i] = rtok->pass_samples[i] ? value : -1;
+    hts_expand(double,1,rtok->mvalues,rtok->values);
+    rtok->nsamples = 0;
+    rtok->nvalues = 1;
+    rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0);
  
      return 1;
  }
@@ -1165,13 +1181,30 @@ static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack,
      token_t *tok = stack[nstack - 1];
      rtok->nvalues = 0;
      if ( !tok->nvalues ) return 1;
-    double val = -HUGE_VAL;
-    int i, has_value = 0;
-    for (i=0; i<tok->nvalues; i++)
+    double *ptr, val = -HUGE_VAL;
+    int i,j, has_value = 0;
+    if ( tok->nsamples )
      {
-        if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
-        has_value = 1;
-        if ( val < tok->values[i] ) val = tok->values[i];
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) continue;
+            ptr = tok->values + i*tok->nval1;
+            for (j=0; j<tok->nval1; j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+                has_value = 1;
+                if ( val < ptr[j] ) val = ptr[j];
+            }
+        }
+    }
+    else
+    {
+        for (i=0; i<tok->nvalues; i++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+            has_value = 1;
+            if ( val < tok->values[i] ) val = tok->values[i];
+        }
      }
      if ( has_value )
      {
@@ -1180,18 +1213,65 @@ static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack,
      }
      return 1;
  }
+static int func_smpl_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) return func_max(flt,line,rtok,stack,nstack);
+    rtok->nsamples = tok->nsamples;
+    rtok->nvalues  = tok->nsamples;
+    rtok->nval1 = 1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    assert(tok->usmpl);
+    if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+    memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    int i, j, has_value;
+    double val, *ptr;
+    for (i=0; i<tok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        val = -HUGE_VAL;
+        has_value = 0;
+        ptr = tok->values + i*tok->nval1;
+        for (j=0; j<tok->nval1; j++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+            has_value = 1;
+            if ( val < ptr[j] ) val = ptr[j];
+        }
+        if ( has_value ) rtok->values[i] = val;
+        else bcf_double_set_missing(rtok->values[i]);
+    }
+    return 1;
+}
  static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
      token_t *tok = stack[nstack - 1];
      rtok->nvalues = 0;
      if ( !tok->nvalues ) return 1;
-    double val = HUGE_VAL;
-    int i, has_value = 0;
-    for (i=0; i<tok->nvalues; i++)
+    double *ptr, val = HUGE_VAL;
+    int i,j, has_value = 0;
+    if ( tok->nsamples )
+    {
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) continue;
+            ptr = tok->values + i*tok->nval1;
+            for (j=0; j<tok->nval1; j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+                has_value = 1;
+                if ( val > ptr[j] ) val = ptr[j];
+            }
+        }
+    }
+    else
      {
-        if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
-        has_value = 1;
-        if ( val > tok->values[i] ) val = tok->values[i];
+        for (i=0; i<tok->nvalues; i++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+            has_value = 1;
+            if ( val > tok->values[i] ) val = tok->values[i];
+        }
      }
      if ( has_value )
      {
@@ -1200,15 +1280,62 @@ static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack,
      }
      return 1;
  }
+static int func_smpl_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) return func_min(flt,line,rtok,stack,nstack);
+    rtok->nsamples = tok->nsamples;
+    rtok->nvalues  = tok->nsamples;
+    rtok->nval1 = 1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    assert(tok->usmpl);
+    if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+    memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    int i, j, has_value;
+    double val, *ptr;
+    for (i=0; i<tok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        val = HUGE_VAL;
+        has_value = 0;
+        ptr = tok->values + i*tok->nval1;
+        for (j=0; j<tok->nval1; j++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+            has_value = 1;
+            if ( val > ptr[j] ) val = ptr[j];
+        }
+        if ( has_value ) rtok->values[i] = val;
+        else bcf_double_set_missing(rtok->values[i]);
+    }
+    return 1;
+}
  static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
      token_t *tok = stack[nstack - 1];
      rtok->nvalues = 0;
      if ( !tok->nvalues ) return 1;
-    double val = 0;
-    int i, n = 0;
-    for (i=0; i<tok->nvalues; i++)
-        if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+    double *ptr, val = 0;
+    int i,j, n = 0;
+    if ( tok->nsamples )
+    {
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) continue;
+            ptr = tok->values + i*tok->nval1;
+            for (j=0; j<tok->nval1; j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+                val += ptr[j];
+                n++;
+            }
+        }
+    }
+    else
+    {
+        for (i=0; i<tok->nvalues; i++)
+            if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+    }
      if ( n )
      {
          rtok->values[0] = val / n;
@@ -1216,6 +1343,34 @@ static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack,
      }
      return 1;
  }
+static int func_smpl_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+    rtok->nsamples = tok->nsamples;
+    rtok->nvalues  = tok->nsamples;
+    rtok->nval1 = 1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    assert(tok->usmpl);
+    if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+    memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    int i, j, n;
+    double val, *ptr;
+    for (i=0; i<tok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        val = 0;
+        n = 0;
+        ptr = tok->values + i*tok->nval1;
+        for (j=0; j<tok->nval1; j++)
+        {
+            if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) { val += ptr[j]; n++; }
+        }
+        if ( n ) rtok->values[i] = val / n;
+        else bcf_double_set_missing(rtok->values[i]);
+    }
+    return 1;
+}
  static int compare_doubles(const void *lhs, const void *rhs)
  {
      double arg1 = *(const double*) lhs;
@@ -1229,12 +1384,29 @@ static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta
      token_t *tok = stack[nstack - 1];
      rtok->nvalues = 0;
      if ( !tok->nvalues ) return 1;
-    int i, n = 0;
-    for (i=0; i<tok->nvalues; i++)
+    // sweep through all tok->values and while excluding all missing values reuse the very same array
+    int i,j,k = 0, n = 0;
+    if ( tok->nsamples )
      {
-        if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
-        if ( n < i ) tok->values[n] = tok->values[i];
-        n++;
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) { k += tok->nval1; continue; }
+            for (j=0; j<tok->nval1; k++,j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue;
+                if ( n < k ) tok->values[n] = tok->values[k];
+                n++;
+            }
+        }
+    }
+    else
+    {
+        for (i=0; i<tok->nvalues; i++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+            if ( n < i ) tok->values[n] = tok->values[i];
+            n++;
+        }
      }
      if ( !n ) return 1;
      if ( n==1 ) rtok->values[0] = tok->values[0];
@@ -1246,40 +1418,149 @@ static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta
      rtok->nvalues = 1;
      return 1;
  }
+static int func_smpl_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+    rtok->nsamples = tok->nsamples;
+    rtok->nvalues  = tok->nsamples;
+    rtok->nval1 = 1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    assert(tok->usmpl);
+    if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+    memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    int i, j, n;
+    double *ptr;
+    for (i=0; i<tok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        n = 0;
+        ptr = tok->values + i*tok->nval1;
+        for (j=0; j<tok->nval1; j++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+            if ( n < j ) ptr[n] = ptr[j];
+            n++;
+        }
+        if ( n==0 )
+            bcf_double_set_missing(rtok->values[i]);
+        else if ( n==1 )
+            rtok->values[i] = ptr[0];
+        else
+        {
+            qsort(ptr, n, sizeof(double), compare_doubles);
+            rtok->values[i] = n % 2 ? ptr[n/2] : (ptr[n/2-1] + ptr[n/2]) * 0.5;
+        }
+    }
+    return 1;
+}
  static int func_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
      token_t *tok = stack[nstack - 1];
      rtok->nvalues = 0;
      if ( !tok->nvalues ) return 1;
-    int i, n = 0;
-    for (i=0; i<tok->nvalues; i++)
+    // sweep through all tok->values and while excluding all missing values reuse the very same array
+    int i,j,k = 0, n = 0;
+    if ( tok->nsamples )
+    {
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) { k += tok->nval1; continue; }
+            for (j=0; j<tok->nval1; k++,j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue;
+                if ( n < k ) tok->values[n] = tok->values[k];
+                n++;
+            }
+        }
+    }
+    else
      {
-        if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
-        if ( n < i ) tok->values[n] = tok->values[i];
-        n++;
+        for (i=0; i<tok->nvalues; i++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+            if ( n < i ) tok->values[n] = tok->values[i];
+            n++;
+        }
      }
      if ( !n ) return 1;
      if ( n==1 ) rtok->values[0] = 0;
      else
      {
          double sdev = 0, avg = 0;
-        for (i=0; i<n; i++) avg += tok->values[n];
+        for (i=0; i<n; i++) avg += tok->values[i];
          avg /= n;
-        for (i=0; i<n; i++) sdev += (tok->values[n] - avg) * (tok->values[n] - avg);
+        for (i=0; i<n; i++) sdev += (tok->values[i] - avg) * (tok->values[i] - avg);
          rtok->values[0] = sqrt(sdev/n);
      }
      rtok->nvalues = 1;
      return 1;
  }
+static int func_smpl_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+    rtok->nsamples = tok->nsamples;
+    rtok->nvalues  = tok->nsamples;
+    rtok->nval1 = 1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    assert(tok->usmpl);
+    if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+    memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    int i, j, n;
+    double *ptr;
+    for (i=0; i<tok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        n = 0;
+        ptr = tok->values + i*tok->nval1;
+        for (j=0; j<tok->nval1; j++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+            if ( n < j ) ptr[n] = ptr[j];
+            n++;
+        }
+        if ( n==0 )
+            bcf_double_set_missing(rtok->values[i]);
+        else if ( n==1 )
+            rtok->values[i] = 0;
+        else
+        {
+            double sdev = 0, avg = 0;
+            for (j=0; j<n; j++) avg += ptr[j];
+            avg /= n;
+            for (j=0; j<n; j++) sdev += (ptr[j] - avg) * (ptr[j] - avg);
+            rtok->values[i] = sqrt(sdev/n);
+        }
+    }
+    return 1;
+}
  static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
      rtok->nvalues = 0;
      token_t *tok = stack[nstack - 1];
      if ( !tok->nvalues ) return 1;
-    double val = 0;
-    int i, n = 0;
-    for (i=0; i<tok->nvalues; i++)
-        if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+    double *ptr, val = 0;
+    int i,j, n = 0;
+    if ( tok->nsamples )
+    {
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) continue;
+            ptr = tok->values + i*tok->nval1;
+            for (j=0; j<tok->nval1; j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+                val += ptr[j];
+                n++;
+            }
+        }
+    }
+    else
+    {
+        for (i=0; i<tok->nvalues; i++)
+            if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+    }
      if ( n )
      {
          rtok->values[0] = val;
@@ -1287,39 +1568,104 @@ static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack,
      }
      return 1;
  }
+static int func_smpl_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+    rtok->nsamples = tok->nsamples;
+    rtok->nvalues  = tok->nsamples;
+    rtok->nval1 = 1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    assert(tok->usmpl);
+    if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+    memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    int i, j, has_value;
+    double val, *ptr;
+    for (i=0; i<tok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        val = 0;
+        has_value = 0;
+        ptr = tok->values + i*tok->nval1;
+        for (j=0; j<tok->nval1; j++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+            has_value = 1;
+            val += ptr[j];
+        }
+        if ( has_value ) rtok->values[i] = val;
+        else bcf_double_set_missing(rtok->values[i]);
+    }
+    return 1;
+}
  static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
      token_t *tok = stack[nstack - 1];
      if ( tok->is_str ) error("ABS() can be applied only on numeric values\n");
-
+    rtok->nsamples = tok->nsamples;
      rtok->nvalues = tok->nvalues;
+    rtok->nval1 = tok->nval1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    if ( tok->usmpl )
+    {
+        if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+        memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    }
      if ( !tok->nvalues ) return 1;
      hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
-    int i;
-    for (i=0; i<tok->nvalues; i++)
-        if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
-        else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]);
+    int i,j,k = 0;
+    if ( tok->usmpl )
+    {
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; }
+            for (j=0; j<tok->nval1; k++,j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]);
+                else rtok->values[k] = fabs(tok->values[k]);
+            }
+        }
+    }
+    else
+    {
+        for (i=0; i<tok->nvalues; i++)
+        {
+            if ( tok->usmpl && !tok->usmpl[i] ) continue;
+            if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
+            else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]);
+        }
+    }
      return 1;
  }
  static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
      token_t *tok = stack[nstack - 1];
-    int i, cnt = 0;
-    if ( !tok->nsamples )
+    int i,j, cnt = 0;
+    if ( tok->tag && tok->nsamples )
      {
-        if ( tok->is_str )
+        // raw number of values in a FMT tag, e.g. COUNT(FMT/TAG)
+        if ( tok->is_str ) error("todo: Type=String for COUNT on FORMAT fields?\n");
+        for (i=0; i<tok->nsamples; i++)
          {
-            if ( tok->str_value.l ) cnt = 1;
-            for (i=0; i<tok->str_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++;
+            if ( !tok->usmpl[i] ) continue;
+            double *ptr = tok->values + i*tok->nval1;
+            for (j=0; j<tok->nval1; j++)
+                if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) cnt++;
          }
-        else
-            cnt = tok->nvalues;
      }
-    else
+    else if ( tok->nsamples )
      {
+        // number of samples that pass a processed FMT tag
          for (i=0; i<tok->nsamples; i++)
              if ( tok->pass_samples[i] ) cnt++;
      }
+    else if ( tok->is_str )
+    {
+        if ( tok->str_value.l ) cnt = 1;
+        for (i=0; i<tok->str_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++;
+    }
+    else
+        cnt = tok->nvalues;
  
      rtok->nvalues = 1;
      rtok->values[0] = cnt;
@@ -1531,11 +1877,27 @@ static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac
      if ( !tok->nvalues ) return 1;
  
      hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
-    int i;
-    for (i=0; i<tok->nvalues; i++)
-        if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
-        else rtok->values[i] = -4.34294481903*log(tok->values[i]);
-
+    int i,j,k = 0;
+    if ( tok->usmpl )
+    {
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; }
+            for (j=0; j<tok->nval1; k++,j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]);
+                else rtok->values[k] = -4.34294481903*log(tok->values[k]);
+            }
+        }
+    }
+    else
+    {
+        for (i=0; i<tok->nvalues; i++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
+            else rtok->values[i] = -4.34294481903*log(tok->values[i]);
+        }
+    }
      return 1;
  }
  inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok)
@@ -1555,7 +1917,8 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok)
          for (i=0; i<atok->nsamples; i++) rtok->usmpl[i] |= atok->usmpl[i];
          for (i=0; i<btok->nsamples; i++) rtok->usmpl[i] |= btok->usmpl[i];
      }
-    memset(rtok->pass_samples, 0, rtok->nsamples);
+    if (rtok->nsamples)
+        memset(rtok->pass_samples, 0, rtok->nsamples);
  }
  
  #define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \
@@ -1580,22 +1943,37 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok)
                  rtok->values[i] = atok->values[i] AOP btok->values[i]; \
              } \
          } \
+        else if ( atok->nsamples ) \
+        { \
+            assert( btok->nvalues==1 ); \
+            if ( !bcf_double_is_missing_or_vector_end(btok->values[0]) ) \
+            { \
+                for (i=0; i<atok->nvalues; i++) \
+                { \
+                    if ( bcf_double_is_missing_or_vector_end(atok->values[i]) ) \
+                    { \
+                        bcf_double_set_missing(rtok->values[i]); \
+                        continue; \
+                    } \
+                    has_values = 1; \
+                    rtok->values[i] = atok->values[i] AOP btok->values[0]; \
+                } \
+            } \
+        } \
          else \
          { \
-            token_t *xtok = atok->nsamples ? atok : btok; \
-            token_t *ytok = atok->nsamples ? btok : atok; \
-            assert( ytok->nvalues==1 ); \
-            if ( !bcf_double_is_missing_or_vector_end(ytok->values[0]) ) \
+            assert( atok->nvalues==1 ); \
+            if ( !bcf_double_is_missing_or_vector_end(atok->values[0]) ) \
              { \
-                for (i=0; i<xtok->nvalues; i++) \
+                for (i=0; i<btok->nvalues; i++) \
                  { \
-                    if ( bcf_double_is_missing_or_vector_end(xtok->values[i]) ) \
+                    if ( bcf_double_is_missing_or_vector_end(btok->values[i]) ) \
                      { \
                          bcf_double_set_missing(rtok->values[i]); \
                          continue; \
                      } \
                      has_values = 1; \
-                    rtok->values[i] = xtok->values[i] AOP ytok->values[0]; \
+                    rtok->values[i] = atok->values[0] AOP btok->values[i]; \
                  } \
              } \
          } \
@@ -1711,14 +2089,6 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
      return 2;
  }
  
-#define CMP_MISSING(atok,btok,CMP_OP,ret) \
-{ \
-    if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \
-    token_t *tok = (atok)->is_missing ? (btok) : (atok); \
-    (ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \
-    tok->nvalues = 1; \
-}
-
  #define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
  { \
      token_t *rtok = _rtok; \
@@ -1821,31 +2191,56 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
                  } \
              } \
          } \
-        else \
+        else if ( atok->nsamples )\
+        { \
+            for (i=0; i<atok->nsamples; i++) \
+            { \
+                if ( !rtok->usmpl[i] ) continue; \
+                double *aptr = atok->values + i*atok->nval1; \
+                double *bptr = btok->values + i*btok->nval1; \
+                for (j=0; j<atok->nval1; j++) \
+                { \
+                    int miss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
+                    if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
+                    for (k=0; k<btok->nvalues; k++) \
+                    { \
+                        int nmiss = miss + (bcf_double_is_missing_or_vector_end(bptr[k]) ? 1 : 0); \
+                        if ( nmiss ) \
+                        { \
+                            if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
+                        } \
+                        else if ( aptr[j] > 16777216 || bptr[k] > 16777216 ) /* Ugly, see #871 */ \
+                        { \
+                            if ( aptr[j] CMP_OP bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
+                        } \
+                        else if ( (float)aptr[j] CMP_OP (float)bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
+                    } \
+                } \
+            } \
+        } \
+        else /* btok->nsamples */ \
          { \
-            token_t *xtok = atok->nsamples ? atok : btok; \
-            token_t *ytok = atok->nsamples ? btok : atok; \
-            for (i=0; i<xtok->nsamples; i++) \
+            for (i=0; i<btok->nsamples; i++) \
              { \
                  if ( !rtok->usmpl[i] ) continue; \
-                double *xptr = xtok->values + i*xtok->nval1; \
-                double *yptr = ytok->values + i*ytok->nval1; \
-                for (j=0; j<xtok->nval1; j++) \
+                double *aptr = atok->values + i*atok->nval1; \
+                double *bptr = btok->values + i*btok->nval1; \
+                for (j=0; j<btok->nval1; j++) \
                  { \
-                    int miss = bcf_double_is_missing_or_vector_end(xptr[j]) ? 1 : 0; \
+                    int miss = bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0; \
                      if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
-                    for (k=0; k<ytok->nvalues; k++) \
+                    for (k=0; k<atok->nvalues; k++) \
                      { \
-                        int nmiss = miss + (bcf_double_is_missing_or_vector_end(yptr[k]) ? 1 : 0); \
+                        int nmiss = miss + (bcf_double_is_missing_or_vector_end(aptr[k]) ? 1 : 0); \
                          if ( nmiss ) \
                          { \
-                            if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+                            if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
                          } \
-                        else if ( xptr[j] > 16777216 || yptr[k] > 16777216 ) /* Ugly, see #871 */ \
+                        else if ( bptr[j] > 16777216 || aptr[k] > 16777216 ) /* Ugly, see #871 */ \
                          { \
-                            if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+                            if ( aptr[k] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
                          } \
-                        else if ( (float)xptr[j] CMP_OP (float)yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+                        else if ( (float)aptr[k] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
                      } \
                  } \
              } \
@@ -2344,7 +2739,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
          {
              int is_info = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ? 1 : 0;
              is_fmt = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ? 1 : 0;
-            if ( is_info && is_fmt ) error("Both INFO/%s and FORMAT/%s exist, which one do you want?\n", tmp.s,tmp.s);
+            if ( is_info && is_fmt )
+                error("Error: ambiguous filtering expression, both INFO/%s and FORMAT/%s are defined in the VCF header.\n" , tmp.s,tmp.s);
          }
          if ( is_fmt==-1 ) is_fmt = 0;
      }
@@ -2833,6 +3229,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
      // Additionally, treat "." as missing value rather than a string in numeric equalities; that
      // @file is only used with ID; etc.
      // This code is fragile: improve me.
+    static int comma_separator_warned = 0;
      int i;
      for (i=0; i<nout; i++)
      {
@@ -2883,6 +3280,19 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
              if ( regcomp(out[j].regex, out[j].key, cflags) )
                  error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str);
          }
+        if ( out[i].is_str && out[i].tok_type==TOK_VAL && out[i].key && strchr(out[i].key,',') )
+        {
+            int print_note = 0;
+            if ( out[i+1].tok_type==TOK_EQ || (out[i+1].is_str && out[i+2].tok_type==TOK_EQ) ) print_note = 1;
+            else if ( out[i+1].tok_type==TOK_NE || (out[i+1].is_str && out[i+2].tok_type==TOK_NE) ) print_note = 1;
+            if ( print_note && !comma_separator_warned )
+            {
+                comma_separator_warned = 1;
+                fprintf(stderr,
+                    "Warning: comma is interpreted as a separator and OR logic is used in string comparisons.\n"
+                    "         (Search the manual for \"Comma in strings\" to learn more.)\n");
+            }
+        }
          if ( out[i].tok_type!=TOK_VAL ) continue;
          if ( !out[i].tag ) continue;
          if ( out[i].setter==filters_set_type )
@@ -2939,11 +3349,11 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
              if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
              int itok = i, ival;
              if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
-            else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_EQ, ival = i - 1;
-            else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NE, ival = i - 1;
+            else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_IN, ival = i - 1;
+            else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NOT_IN, ival = i - 1;
              else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i;
-            else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_EQ, ival = ++i;
-            else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NE, ival = ++i;
+            else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_IN, ival = ++i;
+            else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NOT_IN, ival = ++i;
              else error("[%s:%d %s] Could not parse the expression: %s\n",  __FILE__,__LINE__,__FUNCTION__, filter->str);
              if ( out[ival].tok_type!=TOK_VAL || !out[ival].key )
                  error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
@@ -2976,6 +3386,12 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
          else if ( out[i].tok_type==TOK_PHRED ) { out[i].func = func_phred; out[i].tok_type = TOK_FUNC; }
          else if ( out[i].tok_type==TOK_BINOM ) { out[i].func = func_binom; out[i].tok_type = TOK_FUNC; }
          else if ( out[i].tok_type==TOK_PERLSUB ) { out[i].func = perl_exec; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_sMAX ) { out[i].func = func_smpl_max; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_sMIN ) { out[i].func = func_smpl_min; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_sAVG ) { out[i].func = func_smpl_avg; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_sMEDIAN ) { out[i].func = func_smpl_median; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_sSTDEV ) { out[i].func = func_smpl_stddev; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_sSUM ) { out[i].func = func_smpl_sum; out[i].tok_type = TOK_FUNC; }
          hts_expand0(double,1,out[i].mvalues,out[i].values);
          if ( filter->nsamples )
          {
@@ -3151,3 +3567,32 @@ int filter_max_unpack(filter_t *flt)
  {
      return flt->max_unpack;
  }
+
+const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1)
+{
+    token_t *tok = filter->flt_stack[0];
+    if ( tok->nvalues )
+    {
+        *nval  = tok->nvalues;
+        *nval1 = tok->nval1;
+    }
+    else
+    {
+        if ( !tok->values ) error("fixme in filter_get_doubles(): %s\n", filter->str);
+        *nval  = 1;
+        *nval1 = 1;
+        tok->values[0] = filter->flt_stack[0]->pass_site;
+    }
+    return tok->values;
+}
+
+void filter_set_samples(filter_t *filter, const uint8_t *samples)
+{
+    int i,j;
+    for (i=0; i<filter->nfilters; i++)
+    {
+        if ( !filter->filters[i].nsamples ) continue;
+        for (j=0; j<filter->filters[i].nsamples; j++) filter->filters[i].usmpl[j] = samples[j];
+    }
+}
+
diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c

index 2d1987ab5cbc86863a1be8d08017442bf4165b19..8832633280c94f0de1f327a1b238357d2271f819 100644 (file)
--- a/bcftools/filter.c.pysam.c
+++ b/bcftools/filter.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  filter.c -- filter expressions.
  
-    Copyright (C) 2013-2018 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -27,6 +27,7 @@ THE SOFTWARE.  */
  #include <ctype.h>
  #include <stdlib.h>
  #include <strings.h>
+#include <assert.h>
  #include <errno.h>
  #include <math.h>
  #include <sys/types.h>
@@ -58,27 +59,6 @@ static int filter_ninit = 0;
  #  define __FUNCTION__ __func__
  #endif
  
-static const uint64_t bcf_double_missing    = 0x7ff0000000000001;
-static const uint64_t bcf_double_vector_end = 0x7ff0000000000002;
-static inline void bcf_double_set(double *ptr, uint64_t value)
-{
-    union { uint64_t i; double d; } u;
-    u.i = value;
-    *ptr = u.d;
-}
-static inline int bcf_double_test(double d, uint64_t value)
-{
-    union { uint64_t i; double d; } u;
-    u.d = d;
-    return u.i==value ? 1 : 0;
-}
-#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end)
-#define bcf_double_set_missing(x)    bcf_double_set(&(x),bcf_double_missing)
-#define bcf_double_is_vector_end(x)  bcf_double_test((x),bcf_double_vector_end)
-#define bcf_double_is_missing(x)     bcf_double_test((x),bcf_double_missing)
-#define bcf_double_is_missing_or_vector_end(x)     (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end))
-
-
  typedef struct _token_t
  {
      // read-only values, same for all VCF lines
@@ -91,9 +71,9 @@ typedef struct _token_t
      int hdr_id, tag_type;   // BCF header lookup ID and one of BCF_HL_* types
      int idx;            // 0-based index to VCF vectors,
                          //  -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
-    int *idxs;          // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
+    int *idxs;          // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only
      int nidxs, nuidxs;  // size of idxs array and the number of elements set to 1
-    uint8_t *usmpl;     // bitmask of used samples as set by idx
+    uint8_t *usmpl;     // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise
      int nsamples;       // number of samples for format fields, 0 for info and other fields
      void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
      int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack);
@@ -160,11 +140,19 @@ struct _filter_t
  #define TOK_PHRED   29
  #define TOK_MEDIAN  30
  #define TOK_STDEV   31
-
-//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
-//                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l  f  c  p  b  P  i  s
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
-#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis"
+#define TOK_sMAX    32
+#define TOK_sMIN    33
+#define TOK_sAVG    34
+#define TOK_sMEDIAN 35
+#define TOK_sSTDEV  36
+#define TOK_sSUM    37
+#define TOK_IN      38      // contains, e.g. FILTER~"A" 
+#define TOK_NOT_IN  39      // does not contain, e.g. FILTER!~"A" 
+
+//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
+//                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l  f  c  p  b  P  i  s 
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
+#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis"       // this is only for debugging, not maintained diligently
  
  // Return negative values if it is a function with variable number of arguments
  static int filters_next_token(char **str, int *len)
@@ -186,6 +174,20 @@ static int filters_next_token(char **str, int *len)
          tmp = *str;
      }
  
+    if ( !strncasecmp(tmp,"SMPL_MAX(",9) ) { (*str) += 8; return TOK_sMAX; }
+    if ( !strncasecmp(tmp,"SMPL_MIN(",9) ) { (*str) += 8; return TOK_sMIN; }
+    if ( !strncasecmp(tmp,"SMPL_MEAN(",10) ) { (*str) += 9; return TOK_sAVG; }
+    if ( !strncasecmp(tmp,"SMPL_MEDIAN(",12) ) { (*str) += 11; return TOK_sMEDIAN; }
+    if ( !strncasecmp(tmp,"SMPL_AVG(",9) ) { (*str) += 8; return TOK_sAVG; }
+    if ( !strncasecmp(tmp,"SMPL_STDEV(",11) ) { (*str) += 10; return TOK_sSTDEV; }
+    if ( !strncasecmp(tmp,"SMPL_SUM(",9) ) { (*str) += 8; return TOK_sSUM; }
+    if ( !strncasecmp(tmp,"sMAX(",5) ) { (*str) += 4; return TOK_sMAX; }
+    if ( !strncasecmp(tmp,"sMIN(",5) ) { (*str) += 4; return TOK_sMIN; }
+    if ( !strncasecmp(tmp,"sMEAN(",6) ) { (*str) += 5; return TOK_sAVG; }
+    if ( !strncasecmp(tmp,"sMEDIAN(",8) ) { (*str) += 7; return TOK_sMEDIAN; }
+    if ( !strncasecmp(tmp,"sAVG(",5) ) { (*str) += 4; return TOK_sAVG; }
+    if ( !strncasecmp(tmp,"sSTDEV(",7) ) { (*str) += 6; return TOK_sSTDEV; }
+    if ( !strncasecmp(tmp,"sSUM(",5) ) { (*str) += 4; return TOK_sSUM; }
      if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; }
      if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; }
      if ( !strncasecmp(tmp,"MEAN(",5) ) { (*str) += 4; return TOK_AVG; }
@@ -419,7 +421,7 @@ static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf
  static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
  {
      int i;
-    if ( rtok->tok_type==TOK_NE )  // AND logic: none of the filters can match
+    if ( rtok->tok_type==TOK_NOT_IN )
      {
          if ( !line->d.n_flt )
          {
@@ -432,7 +434,7 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1
          rtok->pass_site = 1;
          return;
      }
-    else if ( rtok->tok_type==TOK_EQ ) // OR logic: at least one of the filters must match
+    else if ( rtok->tok_type==TOK_IN )
      {
          if ( !line->d.n_flt )
          {
@@ -443,8 +445,30 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1
              if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; }
          return;
      }
+    else if ( rtok->tok_type==TOK_NE )  // exact match
+    {
+        if ( !line->d.n_flt )
+        {
+            if ( atok->hdr_id==-1 ) return;   // missing value
+            rtok->pass_site = 1;
+            return; // no filter present, eval to true
+        }
+        if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) return;    // exact match, fail iff a single matching value is present
+        rtok->pass_site = 1;
+        return;
+    }
+    else if ( rtok->tok_type==TOK_EQ )  // exact match, pass iff a single matching value is present
+    {
+        if ( !line->d.n_flt )
+        {
+            if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
+            return; // no filter present, eval to false
+        }
+        if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) rtok->pass_site = 1;
+        return;
+    }
      else 
-        error("Only == and != operators are supported for FILTER\n");
+        error("Only ==, !=, ~, and !~ operators are supported for FILTER\n");
      return;
  }
  static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
@@ -516,7 +540,7 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value)
          case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break;
          case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break;
          case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break;
-        default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break;
+        default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); bcftools_exit(1); break;
      }
      #undef BRANCH
      return -1;  // this shouldn't happen
@@ -1038,54 +1062,46 @@ static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok)
          tok->nvalues = 0;
          return;
      }
-    if ( fmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8\n");
-
+    
      int j,nmissing = 0;
-    for (i=0; i<line->n_sample; i++)
-    {
-        int8_t *ptr = (int8_t*) (fmt->p + i*fmt->size);
-        for (j=0; j<fmt->n; j++)
-        {
-            if ( ptr[j]==bcf_int8_vector_end ) break;
-            if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; }
-        }
+    #define BRANCH(type_t, is_vector_end) { \
+        for (i=0; i<line->n_sample; i++) \
+        { \
+            type_t *ptr = (type_t *) (fmt->p + i*fmt->size); \
+            for (j=0; j<fmt->n; j++) \
+            { \
+                if ( ptr[j]==is_vector_end ) break; \
+                if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } \
+            } \
+        } \
+    }
+    switch (fmt->type) {
+        case BCF_BT_INT8:  BRANCH(int8_t,  bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+        default: fprintf(bcftools_stderr,"todo: type %d\n", fmt->type); bcftools_exit(1); break;
      }
+    #undef BRANCH
      tok->nvalues = 1;
      tok->values[0] = tok->tag[0]=='N' ? nmissing : (double)nmissing / line->n_sample;
  }
  static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
-    if ( nstack==0 ) error("Error parsing the expresion\n");
+    if ( nstack==0 ) error("Error parsing the expression\n");
      token_t *tok = stack[nstack - 1];
      if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag);
-
-    rtok->nsamples = tok->nsamples;
-    memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples));
-
      assert(tok->usmpl);
-    if ( !rtok->usmpl )
-    {
-        rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl));
-        memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl));
-    }
  
      int i, npass = 0;
-    for (i=0; i<rtok->nsamples; i++)
+    for (i=0; i<tok->nsamples; i++)
      {
-        if ( !rtok->usmpl[i] ) continue;
-        if ( rtok->pass_samples[i] ) npass++;
+        if ( !tok->usmpl[i] ) continue;
+        if ( tok->pass_samples[i] ) npass++;
      }
-
-    hts_expand(double,rtok->nsamples,rtok->mvalues,rtok->values);
-    double value = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0);
-    rtok->nval1 = 1;
-    rtok->nvalues = rtok->nsamples;
-
-    // Set per-sample status so that `query -i 'F_PASS(GT!="mis" & GQ >= 20) > 0.5'` or +trio-stats
-    // consider only the passing site AND samples. The values for failed samples is set to -1 so
-    // that it can never conflict with valid expressions.
-    for (i=0; i<rtok->nsamples; i++)
-        rtok->values[i] = rtok->pass_samples[i] ? value : -1;
+    hts_expand(double,1,rtok->mvalues,rtok->values);
+    rtok->nsamples = 0;
+    rtok->nvalues = 1;
+    rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0);
  
      return 1;
  }
@@ -1167,13 +1183,30 @@ static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack,
      token_t *tok = stack[nstack - 1];
      rtok->nvalues = 0;
      if ( !tok->nvalues ) return 1;
-    double val = -HUGE_VAL;
-    int i, has_value = 0;
-    for (i=0; i<tok->nvalues; i++)
+    double *ptr, val = -HUGE_VAL;
+    int i,j, has_value = 0;
+    if ( tok->nsamples )
      {
-        if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
-        has_value = 1;
-        if ( val < tok->values[i] ) val = tok->values[i];
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) continue;
+            ptr = tok->values + i*tok->nval1;
+            for (j=0; j<tok->nval1; j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+                has_value = 1;
+                if ( val < ptr[j] ) val = ptr[j];
+            }
+        }
+    }
+    else
+    {
+        for (i=0; i<tok->nvalues; i++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+            has_value = 1;
+            if ( val < tok->values[i] ) val = tok->values[i];
+        }
      }
      if ( has_value )
      {
@@ -1182,18 +1215,65 @@ static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack,
      }
      return 1;
  }
+static int func_smpl_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) return func_max(flt,line,rtok,stack,nstack);
+    rtok->nsamples = tok->nsamples;
+    rtok->nvalues  = tok->nsamples;
+    rtok->nval1 = 1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    assert(tok->usmpl);
+    if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+    memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    int i, j, has_value;
+    double val, *ptr;
+    for (i=0; i<tok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        val = -HUGE_VAL;
+        has_value = 0;
+        ptr = tok->values + i*tok->nval1;
+        for (j=0; j<tok->nval1; j++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+            has_value = 1;
+            if ( val < ptr[j] ) val = ptr[j];
+        }
+        if ( has_value ) rtok->values[i] = val;
+        else bcf_double_set_missing(rtok->values[i]);
+    }
+    return 1;
+}
  static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
      token_t *tok = stack[nstack - 1];
      rtok->nvalues = 0;
      if ( !tok->nvalues ) return 1;
-    double val = HUGE_VAL;
-    int i, has_value = 0;
-    for (i=0; i<tok->nvalues; i++)
+    double *ptr, val = HUGE_VAL;
+    int i,j, has_value = 0;
+    if ( tok->nsamples )
+    {
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) continue;
+            ptr = tok->values + i*tok->nval1;
+            for (j=0; j<tok->nval1; j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+                has_value = 1;
+                if ( val > ptr[j] ) val = ptr[j];
+            }
+        }
+    }
+    else
      {
-        if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
-        has_value = 1;
-        if ( val > tok->values[i] ) val = tok->values[i];
+        for (i=0; i<tok->nvalues; i++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+            has_value = 1;
+            if ( val > tok->values[i] ) val = tok->values[i];
+        }
      }
      if ( has_value )
      {
@@ -1202,15 +1282,62 @@ static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack,
      }
      return 1;
  }
+static int func_smpl_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) return func_min(flt,line,rtok,stack,nstack);
+    rtok->nsamples = tok->nsamples;
+    rtok->nvalues  = tok->nsamples;
+    rtok->nval1 = 1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    assert(tok->usmpl);
+    if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+    memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    int i, j, has_value;
+    double val, *ptr;
+    for (i=0; i<tok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        val = HUGE_VAL;
+        has_value = 0;
+        ptr = tok->values + i*tok->nval1;
+        for (j=0; j<tok->nval1; j++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+            has_value = 1;
+            if ( val > ptr[j] ) val = ptr[j];
+        }
+        if ( has_value ) rtok->values[i] = val;
+        else bcf_double_set_missing(rtok->values[i]);
+    }
+    return 1;
+}
  static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
      token_t *tok = stack[nstack - 1];
      rtok->nvalues = 0;
      if ( !tok->nvalues ) return 1;
-    double val = 0;
-    int i, n = 0;
-    for (i=0; i<tok->nvalues; i++)
-        if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+    double *ptr, val = 0;
+    int i,j, n = 0;
+    if ( tok->nsamples )
+    {
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) continue;
+            ptr = tok->values + i*tok->nval1;
+            for (j=0; j<tok->nval1; j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+                val += ptr[j];
+                n++;
+            }
+        }
+    }
+    else
+    {
+        for (i=0; i<tok->nvalues; i++)
+            if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+    }
      if ( n )
      {
          rtok->values[0] = val / n;
@@ -1218,6 +1345,34 @@ static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack,
      }
      return 1;
  }
+static int func_smpl_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+    rtok->nsamples = tok->nsamples;
+    rtok->nvalues  = tok->nsamples;
+    rtok->nval1 = 1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    assert(tok->usmpl);
+    if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+    memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    int i, j, n;
+    double val, *ptr;
+    for (i=0; i<tok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        val = 0;
+        n = 0;
+        ptr = tok->values + i*tok->nval1;
+        for (j=0; j<tok->nval1; j++)
+        {
+            if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) { val += ptr[j]; n++; }
+        }
+        if ( n ) rtok->values[i] = val / n;
+        else bcf_double_set_missing(rtok->values[i]);
+    }
+    return 1;
+}
  static int compare_doubles(const void *lhs, const void *rhs)
  {
      double arg1 = *(const double*) lhs;
@@ -1231,12 +1386,29 @@ static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta
      token_t *tok = stack[nstack - 1];
      rtok->nvalues = 0;
      if ( !tok->nvalues ) return 1;
-    int i, n = 0;
-    for (i=0; i<tok->nvalues; i++)
+    // sweep through all tok->values and while excluding all missing values reuse the very same array
+    int i,j,k = 0, n = 0;
+    if ( tok->nsamples )
      {
-        if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
-        if ( n < i ) tok->values[n] = tok->values[i];
-        n++;
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) { k += tok->nval1; continue; }
+            for (j=0; j<tok->nval1; k++,j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue;
+                if ( n < k ) tok->values[n] = tok->values[k];
+                n++;
+            }
+        }
+    }
+    else
+    {
+        for (i=0; i<tok->nvalues; i++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+            if ( n < i ) tok->values[n] = tok->values[i];
+            n++;
+        }
      }
      if ( !n ) return 1;
      if ( n==1 ) rtok->values[0] = tok->values[0];
@@ -1248,40 +1420,149 @@ static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta
      rtok->nvalues = 1;
      return 1;
  }
+static int func_smpl_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+    rtok->nsamples = tok->nsamples;
+    rtok->nvalues  = tok->nsamples;
+    rtok->nval1 = 1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    assert(tok->usmpl);
+    if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+    memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    int i, j, n;
+    double *ptr;
+    for (i=0; i<tok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        n = 0;
+        ptr = tok->values + i*tok->nval1;
+        for (j=0; j<tok->nval1; j++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+            if ( n < j ) ptr[n] = ptr[j];
+            n++;
+        }
+        if ( n==0 )
+            bcf_double_set_missing(rtok->values[i]);
+        else if ( n==1 )
+            rtok->values[i] = ptr[0];
+        else
+        {
+            qsort(ptr, n, sizeof(double), compare_doubles);
+            rtok->values[i] = n % 2 ? ptr[n/2] : (ptr[n/2-1] + ptr[n/2]) * 0.5;
+        }
+    }
+    return 1;
+}
  static int func_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
      token_t *tok = stack[nstack - 1];
      rtok->nvalues = 0;
      if ( !tok->nvalues ) return 1;
-    int i, n = 0;
-    for (i=0; i<tok->nvalues; i++)
+    // sweep through all tok->values and while excluding all missing values reuse the very same array
+    int i,j,k = 0, n = 0;
+    if ( tok->nsamples )
+    {
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) { k += tok->nval1; continue; }
+            for (j=0; j<tok->nval1; k++,j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) continue;
+                if ( n < k ) tok->values[n] = tok->values[k];
+                n++;
+            }
+        }
+    }
+    else
      {
-        if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
-        if ( n < i ) tok->values[n] = tok->values[i];
-        n++;
+        for (i=0; i<tok->nvalues; i++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue;
+            if ( n < i ) tok->values[n] = tok->values[i];
+            n++;
+        }
      }
      if ( !n ) return 1;
      if ( n==1 ) rtok->values[0] = 0;
      else
      {
          double sdev = 0, avg = 0;
-        for (i=0; i<n; i++) avg += tok->values[n];
+        for (i=0; i<n; i++) avg += tok->values[i];
          avg /= n;
-        for (i=0; i<n; i++) sdev += (tok->values[n] - avg) * (tok->values[n] - avg);
+        for (i=0; i<n; i++) sdev += (tok->values[i] - avg) * (tok->values[i] - avg);
          rtok->values[0] = sqrt(sdev/n);
      }
      rtok->nvalues = 1;
      return 1;
  }
+static int func_smpl_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+    rtok->nsamples = tok->nsamples;
+    rtok->nvalues  = tok->nsamples;
+    rtok->nval1 = 1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    assert(tok->usmpl);
+    if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+    memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    int i, j, n;
+    double *ptr;
+    for (i=0; i<tok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        n = 0;
+        ptr = tok->values + i*tok->nval1;
+        for (j=0; j<tok->nval1; j++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+            if ( n < j ) ptr[n] = ptr[j];
+            n++;
+        }
+        if ( n==0 )
+            bcf_double_set_missing(rtok->values[i]);
+        else if ( n==1 )
+            rtok->values[i] = 0;
+        else
+        {
+            double sdev = 0, avg = 0;
+            for (j=0; j<n; j++) avg += ptr[j];
+            avg /= n;
+            for (j=0; j<n; j++) sdev += (ptr[j] - avg) * (ptr[j] - avg);
+            rtok->values[i] = sqrt(sdev/n);
+        }
+    }
+    return 1;
+}
  static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
      rtok->nvalues = 0;
      token_t *tok = stack[nstack - 1];
      if ( !tok->nvalues ) return 1;
-    double val = 0;
-    int i, n = 0;
-    for (i=0; i<tok->nvalues; i++)
-        if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+    double *ptr, val = 0;
+    int i,j, n = 0;
+    if ( tok->nsamples )
+    {
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) continue;
+            ptr = tok->values + i*tok->nval1;
+            for (j=0; j<tok->nval1; j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+                val += ptr[j];
+                n++;
+            }
+        }
+    }
+    else
+    {
+        for (i=0; i<tok->nvalues; i++)
+            if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; }
+    }
      if ( n )
      {
          rtok->values[0] = val;
@@ -1289,39 +1570,104 @@ static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack,
      }
      return 1;
  }
+static int func_smpl_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) return func_avg(flt,line,rtok,stack,nstack);
+    rtok->nsamples = tok->nsamples;
+    rtok->nvalues  = tok->nsamples;
+    rtok->nval1 = 1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    assert(tok->usmpl);
+    if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+    memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    int i, j, has_value;
+    double val, *ptr;
+    for (i=0; i<tok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        val = 0;
+        has_value = 0;
+        ptr = tok->values + i*tok->nval1;
+        for (j=0; j<tok->nval1; j++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(ptr[j]) ) continue;
+            has_value = 1;
+            val += ptr[j];
+        }
+        if ( has_value ) rtok->values[i] = val;
+        else bcf_double_set_missing(rtok->values[i]);
+    }
+    return 1;
+}
  static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
      token_t *tok = stack[nstack - 1];
      if ( tok->is_str ) error("ABS() can be applied only on numeric values\n");
-
+    rtok->nsamples = tok->nsamples;
      rtok->nvalues = tok->nvalues;
+    rtok->nval1 = tok->nval1;
+    hts_expand(double,rtok->nvalues,rtok->mvalues,rtok->values);
+    if ( tok->usmpl )
+    {
+        if ( !rtok->usmpl ) rtok->usmpl = (uint8_t*) malloc(tok->nsamples);
+        memcpy(rtok->usmpl, tok->usmpl, tok->nsamples);
+    }
      if ( !tok->nvalues ) return 1;
      hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
-    int i;
-    for (i=0; i<tok->nvalues; i++)
-        if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
-        else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]);
+    int i,j,k = 0;
+    if ( tok->usmpl )
+    {
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; }
+            for (j=0; j<tok->nval1; k++,j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]);
+                else rtok->values[k] = fabs(tok->values[k]);
+            }
+        }
+    }
+    else
+    {
+        for (i=0; i<tok->nvalues; i++)
+        {
+            if ( tok->usmpl && !tok->usmpl[i] ) continue;
+            if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
+            else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]);
+        }
+    }
      return 1;
  }
  static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
      token_t *tok = stack[nstack - 1];
-    int i, cnt = 0;
-    if ( !tok->nsamples )
+    int i,j, cnt = 0;
+    if ( tok->tag && tok->nsamples )
      {
-        if ( tok->is_str )
+        // raw number of values in a FMT tag, e.g. COUNT(FMT/TAG)
+        if ( tok->is_str ) error("todo: Type=String for COUNT on FORMAT fields?\n");
+        for (i=0; i<tok->nsamples; i++)
          {
-            if ( tok->str_value.l ) cnt = 1;
-            for (i=0; i<tok->str_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++;
+            if ( !tok->usmpl[i] ) continue;
+            double *ptr = tok->values + i*tok->nval1;
+            for (j=0; j<tok->nval1; j++)
+                if ( !bcf_double_is_missing_or_vector_end(ptr[j]) ) cnt++;
          }
-        else
-            cnt = tok->nvalues;
      }
-    else
+    else if ( tok->nsamples )
      {
+        // number of samples that pass a processed FMT tag
          for (i=0; i<tok->nsamples; i++)
              if ( tok->pass_samples[i] ) cnt++;
      }
+    else if ( tok->is_str )
+    {
+        if ( tok->str_value.l ) cnt = 1;
+        for (i=0; i<tok->str_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++;
+    }
+    else
+        cnt = tok->nvalues;
  
      rtok->nvalues = 1;
      rtok->values[0] = cnt;
@@ -1533,11 +1879,27 @@ static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac
      if ( !tok->nvalues ) return 1;
  
      hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
-    int i;
-    for (i=0; i<tok->nvalues; i++)
-        if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
-        else rtok->values[i] = -4.34294481903*log(tok->values[i]);
-
+    int i,j,k = 0;
+    if ( tok->usmpl )
+    {
+        for (i=0; i<tok->nsamples; i++)
+        {
+            if ( !tok->usmpl[i] ) { k+= tok->nval1; continue; }
+            for (j=0; j<tok->nval1; k++,j++)
+            {
+                if ( bcf_double_is_missing_or_vector_end(tok->values[k]) ) bcf_double_set_missing(rtok->values[k]);
+                else rtok->values[k] = -4.34294481903*log(tok->values[k]);
+            }
+        }
+    }
+    else
+    {
+        for (i=0; i<tok->nvalues; i++)
+        {
+            if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
+            else rtok->values[i] = -4.34294481903*log(tok->values[i]);
+        }
+    }
      return 1;
  }
  inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok)
@@ -1557,7 +1919,8 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok)
          for (i=0; i<atok->nsamples; i++) rtok->usmpl[i] |= atok->usmpl[i];
          for (i=0; i<btok->nsamples; i++) rtok->usmpl[i] |= btok->usmpl[i];
      }
-    memset(rtok->pass_samples, 0, rtok->nsamples);
+    if (rtok->nsamples)
+        memset(rtok->pass_samples, 0, rtok->nsamples);
  }
  
  #define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \
@@ -1582,22 +1945,37 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok)
                  rtok->values[i] = atok->values[i] AOP btok->values[i]; \
              } \
          } \
+        else if ( atok->nsamples ) \
+        { \
+            assert( btok->nvalues==1 ); \
+            if ( !bcf_double_is_missing_or_vector_end(btok->values[0]) ) \
+            { \
+                for (i=0; i<atok->nvalues; i++) \
+                { \
+                    if ( bcf_double_is_missing_or_vector_end(atok->values[i]) ) \
+                    { \
+                        bcf_double_set_missing(rtok->values[i]); \
+                        continue; \
+                    } \
+                    has_values = 1; \
+                    rtok->values[i] = atok->values[i] AOP btok->values[0]; \
+                } \
+            } \
+        } \
          else \
          { \
-            token_t *xtok = atok->nsamples ? atok : btok; \
-            token_t *ytok = atok->nsamples ? btok : atok; \
-            assert( ytok->nvalues==1 ); \
-            if ( !bcf_double_is_missing_or_vector_end(ytok->values[0]) ) \
+            assert( atok->nvalues==1 ); \
+            if ( !bcf_double_is_missing_or_vector_end(atok->values[0]) ) \
              { \
-                for (i=0; i<xtok->nvalues; i++) \
+                for (i=0; i<btok->nvalues; i++) \
                  { \
-                    if ( bcf_double_is_missing_or_vector_end(xtok->values[i]) ) \
+                    if ( bcf_double_is_missing_or_vector_end(btok->values[i]) ) \
                      { \
                          bcf_double_set_missing(rtok->values[i]); \
                          continue; \
                      } \
                      has_values = 1; \
-                    rtok->values[i] = xtok->values[i] AOP ytok->values[0]; \
+                    rtok->values[i] = atok->values[0] AOP btok->values[i]; \
                  } \
              } \
          } \
@@ -1713,14 +2091,6 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
      return 2;
  }
  
-#define CMP_MISSING(atok,btok,CMP_OP,ret) \
-{ \
-    if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \
-    token_t *tok = (atok)->is_missing ? (btok) : (atok); \
-    (ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \
-    tok->nvalues = 1; \
-}
-
  #define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
  { \
      token_t *rtok = _rtok; \
@@ -1823,31 +2193,56 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
                  } \
              } \
          } \
-        else \
+        else if ( atok->nsamples )\
+        { \
+            for (i=0; i<atok->nsamples; i++) \
+            { \
+                if ( !rtok->usmpl[i] ) continue; \
+                double *aptr = atok->values + i*atok->nval1; \
+                double *bptr = btok->values + i*btok->nval1; \
+                for (j=0; j<atok->nval1; j++) \
+                { \
+                    int miss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
+                    if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
+                    for (k=0; k<btok->nvalues; k++) \
+                    { \
+                        int nmiss = miss + (bcf_double_is_missing_or_vector_end(bptr[k]) ? 1 : 0); \
+                        if ( nmiss ) \
+                        { \
+                            if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
+                        } \
+                        else if ( aptr[j] > 16777216 || bptr[k] > 16777216 ) /* Ugly, see #871 */ \
+                        { \
+                            if ( aptr[j] CMP_OP bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
+                        } \
+                        else if ( (float)aptr[j] CMP_OP (float)bptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = atok->nval1; break; } \
+                    } \
+                } \
+            } \
+        } \
+        else /* btok->nsamples */ \
          { \
-            token_t *xtok = atok->nsamples ? atok : btok; \
-            token_t *ytok = atok->nsamples ? btok : atok; \
-            for (i=0; i<xtok->nsamples; i++) \
+            for (i=0; i<btok->nsamples; i++) \
              { \
                  if ( !rtok->usmpl[i] ) continue; \
-                double *xptr = xtok->values + i*xtok->nval1; \
-                double *yptr = ytok->values + i*ytok->nval1; \
-                for (j=0; j<xtok->nval1; j++) \
+                double *aptr = atok->values + i*atok->nval1; \
+                double *bptr = btok->values + i*btok->nval1; \
+                for (j=0; j<btok->nval1; j++) \
                  { \
-                    int miss = bcf_double_is_missing_or_vector_end(xptr[j]) ? 1 : 0; \
+                    int miss = bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0; \
                      if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
-                    for (k=0; k<ytok->nvalues; k++) \
+                    for (k=0; k<atok->nvalues; k++) \
                      { \
-                        int nmiss = miss + (bcf_double_is_missing_or_vector_end(yptr[k]) ? 1 : 0); \
+                        int nmiss = miss + (bcf_double_is_missing_or_vector_end(aptr[k]) ? 1 : 0); \
                          if ( nmiss ) \
                          { \
-                            if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+                            if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
                          } \
-                        else if ( xptr[j] > 16777216 || yptr[k] > 16777216 ) /* Ugly, see #871 */ \
+                        else if ( bptr[j] > 16777216 || aptr[k] > 16777216 ) /* Ugly, see #871 */ \
                          { \
-                            if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+                            if ( aptr[k] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
                          } \
-                        else if ( (float)xptr[j] CMP_OP (float)yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+                        else if ( (float)aptr[k] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = btok->nval1; break; } \
                      } \
                  } \
              } \
@@ -2346,7 +2741,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
          {
              int is_info = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ? 1 : 0;
              is_fmt = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ? 1 : 0;
-            if ( is_info && is_fmt ) error("Both INFO/%s and FORMAT/%s exist, which one do you want?\n", tmp.s,tmp.s);
+            if ( is_info && is_fmt )
+                error("Error: ambiguous filtering expression, both INFO/%s and FORMAT/%s are defined in the VCF header.\n" , tmp.s,tmp.s);
          }
          if ( is_fmt==-1 ) is_fmt = 0;
      }
@@ -2835,6 +3231,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
      // Additionally, treat "." as missing value rather than a string in numeric equalities; that
      // @file is only used with ID; etc.
      // This code is fragile: improve me.
+    static int comma_separator_warned = 0;
      int i;
      for (i=0; i<nout; i++)
      {
@@ -2885,6 +3282,19 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
              if ( regcomp(out[j].regex, out[j].key, cflags) )
                  error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str);
          }
+        if ( out[i].is_str && out[i].tok_type==TOK_VAL && out[i].key && strchr(out[i].key,',') )
+        {
+            int print_note = 0;
+            if ( out[i+1].tok_type==TOK_EQ || (out[i+1].is_str && out[i+2].tok_type==TOK_EQ) ) print_note = 1;
+            else if ( out[i+1].tok_type==TOK_NE || (out[i+1].is_str && out[i+2].tok_type==TOK_NE) ) print_note = 1;
+            if ( print_note && !comma_separator_warned )
+            {
+                comma_separator_warned = 1;
+                fprintf(bcftools_stderr,
+                    "Warning: comma is interpreted as a separator and OR logic is used in string comparisons.\n"
+                    "         (Search the manual for \"Comma in strings\" to learn more.)\n");
+            }
+        }
          if ( out[i].tok_type!=TOK_VAL ) continue;
          if ( !out[i].tag ) continue;
          if ( out[i].setter==filters_set_type )
@@ -2941,11 +3351,11 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
              if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
              int itok = i, ival;
              if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
-            else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_EQ, ival = i - 1;
-            else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NE, ival = i - 1;
+            else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_IN, ival = i - 1;
+            else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NOT_IN, ival = i - 1;
              else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i;
-            else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_EQ, ival = ++i;
-            else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NE, ival = ++i;
+            else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_IN, ival = ++i;
+            else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NOT_IN, ival = ++i;
              else error("[%s:%d %s] Could not parse the expression: %s\n",  __FILE__,__LINE__,__FUNCTION__, filter->str);
              if ( out[ival].tok_type!=TOK_VAL || !out[ival].key )
                  error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
@@ -2978,6 +3388,12 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
          else if ( out[i].tok_type==TOK_PHRED ) { out[i].func = func_phred; out[i].tok_type = TOK_FUNC; }
          else if ( out[i].tok_type==TOK_BINOM ) { out[i].func = func_binom; out[i].tok_type = TOK_FUNC; }
          else if ( out[i].tok_type==TOK_PERLSUB ) { out[i].func = perl_exec; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_sMAX ) { out[i].func = func_smpl_max; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_sMIN ) { out[i].func = func_smpl_min; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_sAVG ) { out[i].func = func_smpl_avg; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_sMEDIAN ) { out[i].func = func_smpl_median; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_sSTDEV ) { out[i].func = func_smpl_stddev; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_sSUM ) { out[i].func = func_smpl_sum; out[i].tok_type = TOK_FUNC; }
          hts_expand0(double,1,out[i].mvalues,out[i].values);
          if ( filter->nsamples )
          {
@@ -3153,3 +3569,32 @@ int filter_max_unpack(filter_t *flt)
  {
      return flt->max_unpack;
  }
+
+const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1)
+{
+    token_t *tok = filter->flt_stack[0];
+    if ( tok->nvalues )
+    {
+        *nval  = tok->nvalues;
+        *nval1 = tok->nval1;
+    }
+    else
+    {
+        if ( !tok->values ) error("fixme in filter_get_doubles(): %s\n", filter->str);
+        *nval  = 1;
+        *nval1 = 1;
+        tok->values[0] = filter->flt_stack[0]->pass_site;
+    }
+    return tok->values;
+}
+
+void filter_set_samples(filter_t *filter, const uint8_t *samples)
+{
+    int i,j;
+    for (i=0; i<filter->nfilters; i++)
+    {
+        if ( !filter->filters[i].nsamples ) continue;
+        for (j=0; j<filter->filters[i].nsamples; j++) filter->filters[i].usmpl[j] = samples[j];
+    }
+}
+
diff --git a/bcftools/filter.h b/bcftools/filter.h

index ccd3fe30fbe19487e612a41b28c3dc739e8c04c1..243e3b69e6aa750ebc0266188d87487ae2fd3fcf 100644 (file)
--- a/bcftools/filter.h
+++ b/bcftools/filter.h
@@ -1,6 +1,6 @@
  /*  filter.h -- filter expressions.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -46,6 +46,18 @@ void filter_destroy(filter_t *filter);
    */
  int filter_test(filter_t *filter, bcf1_t *rec, const uint8_t **samples);
  
+/**
+  *  filter_set_samples() - restrict filtering expression to samples.
+  *             Call after filter_init().
+  *  @samples:  use samples set to 1, ignore samples set 0
+  */
+void filter_set_samples(filter_t *filter, const uint8_t *samples);
+
+/**
+  *  filter_get_doubles() - return a pointer to values from the last filter_test() evaluation
+  */
+const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1);
+
  void filter_expression_info(FILE *fp);
  int filter_max_unpack(filter_t *filter);
  
diff --git a/bcftools/hclust.c b/bcftools/hclust.c

index 692fa54a5aa7f8c7d6453cbf8ec05717bc744a20..945c70e0e01ae5608eb31fb1845e45178344f74e 100644 (file)
--- a/bcftools/hclust.c
+++ b/bcftools/hclust.c
@@ -27,6 +27,7 @@
  #include <htslib/hts.h>
  #include <htslib/kstring.h>
  #include <stdlib.h>
+#include <assert.h>
  #include "bcftools.h"
  #include "hclust.h"
  
diff --git a/bcftools/hclust.c.pysam.c b/bcftools/hclust.c.pysam.c

index 29da67cf3e3be70eb14e7a658427b98b08b22c27..0a90af86a4162f8b5badacdbf89a87b916321cae 100644 (file)
--- a/bcftools/hclust.c.pysam.c
+++ b/bcftools/hclust.c.pysam.c
@@ -29,6 +29,7 @@
  #include <htslib/hts.h>
  #include <htslib/kstring.h>
  #include <stdlib.h>
+#include <assert.h>
  #include "bcftools.h"
  #include "hclust.h"
  
diff --git a/bcftools/htslib-1.10.2/LICENSE b/bcftools/htslib-1.10.2/LICENSE

deleted file mode 100644 (file)

index f70e757..0000000
--- a/bcftools/htslib-1.10.2/LICENSE
+++ /dev/null
@@ -1,69 +0,0 @@
-[Files in this distribution outwith the cram/ subdirectory are distributed
-according to the terms of the following MIT/Expat license.]
-
-The MIT/Expat License
-
-Copyright (C) 2012-2019 Genome Research Ltd.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
-
-[Files within the cram/ subdirectory in this distribution are distributed
-according to the terms of the following Modified 3-Clause BSD license.]
-
-The Modified-BSD License
-
-Copyright (C) 2012-2019 Genome Research Ltd.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice,
-   this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute
-   nor the names of its contributors may be used to endorse or promote products
-   derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR ITS CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-[The use of a range of years within a copyright notice in this distribution
-should be interpreted as being equivalent to a list of years including the
-first and last year specified and all consecutive years between them.
-
-For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009,
-2011-2012" should be interpreted as being identical to a notice that reads
-"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice
-that reads "Copyright (C) 2005-2012" should be interpreted as being identical
-to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010,
-2011, 2012".]
diff --git a/bcftools/htslib-1.10.2/README b/bcftools/htslib-1.10.2/README

deleted file mode 100644 (file)

index 4225bec..0000000
--- a/bcftools/htslib-1.10.2/README
+++ /dev/null
@@ -1,5 +0,0 @@
-HTSlib is an implementation of a unified C library for accessing common file
-formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing
-data.  It is the core library used by samtools and bcftools.
-
-See INSTALL for building and installation instructions.
diff --git a/bcftools/main.c b/bcftools/main.c

index 2e3e56d5ef83303422bc93dfd331fa94e2abf627..f89271108a8c322a95d3f28d842a2ae30556f8c1 100644 (file)
--- a/bcftools/main.c
+++ b/bcftools/main.c
@@ -1,6 +1,6 @@
  /*  main.c -- main bcftools command front-end.
  
-    Copyright (C) 2012-2018 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -58,7 +58,7 @@ int main_plugin(int argc, char *argv[]);
  #endif
  int main_consensus(int argc, char *argv[]);
  int main_csq(int argc, char *argv[]);
-int bam_mpileup(int argc, char *argv[]);
+int main_mpileup(int argc, char *argv[]);
  int main_sort(int argc, char *argv[]);
  
  typedef struct
@@ -164,7 +164,7 @@ static cmd_t cmds[] =
        .alias = "gtcheck",
        .help  = "check sample concordance, detect sample swaps and contamination"
      },
-    { .func  = bam_mpileup,
+    { .func  = main_mpileup,
          .alias = "mpileup",
          .help  = "multi-way pileup producing genotype likelihoods"
      },
@@ -251,7 +251,7 @@ int main(int argc, char *argv[])
      if (argc < 2) { usage(stderr); return 1; }
  
      if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
-        printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2019 Genome Research Ltd.\n", bcftools_version(), hts_version());
+        printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2021 Genome Research Ltd.\n", bcftools_version(), hts_version());
  #if USE_GPL
          printf("License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
  #else
diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c

index c7cd4b01fda173c1c3d2ce75ed83b3c81034c4c4..bfd0f04da6c878261bc91a9168f9615175f6f6b3 100644 (file)
--- a/bcftools/main.c.pysam.c
+++ b/bcftools/main.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  main.c -- main bcftools command front-end.
  
-    Copyright (C) 2012-2018 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -60,7 +60,7 @@ int main_plugin(int argc, char *argv[]);
  #endif
  int main_consensus(int argc, char *argv[]);
  int main_csq(int argc, char *argv[]);
-int bam_mpileup(int argc, char *argv[]);
+int main_mpileup(int argc, char *argv[]);
  int main_sort(int argc, char *argv[]);
  
  typedef struct
@@ -166,7 +166,7 @@ static cmd_t cmds[] =
        .alias = "gtcheck",
        .help  = "check sample concordance, detect sample swaps and contamination"
      },
-    { .func  = bam_mpileup,
+    { .func  = main_mpileup,
          .alias = "mpileup",
          .help  = "multi-way pileup producing genotype likelihoods"
      },
@@ -253,7 +253,7 @@ int bcftools_main(int argc, char *argv[])
      if (argc < 2) { usage(bcftools_stderr); return 1; }
  
      if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
-        fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2019 Genome Research Ltd.\n", bcftools_version(), hts_version());
+        fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2021 Genome Research Ltd.\n", bcftools_version(), hts_version());
  #if USE_GPL
          fprintf(bcftools_stdout, "License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
  #else
diff --git a/bcftools/mcall.c b/bcftools/mcall.c

index 325093d12296aa2981b8d0db2a86bde7b9d3f73a..e96d41dfe1e314c64c5be09614868ff45e76f54a 100644 (file)
--- a/bcftools/mcall.c
+++ b/bcftools/mcall.c
@@ -1,6 +1,6 @@
  /*  mcall.c -- multiallelic and rare variant calling.
  
-    Copyright (C) 2012-2016 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -22,11 +22,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.  */
  
+#include <assert.h>
  #include <math.h>
  #include <inttypes.h>
+#include <ctype.h>
  #include <htslib/kfunc.h>
  #include <htslib/khash_str2int.h>
  #include "call.h"
+#include "prob1.h"
  
  // Using priors for GTs does not seem to be mathematically justified. Although
  // it seems effective in removing false calls, it also flips a significant
@@ -38,6 +41,7 @@ THE SOFTWARE.  */
  // genotypes is reported instead.
  #define FLAT_PDG_FOR_MISSING 0
  
+int test16(float *anno16, anno16_t *a);
  
  void qcall_init(call_t *call) { return; }
  void qcall_destroy(call_t *call) { return; }
@@ -249,19 +253,46 @@ static void init_sample_groups(call_t *call)
      if ( !call->sample_groups )
      {
          // standard pooled calling, all samples in the same group
-        grp_t *grps = &call->smpl_grp;
-        grps->ngrp = 1;
-        grps->grp  = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t));
-        grps->smpl2grp = (int*)calloc(nsmpl,sizeof(int));
+        call->nsmpl_grp = 1;
+        call->smpl_grp  = (smpl_grp_t*)calloc(1,sizeof(*call->smpl_grp));
+        call->smpl_grp[0].nsmpl = nsmpl;
+        call->smpl_grp[0].smpl  = (uint32_t*)calloc(call->smpl_grp[0].nsmpl,sizeof(uint32_t));
+        for (i=0; i<nsmpl; i++)
+            call->smpl_grp[0].smpl[i] = i;
+        return;
+    }
+
+    if ( call->sample_groups_tag )
+    {
+        // Is the tag defined in the header?
+        int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->sample_groups_tag);
+        if ( tag_id==-1 ) error("No such tag \"%s\"\n",call->sample_groups_tag);
+        if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) )  error("No such FORMAT tag \"%s\"\n", call->sample_groups_tag);
+    }
+    else
+    {
+        int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"QS");
+        if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "QS";
+        else
+        {
+            tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"AD");
+            if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "AD";
+            else error("Error: neither \"AD\" nor \"QS\" FORMAT tag exists and no alternative given with -G\n");
+        }
      }
-    else if ( !strcmp("-",call->sample_groups) )
+
+    // Read samples/groups
+    if ( !strcmp("-",call->sample_groups) )
      {
          // single-sample calling, each sample creates its own group
-        grp_t *grps = &call->smpl_grp;
-        grps->ngrp = nsmpl;
-        grps->grp  = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t));
-        grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int));
-        for (i=0; i<nsmpl; i++) grps->smpl2grp[i] = i;
+        call->nsmpl_grp = nsmpl;
+        call->smpl_grp  = (smpl_grp_t*)calloc(nsmpl,sizeof(*call->smpl_grp));
+        for (i=0; i<nsmpl; i++)
+        {
+            call->smpl_grp[i].nsmpl = 1;
+            call->smpl_grp[i].smpl  = (uint32_t*)calloc(call->smpl_grp[i].nsmpl,sizeof(uint32_t));
+            call->smpl_grp[i].smpl[0] = i;
+        }
      }
      else
      {
@@ -269,40 +300,49 @@ static void init_sample_groups(call_t *call)
          char **lines = hts_readlist(call->sample_groups, 1, &nlines);
          if ( !lines ) error("Could not read the file: %s\n", call->sample_groups);
  
-        uint32_t *smpl2grp1 = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
+        uint32_t *smpl2grp = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
+        uint32_t *grp2n = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
          void *grp2idx = khash_str2int_init();
  
-        grp_t *grps = &call->smpl_grp;
+        call->nsmpl_grp = 0;
          for (i=0; i<nlines; i++)
          {
              char *ptr = lines[i];
-            while ( *ptr && *ptr!='\t' ) ptr++;
+            while ( *ptr && !isspace(*ptr) ) ptr++;
              if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
-            *ptr = 0;
+            char *tmp = ptr;
+            while ( *ptr && isspace(*ptr) ) ptr++;
+            if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
+            *tmp = 0;
              int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); 
              if ( ismpl<0 ) continue;
-            if ( smpl2grp1[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
+            if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
              if ( !khash_str2int_has_key(grp2idx,ptr+1) )
              {
-                khash_str2int_inc(grp2idx, ptr+1);
-                grps->ngrp++;
+                khash_str2int_set(grp2idx, ptr+1, call->nsmpl_grp);
+                call->nsmpl_grp++;
              }
-            int igrp;
-            if ( khash_str2int_get(grp2idx, ptr+1, &igrp)==0 )
-                smpl2grp1[ismpl] = igrp+1;
-            else
+            int igrp = -1;
+            if ( khash_str2int_get(grp2idx, ptr+1, &igrp)!=0 )
                  error("This should not happen, fixme: %s\n",ptr+1);
+            grp2n[igrp]++;
+            smpl2grp[ismpl] = igrp+1;   // +1 to distinguish unlisted samples
          }
          khash_str2int_destroy(grp2idx);
+        if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", call->sample_groups);
  
-        grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t));
-        grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int));
+        call->smpl_grp = (smpl_grp_t*)calloc(call->nsmpl_grp,sizeof(*call->smpl_grp));
          for (i=0; i<nsmpl; i++)
          {
-            if ( !smpl2grp1[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
-            grps->smpl2grp[i] = smpl2grp1[i] - 1;
+            if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
+            int igrp = smpl2grp[i] - 1;
+            if ( !call->smpl_grp[igrp].nsmpl ) 
+                call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t));
+            call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i;
+            call->smpl_grp[igrp].nsmpl++;
          }
-        free(smpl2grp1);
+        free(smpl2grp);
+        free(grp2n);
          for (i=0; i<nlines; i++) free(lines[i]);
          free(lines);
      }
@@ -310,15 +350,17 @@ static void init_sample_groups(call_t *call)
  static void destroy_sample_groups(call_t *call)
  {
      int i;
-    grp_t *grps = &call->smpl_grp;
-    for (i=0; i<grps->ngrp; i++)
-        free(grps->grp[i].qsum);
-    free(grps->grp);
-    free(grps->smpl2grp);
+    for (i=0; i<call->nsmpl_grp; i++)
+    {
+        free(call->smpl_grp[i].qsum);
+        free(call->smpl_grp[i].smpl);
+    }
+    free(call->smpl_grp);
  }
  
  void mcall_init(call_t *call)
  {
+    init_sample_groups(call);
      call_init_pl2p(call);
  
      call->nals_map = 5;
@@ -341,15 +383,15 @@ void mcall_init(call_t *call)
      if ( call->output_tags & CALL_FMT_GQ )
          bcf_hdr_append(call->hdr,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Phred-scaled Genotype Quality\">");
      if ( call->output_tags & CALL_FMT_GP )
-        bcf_hdr_append(call->hdr,"##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Phred-scaled genotype posterior probabilities\">");
+        bcf_hdr_append(call->hdr,"##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype posterior probabilities in the range 0 to 1\">");
      if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
          call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr));
-    bcf_hdr_append(call->hdr,"##INFO=<ID=ICB,Number=1,Type=Float,Description=\"Inbreeding Coefficient Binomial test (bigger is better)\">");
-    bcf_hdr_append(call->hdr,"##INFO=<ID=HOB,Number=1,Type=Float,Description=\"Bias in the number of HOMs number (smaller is better)\">");
      bcf_hdr_append(call->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">");
      bcf_hdr_append(call->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
      bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">");
      bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Average mapping quality\">");
+    if ( call->output_tags & CALL_FMT_PV4 )
+        bcf_hdr_append(call->hdr,"##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n");
  
      // init the prior
      if ( call->theta>0 )
@@ -372,8 +414,6 @@ void mcall_init(call_t *call)
          }
          call->theta = log(call->theta);
      }
-
-    init_sample_groups(call);
  }
  
  void mcall_destroy(call_t *call)
@@ -394,7 +434,6 @@ void mcall_destroy(call_t *call)
      free(call->pdg);
      free(call->als);
      free(call->ac);
-    free(call->qsum);
      return;
  }
  
@@ -505,14 +544,14 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse
  }
  
  // Create mapping between old and new (trimmed) alleles
-void init_allele_trimming_maps(call_t *call, int als, int nals)
+void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out)
  {
-    int i, j;
+    int i, j, nout = 0;
  
      // als_map: old(i) -> new(j)
-    for (i=0, j=0; i<nals; i++)
+    for (i=0; i<nals_ori; i++)
      {
-        if ( als & 1<<i ) call->als_map[i] = j++;
+        if ( als_out & (1<<i) ) call->als_map[i] = nout++;
          else call->als_map[i] = -1;
      }
  
@@ -520,85 +559,16 @@ void init_allele_trimming_maps(call_t *call, int als, int nals)
  
      // pl_map: new(k) -> old(l)
      int k = 0, l = 0;
-    for (i=0; i<nals; i++)
+    for (i=0; i<nals_ori; i++)
      {
          for (j=0; j<=i; j++)
          {
-            if ( (als & 1<<i) && (als & 1<<j) ) call->pl_map[k++] = l;
+            if ( (als_out & (1<<i)) && (als_out & (1<<j)) ) call->pl_map[k++] = l;
              l++;
          }
      }
  }
  
-double binom_dist(int N, double p, int k)
-{
-    int mean = (int) (N*p);
-    if ( mean==k ) return 1.0;
-
-    double log_p = (k-mean)*log(p) + (mean-k)*log(1.0-p);
-    if ( k > N - k ) k = N - k;
-    if ( mean > N - mean ) mean = N - mean;
-
-    if ( k < mean ) { int tmp = k; k = mean; mean = tmp; }
-    double diff = k - mean;
-
-    double val = 1.0;
-    int i;
-    for (i=0; i<diff; i++)
-        val = val * (N-mean-i) / (k-i);
-
-    return exp(log_p)/val;
-}
-
-
-// Inbreeding Coefficient, binomial test
-float calc_ICB(int nref, int nalt, int nhets, int ndiploid)
-{
-    if ( !nref || !nalt || !ndiploid ) return HUGE_VAL;
-
-    double fref = (double)nref/(nref+nalt); // fraction of reference allelels
-    double falt = (double)nalt/(nref+nalt); // non-ref als
-    double q = 2*fref*falt;                 // probability of a het, assuming HWE
-    double mean = q*ndiploid;
-
-    //fprintf(stderr,"\np=%e N=%d k=%d  .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid);
-
-    // Can we use normal approximation? The second condition is for performance only
-    // and is not well justified.
-    if ( (mean>10 && (1-q)*ndiploid>10 ) || ndiploid>200 )
-    {
-        //fprintf(stderr,"out: mean=%e  p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))));
-        return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)));
-    }
-
-    return binom_dist(ndiploid, q, nhets);
-}
-
-float calc_HOB(int nref, int nalt, int nhets, int ndiploid)
-{
-    if ( !nref || !nalt || !ndiploid ) return HUGE_VAL;
-
-    double fref = (double)nref/(nref+nalt); // fraction of reference allelels
-    double falt = (double)nalt/(nref+nalt); // non-ref als
-    return fabs((double)nhets/ndiploid - 2*fref*falt);
-}
-
-/**
-  *  log(sum_i exp(a_i))
-  */
-// static inline double logsumexp(double *vals, int nvals)
-// {
-//     int i;
-//     double max_exp = vals[0];
-//     for (i=1; i<nvals; i++)
-//         if ( max_exp < vals[i] ) max_exp = vals[i];
-
-//     double sum = 0;
-//     for (i=0; i<nvals; i++)
-//         sum += exp(vals[i] - max_exp);
-
-//     return log(sum) + max_exp;
-// }
  /** log(exp(a)+exp(b)) */
  static inline double logsumexp2(double a, double b)
  {
@@ -610,7 +580,7 @@ static inline double logsumexp2(double a, double b)
  
  // Macro to set the most likely alleles
  #define UPDATE_MAX_LKs(als,sum) { \
-     if ( max_lk<lk_tot ) { max_lk = lk_tot; max_als = (als); } \
+     if ( max_lk<lk_tot && lk_tot_set ) { max_lk = lk_tot; max_als = (als); } \
       if ( sum ) lk_sum = logsumexp2(lk_tot,lk_sum); \
  }
  
@@ -618,14 +588,13 @@ static inline double logsumexp2(double a, double b)
  
  // Determine the most likely combination of alleles. In this implementation,
  // at most tri-allelic sites are considered. Returns the number of alleles.
-static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
+static int mcall_find_best_alleles(call_t *call, int nals, smpl_grp_t *grp)
  {
-    int j;
      int ia,ib,ic;   // iterators over up to three alleles
      int max_als=0;  // most likely combination of alleles
-    double ref_lk = 0, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles
+    double ref_lk = -HUGE_VAL, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles
      double lk_sum = -HUGE_VAL;    // for normalizing the likelihoods
-    int nsmpl = bcf_hdr_nsamples(call->hdr);
+    int nsmpl = grp->nsmpl;
      int ngts  = nals*(nals+1)/2;
  
      // Single allele
@@ -634,60 +603,45 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
          double lk_tot  = 0;
          int lk_tot_set = 0;
          int iaa = (ia+1)*(ia+2)/2-1;    // index in PL which corresponds to the homozygous "ia/ia" genotype
-        int isample;
-        double *pdg = call->pdg + iaa;
-        for (isample=0; isample<nsmpl; isample++)
+        int ismpl;
+        for (ismpl=0; ismpl<nsmpl; ismpl++)
          {
+            double *pdg = call->pdg + grp->smpl[ismpl]*ngts + iaa;
              if ( *pdg ) { lk_tot += log(*pdg); lk_tot_set = 1; }
-            pdg += ngts;
          }
          if ( ia==0 ) ref_lk = lk_tot;   // likelihood of 0/0 for all samples
          else lk_tot += call->theta; // the prior
          UPDATE_MAX_LKs(1<<ia, ia>0 && lk_tot_set);
      }
  
-    grp_t *grps = &call->smpl_grp;
-
      // Two alleles
      if ( nals>1 )
      {
          for (ia=0; ia<nals; ia++)
          {
-            if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue;
+            if ( grp->qsum[ia]==0 ) continue;
              int iaa = (ia+1)*(ia+2)/2-1;
              for (ib=0; ib<ia; ib++)
              {
-                if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue;
+                if ( grp->qsum[ib]==0 ) continue;
                  double lk_tot  = 0;
                  int lk_tot_set = 0;
-                int ia_cov = 0, ib_cov = 0;
-                for (j=0; j<grps->ngrp; j++)
+                double fa  = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib]);
+                double fb  = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib]);
+                double fa2 = fa*fa;
+                double fb2 = fb*fb;
+                double fab = 2*fa*fb;
+                int is, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
+                for (is=0; is<nsmpl; is++)
                  {
-                    grp1_t *grp = &grps->grp[j];
-                    if ( grp->qsum[ia] ) ia_cov = 1;
-                    if ( grp->qsum[ib] ) ib_cov = 1;
-                    if ( !grp->qsum[ia] && !grp->qsum[ib] ) { grp->dp = 0; continue; }
-                    grp->dp  = 1;
-                    grp->fa  = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]);
-                    grp->fb  = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]);
-                    grp->fa2 = grp->fa*grp->fa;
-                    grp->fb2 = grp->fb*grp->fb;
-                    grp->fab = 2*grp->fa*grp->fb;
-                }
-                if ( !ia_cov || !ib_cov ) continue;
-                int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
-                double *pdg  = call->pdg;
-                for (isample=0; isample<nsmpl; isample++)
-                {
-                    grp1_t *grp = &grps->grp[grps->smpl2grp[isample]];
-                    if ( !grp->dp ) continue;
+                    int ismpl = grp->smpl[is];
+                    double *pdg = call->pdg + ismpl*ngts;
                      double val = 0;
-                    if ( !call->ploidy || call->ploidy[isample]==2 )
-                        val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fab*pdg[iab];
-                    else if ( call->ploidy && call->ploidy[isample]==1 )
-                        val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb];
+                    if ( !call->ploidy || call->ploidy[ismpl]==2 )
+                        val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab];
+                    else if ( call->ploidy && call->ploidy[ismpl]==1 )
+                        val = fa*pdg[iaa] + fb*pdg[ibb];
                      if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
-                    pdg += ngts;
                  }
                  if ( ia!=0 ) lk_tot += call->theta;    // the prior
                  if ( ib!=0 ) lk_tot += call->theta;
@@ -701,50 +655,38 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
      {
          for (ia=0; ia<nals; ia++)
          {
-            if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue;
+            if ( grp->qsum[ia]==0 ) continue;
              int iaa = (ia+1)*(ia+2)/2-1;
              for (ib=0; ib<ia; ib++)
              {
-                if (  grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue;
+                if ( grp->qsum[ib]==0 ) continue;
                  int ibb = (ib+1)*(ib+2)/2-1;
                  int iab = iaa - ia + ib;
                  for (ic=0; ic<ib; ic++)
                  {
-                    if (  grps->ngrp==1 && grps->grp[0].qsum[ic]==0 ) continue;
+                    if ( grp->qsum[ic]==0 ) continue;
                      double lk_tot  = 0;
-                    int lk_tot_set = 1;
-                    int ia_cov = 0, ib_cov = 0, ic_cov = 0;
-                    for (j=0; j<grps->ngrp; j++)
-                    {
-                        grp1_t *grp = &grps->grp[j];
-                        if ( grp->qsum[ia] ) ia_cov = 1;
-                        if ( grp->qsum[ib] ) ib_cov = 1;
-                        if ( grp->qsum[ic] ) ic_cov = 1;
-                        if ( !grp->qsum[ia] && !grp->qsum[ib] && !grp->qsum[ic] ) { grp->dp = 0; continue; }
-                        grp->dp  = 1;
-                        grp->fa  = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]);
-                        grp->fb  = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]);
-                        grp->fc  = grp->qsum[ic]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]);
-                        grp->fa2 = grp->fa*grp->fa;
-                        grp->fb2 = grp->fb*grp->fb;
-                        grp->fc2 = grp->fc*grp->fc;
-                        grp->fab = 2*grp->fa*grp->fb, grp->fac = 2*grp->fa*grp->fc, grp->fbc = 2*grp->fb*grp->fc;
-                    }
-                    if ( !ia_cov || !ib_cov || !ic_cov ) continue;
-                    int isample, icc = (ic+1)*(ic+2)/2-1;
+                    int lk_tot_set = 0;
+
+                    double fa  = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
+                    double fb  = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
+                    double fc  = grp->qsum[ic]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
+                    double fa2 = fa*fa;
+                    double fb2 = fb*fb;
+                    double fc2 = fc*fc;
+                    double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc;
+                    int is, icc = (ic+1)*(ic+2)/2-1;
                      int iac = iaa - ia + ic, ibc = ibb - ib + ic;
-                    double *pdg = call->pdg;
-                    for (isample=0; isample<nsmpl; isample++)
+                    for (is=0; is<nsmpl; is++)
                      {
-                        grp1_t *grp = &grps->grp[grps->smpl2grp[isample]];
-                        if ( !grp->dp ) continue;
+                        int ismpl = grp->smpl[is];
+                        double *pdg = call->pdg + ismpl*ngts;
                          double val = 0;
-                        if ( !call->ploidy || call->ploidy[isample]==2 )
-                            val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fc2*pdg[icc] + grp->fab*pdg[iab] + grp->fac*pdg[iac] + grp->fbc*pdg[ibc];
-                        else if ( call->ploidy && call->ploidy[isample]==1 )
-                            val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb] + grp->fc*pdg[icc];
+                        if ( !call->ploidy || call->ploidy[ismpl]==2 )
+                            val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
+                        else if ( call->ploidy && call->ploidy[ismpl]==1 )
+                            val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc];
                          if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
-                        pdg += ngts;
                      }
                      if ( ia!=0 ) lk_tot += call->theta;    // the prior
                      if ( ib!=0 ) lk_tot += call->theta;    // the prior
@@ -755,25 +697,26 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
          }
      }
  
-    call->ref_lk = ref_lk;
-    call->lk_sum = lk_sum;
-    *out_als = max_als;
-
      int i, n = 0;
      for (i=0; i<nals; i++) if ( max_als & 1<<i) n++;
  
+    grp->max_lk = max_lk;
+    grp->ref_lk = ref_lk;
+    grp->lk_sum = lk_sum;
+    grp->als  = max_als;
+    grp->nals = n;
+
      return n;
  }
  
-static void mcall_set_ref_genotypes(call_t *call, int nals)
+// Sets GT=0/0 or GT=. if PL=0,0,0
+static void mcall_set_ref_genotypes(call_t *call, int nals_ori)
  {
      int i;
-    int ngts  = nals*(nals+1)/2;
+    int ngts  = nals_ori*(nals_ori+1)/2;            // need this to distinguish between GT=0/0 vs GT=.
      int nsmpl = bcf_hdr_nsamples(call->hdr);
  
-    for (i=0; i<nals; i++) call->ac[i] = 0;
-    call->nhets = 0;
-    call->ndiploid = 0;
+    for (i=0; i<nals_ori; i++) call->ac[i] = 0;     // nals_new<=nals_ori, never mind setting extra 0's
  
      // Set all genotypes to 0/0 or 0
      int *gts    = call->gts;
@@ -799,34 +742,27 @@ static void mcall_set_ref_genotypes(call_t *call, int nals)
      }
  }
  
-static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp)
  {
      int ia, ib, i;
-    int ngts  = nals*(nals+1)/2;
-    int nsmpl = bcf_hdr_nsamples(call->hdr);
-    int nout_gts = nout_als*(nout_als+1)/2;
-    hts_expand(float,nout_gts*nsmpl,call->nGPs,call->GPs);
-
-    for (i=0; i<nout_als; i++) call->ac[i] = 0;
-    call->nhets = 0;
-    call->ndiploid = 0;
+    int ngts_ori = nals_ori*(nals_ori+1)/2; 
+    int ngts_new = call->nals_new*(call->nals_new+1)/2;
+    int nsmpl = grp->nsmpl;
  
      #if USE_PRIOR_FOR_GTS
          float prior = exp(call->theta);
      #endif
-    float *gps  = call->GPs - nout_gts;
-    double *pdg = call->pdg - ngts;
-    int *gts  = call->gts - 2;
  
-    int isample;
-    for (isample = 0; isample < nsmpl; isample++)
+    int is;
+    for (is = 0; is < nsmpl; is++)
      {
-        int ploidy = call->ploidy ? call->ploidy[isample] : 2;
-        assert( ploidy>=0 && ploidy<=2 );
+        int ismpl   = grp->smpl[is];
+        double *pdg = call->pdg + ismpl*ngts_ori;
+        float *gps  = call->GPs + ismpl*ngts_new;
+        int *gts    = call->gts + ismpl*2;
  
-        pdg += ngts;
-        gts += 2;
-        gps += nout_gts;
+        int ploidy = call->ploidy ? call->ploidy[ismpl] : 2;
+        assert( ploidy>=0 && ploidy<=2 );
  
          if ( !ploidy )
          {
@@ -838,8 +774,8 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
  
          #if !FLAT_PDG_FOR_MISSING
              // Skip samples with zero depth, they have all pdg's equal to 0
-            for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
-            if ( i==ngts )
+            for (i=0; i<ngts_ori; i++) if ( pdg[i]!=0.0 ) break;
+            if ( i==ngts_ori )
              {
                  gts[0] = bcf_gt_missing;
                  gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
@@ -848,19 +784,16 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
              }
          #endif
  
-        if ( ploidy==2 ) call->ndiploid++;
-
          // Default fallback for the case all LKs are the same
          gts[0] = bcf_gt_unphased(0);
          gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end;
  
          // Non-zero depth, determine the most likely genotype
-        grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]];
          double best_lk = 0;
-        for (ia=0; ia<nals; ia++)
+        for (ia=0; ia<nals_ori; ia++)
          {
-            if ( !(out_als & 1<<ia) ) continue;     // ia-th allele not in the final selection, skip
-            int iaa = (ia+1)*(ia+2)/2-1;            // PL index of the ia/ia genotype
+            if ( !(grp->als & 1<<ia) ) continue;    // ia-th allele not in the final selection, skip
+            int iaa = (ia+1)*(ia+2)/2-1;                // PL index of the ia/ia genotype
              double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia];
              #if USE_PRIOR_FOR_GTS
                  if ( ia!=0 ) lk *= prior;
@@ -876,13 +809,13 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
          if ( ploidy==2 )
          {
              gts[1] = gts[0];
-            for (ia=0; ia<nals; ia++)
+            for (ia=0; ia<nals_ori; ia++)
              {
-                if ( !(out_als & 1<<ia) ) continue;
+                if ( !(grp->als & 1<<ia) ) continue;
                  int iaa = (ia+1)*(ia+2)/2-1;
                  for (ib=0; ib<ia; ib++)
                  {
-                    if ( !(out_als & 1<<ib) ) continue;
+                    if ( !(grp->als & 1<<ib) ) continue;
                      int iab = iaa - ia + ib;
                      double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib];
                      #if USE_PRIOR_FOR_GTS
@@ -899,7 +832,6 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
                      }
                  }
              }
-            if ( gts[0] != gts[1] ) call->nhets++;
          }
          else
              gts[1] = bcf_int32_vector_end;
@@ -907,55 +839,50 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
          call->ac[ bcf_gt_allele(gts[0]) ]++;
          if ( gts[1]!=bcf_int32_vector_end ) call->ac[ bcf_gt_allele(gts[1]) ]++;
      }
-    if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
+    if ( !(call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP)) ) return;
+    double max, sum;
+    for (is=0; is<nsmpl; is++)
      {
-        double max, sum;
-        for (isample=0; isample<nsmpl; isample++)
-        {
-            gps = call->GPs + isample*nout_gts;
+        int ismpl  = grp->smpl[is];
+        float *gps = call->GPs + ismpl*ngts_new;
  
-            int nmax;
-            if ( call->ploidy )
-            {
-                if ( call->ploidy[isample]==2 ) nmax = nout_gts;
-                else if ( call->ploidy[isample]==1 ) nmax = nout_als;
-                else nmax = 0;
-            }
-            else nmax = nout_gts;
+        int nmax;
+        if ( call->ploidy )
+        {
+            if ( call->ploidy[ismpl]==2 ) nmax = ngts_new;
+            else if ( call->ploidy[ismpl]==1 ) nmax = grp->nals;
+            else nmax = 0;
+        }
+        else nmax = ngts_new;
  
-            max = gps[0];
-            if ( max<0 || nmax==0 )
-            {
-                // no call
-                if ( call->output_tags & CALL_FMT_GP )
-                {
-                    for (i=0; i<nmax; i++) gps[i] = 0;
-                    if ( nmax==0 ) { bcf_float_set_missing(gps[i]); nmax++; }
-                    if ( nmax < nout_gts ) bcf_float_set_vector_end(gps[nmax]);
-                }
-                call->GQs[isample] = 0;
-                continue;
-            }
-            sum = gps[0];
-            for (i=1; i<nmax; i++)
-            {
-                if ( max < gps[i] ) max = gps[i];
-                sum += gps[i];
-            }
-            max = -4.34294*log(1 - max/sum);
-            call->GQs[isample] = max<=INT8_MAX ? max : INT8_MAX;
+        max = gps[0];
+        if ( max<0 || nmax==0 )
+        {
+            // no call
              if ( call->output_tags & CALL_FMT_GP )
              {
-                assert( max );
-                for (i=0; i<nmax; i++) gps[i] = (int)(-4.34294*log(gps[i]/sum));
-                if ( nmax < nout_gts ) bcf_float_set_vector_end(gps[nmax]);
+                for (i=0; i<nmax; i++) gps[i] = 0;
+                if ( nmax==0 ) { bcf_float_set_missing(gps[i]); nmax++; }
+                if ( nmax < ngts_new ) bcf_float_set_vector_end(gps[nmax]);
              }
+            call->GQs[ismpl] = 0;
+            continue;
+        }
+        sum = gps[0];
+        for (i=1; i<nmax; i++)
+        {
+            if ( max < gps[i] ) max = gps[i];
+            sum += gps[i];
+        }
+        max = -4.34294*log(1 - max/sum);
+        call->GQs[ismpl] = max<=INT8_MAX ? max : INT8_MAX;
+        if ( call->output_tags & CALL_FMT_GP )
+        {
+            assert( max );
+            for (i=0; i<nmax; i++) gps[i] = gps[i]/sum;
+            for (; i<ngts_new; i++) bcf_float_set_vector_end(gps[i]);
          }
      }
-    if ( call->output_tags & CALL_FMT_GP )
-        bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*nout_gts);
-    if ( call->output_tags & CALL_FMT_GQ )
-        bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl);
  }
  
  
@@ -978,12 +905,13 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
      Individual qualities are calculated as
          GQ(F=i,M=j,K=k) = P(F=i,M=j,K=k) / \sum_{x,y} P(F=i,M=x,K=y)
   */
-static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+#if 0
+static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nals_new, int als_new)
  {
      int ia, ib, i;
      int nsmpl    = bcf_hdr_nsamples(call->hdr);
      int ngts     = nals*(nals+1)/2;
-    int nout_gts = nout_als*(nout_als+1)/2;
+    int nout_gts = nals_new*(nals_new+1)/2;
      double *gls  = call->GLs - nout_gts;
      double *pdg  = call->pdg - ngts;
  
@@ -1013,7 +941,7 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n
          double best_lk = 0;
          for (ia=0; ia<nals; ia++)
          {
-            if ( !(out_als & 1<<ia) ) continue;     // ia-th allele not in the final selection, skip
+            if ( !(als_new & 1<<ia) ) continue;     // ia-th allele not in the final selection, skip
              int iaa   = bcf_alleles2gt(ia,ia);      // PL index of the ia/ia genotype
              int idx   = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
              double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia];
@@ -1029,10 +957,10 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n
          {
              for (ia=0; ia<nals; ia++)
              {
-                if ( !(out_als & 1<<ia) ) continue;
+                if ( !(als_new & 1<<ia) ) continue;
                  for (ib=0; ib<ia; ib++)
                  {
-                    if ( !(out_als & 1<<ib) ) continue;
+                    if ( !(als_new & 1<<ib) ) continue;
                      int iab   = bcf_alleles2gt(ia,ib);
                      int idx   = bcf_alleles2gt(call->als_map[ia],call->als_map[ib]);
                      double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib];
@@ -1076,8 +1004,8 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n
      for (ifm=0; ifm<call->nfams; ifm++)
      {
          family_t *fam = &call->fams[ifm];
-        int ntrio = call->ntrio[fam->type][nout_als];
-        uint16_t *trio = call->trio[fam->type][nout_als];
+        int ntrio = call->ntrio[fam->type][nals_new];
+        uint16_t *trio = call->trio[fam->type][nals_new];
  
          // Unconstrained likelihood
          int uc_itr = 0;
@@ -1225,11 +1153,12 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n
          bcf_update_format_int32(call->hdr,rec,"CGT",call->cgts,nsmpl);
      }
  }
+#endif
  
-static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+static void mcall_trim_and_update_PLs(call_t *call, bcf1_t *rec, int nals_ori, int nals_new)
  {
-    int ngts  = nals*(nals+1)/2;
-    int npls_src = ngts, npls_dst = nout_als*(nout_als+1)/2;     // number of PL values in diploid samples, ori and new
+    int npls_src = nals_ori*(nals_ori+1)/2;
+    int npls_dst = nals_new*(nals_new+1)/2;     // number of PL values in diploid samples, ori and new
      if ( call->all_diploid && npls_src == npls_dst ) return;
  
      int *pls_src = call->PLs, *pls_dst = call->PLs;
@@ -1246,7 +1175,7 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in
          }
          else if ( ploidy==1 )
          {
-            for (ia=0; ia<nout_als; ia++)
+            for (ia=0; ia<nals_new; ia++)
              {
                  int isrc = (ia+1)*(ia+2)/2-1;
                  pls_dst[ia] = pls_src[ call->pl_map[isrc] ];
@@ -1256,7 +1185,7 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in
          else
          {
              pls_dst[0] = bcf_int32_missing;
-            pls_dst[1] = bcf_int32_vector_end;  // relying on nout_als>1 in mcall()
+            pls_dst[1] = bcf_int32_vector_end;  // relying on nals_new>1 in mcall()
          }
          pls_src += npls_src;
          pls_dst += npls_dst;
@@ -1264,9 +1193,9 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in
      bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*nsmpl);
  }
  
-void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new)
  {
-    if ( nals==nout_als ) return;
+    if ( nals_ori==nals_new ) return;
  
      int i,j, nret, size = sizeof(float);
  
@@ -1285,17 +1214,17 @@ void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int o
          nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
          if ( nret<=0 ) continue;
  
-        if ( nout_als==1 )
+        if ( nals_new==1 )
              bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1);     // has to be the REF, the order could not change
          else
          {
-            for (j=0; j<nals; j++)
+            for (j=0; j<nals_ori; j++)
              {
                  int k = call->als_map[j];
                  if ( k==-1 ) continue;   // to be dropped
                  memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size);
              }
-            bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als);
+            bcf_update_info_int32(call->hdr, rec, key, tmp_new, nals_new);
          }
      }
  
@@ -1312,21 +1241,21 @@ void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int o
          if (nret<=0) continue;
          int nsmpl = bcf_hdr_nsamples(call->hdr);
  
-        assert( nret==nals*nsmpl );
+        assert( nret==nals_ori*nsmpl );
  
          for (j=0; j<nsmpl; j++)
          {
-            char *ptr_src = (char *)tmp_ori + j*nals*size;
-            char *ptr_dst = (char *)tmp_new + j*nout_als*size;
+            char *ptr_src = (char *)tmp_ori + j*nals_ori*size;
+            char *ptr_dst = (char *)tmp_new + j*nals_new*size;
              int k;
-            for (k=0; k<nals; k++)
+            for (k=0; k<nals_ori; k++)
              {
                  int l = call->als_map[k];
                  if ( l==-1 ) continue;   // to be dropped
                  memcpy(ptr_dst+size*l, ptr_src+size*k, size);
              }
          }
-        bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl);
+        bcf_update_format_int32(call->hdr, rec, key, tmp_new, nals_new*nsmpl);
      }
  
      call->PLs    = (int32_t*) tmp_new;
@@ -1441,12 +1370,12 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
      }
      bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl);
  
-    // update QS
-    int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum);
-    hts_expand(float,nals,call->nqsum,call->qsum);
+    // update QS, use temporarily call->GPs to store the values
+    int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum);
+    hts_expand(float,nals,call->nGPs,call->GPs);
      for (i=0; i<nals; i++)
-        call->qsum[i] = call->als_map[i]<nqs ? call->smpl_grp.grp[0].qsum[call->als_map[i]] : 0;
-    bcf_update_info_float(call->hdr, rec, "QS", call->qsum, nals);
+        call->GPs[i] = call->als_map[i]<nqs ? call->smpl_grp[0].qsum[call->als_map[i]] : 0;
+    bcf_update_info_float(call->hdr, rec, "QS", call->GPs, nals);
  
      // update any Number=R tags
      void *tmp_ori = call->itmp, *tmp_new = call->PLs;  // reusing PLs storage which is not used at this point
@@ -1487,7 +1416,6 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
      call->itmp   = (int32_t*) tmp_ori;
      call->n_itmp = ntmp_ori;
  
-
      if ( *unseen ) *unseen = nals-1;
      return 0;
  }
@@ -1506,203 +1434,229 @@ int mcall(call_t *call, bcf1_t *rec)
      // Force alleles when calling genotypes given alleles was requested
      if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2;
  
-    int nsmpl = bcf_hdr_nsamples(call->hdr);
-    int nals  = rec->n_allele;
-    hts_expand(int,nals,call->nac,call->ac);
-    hts_expand(int,nals,call->nals_map,call->als_map);
-    hts_expand(int,nals*(nals+1)/2,call->npl_map,call->pl_map);
+    int nsmpl    = bcf_hdr_nsamples(call->hdr);
+    int nals_ori = rec->n_allele;
+    hts_expand(int,nals_ori,call->nac,call->ac);
+    hts_expand(int,nals_ori,call->nals_map,call->als_map);
+    hts_expand(int,nals_ori*(nals_ori+1)/2,call->npl_map,call->pl_map);
  
      // Get the genotype likelihoods
      call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
-    if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals )  // a mixture of diploid and haploid or haploid only
-        error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs);
+    if ( call->nPLs!=nsmpl*nals_ori*(nals_ori+1)/2 && call->nPLs!=nsmpl*nals_ori )  // a mixture of diploid and haploid or haploid only
+        error("Wrong number of PL fields? nals=%d npl=%d\n", nals_ori,call->nPLs);
  
      // Convert PLs to probabilities
-    int ngts = nals*(nals+1)/2;
+    int ngts_ori = nals_ori*(nals_ori+1)/2;
      hts_expand(double, call->nPLs, call->npdg, call->pdg);
-    set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen);
+    set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts_ori, unseen);
  
      // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes.
-    if ( call->smpl_grp.ngrp == 1  )
+    if ( call->nsmpl_grp == 1  )
      {
-        int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum);
+        int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum);
          if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
-        if ( nqs < nals )
+        if ( nqs < nals_ori )
          {
              // Some of the listed alleles do not have the corresponding QS field. This is
              // typically ref-only site with <*> in ALT.
-            hts_expand(float,nals,call->smpl_grp.grp[0].nqsum,call->smpl_grp.grp[0].qsum);
-            for (i=nqs; i<nals; i++) call->smpl_grp.grp[0].qsum[i] = 0;
+            hts_expand(float,nals_ori,call->smpl_grp[0].nqsum,call->smpl_grp[0].qsum);
+            for (i=nqs; i<nals_ori; i++) call->smpl_grp[0].qsum[i] = 0;
          }
      }
      else
      {
-        for (j=0; j<call->smpl_grp.ngrp; j++)
+        for (j=0; j<call->nsmpl_grp; j++)
          {
-            hts_expand(float,nals,call->smpl_grp.grp[j].nqsum,call->smpl_grp.grp[j].qsum);
-            memset(call->smpl_grp.grp[j].qsum, 0, sizeof(float)*nals);
+            hts_expand(float,nals_ori,call->smpl_grp[j].nqsum,call->smpl_grp[j].qsum);
+            memset(call->smpl_grp[j].qsum, 0, sizeof(float)*nals_ori);
          }
  
-        int nad = bcf_get_format_int32(call->hdr, rec, "AD", &call->ADs, &call->nADs);
-        if ( nad<1 ) error("Error: FORMAT/AD is required with the -G option, mpileup must be run with -a AD\n");
+        // Use FORMAT/AD or FORMAT/QS
+        int nad = bcf_get_format_int32(call->hdr, rec, call->sample_groups_tag, &call->ADs, &call->nADs);
+        if ( nad<1 ) error("Error: FORMAT/%s is required with the -G option, mpileup must be run with \"-a AD\" or \"-a QS\"\n",call->sample_groups_tag);
          nad /= bcf_hdr_nsamples(call->hdr);
-        hts_expand(float,nals,call->nqsum,call->qsum);
-        float qsum = 0;
-        for (i=0; i<bcf_hdr_nsamples(call->hdr); i++)
+        for (i=0; i<call->nsmpl_grp; i++)
          {
-            int32_t *ptr = call->ADs + i*nad;
-            for (j=0; j<nad; j++)
+            int is;
+            smpl_grp_t *grp = &call->smpl_grp[i];
+            hts_expand(float,nals_ori,grp->nqsum,grp->qsum);
+            for (j=0; j<nals_ori; j++) grp->qsum[j] = 0;
+            for (is=0; is<grp->nsmpl; is++)
              {
-                if ( ptr[j]==bcf_int32_vector_end ) break;
-                if ( ptr[j]==bcf_int32_missing ) call->qsum[j] = 0;
-                else { call->qsum[j] = ptr[j]; qsum += ptr[j]; }
+                int ismpl = grp->smpl[is];
+                int32_t *ptr = call->ADs + ismpl*nad;
+                float sum = 0;
+                for (j=0; j<nad; j++)
+                {
+                    if ( ptr[j]==bcf_int32_vector_end ) break;
+                    if ( ptr[j]!=bcf_int32_missing ) sum += ptr[j];
+                }
+                if ( sum )
+                {
+                    for (j=0; j<nad; j++)
+                    {
+                        if ( ptr[j]==bcf_int32_vector_end ) break;
+                        if ( ptr[j]!=bcf_int32_missing ) grp->qsum[j] += ptr[j]/sum;
+                    }
+                }
              }
-            for (; j<nals; j++) call->qsum[j] = 0;
-            if ( qsum ) 
-                for (j=0; j<nals; j++) call->qsum[j] /= qsum;
-
-            grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[i]];
-            for (j=0; j<nals; j++)
-                grp->qsum[j] += call->qsum[j];
          }
      }
  
      // If available, take into account reference panel AFs
      if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
      {
-        int an = call->ac[0];
-        if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 )
+        int an = call->ac[0];   // number of alleles total, procede only if not zero; reuse call->ac
+        if ( an > 0 && bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals_ori-1 )    // number of ALT alleles
          {
-            int ac0 = an;   // number of alleles in the reference population
-            for (i=0; i<nals-1; i++)
+            int ac0 = an;       // this will become the number of REFs
+            for (i=0; i<nals_ori-1; i++)
              {
                  if ( call->ac[i]==bcf_int32_vector_end ) break;
                  if ( call->ac[i]==bcf_int32_missing ) continue;
                  ac0 -= call->ac[i];
-                for (j=0; j<call->smpl_grp.ngrp; j++)
-                    call->smpl_grp.grp[j].qsum[i+1] += call->ac[i]*0.5;
+
+                // here an*0.5 is the number of samples in the populatio and ac*0.5 is the AF weighted by the number of samples
+                for (j=0; j<call->nsmpl_grp; j++)
+                    call->smpl_grp[j].qsum[i+1] = (call->smpl_grp[j].qsum[i+1] + 0.5*call->ac[i]) / (call->smpl_grp[j].nsmpl + 0.5*an);
              }
              if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1);
-            for (j=0; j<call->smpl_grp.ngrp; j++)
-                call->smpl_grp.grp[j].qsum[0] += ac0*0.5;
-            for (i=0; i<nals; i++)
-            {
-                for (j=0; j<call->smpl_grp.ngrp; j++)
-                    call->smpl_grp.grp[j].qsum[i] /= nsmpl + 0.5*an;
-            }
+            for (j=0; j<call->nsmpl_grp; j++)
+                call->smpl_grp[j].qsum[0] = (call->smpl_grp[j].qsum[0] + 0.5*ac0) / (call->smpl_grp[j].nsmpl + 0.5*an);
          }
      }
  
-    for (j=0; j<call->smpl_grp.ngrp; j++)
+    // normalize so that QS sums to 1 for each group
+    for (j=0; j<call->nsmpl_grp; j++)
      {
-        float qsum_tot = 0;
-        for (i=0; i<nals; i++) qsum_tot += call->smpl_grp.grp[j].qsum[i];
-        if ( qsum_tot ) for (i=0; i<nals; i++) call->smpl_grp.grp[j].qsum[i] /= qsum_tot;
+        float sum = 0;
+        for (i=0; i<nals_ori; i++) sum += call->smpl_grp[j].qsum[i];
+        if ( sum ) for (i=0; i<nals_ori; i++) call->smpl_grp[j].qsum[i] /= sum;
      }
  
      bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0);      // remove QS tag
  
-    // Find the best combination of alleles
-    int out_als, nout;
-    if ( nals > 8*sizeof(out_als) )
+    if ( nals_ori > 8*sizeof(call->als_new) )
      { 
          fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
          return 0; 
      }
-    nout = mcall_find_best_alleles(call, nals, &out_als);
  
-    // Make sure the REF allele is always present
-    if ( !(out_als&1) )
+    // For each group find the best combination of alleles
+    call->als_new = 0;
+    double ref_lk = -HUGE_VAL, lk_sum = -HUGE_VAL, max_qual = -HUGE_VAL;
+    for (j=0; j<call->nsmpl_grp; j++)
      {
-        out_als |= 1;
-        nout++;
+        smpl_grp_t *grp = &call->smpl_grp[j];
+        mcall_find_best_alleles(call, nals_ori, grp);
+        call->als_new |= grp->als;
+        if ( grp->max_lk==-HUGE_VAL ) continue;
+        double qual = -4.343*(grp->ref_lk - logsumexp2(grp->lk_sum,grp->ref_lk));
+        if ( max_qual < qual )
+        {
+            max_qual = qual;
+            lk_sum = grp->lk_sum;
+            ref_lk = grp->ref_lk;
+        }
      }
-    int is_variant = out_als==1 ? 0 : 1;
+
+    // Make sure the REF allele is always present
+    if ( !(call->als_new&1) ) call->als_new |= 1;
+
+    int is_variant = call->als_new==1 ? 0 : 1;
      if ( call->flag & CALL_VARONLY && !is_variant ) return 0;
  
-    // With -A, keep all ALTs except X
-    if ( call->flag & CALL_KEEPALT )
+    call->nals_new = 0;
+    for (i=0; i<nals_ori; i++)
      {
-        nout = 0;
-        for (i=0; i<nals; i++)
-        {
-            if ( i>0 && i==unseen ) continue;
-            out_als |= 1<<i;
-            nout++;
-        }
+        if ( i>0 && i==unseen ) continue;
+        if ( call->flag & CALL_KEEPALT ) call->als_new |= 1<<i;
+        if ( call->als_new & (1<<i) ) call->nals_new++;
      }
  
+    init_allele_trimming_maps(call,nals_ori,call->als_new);
+
      int nAC = 0;
-    if ( out_als==1 )   // only REF allele on output
+    if ( call->als_new==1 )   // only REF allele on output
      {
-        init_allele_trimming_maps(call, 1, nals);
-        mcall_set_ref_genotypes(call,nals);
+        mcall_set_ref_genotypes(call,nals_ori);
          bcf_update_format_int32(call->hdr, rec, "PL", NULL, 0);    // remove PL, useless now
      }
+    else if ( !is_variant )
+    {
+        mcall_set_ref_genotypes(call,nals_ori);     // running with -A, prevent mcall_call_genotypes from putting some ALT back
+        mcall_trim_and_update_PLs(call, rec, nals_ori, call->nals_new);
+    }
      else
      {
          // The most likely set of alleles includes non-reference allele (or was enforced), call genotypes.
          // Note that it is a valid outcome if the called genotypes exclude some of the ALTs.
-        init_allele_trimming_maps(call, out_als, nals);
-        if ( !is_variant )
-            mcall_set_ref_genotypes(call,nals);     // running with -A, prevent mcall_call_genotypes from putting some ALT back
-        else if ( call->flag & CALL_CONSTR_TRIO )
+        int ngts_new = call->nals_new*(call->nals_new+1)/2;
+        hts_expand(float,ngts_new*nsmpl,call->nGPs,call->GPs);
+        for (i=0; i<call->nals_new; i++) call->ac[i] = 0;
+
+        if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 )
+        { 
+            fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
+            return 0; 
+        }
+        if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
          {
-            if ( nout>4 ) 
-            { 
-                fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
-                return 0; 
-            }
-            mcall_call_trio_genotypes(call, rec, nals,nout,out_als);
+            memset(call->GPs,0,nsmpl*ngts_new*sizeof(*call->GPs));
+            memset(call->GQs,0,nsmpl*sizeof(*call->GQs));
+        }
+        for (i=0; i<call->nsmpl_grp; i++)
+        {
+            if ( call->flag & CALL_CONSTR_TRIO )
+                error("todo: constrained trio calling temporarily disabled\n");   //mcall_call_trio_genotypes(call,rec,nals,&call->smpl_grp[i]);
+            else
+                mcall_call_genotypes(call,nals_ori,&call->smpl_grp[i]);
          }
-        else
-            mcall_call_genotypes(call,rec,nals,nout,out_als);
  
          // Skip the site if all samples are 0/0. This can happen occasionally.
-        nAC = 0;
-        for (i=1; i<nout; i++) nAC += call->ac[i];
+        for (i=1; i<call->nals_new; i++) nAC += call->ac[i];
          if ( !nAC && call->flag & CALL_VARONLY ) return 0;
-        mcall_trim_PLs(call, rec, nals, nout, out_als);
+
+        if ( call->output_tags & CALL_FMT_GP )
+            bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*ngts_new);
+        if ( call->output_tags & CALL_FMT_GQ )
+            bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl);
+
+        mcall_trim_and_update_PLs(call,rec,nals_ori,call->nals_new);
      }
-    if ( nals!=nout ) mcall_trim_numberR(call, rec, nals, nout, out_als);
+    if ( nals_ori!=call->nals_new )
+        mcall_trim_and_update_numberR(call,rec,nals_ori,call->nals_new);
  
-    // Set QUAL and calculate HWE-related annotations
+    // Set QUAL
      if ( nAC )
      {
-        float icb = calc_ICB(call->ac[0],nAC, call->nhets, call->ndiploid);
-        if ( icb != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "ICB", &icb, 1);
-
-        float hob = calc_HOB(call->ac[0],nAC, call->nhets, call->ndiploid);
-        if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1);
-
          // Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set
-        rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk));
+        rec->qual = max_qual;
      }
      else
      {
          // Set the quality of a REF site
-        if ( call->lk_sum==-HUGE_VAL )  // no support from (high quality) reads, so QUAL=1-prior
+        if ( lk_sum!=-HUGE_VAL )  // no support from (high quality) reads, so QUAL=1-prior
+            rec->qual = -4.343*(lk_sum - logsumexp2(lk_sum,ref_lk));
+        else if ( call->ac[0] )
              rec->qual = call->theta ? -4.343*call->theta : 0;
          else
-            rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk));
+            bcf_float_set_missing(rec->qual);
      }
  
-    if ( rec->qual>999 ) rec->qual = 999;
-    if ( rec->qual>50 ) rec->qual = rint(rec->qual);
-
      // AC, AN
-    if ( nout>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, nout-1);
+    if ( call->nals_new>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, call->nals_new-1);
      nAC += call->ac[0];
      bcf_update_info_int32(call->hdr, rec, "AN", &nAC, 1);
  
      // Remove unused alleles
-    hts_expand(char*,nout,call->nals,call->als);
-    for (i=0; i<nals; i++)
+    hts_expand(char*,call->nals_new,call->nals,call->als);
+    for (i=0; i<nals_ori; i++)
          if ( call->als_map[i]>=0 ) call->als[call->als_map[i]] = rec->d.allele[i];
-    bcf_update_alleles(call->hdr, rec, (const char**)call->als, nout);
+    bcf_update_alleles(call->hdr, rec, (const char**)call->als, call->nals_new);
      bcf_update_genotypes(call->hdr, rec, call->gts, nsmpl*2);
  
-    // DP4 tag
+    // DP4 and PV4 tags
      if ( bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16)==16 )
      {
          int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3];
@@ -1710,10 +1664,22 @@ int mcall(call_t *call, bcf1_t *rec)
  
          int32_t mq = (call->anno16[8]+call->anno16[10])/(call->anno16[0]+call->anno16[1]+call->anno16[2]+call->anno16[3]);
          bcf_update_info_int32(call->hdr, rec, "MQ", &mq, 1);
+
+        if ( call->output_tags & CALL_FMT_PV4 )
+        {
+            anno16_t a;
+            float tmpf[4];
+            int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0;
+            if ( is_tested ) 
+            {
+                for (i=0; i<4; i++) tmpf[i] = a.p[i];
+                bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4);
+            }
+        }
      }
  
      bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0);     // remove I16 tag
  
-    return nout;
+    return call->nals_new;
  }
  
diff --git a/bcftools/mcall.c.pysam.c b/bcftools/mcall.c.pysam.c

index 2c2fb37d3fcf7fe1e0b4ef42000a1d41a540761f..c2d38a6c686162ce070d997890b4297017772e5d 100644 (file)
--- a/bcftools/mcall.c.pysam.c
+++ b/bcftools/mcall.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  mcall.c -- multiallelic and rare variant calling.
  
-    Copyright (C) 2012-2016 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -24,11 +24,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.  */
  
+#include <assert.h>
  #include <math.h>
  #include <inttypes.h>
+#include <ctype.h>
  #include <htslib/kfunc.h>
  #include <htslib/khash_str2int.h>
  #include "call.h"
+#include "prob1.h"
  
  // Using priors for GTs does not seem to be mathematically justified. Although
  // it seems effective in removing false calls, it also flips a significant
@@ -40,6 +43,7 @@ THE SOFTWARE.  */
  // genotypes is reported instead.
  #define FLAT_PDG_FOR_MISSING 0
  
+int test16(float *anno16, anno16_t *a);
  
  void qcall_init(call_t *call) { return; }
  void qcall_destroy(call_t *call) { return; }
@@ -251,19 +255,46 @@ static void init_sample_groups(call_t *call)
      if ( !call->sample_groups )
      {
          // standard pooled calling, all samples in the same group
-        grp_t *grps = &call->smpl_grp;
-        grps->ngrp = 1;
-        grps->grp  = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t));
-        grps->smpl2grp = (int*)calloc(nsmpl,sizeof(int));
+        call->nsmpl_grp = 1;
+        call->smpl_grp  = (smpl_grp_t*)calloc(1,sizeof(*call->smpl_grp));
+        call->smpl_grp[0].nsmpl = nsmpl;
+        call->smpl_grp[0].smpl  = (uint32_t*)calloc(call->smpl_grp[0].nsmpl,sizeof(uint32_t));
+        for (i=0; i<nsmpl; i++)
+            call->smpl_grp[0].smpl[i] = i;
+        return;
+    }
+
+    if ( call->sample_groups_tag )
+    {
+        // Is the tag defined in the header?
+        int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->sample_groups_tag);
+        if ( tag_id==-1 ) error("No such tag \"%s\"\n",call->sample_groups_tag);
+        if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) )  error("No such FORMAT tag \"%s\"\n", call->sample_groups_tag);
+    }
+    else
+    {
+        int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"QS");
+        if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "QS";
+        else
+        {
+            tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"AD");
+            if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "AD";
+            else error("Error: neither \"AD\" nor \"QS\" FORMAT tag exists and no alternative given with -G\n");
+        }
      }
-    else if ( !strcmp("-",call->sample_groups) )
+
+    // Read samples/groups
+    if ( !strcmp("-",call->sample_groups) )
      {
          // single-sample calling, each sample creates its own group
-        grp_t *grps = &call->smpl_grp;
-        grps->ngrp = nsmpl;
-        grps->grp  = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t));
-        grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int));
-        for (i=0; i<nsmpl; i++) grps->smpl2grp[i] = i;
+        call->nsmpl_grp = nsmpl;
+        call->smpl_grp  = (smpl_grp_t*)calloc(nsmpl,sizeof(*call->smpl_grp));
+        for (i=0; i<nsmpl; i++)
+        {
+            call->smpl_grp[i].nsmpl = 1;
+            call->smpl_grp[i].smpl  = (uint32_t*)calloc(call->smpl_grp[i].nsmpl,sizeof(uint32_t));
+            call->smpl_grp[i].smpl[0] = i;
+        }
      }
      else
      {
@@ -271,40 +302,49 @@ static void init_sample_groups(call_t *call)
          char **lines = hts_readlist(call->sample_groups, 1, &nlines);
          if ( !lines ) error("Could not read the file: %s\n", call->sample_groups);
  
-        uint32_t *smpl2grp1 = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
+        uint32_t *smpl2grp = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
+        uint32_t *grp2n = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
          void *grp2idx = khash_str2int_init();
  
-        grp_t *grps = &call->smpl_grp;
+        call->nsmpl_grp = 0;
          for (i=0; i<nlines; i++)
          {
              char *ptr = lines[i];
-            while ( *ptr && *ptr!='\t' ) ptr++;
+            while ( *ptr && !isspace(*ptr) ) ptr++;
              if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
-            *ptr = 0;
+            char *tmp = ptr;
+            while ( *ptr && isspace(*ptr) ) ptr++;
+            if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
+            *tmp = 0;
              int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); 
              if ( ismpl<0 ) continue;
-            if ( smpl2grp1[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
+            if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
              if ( !khash_str2int_has_key(grp2idx,ptr+1) )
              {
-                khash_str2int_inc(grp2idx, ptr+1);
-                grps->ngrp++;
+                khash_str2int_set(grp2idx, ptr+1, call->nsmpl_grp);
+                call->nsmpl_grp++;
              }
-            int igrp;
-            if ( khash_str2int_get(grp2idx, ptr+1, &igrp)==0 )
-                smpl2grp1[ismpl] = igrp+1;
-            else
+            int igrp = -1;
+            if ( khash_str2int_get(grp2idx, ptr+1, &igrp)!=0 )
                  error("This should not happen, fixme: %s\n",ptr+1);
+            grp2n[igrp]++;
+            smpl2grp[ismpl] = igrp+1;   // +1 to distinguish unlisted samples
          }
          khash_str2int_destroy(grp2idx);
+        if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", call->sample_groups);
  
-        grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t));
-        grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int));
+        call->smpl_grp = (smpl_grp_t*)calloc(call->nsmpl_grp,sizeof(*call->smpl_grp));
          for (i=0; i<nsmpl; i++)
          {
-            if ( !smpl2grp1[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
-            grps->smpl2grp[i] = smpl2grp1[i] - 1;
+            if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
+            int igrp = smpl2grp[i] - 1;
+            if ( !call->smpl_grp[igrp].nsmpl ) 
+                call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t));
+            call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i;
+            call->smpl_grp[igrp].nsmpl++;
          }
-        free(smpl2grp1);
+        free(smpl2grp);
+        free(grp2n);
          for (i=0; i<nlines; i++) free(lines[i]);
          free(lines);
      }
@@ -312,15 +352,17 @@ static void init_sample_groups(call_t *call)
  static void destroy_sample_groups(call_t *call)
  {
      int i;
-    grp_t *grps = &call->smpl_grp;
-    for (i=0; i<grps->ngrp; i++)
-        free(grps->grp[i].qsum);
-    free(grps->grp);
-    free(grps->smpl2grp);
+    for (i=0; i<call->nsmpl_grp; i++)
+    {
+        free(call->smpl_grp[i].qsum);
+        free(call->smpl_grp[i].smpl);
+    }
+    free(call->smpl_grp);
  }
  
  void mcall_init(call_t *call)
  {
+    init_sample_groups(call);
      call_init_pl2p(call);
  
      call->nals_map = 5;
@@ -343,15 +385,15 @@ void mcall_init(call_t *call)
      if ( call->output_tags & CALL_FMT_GQ )
          bcf_hdr_append(call->hdr,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Phred-scaled Genotype Quality\">");
      if ( call->output_tags & CALL_FMT_GP )
-        bcf_hdr_append(call->hdr,"##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Phred-scaled genotype posterior probabilities\">");
+        bcf_hdr_append(call->hdr,"##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Genotype posterior probabilities in the range 0 to 1\">");
      if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
          call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr));
-    bcf_hdr_append(call->hdr,"##INFO=<ID=ICB,Number=1,Type=Float,Description=\"Inbreeding Coefficient Binomial test (bigger is better)\">");
-    bcf_hdr_append(call->hdr,"##INFO=<ID=HOB,Number=1,Type=Float,Description=\"Bias in the number of HOMs number (smaller is better)\">");
      bcf_hdr_append(call->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">");
      bcf_hdr_append(call->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
      bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">");
      bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Average mapping quality\">");
+    if ( call->output_tags & CALL_FMT_PV4 )
+        bcf_hdr_append(call->hdr,"##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n");
  
      // init the prior
      if ( call->theta>0 )
@@ -374,8 +416,6 @@ void mcall_init(call_t *call)
          }
          call->theta = log(call->theta);
      }
-
-    init_sample_groups(call);
  }
  
  void mcall_destroy(call_t *call)
@@ -396,7 +436,6 @@ void mcall_destroy(call_t *call)
      free(call->pdg);
      free(call->als);
      free(call->ac);
-    free(call->qsum);
      return;
  }
  
@@ -507,14 +546,14 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse
  }
  
  // Create mapping between old and new (trimmed) alleles
-void init_allele_trimming_maps(call_t *call, int als, int nals)
+void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out)
  {
-    int i, j;
+    int i, j, nout = 0;
  
      // als_map: old(i) -> new(j)
-    for (i=0, j=0; i<nals; i++)
+    for (i=0; i<nals_ori; i++)
      {
-        if ( als & 1<<i ) call->als_map[i] = j++;
+        if ( als_out & (1<<i) ) call->als_map[i] = nout++;
          else call->als_map[i] = -1;
      }
  
@@ -522,85 +561,16 @@ void init_allele_trimming_maps(call_t *call, int als, int nals)
  
      // pl_map: new(k) -> old(l)
      int k = 0, l = 0;
-    for (i=0; i<nals; i++)
+    for (i=0; i<nals_ori; i++)
      {
          for (j=0; j<=i; j++)
          {
-            if ( (als & 1<<i) && (als & 1<<j) ) call->pl_map[k++] = l;
+            if ( (als_out & (1<<i)) && (als_out & (1<<j)) ) call->pl_map[k++] = l;
              l++;
          }
      }
  }
  
-double binom_dist(int N, double p, int k)
-{
-    int mean = (int) (N*p);
-    if ( mean==k ) return 1.0;
-
-    double log_p = (k-mean)*log(p) + (mean-k)*log(1.0-p);
-    if ( k > N - k ) k = N - k;
-    if ( mean > N - mean ) mean = N - mean;
-
-    if ( k < mean ) { int tmp = k; k = mean; mean = tmp; }
-    double diff = k - mean;
-
-    double val = 1.0;
-    int i;
-    for (i=0; i<diff; i++)
-        val = val * (N-mean-i) / (k-i);
-
-    return exp(log_p)/val;
-}
-
-
-// Inbreeding Coefficient, binomial test
-float calc_ICB(int nref, int nalt, int nhets, int ndiploid)
-{
-    if ( !nref || !nalt || !ndiploid ) return HUGE_VAL;
-
-    double fref = (double)nref/(nref+nalt); // fraction of reference allelels
-    double falt = (double)nalt/(nref+nalt); // non-ref als
-    double q = 2*fref*falt;                 // probability of a het, assuming HWE
-    double mean = q*ndiploid;
-
-    //fprintf(bcftools_stderr,"\np=%e N=%d k=%d  .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid);
-
-    // Can we use normal approximation? The second condition is for performance only
-    // and is not well justified.
-    if ( (mean>10 && (1-q)*ndiploid>10 ) || ndiploid>200 )
-    {
-        //fprintf(bcftools_stderr,"out: mean=%e  p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))));
-        return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)));
-    }
-
-    return binom_dist(ndiploid, q, nhets);
-}
-
-float calc_HOB(int nref, int nalt, int nhets, int ndiploid)
-{
-    if ( !nref || !nalt || !ndiploid ) return HUGE_VAL;
-
-    double fref = (double)nref/(nref+nalt); // fraction of reference allelels
-    double falt = (double)nalt/(nref+nalt); // non-ref als
-    return fabs((double)nhets/ndiploid - 2*fref*falt);
-}
-
-/**
-  *  log(sum_i exp(a_i))
-  */
-// static inline double logsumexp(double *vals, int nvals)
-// {
-//     int i;
-//     double max_exp = vals[0];
-//     for (i=1; i<nvals; i++)
-//         if ( max_exp < vals[i] ) max_exp = vals[i];
-
-//     double sum = 0;
-//     for (i=0; i<nvals; i++)
-//         sum += exp(vals[i] - max_exp);
-
-//     return log(sum) + max_exp;
-// }
  /** log(exp(a)+exp(b)) */
  static inline double logsumexp2(double a, double b)
  {
@@ -612,7 +582,7 @@ static inline double logsumexp2(double a, double b)
  
  // Macro to set the most likely alleles
  #define UPDATE_MAX_LKs(als,sum) { \
-     if ( max_lk<lk_tot ) { max_lk = lk_tot; max_als = (als); } \
+     if ( max_lk<lk_tot && lk_tot_set ) { max_lk = lk_tot; max_als = (als); } \
       if ( sum ) lk_sum = logsumexp2(lk_tot,lk_sum); \
  }
  
@@ -620,14 +590,13 @@ static inline double logsumexp2(double a, double b)
  
  // Determine the most likely combination of alleles. In this implementation,
  // at most tri-allelic sites are considered. Returns the number of alleles.
-static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
+static int mcall_find_best_alleles(call_t *call, int nals, smpl_grp_t *grp)
  {
-    int j;
      int ia,ib,ic;   // iterators over up to three alleles
      int max_als=0;  // most likely combination of alleles
-    double ref_lk = 0, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles
+    double ref_lk = -HUGE_VAL, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles
      double lk_sum = -HUGE_VAL;    // for normalizing the likelihoods
-    int nsmpl = bcf_hdr_nsamples(call->hdr);
+    int nsmpl = grp->nsmpl;
      int ngts  = nals*(nals+1)/2;
  
      // Single allele
@@ -636,60 +605,45 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
          double lk_tot  = 0;
          int lk_tot_set = 0;
          int iaa = (ia+1)*(ia+2)/2-1;    // index in PL which corresponds to the homozygous "ia/ia" genotype
-        int isample;
-        double *pdg = call->pdg + iaa;
-        for (isample=0; isample<nsmpl; isample++)
+        int ismpl;
+        for (ismpl=0; ismpl<nsmpl; ismpl++)
          {
+            double *pdg = call->pdg + grp->smpl[ismpl]*ngts + iaa;
              if ( *pdg ) { lk_tot += log(*pdg); lk_tot_set = 1; }
-            pdg += ngts;
          }
          if ( ia==0 ) ref_lk = lk_tot;   // likelihood of 0/0 for all samples
          else lk_tot += call->theta; // the prior
          UPDATE_MAX_LKs(1<<ia, ia>0 && lk_tot_set);
      }
  
-    grp_t *grps = &call->smpl_grp;
-
      // Two alleles
      if ( nals>1 )
      {
          for (ia=0; ia<nals; ia++)
          {
-            if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue;
+            if ( grp->qsum[ia]==0 ) continue;
              int iaa = (ia+1)*(ia+2)/2-1;
              for (ib=0; ib<ia; ib++)
              {
-                if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue;
+                if ( grp->qsum[ib]==0 ) continue;
                  double lk_tot  = 0;
                  int lk_tot_set = 0;
-                int ia_cov = 0, ib_cov = 0;
-                for (j=0; j<grps->ngrp; j++)
+                double fa  = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib]);
+                double fb  = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib]);
+                double fa2 = fa*fa;
+                double fb2 = fb*fb;
+                double fab = 2*fa*fb;
+                int is, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
+                for (is=0; is<nsmpl; is++)
                  {
-                    grp1_t *grp = &grps->grp[j];
-                    if ( grp->qsum[ia] ) ia_cov = 1;
-                    if ( grp->qsum[ib] ) ib_cov = 1;
-                    if ( !grp->qsum[ia] && !grp->qsum[ib] ) { grp->dp = 0; continue; }
-                    grp->dp  = 1;
-                    grp->fa  = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]);
-                    grp->fb  = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]);
-                    grp->fa2 = grp->fa*grp->fa;
-                    grp->fb2 = grp->fb*grp->fb;
-                    grp->fab = 2*grp->fa*grp->fb;
-                }
-                if ( !ia_cov || !ib_cov ) continue;
-                int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
-                double *pdg  = call->pdg;
-                for (isample=0; isample<nsmpl; isample++)
-                {
-                    grp1_t *grp = &grps->grp[grps->smpl2grp[isample]];
-                    if ( !grp->dp ) continue;
+                    int ismpl = grp->smpl[is];
+                    double *pdg = call->pdg + ismpl*ngts;
                      double val = 0;
-                    if ( !call->ploidy || call->ploidy[isample]==2 )
-                        val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fab*pdg[iab];
-                    else if ( call->ploidy && call->ploidy[isample]==1 )
-                        val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb];
+                    if ( !call->ploidy || call->ploidy[ismpl]==2 )
+                        val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab];
+                    else if ( call->ploidy && call->ploidy[ismpl]==1 )
+                        val = fa*pdg[iaa] + fb*pdg[ibb];
                      if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
-                    pdg += ngts;
                  }
                  if ( ia!=0 ) lk_tot += call->theta;    // the prior
                  if ( ib!=0 ) lk_tot += call->theta;
@@ -703,50 +657,38 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
      {
          for (ia=0; ia<nals; ia++)
          {
-            if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue;
+            if ( grp->qsum[ia]==0 ) continue;
              int iaa = (ia+1)*(ia+2)/2-1;
              for (ib=0; ib<ia; ib++)
              {
-                if (  grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue;
+                if ( grp->qsum[ib]==0 ) continue;
                  int ibb = (ib+1)*(ib+2)/2-1;
                  int iab = iaa - ia + ib;
                  for (ic=0; ic<ib; ic++)
                  {
-                    if (  grps->ngrp==1 && grps->grp[0].qsum[ic]==0 ) continue;
+                    if ( grp->qsum[ic]==0 ) continue;
                      double lk_tot  = 0;
-                    int lk_tot_set = 1;
-                    int ia_cov = 0, ib_cov = 0, ic_cov = 0;
-                    for (j=0; j<grps->ngrp; j++)
-                    {
-                        grp1_t *grp = &grps->grp[j];
-                        if ( grp->qsum[ia] ) ia_cov = 1;
-                        if ( grp->qsum[ib] ) ib_cov = 1;
-                        if ( grp->qsum[ic] ) ic_cov = 1;
-                        if ( !grp->qsum[ia] && !grp->qsum[ib] && !grp->qsum[ic] ) { grp->dp = 0; continue; }
-                        grp->dp  = 1;
-                        grp->fa  = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]);
-                        grp->fb  = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]);
-                        grp->fc  = grp->qsum[ic]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]);
-                        grp->fa2 = grp->fa*grp->fa;
-                        grp->fb2 = grp->fb*grp->fb;
-                        grp->fc2 = grp->fc*grp->fc;
-                        grp->fab = 2*grp->fa*grp->fb, grp->fac = 2*grp->fa*grp->fc, grp->fbc = 2*grp->fb*grp->fc;
-                    }
-                    if ( !ia_cov || !ib_cov || !ic_cov ) continue;
-                    int isample, icc = (ic+1)*(ic+2)/2-1;
+                    int lk_tot_set = 0;
+
+                    double fa  = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
+                    double fb  = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
+                    double fc  = grp->qsum[ic]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]);
+                    double fa2 = fa*fa;
+                    double fb2 = fb*fb;
+                    double fc2 = fc*fc;
+                    double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc;
+                    int is, icc = (ic+1)*(ic+2)/2-1;
                      int iac = iaa - ia + ic, ibc = ibb - ib + ic;
-                    double *pdg = call->pdg;
-                    for (isample=0; isample<nsmpl; isample++)
+                    for (is=0; is<nsmpl; is++)
                      {
-                        grp1_t *grp = &grps->grp[grps->smpl2grp[isample]];
-                        if ( !grp->dp ) continue;
+                        int ismpl = grp->smpl[is];
+                        double *pdg = call->pdg + ismpl*ngts;
                          double val = 0;
-                        if ( !call->ploidy || call->ploidy[isample]==2 )
-                            val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fc2*pdg[icc] + grp->fab*pdg[iab] + grp->fac*pdg[iac] + grp->fbc*pdg[ibc];
-                        else if ( call->ploidy && call->ploidy[isample]==1 )
-                            val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb] + grp->fc*pdg[icc];
+                        if ( !call->ploidy || call->ploidy[ismpl]==2 )
+                            val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
+                        else if ( call->ploidy && call->ploidy[ismpl]==1 )
+                            val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc];
                          if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
-                        pdg += ngts;
                      }
                      if ( ia!=0 ) lk_tot += call->theta;    // the prior
                      if ( ib!=0 ) lk_tot += call->theta;    // the prior
@@ -757,25 +699,26 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als)
          }
      }
  
-    call->ref_lk = ref_lk;
-    call->lk_sum = lk_sum;
-    *out_als = max_als;
-
      int i, n = 0;
      for (i=0; i<nals; i++) if ( max_als & 1<<i) n++;
  
+    grp->max_lk = max_lk;
+    grp->ref_lk = ref_lk;
+    grp->lk_sum = lk_sum;
+    grp->als  = max_als;
+    grp->nals = n;
+
      return n;
  }
  
-static void mcall_set_ref_genotypes(call_t *call, int nals)
+// Sets GT=0/0 or GT=. if PL=0,0,0
+static void mcall_set_ref_genotypes(call_t *call, int nals_ori)
  {
      int i;
-    int ngts  = nals*(nals+1)/2;
+    int ngts  = nals_ori*(nals_ori+1)/2;            // need this to distinguish between GT=0/0 vs GT=.
      int nsmpl = bcf_hdr_nsamples(call->hdr);
  
-    for (i=0; i<nals; i++) call->ac[i] = 0;
-    call->nhets = 0;
-    call->ndiploid = 0;
+    for (i=0; i<nals_ori; i++) call->ac[i] = 0;     // nals_new<=nals_ori, never mind setting extra 0's
  
      // Set all genotypes to 0/0 or 0
      int *gts    = call->gts;
@@ -801,34 +744,27 @@ static void mcall_set_ref_genotypes(call_t *call, int nals)
      }
  }
  
-static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp)
  {
      int ia, ib, i;
-    int ngts  = nals*(nals+1)/2;
-    int nsmpl = bcf_hdr_nsamples(call->hdr);
-    int nout_gts = nout_als*(nout_als+1)/2;
-    hts_expand(float,nout_gts*nsmpl,call->nGPs,call->GPs);
-
-    for (i=0; i<nout_als; i++) call->ac[i] = 0;
-    call->nhets = 0;
-    call->ndiploid = 0;
+    int ngts_ori = nals_ori*(nals_ori+1)/2; 
+    int ngts_new = call->nals_new*(call->nals_new+1)/2;
+    int nsmpl = grp->nsmpl;
  
      #if USE_PRIOR_FOR_GTS
          float prior = exp(call->theta);
      #endif
-    float *gps  = call->GPs - nout_gts;
-    double *pdg = call->pdg - ngts;
-    int *gts  = call->gts - 2;
  
-    int isample;
-    for (isample = 0; isample < nsmpl; isample++)
+    int is;
+    for (is = 0; is < nsmpl; is++)
      {
-        int ploidy = call->ploidy ? call->ploidy[isample] : 2;
-        assert( ploidy>=0 && ploidy<=2 );
+        int ismpl   = grp->smpl[is];
+        double *pdg = call->pdg + ismpl*ngts_ori;
+        float *gps  = call->GPs + ismpl*ngts_new;
+        int *gts    = call->gts + ismpl*2;
  
-        pdg += ngts;
-        gts += 2;
-        gps += nout_gts;
+        int ploidy = call->ploidy ? call->ploidy[ismpl] : 2;
+        assert( ploidy>=0 && ploidy<=2 );
  
          if ( !ploidy )
          {
@@ -840,8 +776,8 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
  
          #if !FLAT_PDG_FOR_MISSING
              // Skip samples with zero depth, they have all pdg's equal to 0
-            for (i=0; i<ngts; i++) if ( pdg[i]!=0.0 ) break;
-            if ( i==ngts )
+            for (i=0; i<ngts_ori; i++) if ( pdg[i]!=0.0 ) break;
+            if ( i==ngts_ori )
              {
                  gts[0] = bcf_gt_missing;
                  gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end;
@@ -850,19 +786,16 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
              }
          #endif
  
-        if ( ploidy==2 ) call->ndiploid++;
-
          // Default fallback for the case all LKs are the same
          gts[0] = bcf_gt_unphased(0);
          gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end;
  
          // Non-zero depth, determine the most likely genotype
-        grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]];
          double best_lk = 0;
-        for (ia=0; ia<nals; ia++)
+        for (ia=0; ia<nals_ori; ia++)
          {
-            if ( !(out_als & 1<<ia) ) continue;     // ia-th allele not in the final selection, skip
-            int iaa = (ia+1)*(ia+2)/2-1;            // PL index of the ia/ia genotype
+            if ( !(grp->als & 1<<ia) ) continue;    // ia-th allele not in the final selection, skip
+            int iaa = (ia+1)*(ia+2)/2-1;                // PL index of the ia/ia genotype
              double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia];
              #if USE_PRIOR_FOR_GTS
                  if ( ia!=0 ) lk *= prior;
@@ -878,13 +811,13 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
          if ( ploidy==2 )
          {
              gts[1] = gts[0];
-            for (ia=0; ia<nals; ia++)
+            for (ia=0; ia<nals_ori; ia++)
              {
-                if ( !(out_als & 1<<ia) ) continue;
+                if ( !(grp->als & 1<<ia) ) continue;
                  int iaa = (ia+1)*(ia+2)/2-1;
                  for (ib=0; ib<ia; ib++)
                  {
-                    if ( !(out_als & 1<<ib) ) continue;
+                    if ( !(grp->als & 1<<ib) ) continue;
                      int iab = iaa - ia + ib;
                      double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib];
                      #if USE_PRIOR_FOR_GTS
@@ -901,7 +834,6 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
                      }
                  }
              }
-            if ( gts[0] != gts[1] ) call->nhets++;
          }
          else
              gts[1] = bcf_int32_vector_end;
@@ -909,55 +841,50 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
          call->ac[ bcf_gt_allele(gts[0]) ]++;
          if ( gts[1]!=bcf_int32_vector_end ) call->ac[ bcf_gt_allele(gts[1]) ]++;
      }
-    if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
+    if ( !(call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP)) ) return;
+    double max, sum;
+    for (is=0; is<nsmpl; is++)
      {
-        double max, sum;
-        for (isample=0; isample<nsmpl; isample++)
-        {
-            gps = call->GPs + isample*nout_gts;
+        int ismpl  = grp->smpl[is];
+        float *gps = call->GPs + ismpl*ngts_new;
  
-            int nmax;
-            if ( call->ploidy )
-            {
-                if ( call->ploidy[isample]==2 ) nmax = nout_gts;
-                else if ( call->ploidy[isample]==1 ) nmax = nout_als;
-                else nmax = 0;
-            }
-            else nmax = nout_gts;
+        int nmax;
+        if ( call->ploidy )
+        {
+            if ( call->ploidy[ismpl]==2 ) nmax = ngts_new;
+            else if ( call->ploidy[ismpl]==1 ) nmax = grp->nals;
+            else nmax = 0;
+        }
+        else nmax = ngts_new;
  
-            max = gps[0];
-            if ( max<0 || nmax==0 )
-            {
-                // no call
-                if ( call->output_tags & CALL_FMT_GP )
-                {
-                    for (i=0; i<nmax; i++) gps[i] = 0;
-                    if ( nmax==0 ) { bcf_float_set_missing(gps[i]); nmax++; }
-                    if ( nmax < nout_gts ) bcf_float_set_vector_end(gps[nmax]);
-                }
-                call->GQs[isample] = 0;
-                continue;
-            }
-            sum = gps[0];
-            for (i=1; i<nmax; i++)
-            {
-                if ( max < gps[i] ) max = gps[i];
-                sum += gps[i];
-            }
-            max = -4.34294*log(1 - max/sum);
-            call->GQs[isample] = max<=INT8_MAX ? max : INT8_MAX;
+        max = gps[0];
+        if ( max<0 || nmax==0 )
+        {
+            // no call
              if ( call->output_tags & CALL_FMT_GP )
              {
-                assert( max );
-                for (i=0; i<nmax; i++) gps[i] = (int)(-4.34294*log(gps[i]/sum));
-                if ( nmax < nout_gts ) bcf_float_set_vector_end(gps[nmax]);
+                for (i=0; i<nmax; i++) gps[i] = 0;
+                if ( nmax==0 ) { bcf_float_set_missing(gps[i]); nmax++; }
+                if ( nmax < ngts_new ) bcf_float_set_vector_end(gps[nmax]);
              }
+            call->GQs[ismpl] = 0;
+            continue;
+        }
+        sum = gps[0];
+        for (i=1; i<nmax; i++)
+        {
+            if ( max < gps[i] ) max = gps[i];
+            sum += gps[i];
+        }
+        max = -4.34294*log(1 - max/sum);
+        call->GQs[ismpl] = max<=INT8_MAX ? max : INT8_MAX;
+        if ( call->output_tags & CALL_FMT_GP )
+        {
+            assert( max );
+            for (i=0; i<nmax; i++) gps[i] = gps[i]/sum;
+            for (; i<ngts_new; i++) bcf_float_set_vector_end(gps[i]);
          }
      }
-    if ( call->output_tags & CALL_FMT_GP )
-        bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*nout_gts);
-    if ( call->output_tags & CALL_FMT_GQ )
-        bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl);
  }
  
  
@@ -980,12 +907,13 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a
      Individual qualities are calculated as
          GQ(F=i,M=j,K=k) = P(F=i,M=j,K=k) / \sum_{x,y} P(F=i,M=x,K=y)
   */
-static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+#if 0
+static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nals_new, int als_new)
  {
      int ia, ib, i;
      int nsmpl    = bcf_hdr_nsamples(call->hdr);
      int ngts     = nals*(nals+1)/2;
-    int nout_gts = nout_als*(nout_als+1)/2;
+    int nout_gts = nals_new*(nals_new+1)/2;
      double *gls  = call->GLs - nout_gts;
      double *pdg  = call->pdg - ngts;
  
@@ -1015,7 +943,7 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n
          double best_lk = 0;
          for (ia=0; ia<nals; ia++)
          {
-            if ( !(out_als & 1<<ia) ) continue;     // ia-th allele not in the final selection, skip
+            if ( !(als_new & 1<<ia) ) continue;     // ia-th allele not in the final selection, skip
              int iaa   = bcf_alleles2gt(ia,ia);      // PL index of the ia/ia genotype
              int idx   = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
              double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia];
@@ -1031,10 +959,10 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n
          {
              for (ia=0; ia<nals; ia++)
              {
-                if ( !(out_als & 1<<ia) ) continue;
+                if ( !(als_new & 1<<ia) ) continue;
                  for (ib=0; ib<ia; ib++)
                  {
-                    if ( !(out_als & 1<<ib) ) continue;
+                    if ( !(als_new & 1<<ib) ) continue;
                      int iab   = bcf_alleles2gt(ia,ib);
                      int idx   = bcf_alleles2gt(call->als_map[ia],call->als_map[ib]);
                      double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib];
@@ -1078,8 +1006,8 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n
      for (ifm=0; ifm<call->nfams; ifm++)
      {
          family_t *fam = &call->fams[ifm];
-        int ntrio = call->ntrio[fam->type][nout_als];
-        uint16_t *trio = call->trio[fam->type][nout_als];
+        int ntrio = call->ntrio[fam->type][nals_new];
+        uint16_t *trio = call->trio[fam->type][nals_new];
  
          // Unconstrained likelihood
          int uc_itr = 0;
@@ -1227,11 +1155,12 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n
          bcf_update_format_int32(call->hdr,rec,"CGT",call->cgts,nsmpl);
      }
  }
+#endif
  
-static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+static void mcall_trim_and_update_PLs(call_t *call, bcf1_t *rec, int nals_ori, int nals_new)
  {
-    int ngts  = nals*(nals+1)/2;
-    int npls_src = ngts, npls_dst = nout_als*(nout_als+1)/2;     // number of PL values in diploid samples, ori and new
+    int npls_src = nals_ori*(nals_ori+1)/2;
+    int npls_dst = nals_new*(nals_new+1)/2;     // number of PL values in diploid samples, ori and new
      if ( call->all_diploid && npls_src == npls_dst ) return;
  
      int *pls_src = call->PLs, *pls_dst = call->PLs;
@@ -1248,7 +1177,7 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in
          }
          else if ( ploidy==1 )
          {
-            for (ia=0; ia<nout_als; ia++)
+            for (ia=0; ia<nals_new; ia++)
              {
                  int isrc = (ia+1)*(ia+2)/2-1;
                  pls_dst[ia] = pls_src[ call->pl_map[isrc] ];
@@ -1258,7 +1187,7 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in
          else
          {
              pls_dst[0] = bcf_int32_missing;
-            pls_dst[1] = bcf_int32_vector_end;  // relying on nout_als>1 in mcall()
+            pls_dst[1] = bcf_int32_vector_end;  // relying on nals_new>1 in mcall()
          }
          pls_src += npls_src;
          pls_dst += npls_dst;
@@ -1266,9 +1195,9 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in
      bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*nsmpl);
  }
  
-void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
+void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new)
  {
-    if ( nals==nout_als ) return;
+    if ( nals_ori==nals_new ) return;
  
      int i,j, nret, size = sizeof(float);
  
@@ -1287,17 +1216,17 @@ void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int o
          nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
          if ( nret<=0 ) continue;
  
-        if ( nout_als==1 )
+        if ( nals_new==1 )
              bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1);     // has to be the REF, the order could not change
          else
          {
-            for (j=0; j<nals; j++)
+            for (j=0; j<nals_ori; j++)
              {
                  int k = call->als_map[j];
                  if ( k==-1 ) continue;   // to be dropped
                  memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size);
              }
-            bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als);
+            bcf_update_info_int32(call->hdr, rec, key, tmp_new, nals_new);
          }
      }
  
@@ -1314,21 +1243,21 @@ void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int o
          if (nret<=0) continue;
          int nsmpl = bcf_hdr_nsamples(call->hdr);
  
-        assert( nret==nals*nsmpl );
+        assert( nret==nals_ori*nsmpl );
  
          for (j=0; j<nsmpl; j++)
          {
-            char *ptr_src = (char *)tmp_ori + j*nals*size;
-            char *ptr_dst = (char *)tmp_new + j*nout_als*size;
+            char *ptr_src = (char *)tmp_ori + j*nals_ori*size;
+            char *ptr_dst = (char *)tmp_new + j*nals_new*size;
              int k;
-            for (k=0; k<nals; k++)
+            for (k=0; k<nals_ori; k++)
              {
                  int l = call->als_map[k];
                  if ( l==-1 ) continue;   // to be dropped
                  memcpy(ptr_dst+size*l, ptr_src+size*k, size);
              }
          }
-        bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl);
+        bcf_update_format_int32(call->hdr, rec, key, tmp_new, nals_new*nsmpl);
      }
  
      call->PLs    = (int32_t*) tmp_new;
@@ -1443,12 +1372,12 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
      }
      bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl);
  
-    // update QS
-    int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum);
-    hts_expand(float,nals,call->nqsum,call->qsum);
+    // update QS, use temporarily call->GPs to store the values
+    int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum);
+    hts_expand(float,nals,call->nGPs,call->GPs);
      for (i=0; i<nals; i++)
-        call->qsum[i] = call->als_map[i]<nqs ? call->smpl_grp.grp[0].qsum[call->als_map[i]] : 0;
-    bcf_update_info_float(call->hdr, rec, "QS", call->qsum, nals);
+        call->GPs[i] = call->als_map[i]<nqs ? call->smpl_grp[0].qsum[call->als_map[i]] : 0;
+    bcf_update_info_float(call->hdr, rec, "QS", call->GPs, nals);
  
      // update any Number=R tags
      void *tmp_ori = call->itmp, *tmp_new = call->PLs;  // reusing PLs storage which is not used at this point
@@ -1489,7 +1418,6 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
      call->itmp   = (int32_t*) tmp_ori;
      call->n_itmp = ntmp_ori;
  
-
      if ( *unseen ) *unseen = nals-1;
      return 0;
  }
@@ -1508,203 +1436,229 @@ int mcall(call_t *call, bcf1_t *rec)
      // Force alleles when calling genotypes given alleles was requested
      if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2;
  
-    int nsmpl = bcf_hdr_nsamples(call->hdr);
-    int nals  = rec->n_allele;
-    hts_expand(int,nals,call->nac,call->ac);
-    hts_expand(int,nals,call->nals_map,call->als_map);
-    hts_expand(int,nals*(nals+1)/2,call->npl_map,call->pl_map);
+    int nsmpl    = bcf_hdr_nsamples(call->hdr);
+    int nals_ori = rec->n_allele;
+    hts_expand(int,nals_ori,call->nac,call->ac);
+    hts_expand(int,nals_ori,call->nals_map,call->als_map);
+    hts_expand(int,nals_ori*(nals_ori+1)/2,call->npl_map,call->pl_map);
  
      // Get the genotype likelihoods
      call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs);
-    if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals )  // a mixture of diploid and haploid or haploid only
-        error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs);
+    if ( call->nPLs!=nsmpl*nals_ori*(nals_ori+1)/2 && call->nPLs!=nsmpl*nals_ori )  // a mixture of diploid and haploid or haploid only
+        error("Wrong number of PL fields? nals=%d npl=%d\n", nals_ori,call->nPLs);
  
      // Convert PLs to probabilities
-    int ngts = nals*(nals+1)/2;
+    int ngts_ori = nals_ori*(nals_ori+1)/2;
      hts_expand(double, call->nPLs, call->npdg, call->pdg);
-    set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen);
+    set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts_ori, unseen);
  
      // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes.
-    if ( call->smpl_grp.ngrp == 1  )
+    if ( call->nsmpl_grp == 1  )
      {
-        int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum);
+        int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum);
          if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
-        if ( nqs < nals )
+        if ( nqs < nals_ori )
          {
              // Some of the listed alleles do not have the corresponding QS field. This is
              // typically ref-only site with <*> in ALT.
-            hts_expand(float,nals,call->smpl_grp.grp[0].nqsum,call->smpl_grp.grp[0].qsum);
-            for (i=nqs; i<nals; i++) call->smpl_grp.grp[0].qsum[i] = 0;
+            hts_expand(float,nals_ori,call->smpl_grp[0].nqsum,call->smpl_grp[0].qsum);
+            for (i=nqs; i<nals_ori; i++) call->smpl_grp[0].qsum[i] = 0;
          }
      }
      else
      {
-        for (j=0; j<call->smpl_grp.ngrp; j++)
+        for (j=0; j<call->nsmpl_grp; j++)
          {
-            hts_expand(float,nals,call->smpl_grp.grp[j].nqsum,call->smpl_grp.grp[j].qsum);
-            memset(call->smpl_grp.grp[j].qsum, 0, sizeof(float)*nals);
+            hts_expand(float,nals_ori,call->smpl_grp[j].nqsum,call->smpl_grp[j].qsum);
+            memset(call->smpl_grp[j].qsum, 0, sizeof(float)*nals_ori);
          }
  
-        int nad = bcf_get_format_int32(call->hdr, rec, "AD", &call->ADs, &call->nADs);
-        if ( nad<1 ) error("Error: FORMAT/AD is required with the -G option, mpileup must be run with -a AD\n");
+        // Use FORMAT/AD or FORMAT/QS
+        int nad = bcf_get_format_int32(call->hdr, rec, call->sample_groups_tag, &call->ADs, &call->nADs);
+        if ( nad<1 ) error("Error: FORMAT/%s is required with the -G option, mpileup must be run with \"-a AD\" or \"-a QS\"\n",call->sample_groups_tag);
          nad /= bcf_hdr_nsamples(call->hdr);
-        hts_expand(float,nals,call->nqsum,call->qsum);
-        float qsum = 0;
-        for (i=0; i<bcf_hdr_nsamples(call->hdr); i++)
+        for (i=0; i<call->nsmpl_grp; i++)
          {
-            int32_t *ptr = call->ADs + i*nad;
-            for (j=0; j<nad; j++)
+            int is;
+            smpl_grp_t *grp = &call->smpl_grp[i];
+            hts_expand(float,nals_ori,grp->nqsum,grp->qsum);
+            for (j=0; j<nals_ori; j++) grp->qsum[j] = 0;
+            for (is=0; is<grp->nsmpl; is++)
              {
-                if ( ptr[j]==bcf_int32_vector_end ) break;
-                if ( ptr[j]==bcf_int32_missing ) call->qsum[j] = 0;
-                else { call->qsum[j] = ptr[j]; qsum += ptr[j]; }
+                int ismpl = grp->smpl[is];
+                int32_t *ptr = call->ADs + ismpl*nad;
+                float sum = 0;
+                for (j=0; j<nad; j++)
+                {
+                    if ( ptr[j]==bcf_int32_vector_end ) break;
+                    if ( ptr[j]!=bcf_int32_missing ) sum += ptr[j];
+                }
+                if ( sum )
+                {
+                    for (j=0; j<nad; j++)
+                    {
+                        if ( ptr[j]==bcf_int32_vector_end ) break;
+                        if ( ptr[j]!=bcf_int32_missing ) grp->qsum[j] += ptr[j]/sum;
+                    }
+                }
              }
-            for (; j<nals; j++) call->qsum[j] = 0;
-            if ( qsum ) 
-                for (j=0; j<nals; j++) call->qsum[j] /= qsum;
-
-            grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[i]];
-            for (j=0; j<nals; j++)
-                grp->qsum[j] += call->qsum[j];
          }
      }
  
      // If available, take into account reference panel AFs
      if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
      {
-        int an = call->ac[0];
-        if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 )
+        int an = call->ac[0];   // number of alleles total, procede only if not zero; reuse call->ac
+        if ( an > 0 && bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals_ori-1 )    // number of ALT alleles
          {
-            int ac0 = an;   // number of alleles in the reference population
-            for (i=0; i<nals-1; i++)
+            int ac0 = an;       // this will become the number of REFs
+            for (i=0; i<nals_ori-1; i++)
              {
                  if ( call->ac[i]==bcf_int32_vector_end ) break;
                  if ( call->ac[i]==bcf_int32_missing ) continue;
                  ac0 -= call->ac[i];
-                for (j=0; j<call->smpl_grp.ngrp; j++)
-                    call->smpl_grp.grp[j].qsum[i+1] += call->ac[i]*0.5;
+
+                // here an*0.5 is the number of samples in the populatio and ac*0.5 is the AF weighted by the number of samples
+                for (j=0; j<call->nsmpl_grp; j++)
+                    call->smpl_grp[j].qsum[i+1] = (call->smpl_grp[j].qsum[i+1] + 0.5*call->ac[i]) / (call->smpl_grp[j].nsmpl + 0.5*an);
              }
              if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1);
-            for (j=0; j<call->smpl_grp.ngrp; j++)
-                call->smpl_grp.grp[j].qsum[0] += ac0*0.5;
-            for (i=0; i<nals; i++)
-            {
-                for (j=0; j<call->smpl_grp.ngrp; j++)
-                    call->smpl_grp.grp[j].qsum[i] /= nsmpl + 0.5*an;
-            }
+            for (j=0; j<call->nsmpl_grp; j++)
+                call->smpl_grp[j].qsum[0] = (call->smpl_grp[j].qsum[0] + 0.5*ac0) / (call->smpl_grp[j].nsmpl + 0.5*an);
          }
      }
  
-    for (j=0; j<call->smpl_grp.ngrp; j++)
+    // normalize so that QS sums to 1 for each group
+    for (j=0; j<call->nsmpl_grp; j++)
      {
-        float qsum_tot = 0;
-        for (i=0; i<nals; i++) qsum_tot += call->smpl_grp.grp[j].qsum[i];
-        if ( qsum_tot ) for (i=0; i<nals; i++) call->smpl_grp.grp[j].qsum[i] /= qsum_tot;
+        float sum = 0;
+        for (i=0; i<nals_ori; i++) sum += call->smpl_grp[j].qsum[i];
+        if ( sum ) for (i=0; i<nals_ori; i++) call->smpl_grp[j].qsum[i] /= sum;
      }
  
      bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0);      // remove QS tag
  
-    // Find the best combination of alleles
-    int out_als, nout;
-    if ( nals > 8*sizeof(out_als) )
+    if ( nals_ori > 8*sizeof(call->als_new) )
      { 
          fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
          return 0; 
      }
-    nout = mcall_find_best_alleles(call, nals, &out_als);
  
-    // Make sure the REF allele is always present
-    if ( !(out_als&1) )
+    // For each group find the best combination of alleles
+    call->als_new = 0;
+    double ref_lk = -HUGE_VAL, lk_sum = -HUGE_VAL, max_qual = -HUGE_VAL;
+    for (j=0; j<call->nsmpl_grp; j++)
      {
-        out_als |= 1;
-        nout++;
+        smpl_grp_t *grp = &call->smpl_grp[j];
+        mcall_find_best_alleles(call, nals_ori, grp);
+        call->als_new |= grp->als;
+        if ( grp->max_lk==-HUGE_VAL ) continue;
+        double qual = -4.343*(grp->ref_lk - logsumexp2(grp->lk_sum,grp->ref_lk));
+        if ( max_qual < qual )
+        {
+            max_qual = qual;
+            lk_sum = grp->lk_sum;
+            ref_lk = grp->ref_lk;
+        }
      }
-    int is_variant = out_als==1 ? 0 : 1;
+
+    // Make sure the REF allele is always present
+    if ( !(call->als_new&1) ) call->als_new |= 1;
+
+    int is_variant = call->als_new==1 ? 0 : 1;
      if ( call->flag & CALL_VARONLY && !is_variant ) return 0;
  
-    // With -A, keep all ALTs except X
-    if ( call->flag & CALL_KEEPALT )
+    call->nals_new = 0;
+    for (i=0; i<nals_ori; i++)
      {
-        nout = 0;
-        for (i=0; i<nals; i++)
-        {
-            if ( i>0 && i==unseen ) continue;
-            out_als |= 1<<i;
-            nout++;
-        }
+        if ( i>0 && i==unseen ) continue;
+        if ( call->flag & CALL_KEEPALT ) call->als_new |= 1<<i;
+        if ( call->als_new & (1<<i) ) call->nals_new++;
      }
  
+    init_allele_trimming_maps(call,nals_ori,call->als_new);
+
      int nAC = 0;
-    if ( out_als==1 )   // only REF allele on output
+    if ( call->als_new==1 )   // only REF allele on output
      {
-        init_allele_trimming_maps(call, 1, nals);
-        mcall_set_ref_genotypes(call,nals);
+        mcall_set_ref_genotypes(call,nals_ori);
          bcf_update_format_int32(call->hdr, rec, "PL", NULL, 0);    // remove PL, useless now
      }
+    else if ( !is_variant )
+    {
+        mcall_set_ref_genotypes(call,nals_ori);     // running with -A, prevent mcall_call_genotypes from putting some ALT back
+        mcall_trim_and_update_PLs(call, rec, nals_ori, call->nals_new);
+    }
      else
      {
          // The most likely set of alleles includes non-reference allele (or was enforced), call genotypes.
          // Note that it is a valid outcome if the called genotypes exclude some of the ALTs.
-        init_allele_trimming_maps(call, out_als, nals);
-        if ( !is_variant )
-            mcall_set_ref_genotypes(call,nals);     // running with -A, prevent mcall_call_genotypes from putting some ALT back
-        else if ( call->flag & CALL_CONSTR_TRIO )
+        int ngts_new = call->nals_new*(call->nals_new+1)/2;
+        hts_expand(float,ngts_new*nsmpl,call->nGPs,call->GPs);
+        for (i=0; i<call->nals_new; i++) call->ac[i] = 0;
+
+        if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 )
+        { 
+            fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
+            return 0; 
+        }
+        if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
          {
-            if ( nout>4 ) 
-            { 
-                fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
-                return 0; 
-            }
-            mcall_call_trio_genotypes(call, rec, nals,nout,out_als);
+            memset(call->GPs,0,nsmpl*ngts_new*sizeof(*call->GPs));
+            memset(call->GQs,0,nsmpl*sizeof(*call->GQs));
+        }
+        for (i=0; i<call->nsmpl_grp; i++)
+        {
+            if ( call->flag & CALL_CONSTR_TRIO )
+                error("todo: constrained trio calling temporarily disabled\n");   //mcall_call_trio_genotypes(call,rec,nals,&call->smpl_grp[i]);
+            else
+                mcall_call_genotypes(call,nals_ori,&call->smpl_grp[i]);
          }
-        else
-            mcall_call_genotypes(call,rec,nals,nout,out_als);
  
          // Skip the site if all samples are 0/0. This can happen occasionally.
-        nAC = 0;
-        for (i=1; i<nout; i++) nAC += call->ac[i];
+        for (i=1; i<call->nals_new; i++) nAC += call->ac[i];
          if ( !nAC && call->flag & CALL_VARONLY ) return 0;
-        mcall_trim_PLs(call, rec, nals, nout, out_als);
+
+        if ( call->output_tags & CALL_FMT_GP )
+            bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*ngts_new);
+        if ( call->output_tags & CALL_FMT_GQ )
+            bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl);
+
+        mcall_trim_and_update_PLs(call,rec,nals_ori,call->nals_new);
      }
-    if ( nals!=nout ) mcall_trim_numberR(call, rec, nals, nout, out_als);
+    if ( nals_ori!=call->nals_new )
+        mcall_trim_and_update_numberR(call,rec,nals_ori,call->nals_new);
  
-    // Set QUAL and calculate HWE-related annotations
+    // Set QUAL
      if ( nAC )
      {
-        float icb = calc_ICB(call->ac[0],nAC, call->nhets, call->ndiploid);
-        if ( icb != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "ICB", &icb, 1);
-
-        float hob = calc_HOB(call->ac[0],nAC, call->nhets, call->ndiploid);
-        if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1);
-
          // Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set
-        rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk));
+        rec->qual = max_qual;
      }
      else
      {
          // Set the quality of a REF site
-        if ( call->lk_sum==-HUGE_VAL )  // no support from (high quality) reads, so QUAL=1-prior
+        if ( lk_sum!=-HUGE_VAL )  // no support from (high quality) reads, so QUAL=1-prior
+            rec->qual = -4.343*(lk_sum - logsumexp2(lk_sum,ref_lk));
+        else if ( call->ac[0] )
              rec->qual = call->theta ? -4.343*call->theta : 0;
          else
-            rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk));
+            bcf_float_set_missing(rec->qual);
      }
  
-    if ( rec->qual>999 ) rec->qual = 999;
-    if ( rec->qual>50 ) rec->qual = rint(rec->qual);
-
      // AC, AN
-    if ( nout>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, nout-1);
+    if ( call->nals_new>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, call->nals_new-1);
      nAC += call->ac[0];
      bcf_update_info_int32(call->hdr, rec, "AN", &nAC, 1);
  
      // Remove unused alleles
-    hts_expand(char*,nout,call->nals,call->als);
-    for (i=0; i<nals; i++)
+    hts_expand(char*,call->nals_new,call->nals,call->als);
+    for (i=0; i<nals_ori; i++)
          if ( call->als_map[i]>=0 ) call->als[call->als_map[i]] = rec->d.allele[i];
-    bcf_update_alleles(call->hdr, rec, (const char**)call->als, nout);
+    bcf_update_alleles(call->hdr, rec, (const char**)call->als, call->nals_new);
      bcf_update_genotypes(call->hdr, rec, call->gts, nsmpl*2);
  
-    // DP4 tag
+    // DP4 and PV4 tags
      if ( bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16)==16 )
      {
          int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3];
@@ -1712,10 +1666,22 @@ int mcall(call_t *call, bcf1_t *rec)
  
          int32_t mq = (call->anno16[8]+call->anno16[10])/(call->anno16[0]+call->anno16[1]+call->anno16[2]+call->anno16[3]);
          bcf_update_info_int32(call->hdr, rec, "MQ", &mq, 1);
+
+        if ( call->output_tags & CALL_FMT_PV4 )
+        {
+            anno16_t a;
+            float tmpf[4];
+            int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0;
+            if ( is_tested ) 
+            {
+                for (i=0; i<4; i++) tmpf[i] = a.p[i];
+                bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4);
+            }
+        }
      }
  
      bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0);     // remove I16 tag
  
-    return nout;
+    return call->nals_new;
  }
  
diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c

index c621b4cc4b0648f7efd84feefb7c06fb0498774e..1f40eff8889405dc1c9a10d27e3b4ba1a4a362d0 100644 (file)
--- a/bcftools/mpileup.c
+++ b/bcftools/mpileup.c
@@ -1,6 +1,6 @@
  /*  mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
  
-    Copyright (C) 2008-2018 Genome Research Ltd.
+    Copyright (C) 2008-2021 Genome Research Ltd.
      Portions copyright (C) 2009-2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -39,6 +39,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include <htslib/faidx.h>
  #include <htslib/kstring.h>
  #include <htslib/khash_str2int.h>
+#include <htslib/hts_os.h>
  #include <assert.h>
  #include "regidx.h"
  #include "bcftools.h"
@@ -59,16 +60,19 @@ DEALINGS IN THE SOFTWARE.  */
  #define MPLP_PRINT_MAPQ (1<<10)
  #define MPLP_PER_SAMPLE (1<<11)
  #define MPLP_SMART_OVERLAPS (1<<12)
+#define MPLP_REALN_PARTIAL  (1<<13)
  
  typedef struct _mplp_aux_t mplp_aux_t;
  typedef struct _mplp_pileup_t mplp_pileup_t;
  
  // Data shared by all bam files
  typedef struct {
-    int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+    int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth,
+        max_indel_depth, max_read_len, fmt_flag, ambig_reads;
      int rflag_require, rflag_filter, output_type;
      int openQ, extQ, tandemQ, min_support; // for indels
      double min_frac; // for indels
+    double indel_bias;
      char *reg_fname, *pl_list, *fai_fname, *output_fname;
      int reg_is_file, record_cmd_line, n_threads;
      faidx_t *fai;
@@ -231,7 +235,46 @@ static int mplp_func(void *data, bam1_t *b)
              has_ref = 0;
          }
  
-        if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+        // Allow sufficient room for bam_aux_append of ZQ tag without
+        // a realloc and consequent breakage of pileup's cached pointers.
+        if (has_ref && (ma->conf->flag &MPLP_REALN) && !bam_aux_get(b, "ZQ")) {
+            // Doing sam_prob_realn later is problematic as it adds to
+            // the tag list (ZQ or BQ), which causes a realloc of b->data.
+            // This happens after pileup has built a hash table on the
+            // read name.  It's a deficiency in pileup IMO.
+
+            // We could implement a new sam_prob_realn that returns ZQ
+            // somewhere else and cache it ourselves (pileup clientdata),
+            // but for now we simply use a workaround.
+            //
+            // We create a fake tag of the correct length, which we remove
+            // just prior calling sam_prob_realn so we can guarantee there is
+            // room. (We can't just make room now as bam_copy1 removes it
+            // again).
+            if (b->core.l_qseq > 500) {
+                uint8_t *ZQ = malloc((uint32_t)b->core.l_qseq+1);
+                memset(ZQ, '@', b->core.l_qseq);
+                ZQ[b->core.l_qseq] = 0;
+                bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ);
+                free(ZQ);
+            } else {
+                static uint8_t ZQ[501] =
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@";
+                ZQ[b->core.l_qseq] = 0;
+                bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ);
+                ZQ[b->core.l_qseq] = '@';
+            }
+        }
+
          if (has_ref && ma->conf->capQ_thres > 10) {
              int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
              if (q < 0) continue;    // skip
@@ -257,18 +300,46 @@ static int mplp_func(void *data, bam1_t *b)
  static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd)
  {
      mplp_aux_t *ma = (mplp_aux_t *)data;
-    cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b) << 1;
-    if ( ma->conf->fmt_flag & (B2B_INFO_SCR|B2B_FMT_SCR) )
-    {
-        int i;
-        for (i=0; i<b->core.n_cigar; i++)
-        {
-            int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK;
-            if ( cig!=BAM_CSOFT_CLIP ) continue;
-            cd->i |= 1;
+    int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b);
+    cd->i = 0;
+    PLP_SET_SAMPLE_ID(cd->i, n);
+    // Whether read has a soft-clip is used in mplp_realn's heuristics.
+    // TODO: consider whether clip length is beneficial to use?
+    int i;
+    for (i=0; i<b->core.n_cigar; i++) {
+        int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK;
+        if (cig == BAM_CSOFT_CLIP) {
+            PLP_SET_SOFT_CLIP(cd->i);
              break;
          }
      }
+
+    if (ma->conf->flag & MPLP_REALN) {
+        int i, tot_ins = 0;
+        uint32_t *cigar = bam_get_cigar(b);
+        int p = 0;
+        for (i=0; i<b->core.n_cigar; i++) {
+            int cig = cigar[i] & BAM_CIGAR_MASK;
+            if (bam_cigar_type(cig) & 2)
+                p += cigar[i] >> BAM_CIGAR_SHIFT;
+            if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) {
+                tot_ins += cigar[i] >> BAM_CIGAR_SHIFT;
+                // Possible further optimsation, check tot_ins==1 later
+                // (and remove break) so we can detect single bp indels.
+                // We may want to focus BAQ on more complex regions only.
+                PLP_SET_INDEL(cd->i);
+                break;
+            }
+
+            // TODO: proper p->cd struct and have cd->i as a size rather
+            // than a flag.
+
+            // Then aggregate together the sizes and if just 1 size for all
+            // reads or 2 sizes for approx 50/50 split in all reads, then
+            // treat this as a well-aligned variant and don't run BAQ.
+        }
+    }
+
      return 0;
  }
  
@@ -282,7 +353,7 @@ static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, c
          {
              const bam_pileup1_t *p = plp[i] + j;
              int id = PLP_SAMPLE_ID(p->cd.i);
-            if (m->n_plp[id] == m->m_plp[id]) 
+            if (m->n_plp[id] == m->m_plp[id])
              {
                  m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
                  m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
@@ -317,6 +388,150 @@ static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bc
      if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output");
  }
  
+/*
+ * Loops for an indel at this position.
+ *
+ * Only reads that overlap an indel loci get realigned.  This considerably
+ * reduces the cost of running BAQ while keeping the main benefits.
+ *
+ * TODO: also consider only realigning reads that don't span the indel
+ * by more than a certain amount either-side.  Ie focus BAQ only on reads
+ * ending adjacent to the indel, where the alignment is most likely to
+ * be wrong.  (2nd TODO: do this based on sequence context; STRs bad, unique
+ * data good.)
+ *
+ * NB: this may sadly realign after we've already used the data.  Hmm...
+ */
+static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp,
+                       int flag, int max_read_len,
+                       char *ref, int ref_len, int pos) {
+    int i, j, has_indel = 0, has_clip = 0, nt = 0;
+    int min_indel = INT_MAX, max_indel = INT_MIN;
+
+    // Is an indel present.
+    // NB: don't bother even checking if very long as almost guaranteed
+    // to have indel (and likely soft-clips too).
+    for (i = 0; i < n; i++) { // iterate over bams
+        nt += n_plp[i];
+        for (j = 0; j < n_plp[i]; j++) { // iterate over reads
+            bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j;
+            has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0;
+            // Has_clip is almost always true for very long reads
+            // (eg PacBio CCS), but these rarely matter as the clip
+            // is likely a long way from this indel.
+            has_clip  += (PLP_HAS_SOFT_CLIP(p->cd.i))         ? 1 : 0;
+            if (max_indel < p->indel)
+                max_indel = p->indel;
+            if (min_indel > p->indel)
+                min_indel = p->indel;
+        }
+    }
+
+    if (flag & MPLP_REALN_PARTIAL) {
+        if (has_indel == 0 ||
+            (has_clip < 0.2*nt && max_indel == min_indel &&
+             (has_indel < 0.1*nt /*|| has_indel > 0.9*nt*/ || has_indel == 1)))
+            return;
+    }
+
+    // Realign
+    for (i = 0; i < n; i++) { // iterate over bams
+        for (j = 0; j < n_plp[i]; j++) { // iterate over reads
+            const bam_pileup1_t *p = plp[i] + j;
+            bam1_t *b = p->b;
+
+            // Avoid doing multiple times.
+            //
+            // Note we cannot modify p->cd.i here with a PLP_SET macro
+            // because the cd item is held by mpileup in an lbnode_t
+            // struct and copied over to the pileup struct for each
+            // iteration, essentially making p->cd.i read only.
+            //
+            // We could use our own structure (p->cd.p), allocated during
+            // the constructor, but for simplicity we play dirty and
+            // abuse an unused flag bit instead.
+            if (b->core.flag & 32768)
+                continue;
+            b->core.flag |= 32768;
+
+            if (b->core.l_qseq > max_read_len)
+                continue;
+
+            // Check p->cigar_ind and see what cigar elements are before
+            // and after.  How close is this location to the end of the
+            // read?  Only realign if we don't span by more than X bases.
+            //
+            // Again, best only done on deeper data as BAQ helps
+            // disproportionately more on shallow data sets.
+            //
+            // This rescues some of the false negatives that are caused by
+            // systematic reduction in quality due to sample vs ref alignment.
+
+// At deep coverage we skip realigning more reads as we have sufficient depth.
+// This rescues for false negatives.  At shallow depth we pay for this with
+// more FP so are more stringent on spanning size.
+#define REALN_DIST (40+10*(nt<40)+10*(nt<20))
+            uint32_t *cig = bam_get_cigar(b);
+            int ncig = b->core.n_cigar;
+
+            // Don't realign reads where indel is in middle?
+            // On long read data we don't care about soft-clips at the ends.
+            // For short read data, we always calc BAQ on these as they're
+            // a common source of false positives.
+            if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) {
+                // Left & right cigar op match.
+                int lr = b->core.l_qseq > 500;
+                int lm = 0, rm = 0, k;
+                for (k = 0; k < ncig; k++) {
+                    int cop = bam_cigar_op(cig[k]);
+                    if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+                        continue;
+
+                    if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+                        cop == BAM_CEQUAL)
+                        lm += bam_cigar_oplen(cig[k]);
+                    else
+                        break;
+                }
+
+                for (k = ncig-1; k >= 0; k--) {
+                    int cop = bam_cigar_op(cig[k]);
+                    if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+                        continue;
+
+                    if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+                        cop == BAM_CEQUAL)
+                        rm += bam_cigar_oplen(cig[k]);
+                    else
+                        break;
+                }
+
+                if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
+                    continue;
+
+                if (lm >= REALN_DIST && rm >= REALN_DIST &&
+                    has_clip < (0.15+0.05*(nt>20))*nt)
+                    continue;
+            }
+
+            if (b->core.l_qseq > 500) {
+                // don't do BAQ on long-read data if it's going to
+                // cause us to have a large band-with and costly in CPU
+                int rl = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
+                if (abs(rl - b->core.l_qseq) * b->core.l_qseq >= 500000)
+                    continue;
+            }
+
+            // Fudge: make room for ZQ tag.
+            uint8_t *_Q = bam_aux_get(b, "_Q");
+            if (_Q) bam_aux_del(b, _Q);
+            sam_prob_realn(b, ref, ref_len, (flag & MPLP_REDO_BAQ) ? 7 : 3);
+        }
+    }
+
+    return;
+}
+
  static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
  {
      bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list
@@ -324,7 +539,7 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
      int ret, i, tid, pos, ref_len;
      char *ref;
  
-    while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0) 
+    while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0)
      {
          if ( pos<beg || pos>end ) continue;
          if ( conf->bed && tid >= 0 )
@@ -333,7 +548,10 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
              if ( !conf->bed_logic ) overlap = overlap ? 0 : 1;
              if ( !overlap ) continue;
          }
-        mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
+        int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
+        if (has_ref && (conf->flag & MPLP_REALN))
+            mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag,
+                       conf->max_read_len, ref, ref_len, pos);
  
          int total_depth, _ref0, ref16;
          for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i];
@@ -346,18 +564,19 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
          conf->bc.tid = tid; conf->bc.pos = pos;
          bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
          bcf_clear1(conf->bcf_rec);
-        bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0);
+        bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag,
+                     conf->bca, 0);
          flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
  
          // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
          // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
-        if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth 
-            && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)
+        if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
+            && (bcf_callaux_clean(conf->bca, &conf->bc),
+                bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0))
          {
-            bcf_callaux_clean(conf->bca, &conf->bc);
              for (i = 0; i < conf->gplp->n; ++i)
                  bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
-            if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) 
+            if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
              {
                  bcf_clear1(conf->bcf_rec);
                  bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
@@ -461,7 +680,7 @@ static int mpileup(mplp_conf_t *conf)
              conf->buf.l = 0;
              ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
              conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s);
-            if ( !conf->mplp_data[i]->iter ) 
+            if ( !conf->mplp_data[i]->iter )
              {
                  conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
                  if ( conf->mplp_data[i]->iter ) {
@@ -487,15 +706,19 @@ static int mpileup(mplp_conf_t *conf)
              conf->mplp_data[i]->h = hdr;
          }
      }
+    if ( !hdr ) {
+        fprintf(stderr, "[%s] failed to find a file header with usable read groups\n", __func__);
+        exit(EXIT_FAILURE);
+    }
      // allocate data storage proportionate to number of samples being studied sm->n
      bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n);
      conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int));
      conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int));
-    conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));  
+    conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));
  
      fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
      // write the VCF header
-    conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type));
+    conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode2(conf->output_type,conf->output_fname));
      if (conf->bcf_fp == NULL) {
          fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
          exit(EXIT_FAILURE);
@@ -542,11 +765,24 @@ static int mpileup(mplp_conf_t *conf)
      bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
      if ( conf->fmt_flag&B2B_INFO_VDB )
          bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
-    if ( conf->fmt_flag&B2B_INFO_RPB )
-        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
-    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
-    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
-    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
+
+    if (conf->fmt_flag & B2B_INFO_ZSCORE) {
+        if ( conf->fmt_flag&B2B_INFO_RPB )
+            bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Read Position Bias (closer to 0 is better)\">");
+        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
+        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
+        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
+        if ( conf->fmt_flag&B2B_INFO_SCB )
+            bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
+    } else {
+        if ( conf->fmt_flag&B2B_INFO_RPB )
+            bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
+        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
+        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
+        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
+    }
+
+    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=FS,Number=1,Type=Float,Description=\"Phred-scaled p-value using Fisher's exact test to detect strand bias\">");
  #if CDF_MWU_TESTS
      bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
      bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
@@ -576,6 +812,8 @@ static int mpileup(mplp_conf_t *conf)
          bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand (high-quality bases)\">");
      if ( conf->fmt_flag&B2B_FMT_ADR )
          bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand (high-quality bases)\">");
+    if ( conf->fmt_flag&B2B_FMT_QS )
+        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=QS,Number=R,Type=Integer,Description=\"Phred-score allele quality sum used by `call -mG` and `+trio-dnm`\">");
      if ( conf->fmt_flag&B2B_INFO_AD )
          bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths (high-quality bases)\">");
      if ( conf->fmt_flag&B2B_INFO_ADF )
@@ -595,17 +833,23 @@ static int mpileup(mplp_conf_t *conf)
          bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
      if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
  
-    conf->bca = bcf_call_init(-1., conf->min_baseQ);
+    conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
+                              conf->delta_baseQ);
      conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
      conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
+    conf->bca->indel_bias = conf->indel_bias;
      conf->bca->min_frac = conf->min_frac;
      conf->bca->min_support = conf->min_support;
      conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
      conf->bca->fmt_flag = conf->fmt_flag;
+    conf->bca->ambig_reads = conf->ambig_reads;
  
      conf->bc.bcf_hdr = conf->bcf_hdr;
      conf->bc.n  = nsmpl;
      conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL));
+    conf->bc.QS = (int32_t*) malloc(nsmpl*sizeof(*conf->bc.QS)*B2B_MAX_ALLELES);
+    for (i=0; i<nsmpl; i++)
+        conf->bcr[i].QS = conf->bc.QS + i*B2B_MAX_ALLELES;
      if (conf->fmt_flag)
      {
          assert( sizeof(float)==sizeof(int32_t) );
@@ -643,7 +887,7 @@ static int mpileup(mplp_conf_t *conf)
      if ( nregs )
      {
          int ireg = 0;
-        do 
+        do
          {
              // first region is already positioned
              if ( ireg++ > 0 )
@@ -651,11 +895,11 @@ static int mpileup(mplp_conf_t *conf)
                  conf->buf.l = 0;
                  ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
  
-                for (i=0; i<conf->nfiles; i++) 
+                for (i=0; i<conf->nfiles; i++)
                  {
                      hts_itr_destroy(conf->mplp_data[i]->iter);
                      conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s);
-                    if ( !conf->mplp_data[i]->iter ) 
+                    if ( !conf->mplp_data[i]->iter )
                      {
                          conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
                          if ( conf->mplp_data[i]->iter ) {
@@ -690,6 +934,7 @@ static int mpileup(mplp_conf_t *conf)
          free(conf->bc.ADR);
          free(conf->bc.ADF);
          free(conf->bc.SCR);
+        free(conf->bc.QS);
          free(conf->bc.fmt_arr);
          free(conf->bcr);
      }
@@ -793,10 +1038,12 @@ int parse_format_flag(const char *str)
          else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
          else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
          else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR;
+        else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS;
          else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR;
          else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
          else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
          else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
+        else if ( !strcasecmp(tags[i],"SCB") || !strcasecmp(tags[i],"INFO/SCB")) flag |= B2B_INFO_SCB;
          else
          {
              fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
@@ -821,6 +1068,7 @@ static void list_annotations(FILE *fp)
  "  FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
  "  FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
  "  FORMAT/DP  .. Number of high-quality bases (Number=1,Type=Integer)\n"
+"  FORMAT/QS  .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
  "  FORMAT/SP  .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
  "  FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
  "\n"
@@ -843,78 +1091,98 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
      // source code in 80 columns, to the extent that's possible.)
  
      fprintf(fp,
-"\n"
-"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
-"\n"
-"Input options:\n"
-"  -6, --illumina1.3+      quality is in the Illumina-1.3+ encoding\n"
-"  -A, --count-orphans     do not discard anomalous read pairs\n"
-"  -b, --bam-list FILE     list of input BAM filenames, one per line\n"
-"  -B, --no-BAQ            disable BAQ (per-Base Alignment Quality)\n"
-"  -C, --adjust-MQ INT     adjust mapping quality; recommended:50, disable:0 [0]\n"
-"  -d, --max-depth INT     max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+        "\n"
+        "Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
+        "\n"
+        "Input options:\n"
+        "  -6, --illumina1.3+      quality is in the Illumina-1.3+ encoding\n"
+        "  -A, --count-orphans     do not discard anomalous read pairs\n"
+        "  -b, --bam-list FILE     list of input BAM filenames, one per line\n"
+        "  -B, --no-BAQ            disable BAQ (per-Base Alignment Quality)\n"
+        "  -C, --adjust-MQ INT     adjust mapping quality [0]\n"
+        "  -D, --full-BAQ          Apply BAQ everywhere, not just in problematic regions\n"
+        "  -d, --max-depth INT     max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+            fprintf(fp,
+        "  -E, --redo-BAQ          recalculate BAQ on the fly, ignore existing BQs\n"
+        "  -f, --fasta-ref FILE    faidx indexed reference sequence file\n"
+        "      --no-reference      do not require fasta reference file\n"
+        "  -G, --read-groups FILE  select or exclude read groups listed in the file\n"
+        "  -q, --min-MQ INT        skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
      fprintf(fp,
-"  -E, --redo-BAQ          recalculate BAQ on the fly, ignore existing BQs\n"
-"  -f, --fasta-ref FILE    faidx indexed reference sequence file\n"
-"      --no-reference      do not require fasta reference file\n"
-"  -G, --read-groups FILE  select or exclude read groups listed in the file\n"
-"  -q, --min-MQ INT        skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
+        "  -Q, --min-BQ INT        skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
      fprintf(fp,
-"  -Q, --min-BQ INT        skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
+        "      --max-BQ INT        limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ);
      fprintf(fp,
-"  -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
-"  -R, --regions-file FILE restrict to regions listed in a file\n"
-"      --ignore-RG         ignore RG tags (one BAM = one sample)\n"
-"  --rf, --incl-flags STR|INT  required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+        "      --delta-BQ INT      Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ);
      fprintf(fp,
-"  --ff, --excl-flags STR|INT  filter flags: skip reads with mask bits set\n"
-"                                            [%s]\n", tmp_filter);
+        "  -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
+        "  -R, --regions-file FILE restrict to regions listed in a file\n"
+        "      --ignore-RG         ignore RG tags (one BAM = one sample)\n"
+        "  --rf, --incl-flags STR|INT  required flags: skip reads with mask bits unset [%s]\n", tmp_require);
      fprintf(fp,
-"  -s, --samples LIST      comma separated list of samples to include\n"
-"  -S, --samples-file FILE file of samples to include\n"
-"  -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
-"  -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
-"  -x, --ignore-overlaps   disable read-pair overlap detection\n"
-"\n"
-"Output options:\n"
-"  -a, --annotate LIST     optional tags to output; '?' to list []\n"
-"  -g, --gvcf INT[,...]    group non-variant sites into gVCF blocks according\n"
-"                          to minimum per-sample DP\n"
-"      --no-version        do not append version and command line to the header\n"
-"  -o, --output FILE       write output to FILE [standard output]\n"
-"  -O, --output-type TYPE  'b' compressed BCF; 'u' uncompressed BCF;\n"
-"                          'z' compressed VCF; 'v' uncompressed VCF [v]\n"
-"      --threads INT       use multithreading with INT worker threads [0]\n"
-"\n"
-"SNP/INDEL genotype likelihoods options:\n"
-"  -e, --ext-prob INT      Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
+        "  --ff, --excl-flags STR|INT  filter flags: skip reads with mask bits set\n"
+        "                                            [%s]\n", tmp_filter);
      fprintf(fp,
-"  -F, --gap-frac FLOAT    minimum fraction of gapped reads [%g]\n", mplp->min_frac);
+        "  -s, --samples LIST      comma separated list of samples to include\n"
+        "  -S, --samples-file FILE file of samples to include\n"
+        "  -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
+        "  -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
+        "  -x, --ignore-overlaps   disable read-pair overlap detection\n"
+        "      --seed INT          random number seed used for sampling deep regions [0]\n"
+        "\n"
+        "Output options:\n"
+        "  -a, --annotate LIST     optional tags to output; '?' to list available tags []\n"
+        "  -g, --gvcf INT[,...]    group non-variant sites into gVCF blocks according\n"
+        "                          to minimum per-sample DP\n"
+        "      --no-version        do not append version and command line to the header\n"
+        "  -o, --output FILE       write output to FILE [standard output]\n"
+        "  -O, --output-type TYPE  'b' compressed BCF; 'u' uncompressed BCF;\n"
+        "                          'z' compressed VCF; 'v' uncompressed VCF [v]\n"
+        "  -U, --mwu-u             use older probability scale for Mann-Whitney U test\n"
+        "      --threads INT       use multithreading with INT worker threads [0]\n"
+        "\n"
+        "SNP/INDEL genotype likelihoods options:\n"
+        "  -X, --config STR        Specify platform specific profiles (see below)\n"
+        "  -e, --ext-prob INT      Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
      fprintf(fp,
-"  -h, --tandem-qual INT   coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
+        "  -F, --gap-frac FLOAT    minimum fraction of gapped reads [%g]\n", mplp->min_frac);
      fprintf(fp,
-"  -I, --skip-indels       do not perform indel calling\n"
-"  -L, --max-idepth INT    maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
+        "  -h, --tandem-qual INT   coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
      fprintf(fp,
-"  -m, --min-ireads INT    minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
+        "  -I, --skip-indels       do not perform indel calling\n"
+        "  -L, --max-idepth INT    maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
      fprintf(fp,
-"  -o, --open-prob INT     Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
+        "  -m, --min-ireads INT    minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
      fprintf(fp,
-"  -p, --per-sample-mF     apply -m and -F per-sample for increased sensitivity\n"
-"  -P, --platforms STR     comma separated list of platforms for indels [all]\n"
-"\n"
-"Notes: Assuming diploid individuals.\n"
-"\n"
-"Example:\n"
-"   # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
-"   bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
-"\n");
+        "  -M, --max-read-len INT  maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len);
+    fprintf(fp,
+        "  -o, --open-prob INT     Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
+    fprintf(fp,
+        "  -p, --per-sample-mF     apply -m and -F per-sample for increased sensitivity\n"
+        "  -P, --platforms STR     comma separated list of platforms for indels [all]\n"
+        "  --ar, --ambig-reads STR   What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n");
+    fprintf(fp,
+        "      --indel-bias FLOAT  Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
+    fprintf(fp,"\n");
+    fprintf(fp,
+        "Configuration profiles activated with -X, --config:\n"
+        "    1.12:        -Q13 -h100 -m1 -F0.002\n"
+        "    illumina:    [ default values ]\n"
+        "    ont:         -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n"
+        "    pacbio-ccs:  -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n"
+        "\n"
+        "Notes: Assuming diploid individuals.\n"
+        "\n"
+        "Example:\n"
+        "   # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
+        "   bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
+        "\n");
  
      free(tmp_require);
      free(tmp_filter);
  }
  
-int bam_mpileup(int argc, char *argv[])
+int main_mpileup(int argc, char *argv[])
  {
      int c;
      const char *file_list = NULL;
@@ -922,12 +1190,15 @@ int bam_mpileup(int argc, char *argv[])
      int nfiles = 0, use_orphan = 0, noref = 0;
      mplp_conf_t mplp;
      memset(&mplp, 0, sizeof(mplp_conf_t));
-    mplp.min_baseQ = 13;
+    mplp.min_baseQ = 1;
+    mplp.max_baseQ = 60;
+    mplp.delta_baseQ = 30;
      mplp.capQ_thres = 0;
      mplp.max_depth = 250; mplp.max_indel_depth = 250;
-    mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
-    mplp.min_frac = 0.002; mplp.min_support = 1;
-    mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS;
+    mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500;
+    mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2;
+    mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL
+              | MPLP_SMART_OVERLAPS;
      mplp.argc = argc; mplp.argv = argv;
      mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
      mplp.output_fname = NULL;
@@ -935,7 +1206,11 @@ int bam_mpileup(int argc, char *argv[])
      mplp.record_cmd_line = 1;
      mplp.n_threads = 0;
      mplp.bsmpl = bam_smpl_init();
-    mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB;    // the default to be changed in future, see also parse_format_flag()
+    // the default to be changed in future, see also parse_format_flag()
+    mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE;
+    mplp.max_read_len = 500;
+    mplp.ambig_reads = B2B_DROP;
+    hts_srand48(0);
  
      static const struct option lopts[] =
      {
@@ -956,6 +1231,8 @@ int bam_mpileup(int argc, char *argv[])
          {"bam-list", required_argument, NULL, 'b'},
          {"no-BAQ", no_argument, NULL, 'B'},
          {"no-baq", no_argument, NULL, 'B'},
+        {"full-BAQ", no_argument, NULL, 'D'},
+        {"full-baq", no_argument, NULL, 'D'},
          {"adjust-MQ", required_argument, NULL, 'C'},
          {"adjust-mq", required_argument, NULL, 'C'},
          {"max-depth", required_argument, NULL, 'd'},
@@ -972,6 +1249,9 @@ int bam_mpileup(int argc, char *argv[])
          {"min-mq", required_argument, NULL, 'q'},
          {"min-BQ", required_argument, NULL, 'Q'},
          {"min-bq", required_argument, NULL, 'Q'},
+        {"max-bq", required_argument, NULL, 11},
+        {"max-BQ", required_argument, NULL, 11},
+        {"delta-BQ", required_argument, NULL, 12},
          {"ignore-overlaps", no_argument, NULL, 'x'},
          {"output-type", required_argument, NULL, 'O'},
          {"samples", required_argument, NULL, 's'},
@@ -979,16 +1259,23 @@ int bam_mpileup(int argc, char *argv[])
          {"annotate", required_argument, NULL, 'a'},
          {"ext-prob", required_argument, NULL, 'e'},
          {"gap-frac", required_argument, NULL, 'F'},
+        {"indel-bias", required_argument, NULL, 10},
          {"tandem-qual", required_argument, NULL, 'h'},
          {"skip-indels", no_argument, NULL, 'I'},
          {"max-idepth", required_argument, NULL, 'L'},
-        {"min-ireads ", required_argument, NULL, 'm'},
+        {"min-ireads", required_argument, NULL, 'm'},
          {"per-sample-mF", no_argument, NULL, 'p'},
          {"per-sample-mf", no_argument, NULL, 'p'},
          {"platforms", required_argument, NULL, 'P'},
+        {"max-read-len", required_argument, NULL, 'M'},
+        {"config", required_argument, NULL, 'X'},
+        {"mwu-u", no_argument, NULL, 'U'},
+        {"seed", required_argument, NULL, 13},
+        {"ambig-reads", required_argument, NULL, 14},
+        {"ar", required_argument, NULL, 14},
          {NULL, 0, NULL, 0}
      };
-    while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
          switch (c) {
          case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
          case  1 :
@@ -1040,23 +1327,26 @@ int bam_mpileup(int argc, char *argv[])
          case 'P': mplp.pl_list = strdup(optarg); break;
          case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
          case 'B': mplp.flag &= ~MPLP_REALN; break;
+        case 'D': mplp.flag &= ~MPLP_REALN_PARTIAL; break;
          case 'I': mplp.flag |= MPLP_NO_INDEL; break;
          case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
          case '6': mplp.flag |= MPLP_ILLUMINA13; break;
          case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break;
          case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break;
-        case 'O': 
+        case 'O':
              switch (optarg[0]) {
                  case 'b': mplp.output_type = FT_BCF_GZ; break;
                  case 'u': mplp.output_type = FT_BCF; break;
                  case 'z': mplp.output_type = FT_VCF_GZ; break;
                  case 'v': mplp.output_type = FT_VCF; break;
-                default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n"); 
+                default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n");
              }
              break;
          case 'C': mplp.capQ_thres = atoi(optarg); break;
          case 'q': mplp.min_mq = atoi(optarg); break;
          case 'Q': mplp.min_baseQ = atoi(optarg); break;
+        case  11: mplp.max_baseQ = atoi(optarg); break;
+        case  12: mplp.delta_baseQ = atoi(optarg); break;
          case 'b': file_list = optarg; break;
          case 'o': {
                  char *end;
@@ -1068,6 +1358,12 @@ int bam_mpileup(int argc, char *argv[])
              break;
          case 'e': mplp.extQ = atoi(optarg); break;
          case 'h': mplp.tandemQ = atoi(optarg); break;
+        case 10: // --indel-bias (inverted so higher => more indels called)
+            if (atof(optarg) < 1e-2)
+                mplp.indel_bias = 1/1e2;
+            else
+                mplp.indel_bias = 1/atof(optarg);
+            break;
          case 'A': use_orphan = 1; break;
          case 'F': mplp.min_frac = atof(optarg); break;
          case 'm': mplp.min_support = atoi(optarg); break;
@@ -1080,6 +1376,49 @@ int bam_mpileup(int argc, char *argv[])
              }
              mplp.fmt_flag |= parse_format_flag(optarg);
          break;
+        case 'M': mplp.max_read_len = atoi(optarg); break;
+        case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break;
+        case 'X':
+            if (strcasecmp(optarg, "pacbio-ccs") == 0) {
+                mplp.min_frac = 0.1;
+                mplp.min_baseQ = 5;
+                mplp.max_baseQ = 50;
+                mplp.delta_baseQ = 10;
+                mplp.openQ = 25;
+                mplp.extQ = 1;
+                mplp.flag |= MPLP_REALN_PARTIAL;
+                mplp.max_read_len = 99999;
+            } else if (strcasecmp(optarg, "ont") == 0) {
+                fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with "
+                        "a higher -P, eg -P0.01 or -P 0.1\n");
+                mplp.min_baseQ = 5;
+                mplp.max_baseQ = 30;
+                mplp.flag &= ~MPLP_REALN;
+                mplp.flag |= MPLP_NO_INDEL;
+            } else if (strcasecmp(optarg, "1.12") == 0) {
+                // 1.12 and earlier
+                mplp.min_frac = 0.002;
+                mplp.min_support = 1;
+                mplp.min_baseQ = 13;
+                mplp.tandemQ = 100;
+                mplp.flag &= ~MPLP_REALN_PARTIAL;
+                mplp.flag |= MPLP_REALN;
+            } else if (strcasecmp(optarg, "illumina") == 0) {
+                mplp.flag |= MPLP_REALN_PARTIAL;
+            } else {
+                fprintf(stderr, "Unknown configuration name '%s'\n"
+                        "Please choose from 1.12, illumina, pacbio-ccs or ont\n",
+                        optarg);
+                return 1;
+            }
+            break;
+        case 13: hts_srand48(atoi(optarg)); break;
+        case 14:
+            if ( !strcasecmp(optarg,"drop") ) mplp.ambig_reads = B2B_DROP;
+            else if ( !strcasecmp(optarg,"incAD") ) mplp.ambig_reads = B2B_INC_AD;
+            else if ( !strcasecmp(optarg,"incAD0") ) mplp.ambig_reads = B2B_INC_AD0;
+            else error("The option to --ambig-reads not recognised: %s\n",optarg);
+            break;
          default:
              fprintf(stderr,"Invalid option: '%c'\n", c);
              return 1;
@@ -1120,7 +1459,7 @@ int bam_mpileup(int argc, char *argv[])
          return 1;
      }
      int ret,i;
-    if (file_list) 
+    if (file_list)
      {
          if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
          mplp.files  = fn;
@@ -1142,5 +1481,6 @@ int bam_mpileup(int argc, char *argv[])
      if (mplp.bed_itr) regitr_destroy(mplp.bed_itr);
      if (mplp.reg) regidx_destroy(mplp.reg);
      bam_smpl_destroy(mplp.bsmpl);
+
      return ret;
  }
diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c

index 51fcf8b78ca12213294a16384cffa305107b1baa..c66c75247cd1f934efcb32461a187b9c99015a2e 100644 (file)
--- a/bcftools/mpileup.c.pysam.c
+++ b/bcftools/mpileup.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
  
-    Copyright (C) 2008-2018 Genome Research Ltd.
+    Copyright (C) 2008-2021 Genome Research Ltd.
      Portions copyright (C) 2009-2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -41,6 +41,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include <htslib/faidx.h>
  #include <htslib/kstring.h>
  #include <htslib/khash_str2int.h>
+#include <htslib/hts_os.h>
  #include <assert.h>
  #include "regidx.h"
  #include "bcftools.h"
@@ -61,16 +62,19 @@ DEALINGS IN THE SOFTWARE.  */
  #define MPLP_PRINT_MAPQ (1<<10)
  #define MPLP_PER_SAMPLE (1<<11)
  #define MPLP_SMART_OVERLAPS (1<<12)
+#define MPLP_REALN_PARTIAL  (1<<13)
  
  typedef struct _mplp_aux_t mplp_aux_t;
  typedef struct _mplp_pileup_t mplp_pileup_t;
  
  // Data shared by all bam files
  typedef struct {
-    int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+    int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth,
+        max_indel_depth, max_read_len, fmt_flag, ambig_reads;
      int rflag_require, rflag_filter, output_type;
      int openQ, extQ, tandemQ, min_support; // for indels
      double min_frac; // for indels
+    double indel_bias;
      char *reg_fname, *pl_list, *fai_fname, *output_fname;
      int reg_is_file, record_cmd_line, n_threads;
      faidx_t *fai;
@@ -233,7 +237,46 @@ static int mplp_func(void *data, bam1_t *b)
              has_ref = 0;
          }
  
-        if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+        // Allow sufficient room for bam_aux_append of ZQ tag without
+        // a realloc and consequent breakage of pileup's cached pointers.
+        if (has_ref && (ma->conf->flag &MPLP_REALN) && !bam_aux_get(b, "ZQ")) {
+            // Doing sam_prob_realn later is problematic as it adds to
+            // the tag list (ZQ or BQ), which causes a realloc of b->data.
+            // This happens after pileup has built a hash table on the
+            // read name.  It's a deficiency in pileup IMO.
+
+            // We could implement a new sam_prob_realn that returns ZQ
+            // somewhere else and cache it ourselves (pileup clientdata),
+            // but for now we simply use a workaround.
+            //
+            // We create a fake tag of the correct length, which we remove
+            // just prior calling sam_prob_realn so we can guarantee there is
+            // room. (We can't just make room now as bam_copy1 removes it
+            // again).
+            if (b->core.l_qseq > 500) {
+                uint8_t *ZQ = malloc((uint32_t)b->core.l_qseq+1);
+                memset(ZQ, '@', b->core.l_qseq);
+                ZQ[b->core.l_qseq] = 0;
+                bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ);
+                free(ZQ);
+            } else {
+                static uint8_t ZQ[501] =
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@";
+                ZQ[b->core.l_qseq] = 0;
+                bam_aux_append(b, "_Q", 'Z', b->core.l_qseq+1, ZQ);
+                ZQ[b->core.l_qseq] = '@';
+            }
+        }
+
          if (has_ref && ma->conf->capQ_thres > 10) {
              int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
              if (q < 0) continue;    // skip
@@ -259,18 +302,46 @@ static int mplp_func(void *data, bam1_t *b)
  static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd)
  {
      mplp_aux_t *ma = (mplp_aux_t *)data;
-    cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b) << 1;
-    if ( ma->conf->fmt_flag & (B2B_INFO_SCR|B2B_FMT_SCR) )
-    {
-        int i;
-        for (i=0; i<b->core.n_cigar; i++)
-        {
-            int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK;
-            if ( cig!=BAM_CSOFT_CLIP ) continue;
-            cd->i |= 1;
+    int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b);
+    cd->i = 0;
+    PLP_SET_SAMPLE_ID(cd->i, n);
+    // Whether read has a soft-clip is used in mplp_realn's heuristics.
+    // TODO: consider whether clip length is beneficial to use?
+    int i;
+    for (i=0; i<b->core.n_cigar; i++) {
+        int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK;
+        if (cig == BAM_CSOFT_CLIP) {
+            PLP_SET_SOFT_CLIP(cd->i);
              break;
          }
      }
+
+    if (ma->conf->flag & MPLP_REALN) {
+        int i, tot_ins = 0;
+        uint32_t *cigar = bam_get_cigar(b);
+        int p = 0;
+        for (i=0; i<b->core.n_cigar; i++) {
+            int cig = cigar[i] & BAM_CIGAR_MASK;
+            if (bam_cigar_type(cig) & 2)
+                p += cigar[i] >> BAM_CIGAR_SHIFT;
+            if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) {
+                tot_ins += cigar[i] >> BAM_CIGAR_SHIFT;
+                // Possible further optimsation, check tot_ins==1 later
+                // (and remove break) so we can detect single bp indels.
+                // We may want to focus BAQ on more complex regions only.
+                PLP_SET_INDEL(cd->i);
+                break;
+            }
+
+            // TODO: proper p->cd struct and have cd->i as a size rather
+            // than a flag.
+
+            // Then aggregate together the sizes and if just 1 size for all
+            // reads or 2 sizes for approx 50/50 split in all reads, then
+            // treat this as a well-aligned variant and don't run BAQ.
+        }
+    }
+
      return 0;
  }
  
@@ -284,7 +355,7 @@ static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, c
          {
              const bam_pileup1_t *p = plp[i] + j;
              int id = PLP_SAMPLE_ID(p->cd.i);
-            if (m->n_plp[id] == m->m_plp[id]) 
+            if (m->n_plp[id] == m->m_plp[id])
              {
                  m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
                  m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
@@ -319,6 +390,150 @@ static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bc
      if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output");
  }
  
+/*
+ * Loops for an indel at this position.
+ *
+ * Only reads that overlap an indel loci get realigned.  This considerably
+ * reduces the cost of running BAQ while keeping the main benefits.
+ *
+ * TODO: also consider only realigning reads that don't span the indel
+ * by more than a certain amount either-side.  Ie focus BAQ only on reads
+ * ending adjacent to the indel, where the alignment is most likely to
+ * be wrong.  (2nd TODO: do this based on sequence context; STRs bad, unique
+ * data good.)
+ *
+ * NB: this may sadly realign after we've already used the data.  Hmm...
+ */
+static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp,
+                       int flag, int max_read_len,
+                       char *ref, int ref_len, int pos) {
+    int i, j, has_indel = 0, has_clip = 0, nt = 0;
+    int min_indel = INT_MAX, max_indel = INT_MIN;
+
+    // Is an indel present.
+    // NB: don't bother even checking if very long as almost guaranteed
+    // to have indel (and likely soft-clips too).
+    for (i = 0; i < n; i++) { // iterate over bams
+        nt += n_plp[i];
+        for (j = 0; j < n_plp[i]; j++) { // iterate over reads
+            bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j;
+            has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0;
+            // Has_clip is almost always true for very long reads
+            // (eg PacBio CCS), but these rarely matter as the clip
+            // is likely a long way from this indel.
+            has_clip  += (PLP_HAS_SOFT_CLIP(p->cd.i))         ? 1 : 0;
+            if (max_indel < p->indel)
+                max_indel = p->indel;
+            if (min_indel > p->indel)
+                min_indel = p->indel;
+        }
+    }
+
+    if (flag & MPLP_REALN_PARTIAL) {
+        if (has_indel == 0 ||
+            (has_clip < 0.2*nt && max_indel == min_indel &&
+             (has_indel < 0.1*nt /*|| has_indel > 0.9*nt*/ || has_indel == 1)))
+            return;
+    }
+
+    // Realign
+    for (i = 0; i < n; i++) { // iterate over bams
+        for (j = 0; j < n_plp[i]; j++) { // iterate over reads
+            const bam_pileup1_t *p = plp[i] + j;
+            bam1_t *b = p->b;
+
+            // Avoid doing multiple times.
+            //
+            // Note we cannot modify p->cd.i here with a PLP_SET macro
+            // because the cd item is held by mpileup in an lbnode_t
+            // struct and copied over to the pileup struct for each
+            // iteration, essentially making p->cd.i read only.
+            //
+            // We could use our own structure (p->cd.p), allocated during
+            // the constructor, but for simplicity we play dirty and
+            // abuse an unused flag bit instead.
+            if (b->core.flag & 32768)
+                continue;
+            b->core.flag |= 32768;
+
+            if (b->core.l_qseq > max_read_len)
+                continue;
+
+            // Check p->cigar_ind and see what cigar elements are before
+            // and after.  How close is this location to the end of the
+            // read?  Only realign if we don't span by more than X bases.
+            //
+            // Again, best only done on deeper data as BAQ helps
+            // disproportionately more on shallow data sets.
+            //
+            // This rescues some of the false negatives that are caused by
+            // systematic reduction in quality due to sample vs ref alignment.
+
+// At deep coverage we skip realigning more reads as we have sufficient depth.
+// This rescues for false negatives.  At shallow depth we pay for this with
+// more FP so are more stringent on spanning size.
+#define REALN_DIST (40+10*(nt<40)+10*(nt<20))
+            uint32_t *cig = bam_get_cigar(b);
+            int ncig = b->core.n_cigar;
+
+            // Don't realign reads where indel is in middle?
+            // On long read data we don't care about soft-clips at the ends.
+            // For short read data, we always calc BAQ on these as they're
+            // a common source of false positives.
+            if ((flag & MPLP_REALN_PARTIAL) && nt > 15 && ncig > 1) {
+                // Left & right cigar op match.
+                int lr = b->core.l_qseq > 500;
+                int lm = 0, rm = 0, k;
+                for (k = 0; k < ncig; k++) {
+                    int cop = bam_cigar_op(cig[k]);
+                    if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+                        continue;
+
+                    if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+                        cop == BAM_CEQUAL)
+                        lm += bam_cigar_oplen(cig[k]);
+                    else
+                        break;
+                }
+
+                for (k = ncig-1; k >= 0; k--) {
+                    int cop = bam_cigar_op(cig[k]);
+                    if (lr && (cop == BAM_CHARD_CLIP || cop == BAM_CSOFT_CLIP))
+                        continue;
+
+                    if (cop == BAM_CMATCH || cop == BAM_CDIFF ||
+                        cop == BAM_CEQUAL)
+                        rm += bam_cigar_oplen(cig[k]);
+                    else
+                        break;
+                }
+
+                if (lm >= REALN_DIST*4 && rm >= REALN_DIST*4)
+                    continue;
+
+                if (lm >= REALN_DIST && rm >= REALN_DIST &&
+                    has_clip < (0.15+0.05*(nt>20))*nt)
+                    continue;
+            }
+
+            if (b->core.l_qseq > 500) {
+                // don't do BAQ on long-read data if it's going to
+                // cause us to have a large band-with and costly in CPU
+                int rl = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
+                if (abs(rl - b->core.l_qseq) * b->core.l_qseq >= 500000)
+                    continue;
+            }
+
+            // Fudge: make room for ZQ tag.
+            uint8_t *_Q = bam_aux_get(b, "_Q");
+            if (_Q) bam_aux_del(b, _Q);
+            sam_prob_realn(b, ref, ref_len, (flag & MPLP_REDO_BAQ) ? 7 : 3);
+        }
+    }
+
+    return;
+}
+
  static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
  {
      bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list
@@ -326,7 +541,7 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
      int ret, i, tid, pos, ref_len;
      char *ref;
  
-    while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0) 
+    while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0)
      {
          if ( pos<beg || pos>end ) continue;
          if ( conf->bed && tid >= 0 )
@@ -335,7 +550,10 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
              if ( !conf->bed_logic ) overlap = overlap ? 0 : 1;
              if ( !overlap ) continue;
          }
-        mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
+        int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
+        if (has_ref && (conf->flag & MPLP_REALN))
+            mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag,
+                       conf->max_read_len, ref, ref_len, pos);
  
          int total_depth, _ref0, ref16;
          for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i];
@@ -348,18 +566,19 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
          conf->bc.tid = tid; conf->bc.pos = pos;
          bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
          bcf_clear1(conf->bcf_rec);
-        bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0);
+        bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag,
+                     conf->bca, 0);
          flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
  
          // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
          // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
-        if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth 
-            && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)
+        if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
+            && (bcf_callaux_clean(conf->bca, &conf->bc),
+                bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0))
          {
-            bcf_callaux_clean(conf->bca, &conf->bc);
              for (i = 0; i < conf->gplp->n; ++i)
                  bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
-            if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) 
+            if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
              {
                  bcf_clear1(conf->bcf_rec);
                  bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
@@ -374,7 +593,7 @@ static int mpileup(mplp_conf_t *conf)
  {
      if (conf->nfiles == 0) {
          fprintf(bcftools_stderr,"[%s] no input file/data given\n", __func__);
-        exit(EXIT_FAILURE);
+        bcftools_exit(EXIT_FAILURE);
      }
  
      mplp_ref_t mp_ref = MPLP_REF_INIT;
@@ -395,7 +614,7 @@ static int mpileup(mplp_conf_t *conf)
              conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL);
              if ( !conf->reg ) {
                  fprintf(bcftools_stderr,"Could not parse the regions: %s\n", conf->reg_fname);
-                exit(EXIT_FAILURE);
+                bcftools_exit(EXIT_FAILURE);
              }
          }
          else
@@ -403,7 +622,7 @@ static int mpileup(mplp_conf_t *conf)
              conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
              if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) {
                  fprintf(bcftools_stderr,"Could not parse the regions: %s\n", conf->reg_fname);
-                exit(EXIT_FAILURE);
+                bcftools_exit(EXIT_FAILURE);
              }
          }
          nregs = regidx_nregs(conf->reg);
@@ -422,23 +641,23 @@ static int mpileup(mplp_conf_t *conf)
          if ( !conf->mplp_data[i]->fp )
          {
              fprintf(bcftools_stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno));
-            exit(EXIT_FAILURE);
+            bcftools_exit(EXIT_FAILURE);
          }
          if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
              fprintf(bcftools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
-            exit(EXIT_FAILURE);
+            bcftools_exit(EXIT_FAILURE);
          }
          if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) {
              fprintf(bcftools_stderr, "[%s] failed to process %s: %s\n",
                      __func__, conf->fai_fname, strerror(errno));
-            exit(EXIT_FAILURE);
+            bcftools_exit(EXIT_FAILURE);
          }
          conf->mplp_data[i]->conf = conf;
          conf->mplp_data[i]->ref = &mp_ref;
          h_tmp = sam_hdr_read(conf->mplp_data[i]->fp);
          if ( !h_tmp ) {
              fprintf(bcftools_stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]);
-            exit(EXIT_FAILURE);
+            bcftools_exit(EXIT_FAILURE);
          }
          conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet
          conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]);
@@ -458,20 +677,20 @@ static int mpileup(mplp_conf_t *conf)
              hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]);
              if (idx == NULL) {
                  fprintf(bcftools_stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]);
-                exit(EXIT_FAILURE);
+                bcftools_exit(EXIT_FAILURE);
              }
              conf->buf.l = 0;
              ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
              conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s);
-            if ( !conf->mplp_data[i]->iter ) 
+            if ( !conf->mplp_data[i]->iter )
              {
                  conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
                  if ( conf->mplp_data[i]->iter ) {
                      fprintf(bcftools_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
-                    exit(EXIT_FAILURE);
+                    bcftools_exit(EXIT_FAILURE);
                  }
                  fprintf(bcftools_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
-                exit(EXIT_FAILURE);
+                bcftools_exit(EXIT_FAILURE);
              }
              if ( nregs==1 ) // no need to keep the index in memory
                 hts_idx_destroy(idx);
@@ -489,18 +708,22 @@ static int mpileup(mplp_conf_t *conf)
              conf->mplp_data[i]->h = hdr;
          }
      }
+    if ( !hdr ) {
+        fprintf(bcftools_stderr, "[%s] failed to find a file header with usable read groups\n", __func__);
+        bcftools_exit(EXIT_FAILURE);
+    }
      // allocate data storage proportionate to number of samples being studied sm->n
      bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n);
      conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int));
      conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int));
-    conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));  
+    conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));
  
      fprintf(bcftools_stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
      // write the VCF header
-    conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type));
+    conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode2(conf->output_type,conf->output_fname));
      if (conf->bcf_fp == NULL) {
          fprintf(bcftools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
-        exit(EXIT_FAILURE);
+        bcftools_exit(EXIT_FAILURE);
      }
      if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads);
  
@@ -544,11 +767,24 @@ static int mpileup(mplp_conf_t *conf)
      bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
      if ( conf->fmt_flag&B2B_INFO_VDB )
          bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
-    if ( conf->fmt_flag&B2B_INFO_RPB )
-        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
-    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
-    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
-    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
+
+    if (conf->fmt_flag & B2B_INFO_ZSCORE) {
+        if ( conf->fmt_flag&B2B_INFO_RPB )
+            bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Read Position Bias (closer to 0 is better)\">");
+        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
+        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
+        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
+        if ( conf->fmt_flag&B2B_INFO_SCB )
+            bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
+    } else {
+        if ( conf->fmt_flag&B2B_INFO_RPB )
+            bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
+        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
+        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
+        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
+    }
+
+    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=FS,Number=1,Type=Float,Description=\"Phred-scaled p-value using Fisher's exact test to detect strand bias\">");
  #if CDF_MWU_TESTS
      bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
      bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
@@ -578,6 +814,8 @@ static int mpileup(mplp_conf_t *conf)
          bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand (high-quality bases)\">");
      if ( conf->fmt_flag&B2B_FMT_ADR )
          bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand (high-quality bases)\">");
+    if ( conf->fmt_flag&B2B_FMT_QS )
+        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=QS,Number=R,Type=Integer,Description=\"Phred-score allele quality sum used by `call -mG` and `+trio-dnm`\">");
      if ( conf->fmt_flag&B2B_INFO_AD )
          bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths (high-quality bases)\">");
      if ( conf->fmt_flag&B2B_INFO_ADF )
@@ -597,17 +835,23 @@ static int mpileup(mplp_conf_t *conf)
          bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
      if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output");
  
-    conf->bca = bcf_call_init(-1., conf->min_baseQ);
+    conf->bca = bcf_call_init(-1., conf->min_baseQ, conf->max_baseQ,
+                              conf->delta_baseQ);
      conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
      conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
+    conf->bca->indel_bias = conf->indel_bias;
      conf->bca->min_frac = conf->min_frac;
      conf->bca->min_support = conf->min_support;
      conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
      conf->bca->fmt_flag = conf->fmt_flag;
+    conf->bca->ambig_reads = conf->ambig_reads;
  
      conf->bc.bcf_hdr = conf->bcf_hdr;
      conf->bc.n  = nsmpl;
      conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL));
+    conf->bc.QS = (int32_t*) malloc(nsmpl*sizeof(*conf->bc.QS)*B2B_MAX_ALLELES);
+    for (i=0; i<nsmpl; i++)
+        conf->bcr[i].QS = conf->bc.QS + i*B2B_MAX_ALLELES;
      if (conf->fmt_flag)
      {
          assert( sizeof(float)==sizeof(int32_t) );
@@ -645,7 +889,7 @@ static int mpileup(mplp_conf_t *conf)
      if ( nregs )
      {
          int ireg = 0;
-        do 
+        do
          {
              // first region is already positioned
              if ( ireg++ > 0 )
@@ -653,19 +897,19 @@ static int mpileup(mplp_conf_t *conf)
                  conf->buf.l = 0;
                  ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
  
-                for (i=0; i<conf->nfiles; i++) 
+                for (i=0; i<conf->nfiles; i++)
                  {
                      hts_itr_destroy(conf->mplp_data[i]->iter);
                      conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s);
-                    if ( !conf->mplp_data[i]->iter ) 
+                    if ( !conf->mplp_data[i]->iter )
                      {
                          conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
                          if ( conf->mplp_data[i]->iter ) {
                              fprintf(bcftools_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
-                            exit(EXIT_FAILURE);
+                            bcftools_exit(EXIT_FAILURE);
                          }
                          fprintf(bcftools_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
-                        exit(EXIT_FAILURE);
+                        bcftools_exit(EXIT_FAILURE);
                      }
                      bam_mplp_reset(conf->iter);
                  }
@@ -692,6 +936,7 @@ static int mpileup(mplp_conf_t *conf)
          free(conf->bc.ADR);
          free(conf->bc.ADF);
          free(conf->bc.SCR);
+        free(conf->bc.QS);
          free(conf->bc.fmt_arr);
          free(conf->bcr);
      }
@@ -795,14 +1040,16 @@ int parse_format_flag(const char *str)
          else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
          else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
          else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR;
+        else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS;
          else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR;
          else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
          else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
          else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
+        else if ( !strcasecmp(tags[i],"SCB") || !strcasecmp(tags[i],"INFO/SCB")) flag |= B2B_INFO_SCB;
          else
          {
              fprintf(bcftools_stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
-            exit(EXIT_FAILURE);
+            bcftools_exit(EXIT_FAILURE);
          }
          free(tags[i]);
      }
@@ -823,6 +1070,7 @@ static void list_annotations(FILE *fp)
  "  FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
  "  FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
  "  FORMAT/DP  .. Number of high-quality bases (Number=1,Type=Integer)\n"
+"  FORMAT/QS  .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
  "  FORMAT/SP  .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
  "  FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
  "\n"
@@ -845,78 +1093,98 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
      // source code in 80 columns, to the extent that's possible.)
  
      fprintf(fp,
-"\n"
-"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
-"\n"
-"Input options:\n"
-"  -6, --illumina1.3+      quality is in the Illumina-1.3+ encoding\n"
-"  -A, --count-orphans     do not discard anomalous read pairs\n"
-"  -b, --bam-list FILE     list of input BAM filenames, one per line\n"
-"  -B, --no-BAQ            disable BAQ (per-Base Alignment Quality)\n"
-"  -C, --adjust-MQ INT     adjust mapping quality; recommended:50, disable:0 [0]\n"
-"  -d, --max-depth INT     max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+        "\n"
+        "Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
+        "\n"
+        "Input options:\n"
+        "  -6, --illumina1.3+      quality is in the Illumina-1.3+ encoding\n"
+        "  -A, --count-orphans     do not discard anomalous read pairs\n"
+        "  -b, --bam-list FILE     list of input BAM filenames, one per line\n"
+        "  -B, --no-BAQ            disable BAQ (per-Base Alignment Quality)\n"
+        "  -C, --adjust-MQ INT     adjust mapping quality [0]\n"
+        "  -D, --full-BAQ          Apply BAQ everywhere, not just in problematic regions\n"
+        "  -d, --max-depth INT     max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+            fprintf(fp,
+        "  -E, --redo-BAQ          recalculate BAQ on the fly, ignore existing BQs\n"
+        "  -f, --fasta-ref FILE    faidx indexed reference sequence file\n"
+        "      --no-reference      do not require fasta reference file\n"
+        "  -G, --read-groups FILE  select or exclude read groups listed in the file\n"
+        "  -q, --min-MQ INT        skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
      fprintf(fp,
-"  -E, --redo-BAQ          recalculate BAQ on the fly, ignore existing BQs\n"
-"  -f, --fasta-ref FILE    faidx indexed reference sequence file\n"
-"      --no-reference      do not require fasta reference file\n"
-"  -G, --read-groups FILE  select or exclude read groups listed in the file\n"
-"  -q, --min-MQ INT        skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
+        "  -Q, --min-BQ INT        skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
      fprintf(fp,
-"  -Q, --min-BQ INT        skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
+        "      --max-BQ INT        limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ);
      fprintf(fp,
-"  -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
-"  -R, --regions-file FILE restrict to regions listed in a file\n"
-"      --ignore-RG         ignore RG tags (one BAM = one sample)\n"
-"  --rf, --incl-flags STR|INT  required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+        "      --delta-BQ INT      Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ);
      fprintf(fp,
-"  --ff, --excl-flags STR|INT  filter flags: skip reads with mask bits set\n"
-"                                            [%s]\n", tmp_filter);
+        "  -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
+        "  -R, --regions-file FILE restrict to regions listed in a file\n"
+        "      --ignore-RG         ignore RG tags (one BAM = one sample)\n"
+        "  --rf, --incl-flags STR|INT  required flags: skip reads with mask bits unset [%s]\n", tmp_require);
      fprintf(fp,
-"  -s, --samples LIST      comma separated list of samples to include\n"
-"  -S, --samples-file FILE file of samples to include\n"
-"  -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
-"  -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
-"  -x, --ignore-overlaps   disable read-pair overlap detection\n"
-"\n"
-"Output options:\n"
-"  -a, --annotate LIST     optional tags to output; '?' to list []\n"
-"  -g, --gvcf INT[,...]    group non-variant sites into gVCF blocks according\n"
-"                          to minimum per-sample DP\n"
-"      --no-version        do not append version and command line to the header\n"
-"  -o, --output FILE       write output to FILE [standard output]\n"
-"  -O, --output-type TYPE  'b' compressed BCF; 'u' uncompressed BCF;\n"
-"                          'z' compressed VCF; 'v' uncompressed VCF [v]\n"
-"      --threads INT       use multithreading with INT worker threads [0]\n"
-"\n"
-"SNP/INDEL genotype likelihoods options:\n"
-"  -e, --ext-prob INT      Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
+        "  --ff, --excl-flags STR|INT  filter flags: skip reads with mask bits set\n"
+        "                                            [%s]\n", tmp_filter);
      fprintf(fp,
-"  -F, --gap-frac FLOAT    minimum fraction of gapped reads [%g]\n", mplp->min_frac);
+        "  -s, --samples LIST      comma separated list of samples to include\n"
+        "  -S, --samples-file FILE file of samples to include\n"
+        "  -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
+        "  -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
+        "  -x, --ignore-overlaps   disable read-pair overlap detection\n"
+        "      --seed INT          random number seed used for sampling deep regions [0]\n"
+        "\n"
+        "Output options:\n"
+        "  -a, --annotate LIST     optional tags to output; '?' to list available tags []\n"
+        "  -g, --gvcf INT[,...]    group non-variant sites into gVCF blocks according\n"
+        "                          to minimum per-sample DP\n"
+        "      --no-version        do not append version and command line to the header\n"
+        "  -o, --output FILE       write output to FILE [standard output]\n"
+        "  -O, --output-type TYPE  'b' compressed BCF; 'u' uncompressed BCF;\n"
+        "                          'z' compressed VCF; 'v' uncompressed VCF [v]\n"
+        "  -U, --mwu-u             use older probability scale for Mann-Whitney U test\n"
+        "      --threads INT       use multithreading with INT worker threads [0]\n"
+        "\n"
+        "SNP/INDEL genotype likelihoods options:\n"
+        "  -X, --config STR        Specify platform specific profiles (see below)\n"
+        "  -e, --ext-prob INT      Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
      fprintf(fp,
-"  -h, --tandem-qual INT   coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
+        "  -F, --gap-frac FLOAT    minimum fraction of gapped reads [%g]\n", mplp->min_frac);
      fprintf(fp,
-"  -I, --skip-indels       do not perform indel calling\n"
-"  -L, --max-idepth INT    maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
+        "  -h, --tandem-qual INT   coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
      fprintf(fp,
-"  -m, --min-ireads INT    minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
+        "  -I, --skip-indels       do not perform indel calling\n"
+        "  -L, --max-idepth INT    maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
      fprintf(fp,
-"  -o, --open-prob INT     Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
+        "  -m, --min-ireads INT    minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
      fprintf(fp,
-"  -p, --per-sample-mF     apply -m and -F per-sample for increased sensitivity\n"
-"  -P, --platforms STR     comma separated list of platforms for indels [all]\n"
-"\n"
-"Notes: Assuming diploid individuals.\n"
-"\n"
-"Example:\n"
-"   # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
-"   bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
-"\n");
+        "  -M, --max-read-len INT  maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len);
+    fprintf(fp,
+        "  -o, --open-prob INT     Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
+    fprintf(fp,
+        "  -p, --per-sample-mF     apply -m and -F per-sample for increased sensitivity\n"
+        "  -P, --platforms STR     comma separated list of platforms for indels [all]\n"
+        "  --ar, --ambig-reads STR   What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n");
+    fprintf(fp,
+        "      --indel-bias FLOAT  Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
+    fprintf(fp,"\n");
+    fprintf(fp,
+        "Configuration profiles activated with -X, --config:\n"
+        "    1.12:        -Q13 -h100 -m1 -F0.002\n"
+        "    illumina:    [ default values ]\n"
+        "    ont:         -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n"
+        "    pacbio-ccs:  -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n"
+        "\n"
+        "Notes: Assuming diploid individuals.\n"
+        "\n"
+        "Example:\n"
+        "   # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"
+        "   bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"
+        "\n");
  
      free(tmp_require);
      free(tmp_filter);
  }
  
-int bam_mpileup(int argc, char *argv[])
+int main_mpileup(int argc, char *argv[])
  {
      int c;
      const char *file_list = NULL;
@@ -924,12 +1192,15 @@ int bam_mpileup(int argc, char *argv[])
      int nfiles = 0, use_orphan = 0, noref = 0;
      mplp_conf_t mplp;
      memset(&mplp, 0, sizeof(mplp_conf_t));
-    mplp.min_baseQ = 13;
+    mplp.min_baseQ = 1;
+    mplp.max_baseQ = 60;
+    mplp.delta_baseQ = 30;
      mplp.capQ_thres = 0;
      mplp.max_depth = 250; mplp.max_indel_depth = 250;
-    mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
-    mplp.min_frac = 0.002; mplp.min_support = 1;
-    mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS;
+    mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500;
+    mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2;
+    mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL
+              | MPLP_SMART_OVERLAPS;
      mplp.argc = argc; mplp.argv = argv;
      mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
      mplp.output_fname = NULL;
@@ -937,7 +1208,11 @@ int bam_mpileup(int argc, char *argv[])
      mplp.record_cmd_line = 1;
      mplp.n_threads = 0;
      mplp.bsmpl = bam_smpl_init();
-    mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB;    // the default to be changed in future, see also parse_format_flag()
+    // the default to be changed in future, see also parse_format_flag()
+    mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE;
+    mplp.max_read_len = 500;
+    mplp.ambig_reads = B2B_DROP;
+    hts_srand48(0);
  
      static const struct option lopts[] =
      {
@@ -958,6 +1233,8 @@ int bam_mpileup(int argc, char *argv[])
          {"bam-list", required_argument, NULL, 'b'},
          {"no-BAQ", no_argument, NULL, 'B'},
          {"no-baq", no_argument, NULL, 'B'},
+        {"full-BAQ", no_argument, NULL, 'D'},
+        {"full-baq", no_argument, NULL, 'D'},
          {"adjust-MQ", required_argument, NULL, 'C'},
          {"adjust-mq", required_argument, NULL, 'C'},
          {"max-depth", required_argument, NULL, 'd'},
@@ -974,6 +1251,9 @@ int bam_mpileup(int argc, char *argv[])
          {"min-mq", required_argument, NULL, 'q'},
          {"min-BQ", required_argument, NULL, 'Q'},
          {"min-bq", required_argument, NULL, 'Q'},
+        {"max-bq", required_argument, NULL, 11},
+        {"max-BQ", required_argument, NULL, 11},
+        {"delta-BQ", required_argument, NULL, 12},
          {"ignore-overlaps", no_argument, NULL, 'x'},
          {"output-type", required_argument, NULL, 'O'},
          {"samples", required_argument, NULL, 's'},
@@ -981,16 +1261,23 @@ int bam_mpileup(int argc, char *argv[])
          {"annotate", required_argument, NULL, 'a'},
          {"ext-prob", required_argument, NULL, 'e'},
          {"gap-frac", required_argument, NULL, 'F'},
+        {"indel-bias", required_argument, NULL, 10},
          {"tandem-qual", required_argument, NULL, 'h'},
          {"skip-indels", no_argument, NULL, 'I'},
          {"max-idepth", required_argument, NULL, 'L'},
-        {"min-ireads ", required_argument, NULL, 'm'},
+        {"min-ireads", required_argument, NULL, 'm'},
          {"per-sample-mF", no_argument, NULL, 'p'},
          {"per-sample-mf", no_argument, NULL, 'p'},
          {"platforms", required_argument, NULL, 'P'},
+        {"max-read-len", required_argument, NULL, 'M'},
+        {"config", required_argument, NULL, 'X'},
+        {"mwu-u", no_argument, NULL, 'U'},
+        {"seed", required_argument, NULL, 13},
+        {"ambig-reads", required_argument, NULL, 14},
+        {"ar", required_argument, NULL, 14},
          {NULL, 0, NULL, 0}
      };
-    while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
          switch (c) {
          case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
          case  1 :
@@ -1030,7 +1317,7 @@ int bam_mpileup(int argc, char *argv[])
                    if ( regidx_insert_list(mplp.bed,optarg,',') !=0 )
                    {
                        fprintf(bcftools_stderr,"Could not parse the targets: %s\n", optarg);
-                      exit(EXIT_FAILURE);
+                      bcftools_exit(EXIT_FAILURE);
                    }
                    break;
          case 'T':
@@ -1042,23 +1329,26 @@ int bam_mpileup(int argc, char *argv[])
          case 'P': mplp.pl_list = strdup(optarg); break;
          case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
          case 'B': mplp.flag &= ~MPLP_REALN; break;
+        case 'D': mplp.flag &= ~MPLP_REALN_PARTIAL; break;
          case 'I': mplp.flag |= MPLP_NO_INDEL; break;
          case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
          case '6': mplp.flag |= MPLP_ILLUMINA13; break;
          case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break;
          case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break;
-        case 'O': 
+        case 'O':
              switch (optarg[0]) {
                  case 'b': mplp.output_type = FT_BCF_GZ; break;
                  case 'u': mplp.output_type = FT_BCF; break;
                  case 'z': mplp.output_type = FT_VCF_GZ; break;
                  case 'v': mplp.output_type = FT_VCF; break;
-                default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n"); 
+                default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n");
              }
              break;
          case 'C': mplp.capQ_thres = atoi(optarg); break;
          case 'q': mplp.min_mq = atoi(optarg); break;
          case 'Q': mplp.min_baseQ = atoi(optarg); break;
+        case  11: mplp.max_baseQ = atoi(optarg); break;
+        case  12: mplp.delta_baseQ = atoi(optarg); break;
          case 'b': file_list = optarg; break;
          case 'o': {
                  char *end;
@@ -1070,6 +1360,12 @@ int bam_mpileup(int argc, char *argv[])
              break;
          case 'e': mplp.extQ = atoi(optarg); break;
          case 'h': mplp.tandemQ = atoi(optarg); break;
+        case 10: // --indel-bias (inverted so higher => more indels called)
+            if (atof(optarg) < 1e-2)
+                mplp.indel_bias = 1/1e2;
+            else
+                mplp.indel_bias = 1/atof(optarg);
+            break;
          case 'A': use_orphan = 1; break;
          case 'F': mplp.min_frac = atof(optarg); break;
          case 'm': mplp.min_support = atoi(optarg); break;
@@ -1082,6 +1378,49 @@ int bam_mpileup(int argc, char *argv[])
              }
              mplp.fmt_flag |= parse_format_flag(optarg);
          break;
+        case 'M': mplp.max_read_len = atoi(optarg); break;
+        case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break;
+        case 'X':
+            if (strcasecmp(optarg, "pacbio-ccs") == 0) {
+                mplp.min_frac = 0.1;
+                mplp.min_baseQ = 5;
+                mplp.max_baseQ = 50;
+                mplp.delta_baseQ = 10;
+                mplp.openQ = 25;
+                mplp.extQ = 1;
+                mplp.flag |= MPLP_REALN_PARTIAL;
+                mplp.max_read_len = 99999;
+            } else if (strcasecmp(optarg, "ont") == 0) {
+                fprintf(bcftools_stderr, "For ONT it may be beneficial to also run bcftools call with "
+                        "a higher -P, eg -P0.01 or -P 0.1\n");
+                mplp.min_baseQ = 5;
+                mplp.max_baseQ = 30;
+                mplp.flag &= ~MPLP_REALN;
+                mplp.flag |= MPLP_NO_INDEL;
+            } else if (strcasecmp(optarg, "1.12") == 0) {
+                // 1.12 and earlier
+                mplp.min_frac = 0.002;
+                mplp.min_support = 1;
+                mplp.min_baseQ = 13;
+                mplp.tandemQ = 100;
+                mplp.flag &= ~MPLP_REALN_PARTIAL;
+                mplp.flag |= MPLP_REALN;
+            } else if (strcasecmp(optarg, "illumina") == 0) {
+                mplp.flag |= MPLP_REALN_PARTIAL;
+            } else {
+                fprintf(bcftools_stderr, "Unknown configuration name '%s'\n"
+                        "Please choose from 1.12, illumina, pacbio-ccs or ont\n",
+                        optarg);
+                return 1;
+            }
+            break;
+        case 13: hts_srand48(atoi(optarg)); break;
+        case 14:
+            if ( !strcasecmp(optarg,"drop") ) mplp.ambig_reads = B2B_DROP;
+            else if ( !strcasecmp(optarg,"incAD") ) mplp.ambig_reads = B2B_INC_AD;
+            else if ( !strcasecmp(optarg,"incAD0") ) mplp.ambig_reads = B2B_INC_AD0;
+            else error("The option to --ambig-reads not recognised: %s\n",optarg);
+            break;
          default:
              fprintf(bcftools_stderr,"Invalid option: '%c'\n", c);
              return 1;
@@ -1122,7 +1461,7 @@ int bam_mpileup(int argc, char *argv[])
          return 1;
      }
      int ret,i;
-    if (file_list) 
+    if (file_list)
      {
          if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
          mplp.files  = fn;
@@ -1144,5 +1483,6 @@ int bam_mpileup(int argc, char *argv[])
      if (mplp.bed_itr) regitr_destroy(mplp.bed_itr);
      if (mplp.reg) regidx_destroy(mplp.reg);
      bam_smpl_destroy(mplp.bsmpl);
+
      return ret;
  }
diff --git a/bcftools/ploidy.h b/bcftools/ploidy.h

index 1e7d2f78f6467e5cfd13d8fb14cee7cfa92fd029..7697c65f997626c7359e5737845f658bacf70fee 100644 (file)
--- a/bcftools/ploidy.h
+++ b/bcftools/ploidy.h
@@ -1,5 +1,5 @@
  /* 
-    Copyright (C) 2014 Genome Research Ltd.
+    Copyright (C) 2014-2015 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/prob1.c b/bcftools/prob1.c

index 954d43cf7834688822c484e41b90d9a02feb3a41..3ab7bcb37a10ef61d1fd389278fea09057e7aeda 100644 (file)
--- a/bcftools/prob1.c
+++ b/bcftools/prob1.c
@@ -1,7 +1,7 @@
  /*  prob1.c -- mathematical utility functions.
  
      Copyright (C) 2010, 2011 Broad Institute.
-    Copyright (C) 2012, 2013 Genome Research Ltd.
+    Copyright (C) 2012, 2013-2014, 2017 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
diff --git a/bcftools/prob1.c.pysam.c b/bcftools/prob1.c.pysam.c

index bd73e1d5b67141dce3750c45dafa72bdbd987ac3..6d2bbd17ab57a772d25ee20437fea04105eeab3d 100644 (file)
--- a/bcftools/prob1.c.pysam.c
+++ b/bcftools/prob1.c.pysam.c
@@ -3,7 +3,7 @@
  /*  prob1.c -- mathematical utility functions.
  
      Copyright (C) 2010, 2011 Broad Institute.
-    Copyright (C) 2012, 2013 Genome Research Ltd.
+    Copyright (C) 2012, 2013-2014, 2017 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
diff --git a/bcftools/prob1.h b/bcftools/prob1.h

index a3d4b0d7b482ccc3728cf23b43297237dd316b58..a5622656ce285fe6f71a60001383a20189eaa4aa 100644 (file)
--- a/bcftools/prob1.h
+++ b/bcftools/prob1.h
@@ -1,7 +1,7 @@
  /*  prob1.h -- mathematical utility functions.
  
      Copyright (C) 2010, 2011 Broad Institute.
-    Copyright (C) 2012, 2013 Genome Research Ltd.
+    Copyright (C) 2012, 2013-2014 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
diff --git a/bcftools/rbuf.h b/bcftools/rbuf.h

index 2c0e5b1f73262bf751d050c97183ac0dc30eecef..ef2e206322dbcafdc51d0118d118847b0423da6d 100644 (file)
--- a/bcftools/rbuf.h
+++ b/bcftools/rbuf.h
@@ -1,6 +1,6 @@
  /*  rbuf.h -- round buffers.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2014, 2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/regidx.c b/bcftools/regidx.c

index 5c6c8ce5941dd8113a1d8293a7f1b2cacd14dcfb..cdaf7eaf439d3f8a3a29bce1a9e7f28fdbad1dd1 100644 (file)
--- a/bcftools/regidx.c
+++ b/bcftools/regidx.c
@@ -1,5 +1,5 @@
  /* 
-    Copyright (C) 2014-2017 Genome Research Ltd.
+    Copyright (C) 2014-2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/regidx.c.pysam.c b/bcftools/regidx.c.pysam.c

index 684993cd8d9d4b8202a43706e9c3e74555288039..4eb96e87442824421db867408f534549a41ae9a7 100644 (file)
--- a/bcftools/regidx.c.pysam.c
+++ b/bcftools/regidx.c.pysam.c
@@ -1,7 +1,7 @@
  #include "bcftools.pysam.h"
  
  /* 
-    Copyright (C) 2014-2017 Genome Research Ltd.
+    Copyright (C) 2014-2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/regidx.h b/bcftools/regidx.h

index a654dbdd8bf5b03f5e88a8ba7e7f7ca214c25078..f13b52a934a802491a8ad411ec31b60aaec37eb2 100644 (file)
--- a/bcftools/regidx.h
+++ b/bcftools/regidx.h
@@ -1,5 +1,5 @@
  /* 
-    Copyright (C) 2014-2016 Genome Research Ltd.
+    Copyright (C) 2014-2016, 2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/reheader.c b/bcftools/reheader.c

index 60a60e10ec1b3f89815d99f76e45f4583cfd91be..ae7c6226ecb67526c2085670f20a51ac7260c771 100644 (file)
--- a/bcftools/reheader.c
+++ b/bcftools/reheader.c
@@ -1,6 +1,6 @@
  /*  reheader.c -- reheader subcommand.
  
-    Copyright (C) 2014-2018 Genome Research Ltd.
+    Copyright (C) 2014-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -49,7 +49,7 @@ THE SOFTWARE.  */
  typedef struct _args_t
  {
      char **argv, *fname, *samples_fname, *header_fname, *output_fname;
-    char *fai_fname, *rm_tmpfile;
+    char *fai_fname, *rm_tmpfile, *tmp_prefix;
      htsFile *fp;
      htsFormat type;
      htsThreadPool *threads;
@@ -140,6 +140,33 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see
      free(key.s); free(val.s); free(tmp.s);
      return q;
  }
+char *init_tmp_prefix(const char *tmp_prefix)
+{
+    char *prefix = NULL;
+    if ( tmp_prefix )
+    {
+        int len = strlen(tmp_prefix);
+        prefix = (char*) calloc(len+7,1);
+        memcpy(prefix,tmp_prefix,len);
+        memcpy(prefix+len,"XXXXXX",6);
+    }
+    else
+    {
+        #ifdef _WIN32
+            char tmp_path[MAX_PATH];
+            int ret = GetTempPath(MAX_PATH, tmp_path);
+            if (!ret || ret > MAX_PATH)
+                error("Could not get the path to the temporary folder\n");
+            if (strlen(tmp_path) + strlen("/bcftools.XXXXXX") >= MAX_PATH)
+                error("Full path to the temporary folder is too long\n");
+            strcat(tmp_path, "/bcftools.XXXXXX");
+            prefix = strdup(tmp_path);
+        #else
+            prefix = strdup("/tmp/bcftools.XXXXXX");
+        #endif
+    }
+    return prefix;
+}
  static void update_from_fai(args_t *args)
  {
      if ( !strcmp("-",args->fname) )
@@ -147,18 +174,7 @@ static void update_from_fai(args_t *args)
  
      faidx_t *fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA);
      if ( !fai ) error("Could not parse %s\n", args->fai_fname);
-#ifdef _WIN32
-    char tmp_path[MAX_PATH];
-    int ret = GetTempPath(MAX_PATH, tmp_path);
-    if (!ret || ret > MAX_PATH)
-        error("Could not get the path to the temporary folder\n");
-    if (strlen(tmp_path) + strlen("/bcftools-fai-header-XXXXXX") >= MAX_PATH)
-        error("Full path to the temporary folder is too long\n");
-    strcat(tmp_path, "/bcftools-fai-header-XXXXXX");
-    args->rm_tmpfile = strdup(tmp_path);
-#else
-    args->rm_tmpfile = strdup("/tmp/bcftools-fai-header-XXXXXX");
-#endif
+    args->rm_tmpfile = init_tmp_prefix(args->tmp_prefix);
      int fd = mkstemp(args->rm_tmpfile);
      if ( fd<0 ) error("Could not open a temporary file for writing: %s\n", args->rm_tmpfile);
  
@@ -273,8 +289,8 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id
      hdr->s[hdr->l] = 0;
  
      kstring_t tmp = {0,0,0};
-    i = j = n = 0;
-    while ( hdr->s[idx+i] && hdr->s[idx+i])
+    i = j = n = 0;  // i:traverse the #CHROM line 1 by 1; j:points to the last column
+    while ( hdr->s[idx+i] )
      {
          if ( hdr->s[idx+i]=='\t' )
          {
@@ -282,8 +298,8 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id
  
              if ( ++n>9 )
              {
-                char *ori = khash_str2str_get(hash,hdr->s+idx+j);
-                kputs(ori ? ori : hdr->s+idx+j, &tmp);
+                char *new_name = khash_str2str_get(hash,hdr->s+idx+j);
+                kputs(new_name ? new_name : hdr->s+idx+j, &tmp);
              }
              else
                  kputs(hdr->s+idx+j, &tmp);
@@ -295,8 +311,8 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id
          }
          i++;
      }
-    char *ori = khash_str2str_get(hash,hdr->s+idx+j);
-    kputs(ori ? ori : hdr->s+idx+j, &tmp);
+    char *new_name = khash_str2str_get(hash,hdr->s+idx+j);
+    kputs(new_name ? new_name : hdr->s+idx+j, &tmp);
  
      khash_str2str_destroy_free_all(hash);
  
@@ -317,7 +333,13 @@ static void set_samples(char **samples, int nsamples, kstring_t *hdr)
          if ( hdr->s[i]=='\t' ) ncols++;
          i--;
      }
-    if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) ) error("Could not parse the header: %s\n", hdr->s);
+    if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) )
+    {
+        if ( i>0 && !strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",38) )
+            error("Error: missing FORMAT fields, cowardly refusing to add samples\n");
+
+        error("Could not parse the header: %s\n", hdr->s);
+    }
  
      // Are the samples "old-sample new-sample" pairs?
      if ( set_sample_pairs(samples,nsamples,hdr, i+1) ) return;
@@ -388,7 +410,10 @@ static void reheader_vcf_gz(args_t *args)
      int nsamples = 0;
      char **samples = NULL;
      if ( args->samples_fname )
+    {
          samples = hts_readlines(args->samples_fname, &nsamples);
+        if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname);
+    }
      if ( args->header_fname )
      {
          free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
@@ -444,7 +469,10 @@ static void reheader_vcf(args_t *args)
      int nsamples = 0;
      char **samples = NULL;
      if ( args->samples_fname )
+    {
          samples = hts_readlines(args->samples_fname, &nsamples);
+        if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname);
+    }
      if ( args->header_fname )
      {
          free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
@@ -548,7 +576,10 @@ static void reheader_bcf(args_t *args, int is_compressed)
      int i, nsamples = 0;
      char **samples = NULL;
      if ( args->samples_fname )
+    {
          samples = hts_readlines(args->samples_fname, &nsamples);
+        if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname);
+    }
      if ( args->header_fname )
      {
          free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0;
@@ -639,11 +670,16 @@ static void usage(args_t *args)
      fprintf(stderr, "Usage:   bcftools reheader [OPTIONS] <in.vcf.gz>\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Options:\n");
-    fprintf(stderr, "    -f, --fai <file>        update sequences and their lengths from the .fai file\n");
-    fprintf(stderr, "    -h, --header <file>     new header\n");
-    fprintf(stderr, "    -o, --output <file>     write output to a file [standard output]\n");
-    fprintf(stderr, "    -s, --samples <file>    new sample names\n");
-    fprintf(stderr, "        --threads <int>     use multithreading with <int> worker threads (BCF only) [0]\n");
+    fprintf(stderr, "    -f, --fai FILE             update sequences and their lengths from the .fai file\n");
+    fprintf(stderr, "    -h, --header FILE          new header\n");
+    fprintf(stderr, "    -o, --output FILE          write output to a file [standard output]\n");
+    fprintf(stderr, "    -s, --samples FILE         new sample names\n");
+#ifdef _WIN32
+    fprintf(stderr, "    -T, --temp-prefix PATH     template for temporary file name [/bcftools.XXXXXX]\n");
+#else
+    fprintf(stderr, "    -T, --temp-prefix PATH     template for temporary file name [/tmp/bcftools.XXXXXX]\n");
+#endif
+    fprintf(stderr, "        --threads INT          use multithreading with <int> worker threads (BCF only) [0]\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Example:\n");
      fprintf(stderr, "   # Write out the header to be modified\n");
@@ -666,6 +702,7 @@ int main_reheader(int argc, char *argv[])
      
      static struct option loptions[] =
      {
+        {"temp-prefix",1,0,'T'},
          {"fai",1,0,'f'},
          {"output",1,0,'o'},
          {"header",1,0,'h'},
@@ -673,11 +710,12 @@ int main_reheader(int argc, char *argv[])
          {"threads",1,NULL,1},
          {0,0,0,0}
      };
-    while ((c = getopt_long(argc, argv, "s:h:o:f:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "s:h:o:f:T:",loptions,NULL)) >= 0)
      {
          switch (c)
          {
              case  1 : args->n_threads = strtol(optarg, 0, 0); break;
+            case 'T': args->tmp_prefix = optarg; break;
              case 'f': args->fai_fname = optarg; break;
              case 'o': args->output_fname = optarg; break;
              case 's': args->samples_fname = optarg; break;
@@ -704,10 +742,14 @@ int main_reheader(int argc, char *argv[])
  
      if ( args->type.format==vcf )
      {
-        if ( args->type.compression==bgzf || args->type.compression==gzip )
+        if ( args->type.compression==bgzf )
              reheader_vcf_gz(args);
-        else
+        else if ( args->type.compression==no_compression )
              reheader_vcf(args);
+        else if ( args->type.compression==gzip )
+            error("Error: cannot reheader gzip-compressed files, first convert with `bcftools view --output-type` to a supported format\n");
+        else
+            error("Error: the compression type of \"%s\" is not recognised/supported\n", args->fname);
      }
      else
          reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip);
diff --git a/bcftools/reheader.c.pysam.c b/bcftools/reheader.c.pysam.c

index 9f84e4cc68edc294aacb3db7f9023130501524ec..380843b970795db0e6a5e94b57621df1be569229 100644 (file)
--- a/bcftools/reheader.c.pysam.c
+++ b/bcftools/reheader.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  reheader.c -- reheader subcommand.
  
-    Copyright (C) 2014-2018 Genome Research Ltd.
+    Copyright (C) 2014-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -51,7 +51,7 @@ THE SOFTWARE.  */
  typedef struct _args_t
  {
      char **argv, *fname, *samples_fname, *header_fname, *output_fname;
-    char *fai_fname, *rm_tmpfile;
+    char *fai_fname, *rm_tmpfile, *tmp_prefix;
      htsFile *fp;
      htsFormat type;
      htsThreadPool *threads;
@@ -142,6 +142,33 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see
      free(key.s); free(val.s); free(tmp.s);
      return q;
  }
+char *init_tmp_prefix(const char *tmp_prefix)
+{
+    char *prefix = NULL;
+    if ( tmp_prefix )
+    {
+        int len = strlen(tmp_prefix);
+        prefix = (char*) calloc(len+7,1);
+        memcpy(prefix,tmp_prefix,len);
+        memcpy(prefix+len,"XXXXXX",6);
+    }
+    else
+    {
+        #ifdef _WIN32
+            char tmp_path[MAX_PATH];
+            int ret = GetTempPath(MAX_PATH, tmp_path);
+            if (!ret || ret > MAX_PATH)
+                error("Could not get the path to the temporary folder\n");
+            if (strlen(tmp_path) + strlen("/bcftools.XXXXXX") >= MAX_PATH)
+                error("Full path to the temporary folder is too long\n");
+            strcat(tmp_path, "/bcftools.XXXXXX");
+            prefix = strdup(tmp_path);
+        #else
+            prefix = strdup("/tmp/bcftools.XXXXXX");
+        #endif
+    }
+    return prefix;
+}
  static void update_from_fai(args_t *args)
  {
      if ( !strcmp("-",args->fname) )
@@ -149,18 +176,7 @@ static void update_from_fai(args_t *args)
  
      faidx_t *fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA);
      if ( !fai ) error("Could not parse %s\n", args->fai_fname);
-#ifdef _WIN32
-    char tmp_path[MAX_PATH];
-    int ret = GetTempPath(MAX_PATH, tmp_path);
-    if (!ret || ret > MAX_PATH)
-        error("Could not get the path to the temporary folder\n");
-    if (strlen(tmp_path) + strlen("/bcftools-fai-header-XXXXXX") >= MAX_PATH)
-        error("Full path to the temporary folder is too long\n");
-    strcat(tmp_path, "/bcftools-fai-header-XXXXXX");
-    args->rm_tmpfile = strdup(tmp_path);
-#else
-    args->rm_tmpfile = strdup("/tmp/bcftools-fai-header-XXXXXX");
-#endif
+    args->rm_tmpfile = init_tmp_prefix(args->tmp_prefix);
      int fd = mkstemp(args->rm_tmpfile);
      if ( fd<0 ) error("Could not open a temporary file for writing: %s\n", args->rm_tmpfile);
  
@@ -275,8 +291,8 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id
      hdr->s[hdr->l] = 0;
  
      kstring_t tmp = {0,0,0};
-    i = j = n = 0;
-    while ( hdr->s[idx+i] && hdr->s[idx+i])
+    i = j = n = 0;  // i:traverse the #CHROM line 1 by 1; j:points to the last column
+    while ( hdr->s[idx+i] )
      {
          if ( hdr->s[idx+i]=='\t' )
          {
@@ -284,8 +300,8 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id
  
              if ( ++n>9 )
              {
-                char *ori = khash_str2str_get(hash,hdr->s+idx+j);
-                kputs(ori ? ori : hdr->s+idx+j, &tmp);
+                char *new_name = khash_str2str_get(hash,hdr->s+idx+j);
+                kputs(new_name ? new_name : hdr->s+idx+j, &tmp);
              }
              else
                  kputs(hdr->s+idx+j, &tmp);
@@ -297,8 +313,8 @@ static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int id
          }
          i++;
      }
-    char *ori = khash_str2str_get(hash,hdr->s+idx+j);
-    kputs(ori ? ori : hdr->s+idx+j, &tmp);
+    char *new_name = khash_str2str_get(hash,hdr->s+idx+j);
+    kputs(new_name ? new_name : hdr->s+idx+j, &tmp);
  
      khash_str2str_destroy_free_all(hash);
  
@@ -319,7 +335,13 @@ static void set_samples(char **samples, int nsamples, kstring_t *hdr)
          if ( hdr->s[i]=='\t' ) ncols++;
          i--;
      }
-    if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) ) error("Could not parse the header: %s\n", hdr->s);
+    if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) )
+    {
+        if ( i>0 && !strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",38) )
+            error("Error: missing FORMAT fields, cowardly refusing to add samples\n");
+
+        error("Could not parse the header: %s\n", hdr->s);
+    }
  
      // Are the samples "old-sample new-sample" pairs?
      if ( set_sample_pairs(samples,nsamples,hdr, i+1) ) return;
@@ -390,7 +412,10 @@ static void reheader_vcf_gz(args_t *args)
      int nsamples = 0;
      char **samples = NULL;
      if ( args->samples_fname )
+    {
          samples = hts_readlines(args->samples_fname, &nsamples);
+        if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname);
+    }
      if ( args->header_fname )
      {
          free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
@@ -446,7 +471,10 @@ static void reheader_vcf(args_t *args)
      int nsamples = 0;
      char **samples = NULL;
      if ( args->samples_fname )
+    {
          samples = hts_readlines(args->samples_fname, &nsamples);
+        if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname);
+    }
      if ( args->header_fname )
      {
          free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
@@ -550,7 +578,10 @@ static void reheader_bcf(args_t *args, int is_compressed)
      int i, nsamples = 0;
      char **samples = NULL;
      if ( args->samples_fname )
+    {
          samples = hts_readlines(args->samples_fname, &nsamples);
+        if ( !samples || !nsamples ) error("Error reading the --samples file \"%s\"\n", args->samples_fname);
+    }
      if ( args->header_fname )
      {
          free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0;
@@ -641,11 +672,16 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "Usage:   bcftools reheader [OPTIONS] <in.vcf.gz>\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Options:\n");
-    fprintf(bcftools_stderr, "    -f, --fai <file>        update sequences and their lengths from the .fai file\n");
-    fprintf(bcftools_stderr, "    -h, --header <file>     new header\n");
-    fprintf(bcftools_stderr, "    -o, --output <file>     write output to a file [standard output]\n");
-    fprintf(bcftools_stderr, "    -s, --samples <file>    new sample names\n");
-    fprintf(bcftools_stderr, "        --threads <int>     use multithreading with <int> worker threads (BCF only) [0]\n");
+    fprintf(bcftools_stderr, "    -f, --fai FILE             update sequences and their lengths from the .fai file\n");
+    fprintf(bcftools_stderr, "    -h, --header FILE          new header\n");
+    fprintf(bcftools_stderr, "    -o, --output FILE          write output to a file [standard output]\n");
+    fprintf(bcftools_stderr, "    -s, --samples FILE         new sample names\n");
+#ifdef _WIN32
+    fprintf(bcftools_stderr, "    -T, --temp-prefix PATH     template for temporary file name [/bcftools.XXXXXX]\n");
+#else
+    fprintf(bcftools_stderr, "    -T, --temp-prefix PATH     template for temporary file name [/tmp/bcftools.XXXXXX]\n");
+#endif
+    fprintf(bcftools_stderr, "        --threads INT          use multithreading with <int> worker threads (BCF only) [0]\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Example:\n");
      fprintf(bcftools_stderr, "   # Write out the header to be modified\n");
@@ -657,7 +693,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "   # Reheader the file\n");
      fprintf(bcftools_stderr, "   bcftools reheader -h header.txt -o new.bcf old.bcf\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_reheader(int argc, char *argv[])
@@ -668,6 +704,7 @@ int main_reheader(int argc, char *argv[])
      
      static struct option loptions[] =
      {
+        {"temp-prefix",1,0,'T'},
          {"fai",1,0,'f'},
          {"output",1,0,'o'},
          {"header",1,0,'h'},
@@ -675,11 +712,12 @@ int main_reheader(int argc, char *argv[])
          {"threads",1,NULL,1},
          {0,0,0,0}
      };
-    while ((c = getopt_long(argc, argv, "s:h:o:f:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "s:h:o:f:T:",loptions,NULL)) >= 0)
      {
          switch (c)
          {
              case  1 : args->n_threads = strtol(optarg, 0, 0); break;
+            case 'T': args->tmp_prefix = optarg; break;
              case 'f': args->fai_fname = optarg; break;
              case 'o': args->output_fname = optarg; break;
              case 's': args->samples_fname = optarg; break;
@@ -706,10 +744,14 @@ int main_reheader(int argc, char *argv[])
  
      if ( args->type.format==vcf )
      {
-        if ( args->type.compression==bgzf || args->type.compression==gzip )
+        if ( args->type.compression==bgzf )
              reheader_vcf_gz(args);
-        else
+        else if ( args->type.compression==no_compression )
              reheader_vcf(args);
+        else if ( args->type.compression==gzip )
+            error("Error: cannot reheader gzip-compressed files, first convert with `bcftools view --output-type` to a supported format\n");
+        else
+            error("Error: the compression type of \"%s\" is not recognised/supported\n", args->fname);
      }
      else
          reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip);
diff --git a/bcftools/smpl_ilist.c b/bcftools/smpl_ilist.c

index 9a77e6252bb1a0d9f82a4358141e0fc2d0dd2aff..d170db5ac235e60d42b211218b7535e002f4b0fb 100644 (file)
--- a/bcftools/smpl_ilist.c
+++ b/bcftools/smpl_ilist.c
@@ -1,5 +1,5 @@
  /* 
-    Copyright (C) 2016 Genome Research Ltd.
+    Copyright (C) 2016, 2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/smpl_ilist.c.pysam.c b/bcftools/smpl_ilist.c.pysam.c

index 45fe5af3e4cf78ca2bf458c32bac4eca0006427a..85b5e2f55eabaf8dbe9fb2e0701cbcb7ee90fc8b 100644 (file)
--- a/bcftools/smpl_ilist.c.pysam.c
+++ b/bcftools/smpl_ilist.c.pysam.c
@@ -1,7 +1,7 @@
  #include "bcftools.pysam.h"
  
  /* 
-    Copyright (C) 2016 Genome Research Ltd.
+    Copyright (C) 2016, 2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/str_finder.c b/bcftools/str_finder.c

new file mode 100644 (file)

index 0000000..800cbfe
--- /dev/null
+++ b/bcftools/str_finder.c
@@ -0,0 +1,270 @@
+/*  str_finder.c -- Short Tandem Repeat finder.
+    Originally from Crumble (https://github.com/jkbonfield/crumble)
+
+    Copyright (C) 2015-2016, 2021 Genome Research Ltd.
+
+    Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <ctype.h>
+
+#include "str_finder.h"
+#include "utlist.h"
+
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
+typedef unsigned char uc;
+
+static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen,
+                   int lower_only, unsigned int w) {
+    rep_ele *el, *tmp, *prev;
+    char *cp1, *cp2, *cp_end;
+    int i;
+
+    // Already handled this in previous overlap?
+    if (*list) {
+       tmp = DL_TAIL(*list);
+       if (tmp->start <= pos-rlen*2+1 && tmp->end >= pos)
+           return;
+    }
+
+    // Find current and last occurence of repeated word.
+
+    cp2 = &cons[pos+1];
+    // If unpadded, this is quicker: cp1 = &cons[pos+1-rlen];
+
+    for (cp1 = &cons[pos], i = 1; i < rlen; cp1--) // compensate for pads
+       if (*cp1 == '*')
+           continue;
+       else
+           i++;
+    while (*cp1 == '*')
+       cp1--;
+
+
+    // Scan ahead to see how much further it goes.
+    cp_end = &cons[clen];
+    while (cp2 < cp_end) {
+       if (*cp1 != *cp2)
+           break;
+
+       w<<=2;
+       w|=*cp2;
+       cp1++;
+       cp2++;
+    }
+
+    if (!(el = malloc(sizeof(*el))))
+       return;
+
+    el->end   = pos + cp2-&cons[pos+1];
+    el->rep_len = rlen;
+    pos++;
+    while (rlen--) {
+       while (cons[--pos] == '*');
+       while (cons[--pos] == '*');
+    }
+    //pos++;
+    while (pos > 1 && cons[pos-1] == '*') pos--;
+    el->start = pos;
+
+    // Check it meets the lower-case only criteria
+    if (lower_only) {
+       int lc = 0;
+       for (i = el->start; i <= el->end; i++) {
+           if (islower(cons[i])) {
+               lc = 1;
+               break;
+           }
+       }
+
+       if (!lc) {
+           free(el);
+           return;
+       }
+    }
+
+    // Remove any older items on the list that are entirely contained within el
+    if (*list) {
+       tmp = DL_TAIL(*list);
+       do {
+           prev = tmp->prev;
+           if (tmp->end < el->start)
+               break;
+
+           if (tmp->start >= el->start) {
+               DL_DELETE(*list, tmp);
+               free(tmp);
+           }
+
+           if (tmp == DL_HEAD(*list))
+               break;
+           tmp = prev;
+       } while (*list);
+    }
+
+    DL_APPEND(*list, el);
+
+    return;
+}
+
+/*
+ * Finds repeated homopolymers up to 8-mers.
+ * Note this assumes cons is 0-3, so N of 4 may rarely give false hits.
+ *
+ * Returns a list of rep_ele structs holding the start,end tuples of repeats;
+ *         NULL on failure.
+ */
+rep_ele *find_STR(char *cons, int len, int lower_only) {
+    int i, j;
+    uint32_t w = 0;
+    rep_ele *reps = NULL;
+
+    for (i = j = 0; i < len && j < 15; i++) {
+       if (cons[i] == '*') continue;
+
+       w <<= 2;
+       w |= cons[i];
+       //printf("%3d %c w=%08x\n", i, cons[i], w);
+       if (j>= 1 && (w&0x0003) == ((w>> 2)&0x0003))
+           add_rep(&reps, cons, len, i, 1, lower_only, w);
+       if (j>= 3 && (w&0x000f) == ((w>> 4)&0x000f))
+           add_rep(&reps, cons, len, i, 2, lower_only, w);
+       if (j>= 5 && (w&0x003f) == ((w>> 6)&0x003f))
+           add_rep(&reps, cons, len, i, 3, lower_only, w);
+       if (j>= 7 && (w&0x00ff) == ((w>> 8)&0x00ff))
+           add_rep(&reps, cons, len, i, 4, lower_only, w);
+       if (j>= 9 && (w&0x03ff) == ((w>>10)&0x03ff))
+           add_rep(&reps, cons, len, i, 5, lower_only, w);
+       if (j>=11 && (w&0x0fff) == ((w>>12)&0x0fff))
+           add_rep(&reps, cons, len, i, 6, lower_only, w);
+       if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff))
+           add_rep(&reps, cons, len, i, 7, lower_only, w);
+
+       j++;
+    }
+
+    for (; i < len; i++) {     
+       if (cons[i] == '*') continue;
+
+       w <<= 2;
+       w |= cons[i];
+       //printf("%3d %c w=%08x\n", i, cons[i], w);
+       if ((w&0xffff) == ((w>>16)&0xffff)) 
+           add_rep(&reps, cons, len, i, 8, lower_only, w);
+       else if ((w&0x3fff) == ((w>>14)&0x3fff)) 
+           add_rep(&reps, cons, len, i, 7, lower_only, w);
+       else if ((w&0x0fff) == ((w>>12)&0x0fff)) 
+           add_rep(&reps, cons, len, i, 6, lower_only, w);
+       else if ((w&0x03ff) == ((w>>10)&0x03ff)) 
+           add_rep(&reps, cons, len, i, 5, lower_only, w);
+       else if ((w&0x00ff) == ((w>> 8)&0x00ff)) 
+           add_rep(&reps, cons, len, i, 4, lower_only, w);
+       else if ((w&0x003f) == ((w>> 6)&0x003f)) 
+           add_rep(&reps, cons, len, i, 3, lower_only, w);
+       else if ((w&0x000f) == ((w>> 4)&0x000f)) 
+           add_rep(&reps, cons, len, i, 2, lower_only, w);
+       else if ((w&0x0003) == ((w>> 2)&0x0003)) 
+           add_rep(&reps, cons, len, i, 1, lower_only, w);
+    }
+
+    return reps;
+}
+
+/* -----------------------------------------------------------------------------
+ * Computes repeat regions in the consensus and then provides a bit mask
+ * indicating the extend of the STRs.
+ *
+ * The purpose of this is to identify where a read needs to span the entire
+ * region in order to validate how many copies of a repeat word are present.
+ * This only really has a major impact when indels are involved.
+ *
+ * For example, given this multiple alignment:
+ *
+ * S1 GATCGGACGAGAG
+ * S2 GATCGGACGAGAGAGAGAGAGT
+ * S3 GATCGGACGAGAGAGAGAG**TCGGAC
+ * S4     GGACGAGAGAGAGAGAGTCGGAC
+ * S5        CGAGAGAGAGAG**TCGGAC
+ * S6              AGAGAGAGTCGGAC
+ *
+ * We have subseq of GAGAGAGAGAG** vs GAGAGAGAGAGAG. The first and last
+ * (S1 and S6) sequences do not span and so we do not know which allele they
+ * match. Specifically as the pad is at the right hand end, the alignment of
+ * S6 gives incorrect weight to the consensus as it is stating AG when it
+ * may actually be ** at that point.
+ *
+ * By identifying the repeats we can soft clip as follows:
+ *
+ * S1 GATCGGACgagag
+ * S2 GATCGGACGAGAGAGAGAGAGT
+ * S3 GATCGGACGAGAGAGAGAG**TCGGAC
+ * S4     GGACGAGAGAGAGAGAGTCGGAC
+ * S5        CGAGAGAGAGAG**TCGGAC
+ * S6              agagagagTCGGAC
+ *
+ * Returns an array of STR vs no-STR values.
+ *         0  => non repetitive.
+ *         1+ => repeat with consecutive bit-number for repeat size.
+ *
+ * Eg:  AGGGGAGGAGAAGAC
+ *       1111  1111
+ *         2222222
+ *              444444
+ * =>   011331137754440
+ */
+char *cons_mark_STR(char *cons, int len, int lower_only) {
+    rep_ele *reps, *elt, *tmp;
+    char *str;
+
+    str = calloc(1, len);
+    reps = find_STR(cons, len, lower_only);
+
+    DL_FOREACH_SAFE(reps, elt, tmp) {
+       int i, v = 0;
+       
+       //printf("%2d .. %2d %.*s\n", elt->start, elt->end,
+       //       elt->end - elt->start+1, &cons[elt->start]);
+
+       // What is there?
+       for (i = MAX(elt->start-1,0); i <= MIN(elt->end+1,len-1); i++)
+           v |= str[i];
+
+       for (i = 0; i < 8; i++) {
+           if (!(v&(1<<i)))
+               break;
+       }
+       v = (i == 8) ? 1 : (1<<i);
+
+       // Add new if available, or just overload 1 if not
+       for (i = elt->start; i <= elt->end; i++)
+           str[i] |= v;
+
+       DL_DELETE(reps, elt);
+       free(elt);
+    }
+
+    return str;
+}
diff --git a/bcftools/str_finder.c.pysam.c b/bcftools/str_finder.c.pysam.c

new file mode 100644 (file)

index 0000000..296c867
--- /dev/null
+++ b/bcftools/str_finder.c.pysam.c
@@ -0,0 +1,272 @@
+#include "bcftools.pysam.h"
+
+/*  str_finder.c -- Short Tandem Repeat finder.
+    Originally from Crumble (https://github.com/jkbonfield/crumble)
+
+    Copyright (C) 2015-2016, 2021 Genome Research Ltd.
+
+    Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <ctype.h>
+
+#include "str_finder.h"
+#include "utlist.h"
+
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
+typedef unsigned char uc;
+
+static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen,
+                   int lower_only, unsigned int w) {
+    rep_ele *el, *tmp, *prev;
+    char *cp1, *cp2, *cp_end;
+    int i;
+
+    // Already handled this in previous overlap?
+    if (*list) {
+       tmp = DL_TAIL(*list);
+       if (tmp->start <= pos-rlen*2+1 && tmp->end >= pos)
+           return;
+    }
+
+    // Find current and last occurence of repeated word.
+
+    cp2 = &cons[pos+1];
+    // If unpadded, this is quicker: cp1 = &cons[pos+1-rlen];
+
+    for (cp1 = &cons[pos], i = 1; i < rlen; cp1--) // compensate for pads
+       if (*cp1 == '*')
+           continue;
+       else
+           i++;
+    while (*cp1 == '*')
+       cp1--;
+
+
+    // Scan ahead to see how much further it goes.
+    cp_end = &cons[clen];
+    while (cp2 < cp_end) {
+       if (*cp1 != *cp2)
+           break;
+
+       w<<=2;
+       w|=*cp2;
+       cp1++;
+       cp2++;
+    }
+
+    if (!(el = malloc(sizeof(*el))))
+       return;
+
+    el->end   = pos + cp2-&cons[pos+1];
+    el->rep_len = rlen;
+    pos++;
+    while (rlen--) {
+       while (cons[--pos] == '*');
+       while (cons[--pos] == '*');
+    }
+    //pos++;
+    while (pos > 1 && cons[pos-1] == '*') pos--;
+    el->start = pos;
+
+    // Check it meets the lower-case only criteria
+    if (lower_only) {
+       int lc = 0;
+       for (i = el->start; i <= el->end; i++) {
+           if (islower(cons[i])) {
+               lc = 1;
+               break;
+           }
+       }
+
+       if (!lc) {
+           free(el);
+           return;
+       }
+    }
+
+    // Remove any older items on the list that are entirely contained within el
+    if (*list) {
+       tmp = DL_TAIL(*list);
+       do {
+           prev = tmp->prev;
+           if (tmp->end < el->start)
+               break;
+
+           if (tmp->start >= el->start) {
+               DL_DELETE(*list, tmp);
+               free(tmp);
+           }
+
+           if (tmp == DL_HEAD(*list))
+               break;
+           tmp = prev;
+       } while (*list);
+    }
+
+    DL_APPEND(*list, el);
+
+    return;
+}
+
+/*
+ * Finds repeated homopolymers up to 8-mers.
+ * Note this assumes cons is 0-3, so N of 4 may rarely give false hits.
+ *
+ * Returns a list of rep_ele structs holding the start,end tuples of repeats;
+ *         NULL on failure.
+ */
+rep_ele *find_STR(char *cons, int len, int lower_only) {
+    int i, j;
+    uint32_t w = 0;
+    rep_ele *reps = NULL;
+
+    for (i = j = 0; i < len && j < 15; i++) {
+       if (cons[i] == '*') continue;
+
+       w <<= 2;
+       w |= cons[i];
+       //printf("%3d %c w=%08x\n", i, cons[i], w);
+       if (j>= 1 && (w&0x0003) == ((w>> 2)&0x0003))
+           add_rep(&reps, cons, len, i, 1, lower_only, w);
+       if (j>= 3 && (w&0x000f) == ((w>> 4)&0x000f))
+           add_rep(&reps, cons, len, i, 2, lower_only, w);
+       if (j>= 5 && (w&0x003f) == ((w>> 6)&0x003f))
+           add_rep(&reps, cons, len, i, 3, lower_only, w);
+       if (j>= 7 && (w&0x00ff) == ((w>> 8)&0x00ff))
+           add_rep(&reps, cons, len, i, 4, lower_only, w);
+       if (j>= 9 && (w&0x03ff) == ((w>>10)&0x03ff))
+           add_rep(&reps, cons, len, i, 5, lower_only, w);
+       if (j>=11 && (w&0x0fff) == ((w>>12)&0x0fff))
+           add_rep(&reps, cons, len, i, 6, lower_only, w);
+       if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff))
+           add_rep(&reps, cons, len, i, 7, lower_only, w);
+
+       j++;
+    }
+
+    for (; i < len; i++) {     
+       if (cons[i] == '*') continue;
+
+       w <<= 2;
+       w |= cons[i];
+       //printf("%3d %c w=%08x\n", i, cons[i], w);
+       if ((w&0xffff) == ((w>>16)&0xffff)) 
+           add_rep(&reps, cons, len, i, 8, lower_only, w);
+       else if ((w&0x3fff) == ((w>>14)&0x3fff)) 
+           add_rep(&reps, cons, len, i, 7, lower_only, w);
+       else if ((w&0x0fff) == ((w>>12)&0x0fff)) 
+           add_rep(&reps, cons, len, i, 6, lower_only, w);
+       else if ((w&0x03ff) == ((w>>10)&0x03ff)) 
+           add_rep(&reps, cons, len, i, 5, lower_only, w);
+       else if ((w&0x00ff) == ((w>> 8)&0x00ff)) 
+           add_rep(&reps, cons, len, i, 4, lower_only, w);
+       else if ((w&0x003f) == ((w>> 6)&0x003f)) 
+           add_rep(&reps, cons, len, i, 3, lower_only, w);
+       else if ((w&0x000f) == ((w>> 4)&0x000f)) 
+           add_rep(&reps, cons, len, i, 2, lower_only, w);
+       else if ((w&0x0003) == ((w>> 2)&0x0003)) 
+           add_rep(&reps, cons, len, i, 1, lower_only, w);
+    }
+
+    return reps;
+}
+
+/* -----------------------------------------------------------------------------
+ * Computes repeat regions in the consensus and then provides a bit mask
+ * indicating the extend of the STRs.
+ *
+ * The purpose of this is to identify where a read needs to span the entire
+ * region in order to validate how many copies of a repeat word are present.
+ * This only really has a major impact when indels are involved.
+ *
+ * For example, given this multiple alignment:
+ *
+ * S1 GATCGGACGAGAG
+ * S2 GATCGGACGAGAGAGAGAGAGT
+ * S3 GATCGGACGAGAGAGAGAG**TCGGAC
+ * S4     GGACGAGAGAGAGAGAGTCGGAC
+ * S5        CGAGAGAGAGAG**TCGGAC
+ * S6              AGAGAGAGTCGGAC
+ *
+ * We have subseq of GAGAGAGAGAG** vs GAGAGAGAGAGAG. The first and last
+ * (S1 and S6) sequences do not span and so we do not know which allele they
+ * match. Specifically as the pad is at the right hand end, the alignment of
+ * S6 gives incorrect weight to the consensus as it is stating AG when it
+ * may actually be ** at that point.
+ *
+ * By identifying the repeats we can soft clip as follows:
+ *
+ * S1 GATCGGACgagag
+ * S2 GATCGGACGAGAGAGAGAGAGT
+ * S3 GATCGGACGAGAGAGAGAG**TCGGAC
+ * S4     GGACGAGAGAGAGAGAGTCGGAC
+ * S5        CGAGAGAGAGAG**TCGGAC
+ * S6              agagagagTCGGAC
+ *
+ * Returns an array of STR vs no-STR values.
+ *         0  => non repetitive.
+ *         1+ => repeat with consecutive bit-number for repeat size.
+ *
+ * Eg:  AGGGGAGGAGAAGAC
+ *       1111  1111
+ *         2222222
+ *              444444
+ * =>   011331137754440
+ */
+char *cons_mark_STR(char *cons, int len, int lower_only) {
+    rep_ele *reps, *elt, *tmp;
+    char *str;
+
+    str = calloc(1, len);
+    reps = find_STR(cons, len, lower_only);
+
+    DL_FOREACH_SAFE(reps, elt, tmp) {
+       int i, v = 0;
+       
+       //printf("%2d .. %2d %.*s\n", elt->start, elt->end,
+       //       elt->end - elt->start+1, &cons[elt->start]);
+
+       // What is there?
+       for (i = MAX(elt->start-1,0); i <= MIN(elt->end+1,len-1); i++)
+           v |= str[i];
+
+       for (i = 0; i < 8; i++) {
+           if (!(v&(1<<i)))
+               break;
+       }
+       v = (i == 8) ? 1 : (1<<i);
+
+       // Add new if available, or just overload 1 if not
+       for (i = elt->start; i <= elt->end; i++)
+           str[i] |= v;
+
+       DL_DELETE(reps, elt);
+       free(elt);
+    }
+
+    return str;
+}
diff --git a/bcftools/str_finder.h b/bcftools/str_finder.h

new file mode 100644 (file)

index 0000000..242f59e
--- /dev/null
+++ b/bcftools/str_finder.h
@@ -0,0 +1,64 @@
+/*  str_finder.c -- Short Tandem Repeat finder.
+    Originally from Crumble (https://github.com/jkbonfield/crumble)
+
+    Copyright (C) 2015-2016, 2021 Genome Research Ltd.
+
+    Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#ifndef _STR_FINDER_H_
+#define _STR_FINDER_H_
+
+#include "utlist.h"
+
+typedef struct rep_ele {
+    int start, end, rep_len;
+    struct rep_ele *prev;
+    struct rep_ele *next;
+} rep_ele;
+
+/*
+ * Finds repeated homopolymers up to 8-mers.
+ *
+ * If lower_only is true then it only adds STRs for regions that
+ * contain at least one lower-case base. This can be used as a marker
+ * for looking for specific types of repeats.
+ * (One use for this is to only mark STRs that overlap a heterozygous
+ * indel region.)
+ *
+ * Returns a list of rep_ele structs holding the start,end tuples of repeats;
+ *         NULL on failure.
+ */
+rep_ele *find_STR(char *cons, int len, int lower_only);
+
+/*
+ * Returns an array of STR vs no-STR values.
+ *         0  => non repetitive.
+ *         1+ => repeat with consecutive bit-number for repeat size.
+ *
+ * Eg:  AGGGGAGGAGAAGAC
+ *       1111  1111
+ *         2222222
+ *              444444
+ * =>   011331137754440
+ */
+char *cons_mark_STR(char *cons, int len, int lower_only);
+
+#endif /* _STR_FINDER_H_ */
diff --git a/bcftools/utlist.h b/bcftools/utlist.h

new file mode 100644 (file)

index 0000000..28cf8a3
--- /dev/null
+++ b/bcftools/utlist.h
@@ -0,0 +1,761 @@
+/*
+Copyright (c) 2007-2014, Troy D. Hanson   http://troydhanson.github.com/uthash/
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef UTLIST_H
+#define UTLIST_H
+
+#define UTLIST_VERSION 1.9.9
+
+#include <assert.h>
+
+/* 
+ * This file contains macros to manipulate singly and doubly-linked lists.
+ *
+ * 1. LL_ macros:  singly-linked lists.
+ * 2. DL_ macros:  doubly-linked lists.
+ * 3. CDL_ macros: circular doubly-linked lists.
+ *
+ * To use singly-linked lists, your structure must have a "next" pointer.
+ * To use doubly-linked lists, your structure must "prev" and "next" pointers.
+ * Either way, the pointer to the head of the list must be initialized to NULL.
+ * 
+ * ----------------.EXAMPLE -------------------------
+ * struct item {
+ *      int id;
+ *      struct item *prev, *next;
+ * }
+ *
+ * struct item *list = NULL:
+ *
+ * int main() {
+ *      struct item *item;
+ *      ... allocate and populate item ...
+ *      DL_APPEND(list, item);
+ * }
+ * --------------------------------------------------
+ *
+ * For doubly-linked lists, the append and delete macros are O(1)
+ * For singly-linked lists, append and delete are O(n) but prepend is O(1)
+ * The sort macro is O(n log(n)) for all types of single/double/circular lists.
+ */
+
+/* These macros use decltype or the earlier __typeof GNU extension.
+   As decltype is only available in newer compilers (VS2010 or gcc 4.3+
+   when compiling c++ code), this code uses whatever method is needed
+   or, for VS2008 where neither is available, uses casting workarounds. */
+#ifdef _MSC_VER            /* MS compiler */
+#if _MSC_VER >= 1600 && defined(__cplusplus)  /* VS2010 or newer in C++ mode */
+#define LDECLTYPE(x) decltype(x)
+#else                     /* VS2008 or older (or VS2010 in C mode) */
+#define NO_DECLTYPE
+#define LDECLTYPE(x) char*
+#endif
+#elif defined(__ICCARM__)
+#define NO_DECLTYPE
+#define LDECLTYPE(x) char*
+#else                      /* GNU, Sun and other compilers */
+#define LDECLTYPE(x) __typeof(x)
+#endif
+
+/* for VS2008 we use some workarounds to get around the lack of decltype,
+ * namely, we always reassign our tmp variable to the list head if we need
+ * to dereference its prev/next pointers, and save/restore the real head.*/
+#ifdef NO_DECLTYPE
+#define _SV(elt,list) _tmp = (char*)(list); {char **_alias = (char**)&(list); *_alias = (elt); }
+#define _NEXT(elt,list,next) ((char*)((list)->next))
+#define _NEXTASGN(elt,list,to,next) { char **_alias = (char**)&((list)->next); *_alias=(char*)(to); }
+/* #define _PREV(elt,list,prev) ((char*)((list)->prev)) */
+#define _PREVASGN(elt,list,to,prev) { char **_alias = (char**)&((list)->prev); *_alias=(char*)(to); }
+#define _RS(list) { char **_alias = (char**)&(list); *_alias=_tmp; }
+#define _CASTASGN(a,b) { char **_alias = (char**)&(a); *_alias=(char*)(b); }
+#else 
+#define _SV(elt,list)
+#define _NEXT(elt,list,next) ((elt)->next)
+#define _NEXTASGN(elt,list,to,next) ((elt)->next)=(to)
+/* #define _PREV(elt,list,prev) ((elt)->prev) */
+#define _PREVASGN(elt,list,to,prev) ((elt)->prev)=(to)
+#define _RS(list)
+#define _CASTASGN(a,b) (a)=(b)
+#endif
+
+/******************************************************************************
+ * The sort macro is an adaptation of Simon Tatham's O(n log(n)) mergesort    *
+ * Unwieldy variable names used here to avoid shadowing passed-in variables.  *
+ *****************************************************************************/
+#define LL_SORT(list, cmp)                                                                     \
+    LL_SORT2(list, cmp, next)
+
+#define LL_SORT2(list, cmp, next)                                                              \
+do {                                                                                           \
+  LDECLTYPE(list) _ls_p;                                                                       \
+  LDECLTYPE(list) _ls_q;                                                                       \
+  LDECLTYPE(list) _ls_e;                                                                       \
+  LDECLTYPE(list) _ls_tail;                                                                    \
+  int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping;                       \
+  if (list) {                                                                                  \
+    _ls_insize = 1;                                                                            \
+    _ls_looping = 1;                                                                           \
+    while (_ls_looping) {                                                                      \
+      _CASTASGN(_ls_p,list);                                                                   \
+      list = NULL;                                                                             \
+      _ls_tail = NULL;                                                                         \
+      _ls_nmerges = 0;                                                                         \
+      while (_ls_p) {                                                                          \
+        _ls_nmerges++;                                                                         \
+        _ls_q = _ls_p;                                                                         \
+        _ls_psize = 0;                                                                         \
+        for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) {                                         \
+          _ls_psize++;                                                                         \
+          _SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list);                          \
+          if (!_ls_q) break;                                                                   \
+        }                                                                                      \
+        _ls_qsize = _ls_insize;                                                                \
+        while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) {                                    \
+          if (_ls_psize == 0) {                                                                \
+            _ls_e = _ls_q; _SV(_ls_q,list); _ls_q =                                            \
+              _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--;                                  \
+          } else if (_ls_qsize == 0 || !_ls_q) {                                               \
+            _ls_e = _ls_p; _SV(_ls_p,list); _ls_p =                                            \
+              _NEXT(_ls_p,list,next); _RS(list); _ls_psize--;                                  \
+          } else if (cmp(_ls_p,_ls_q) <= 0) {                                                  \
+            _ls_e = _ls_p; _SV(_ls_p,list); _ls_p =                                            \
+              _NEXT(_ls_p,list,next); _RS(list); _ls_psize--;                                  \
+          } else {                                                                             \
+            _ls_e = _ls_q; _SV(_ls_q,list); _ls_q =                                            \
+              _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--;                                  \
+          }                                                                                    \
+          if (_ls_tail) {                                                                      \
+            _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list);                \
+          } else {                                                                             \
+            _CASTASGN(list,_ls_e);                                                             \
+          }                                                                                    \
+          _ls_tail = _ls_e;                                                                    \
+        }                                                                                      \
+        _ls_p = _ls_q;                                                                         \
+      }                                                                                        \
+      if (_ls_tail) {                                                                          \
+        _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list);                     \
+      }                                                                                        \
+      if (_ls_nmerges <= 1) {                                                                  \
+        _ls_looping=0;                                                                         \
+      }                                                                                        \
+      _ls_insize *= 2;                                                                         \
+    }                                                                                          \
+  }                                                                                            \
+} while (0)
+
+
+#define DL_SORT(list, cmp)                                                                     \
+    DL_SORT2(list, cmp, prev, next)
+
+#define DL_SORT2(list, cmp, prev, next)                                                        \
+do {                                                                                           \
+  LDECLTYPE(list) _ls_p;                                                                       \
+  LDECLTYPE(list) _ls_q;                                                                       \
+  LDECLTYPE(list) _ls_e;                                                                       \
+  LDECLTYPE(list) _ls_tail;                                                                    \
+  int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping;                       \
+  if (list) {                                                                                  \
+    _ls_insize = 1;                                                                            \
+    _ls_looping = 1;                                                                           \
+    while (_ls_looping) {                                                                      \
+      _CASTASGN(_ls_p,list);                                                                   \
+      list = NULL;                                                                             \
+      _ls_tail = NULL;                                                                         \
+      _ls_nmerges = 0;                                                                         \
+      while (_ls_p) {                                                                          \
+        _ls_nmerges++;                                                                         \
+        _ls_q = _ls_p;                                                                         \
+        _ls_psize = 0;                                                                         \
+        for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) {                                         \
+          _ls_psize++;                                                                         \
+          _SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list);                          \
+          if (!_ls_q) break;                                                                   \
+        }                                                                                      \
+        _ls_qsize = _ls_insize;                                                                \
+        while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) {                                    \
+          if (_ls_psize == 0) {                                                                \
+            _ls_e = _ls_q; _SV(_ls_q,list); _ls_q =                                            \
+              _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--;                                  \
+          } else if (_ls_qsize == 0 || !_ls_q) {                                               \
+            _ls_e = _ls_p; _SV(_ls_p,list); _ls_p =                                            \
+              _NEXT(_ls_p,list,next); _RS(list); _ls_psize--;                                  \
+          } else if (cmp(_ls_p,_ls_q) <= 0) {                                                  \
+            _ls_e = _ls_p; _SV(_ls_p,list); _ls_p =                                            \
+              _NEXT(_ls_p,list,next); _RS(list); _ls_psize--;                                  \
+          } else {                                                                             \
+            _ls_e = _ls_q; _SV(_ls_q,list); _ls_q =                                            \
+              _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--;                                  \
+          }                                                                                    \
+          if (_ls_tail) {                                                                      \
+            _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list);                \
+          } else {                                                                             \
+            _CASTASGN(list,_ls_e);                                                             \
+          }                                                                                    \
+          _SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list);                     \
+          _ls_tail = _ls_e;                                                                    \
+        }                                                                                      \
+        _ls_p = _ls_q;                                                                         \
+      }                                                                                        \
+      _CASTASGN(list->prev, _ls_tail);                                                         \
+      _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list);                       \
+      if (_ls_nmerges <= 1) {                                                                  \
+        _ls_looping=0;                                                                         \
+      }                                                                                        \
+      _ls_insize *= 2;                                                                         \
+    }                                                                                          \
+  }                                                                                            \
+} while (0)
+
+
+#define DL_HEAD(list) (list)
+#define DL_TAIL(list) ((list) ? (list)->prev : NULL)
+
+#define CDL_SORT(list, cmp)                                                                    \
+    CDL_SORT2(list, cmp, prev, next)
+
+#define CDL_SORT2(list, cmp, prev, next)                                                       \
+do {                                                                                           \
+  LDECLTYPE(list) _ls_p;                                                                       \
+  LDECLTYPE(list) _ls_q;                                                                       \
+  LDECLTYPE(list) _ls_e;                                                                       \
+  LDECLTYPE(list) _ls_tail;                                                                    \
+  LDECLTYPE(list) _ls_oldhead;                                                                 \
+  LDECLTYPE(list) _tmp;                                                                        \
+  int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping;                       \
+  if (list) {                                                                                  \
+    _ls_insize = 1;                                                                            \
+    _ls_looping = 1;                                                                           \
+    while (_ls_looping) {                                                                      \
+      _CASTASGN(_ls_p,list);                                                                   \
+      _CASTASGN(_ls_oldhead,list);                                                             \
+      list = NULL;                                                                             \
+      _ls_tail = NULL;                                                                         \
+      _ls_nmerges = 0;                                                                         \
+      while (_ls_p) {                                                                          \
+        _ls_nmerges++;                                                                         \
+        _ls_q = _ls_p;                                                                         \
+        _ls_psize = 0;                                                                         \
+        for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) {                                         \
+          _ls_psize++;                                                                         \
+          _SV(_ls_q,list);                                                                     \
+          if (_NEXT(_ls_q,list,next) == _ls_oldhead) {                                         \
+            _ls_q = NULL;                                                                      \
+          } else {                                                                             \
+            _ls_q = _NEXT(_ls_q,list,next);                                                    \
+          }                                                                                    \
+          _RS(list);                                                                           \
+          if (!_ls_q) break;                                                                   \
+        }                                                                                      \
+        _ls_qsize = _ls_insize;                                                                \
+        while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) {                                    \
+          if (_ls_psize == 0) {                                                                \
+            _ls_e = _ls_q; _SV(_ls_q,list); _ls_q =                                            \
+              _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--;                                  \
+            if (_ls_q == _ls_oldhead) { _ls_q = NULL; }                                        \
+          } else if (_ls_qsize == 0 || !_ls_q) {                                               \
+            _ls_e = _ls_p; _SV(_ls_p,list); _ls_p =                                            \
+              _NEXT(_ls_p,list,next); _RS(list); _ls_psize--;                                  \
+            if (_ls_p == _ls_oldhead) { _ls_p = NULL; }                                        \
+          } else if (cmp(_ls_p,_ls_q) <= 0) {                                                  \
+            _ls_e = _ls_p; _SV(_ls_p,list); _ls_p =                                            \
+              _NEXT(_ls_p,list,next); _RS(list); _ls_psize--;                                  \
+            if (_ls_p == _ls_oldhead) { _ls_p = NULL; }                                        \
+          } else {                                                                             \
+            _ls_e = _ls_q; _SV(_ls_q,list); _ls_q =                                            \
+              _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--;                                  \
+            if (_ls_q == _ls_oldhead) { _ls_q = NULL; }                                        \
+          }                                                                                    \
+          if (_ls_tail) {                                                                      \
+            _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list);                \
+          } else {                                                                             \
+            _CASTASGN(list,_ls_e);                                                             \
+          }                                                                                    \
+          _SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list);                     \
+          _ls_tail = _ls_e;                                                                    \
+        }                                                                                      \
+        _ls_p = _ls_q;                                                                         \
+      }                                                                                        \
+      _CASTASGN(list->prev,_ls_tail);                                                          \
+      _CASTASGN(_tmp,list);                                                                    \
+      _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_tmp,next); _RS(list);                       \
+      if (_ls_nmerges <= 1) {                                                                  \
+        _ls_looping=0;                                                                         \
+      }                                                                                        \
+      _ls_insize *= 2;                                                                         \
+    }                                                                                          \
+  }                                                                                            \
+} while (0)
+
+/******************************************************************************
+ * singly linked list macros (non-circular)                                   *
+ *****************************************************************************/
+#define LL_PREPEND(head,add)                                                                   \
+    LL_PREPEND2(head,add,next)
+
+#define LL_PREPEND2(head,add,next)                                                             \
+do {                                                                                           \
+  (add)->next = head;                                                                          \
+  head = add;                                                                                  \
+} while (0)
+
+#define LL_CONCAT(head1,head2)                                                                 \
+    LL_CONCAT2(head1,head2,next)
+
+#define LL_CONCAT2(head1,head2,next)                                                           \
+do {                                                                                           \
+  LDECLTYPE(head1) _tmp;                                                                       \
+  if (head1) {                                                                                 \
+    _tmp = head1;                                                                              \
+    while (_tmp->next) { _tmp = _tmp->next; }                                                  \
+    _tmp->next=(head2);                                                                        \
+  } else {                                                                                     \
+    (head1)=(head2);                                                                           \
+  }                                                                                            \
+} while (0)
+
+#define LL_APPEND(head,add)                                                                    \
+    LL_APPEND2(head,add,next)
+
+#define LL_APPEND2(head,add,next)                                                              \
+do {                                                                                           \
+  LDECLTYPE(head) _tmp;                                                                        \
+  (add)->next=NULL;                                                                            \
+  if (head) {                                                                                  \
+    _tmp = head;                                                                               \
+    while (_tmp->next) { _tmp = _tmp->next; }                                                  \
+    _tmp->next=(add);                                                                          \
+  } else {                                                                                     \
+    (head)=(add);                                                                              \
+  }                                                                                            \
+} while (0)
+
+#define LL_DELETE(head,del)                                                                    \
+    LL_DELETE2(head,del,next)
+
+#define LL_DELETE2(head,del,next)                                                              \
+do {                                                                                           \
+  LDECLTYPE(head) _tmp;                                                                        \
+  if ((head) == (del)) {                                                                       \
+    (head)=(head)->next;                                                                       \
+  } else {                                                                                     \
+    _tmp = head;                                                                               \
+    while (_tmp->next && (_tmp->next != (del))) {                                              \
+      _tmp = _tmp->next;                                                                       \
+    }                                                                                          \
+    if (_tmp->next) {                                                                          \
+      _tmp->next = ((del)->next);                                                              \
+    }                                                                                          \
+  }                                                                                            \
+} while (0)
+
+/* Here are VS2008 replacements for LL_APPEND and LL_DELETE */
+#define LL_APPEND_VS2008(head,add)                                                             \
+    LL_APPEND2_VS2008(head,add,next)
+
+#define LL_APPEND2_VS2008(head,add,next)                                                       \
+do {                                                                                           \
+  if (head) {                                                                                  \
+    (add)->next = head;     /* use add->next as a temp variable */                             \
+    while ((add)->next->next) { (add)->next = (add)->next->next; }                             \
+    (add)->next->next=(add);                                                                   \
+  } else {                                                                                     \
+    (head)=(add);                                                                              \
+  }                                                                                            \
+  (add)->next=NULL;                                                                            \
+} while (0)
+
+#define LL_DELETE_VS2008(head,del)                                                             \
+    LL_DELETE2_VS2008(head,del,next)
+
+#define LL_DELETE2_VS2008(head,del,next)                                                       \
+do {                                                                                           \
+  if ((head) == (del)) {                                                                       \
+    (head)=(head)->next;                                                                       \
+  } else {                                                                                     \
+    char *_tmp = (char*)(head);                                                                \
+    while ((head)->next && ((head)->next != (del))) {                                          \
+      head = (head)->next;                                                                     \
+    }                                                                                          \
+    if ((head)->next) {                                                                        \
+      (head)->next = ((del)->next);                                                            \
+    }                                                                                          \
+    {                                                                                          \
+      char **_head_alias = (char**)&(head);                                                    \
+      *_head_alias = _tmp;                                                                     \
+    }                                                                                          \
+  }                                                                                            \
+} while (0)
+#ifdef NO_DECLTYPE
+#undef LL_APPEND
+#define LL_APPEND LL_APPEND_VS2008
+#undef LL_DELETE
+#define LL_DELETE LL_DELETE_VS2008
+#undef LL_DELETE2
+#define LL_DELETE2 LL_DELETE2_VS2008
+#undef LL_APPEND2
+#define LL_APPEND2 LL_APPEND2_VS2008
+#undef LL_CONCAT /* no LL_CONCAT_VS2008 */
+#undef DL_CONCAT /* no DL_CONCAT_VS2008 */
+#endif
+/* end VS2008 replacements */
+
+#define LL_COUNT(head,el,counter)                                                              \
+    LL_COUNT2(head,el,counter,next)                                                            \
+
+#define LL_COUNT2(head,el,counter,next)                                                        \
+{                                                                                              \
+    counter = 0;                                                                               \
+    LL_FOREACH2(head,el,next){ ++counter; }                                                    \
+}
+
+#define LL_FOREACH(head,el)                                                                    \
+    LL_FOREACH2(head,el,next)
+
+#define LL_FOREACH2(head,el,next)                                                              \
+    for(el=head;el;el=(el)->next)
+
+#define LL_FOREACH_SAFE(head,el,tmp)                                                           \
+    LL_FOREACH_SAFE2(head,el,tmp,next)
+
+#define LL_FOREACH_SAFE2(head,el,tmp,next)                                                     \
+  for((el)=(head);(el) && (tmp = (el)->next, 1); (el) = tmp)
+
+#define LL_SEARCH_SCALAR(head,out,field,val)                                                   \
+    LL_SEARCH_SCALAR2(head,out,field,val,next)
+
+#define LL_SEARCH_SCALAR2(head,out,field,val,next)                                             \
+do {                                                                                           \
+    LL_FOREACH2(head,out,next) {                                                               \
+      if ((out)->field == (val)) break;                                                        \
+    }                                                                                          \
+} while(0) 
+
+#define LL_SEARCH(head,out,elt,cmp)                                                            \
+    LL_SEARCH2(head,out,elt,cmp,next)
+
+#define LL_SEARCH2(head,out,elt,cmp,next)                                                      \
+do {                                                                                           \
+    LL_FOREACH2(head,out,next) {                                                               \
+      if ((cmp(out,elt))==0) break;                                                            \
+    }                                                                                          \
+} while(0) 
+
+#define LL_REPLACE_ELEM(head, el, add)                                                         \
+do {                                                                                           \
+ LDECLTYPE(head) _tmp;                                                                         \
+ assert(head != NULL);                                                                         \
+ assert(el != NULL);                                                                           \
+ assert(add != NULL);                                                                          \
+ (add)->next = (el)->next;                                                                     \
+ if ((head) == (el)) {                                                                         \
+  (head) = (add);                                                                              \
+ } else {                                                                                      \
+  _tmp = head;                                                                                 \
+  while (_tmp->next && (_tmp->next != (el))) {                                                 \
+   _tmp = _tmp->next;                                                                          \
+  }                                                                                            \
+  if (_tmp->next) {                                                                            \
+    _tmp->next = (add);                                                                        \
+  }                                                                                            \
+ }                                                                                             \
+} while (0)
+
+#define LL_PREPEND_ELEM(head, el, add)                                                         \
+do {                                                                                           \
+ LDECLTYPE(head) _tmp;                                                                         \
+ assert(head != NULL);                                                                         \
+ assert(el != NULL);                                                                           \
+ assert(add != NULL);                                                                          \
+ (add)->next = (el);                                                                           \
+ if ((head) == (el)) {                                                                         \
+  (head) = (add);                                                                              \
+ } else {                                                                                      \
+  _tmp = head;                                                                                 \
+  while (_tmp->next && (_tmp->next != (el))) {                                                 \
+   _tmp = _tmp->next;                                                                          \
+  }                                                                                            \
+  if (_tmp->next) {                                                                            \
+    _tmp->next = (add);                                                                        \
+  }                                                                                            \
+ }                                                                                             \
+} while (0)                                                                                    \
+
+
+/******************************************************************************
+ * doubly linked list macros (non-circular)                                   *
+ *****************************************************************************/
+#define DL_PREPEND(head,add)                                                                   \
+    DL_PREPEND2(head,add,prev,next)
+
+#define DL_PREPEND2(head,add,prev,next)                                                        \
+do {                                                                                           \
+ (add)->next = head;                                                                           \
+ if (head) {                                                                                   \
+   (add)->prev = (head)->prev;                                                                 \
+   (head)->prev = (add);                                                                       \
+ } else {                                                                                      \
+   (add)->prev = (add);                                                                        \
+ }                                                                                             \
+ (head) = (add);                                                                               \
+} while (0)
+
+#define DL_APPEND(head,add)                                                                    \
+    DL_APPEND2(head,add,prev,next)
+
+#define DL_APPEND2(head,add,prev,next)                                                         \
+do {                                                                                           \
+  if (head) {                                                                                  \
+      (add)->prev = (head)->prev;                                                              \
+      (head)->prev->next = (add);                                                              \
+      (head)->prev = (add);                                                                    \
+      (add)->next = NULL;                                                                      \
+  } else {                                                                                     \
+      (head)=(add);                                                                            \
+      (head)->prev = (head);                                                                   \
+      (head)->next = NULL;                                                                     \
+  }                                                                                            \
+} while (0) 
+
+#define DL_CONCAT(head1,head2)                                                                 \
+    DL_CONCAT2(head1,head2,prev,next)
+
+#define DL_CONCAT2(head1,head2,prev,next)                                                      \
+do {                                                                                           \
+  LDECLTYPE(head1) _tmp;                                                                       \
+  if (head2) {                                                                                 \
+    if (head1) {                                                                               \
+        _tmp = (head2)->prev;                                                                  \
+        (head2)->prev = (head1)->prev;                                                         \
+        (head1)->prev->next = (head2);                                                         \
+        (head1)->prev = _tmp;                                                                  \
+    } else {                                                                                   \
+        (head1)=(head2);                                                                       \
+    }                                                                                          \
+  }                                                                                            \
+} while (0) 
+
+#define DL_DELETE(head,del)                                                                    \
+    DL_DELETE2(head,del,prev,next)
+
+#define DL_DELETE2(head,del,prev,next)                                                         \
+do {                                                                                           \
+  assert((del)->prev != NULL);                                                                 \
+  if ((del)->prev == (del)) {                                                                  \
+      (head)=NULL;                                                                             \
+  } else if ((del)==(head)) {                                                                  \
+      (del)->next->prev = (del)->prev;                                                         \
+      (head) = (del)->next;                                                                    \
+  } else {                                                                                     \
+      (del)->prev->next = (del)->next;                                                         \
+      if ((del)->next) {                                                                       \
+          (del)->next->prev = (del)->prev;                                                     \
+      } else {                                                                                 \
+          (head)->prev = (del)->prev;                                                          \
+      }                                                                                        \
+  }                                                                                            \
+} while (0) 
+
+#define DL_COUNT(head,el,counter)                                                              \
+    DL_COUNT2(head,el,counter,next)                                                            \
+
+#define DL_COUNT2(head,el,counter,next)                                                        \
+{                                                                                              \
+    counter = 0;                                                                               \
+    DL_FOREACH2(head,el,next){ ++counter; }                                                    \
+}
+
+#define DL_FOREACH(head,el)                                                                    \
+    DL_FOREACH2(head,el,next)
+
+#define DL_FOREACH2(head,el,next)                                                              \
+    for(el=head;el;el=(el)->next)
+
+/* this version is safe for deleting the elements during iteration */
+#define DL_FOREACH_SAFE(head,el,tmp)                                                           \
+    DL_FOREACH_SAFE2(head,el,tmp,next)
+
+#define DL_FOREACH_SAFE2(head,el,tmp,next)                                                     \
+  for((el)=(head);(el) && (tmp = (el)->next, 1); (el) = tmp)
+
+/* these are identical to their singly-linked list counterparts */
+#define DL_SEARCH_SCALAR LL_SEARCH_SCALAR
+#define DL_SEARCH LL_SEARCH
+#define DL_SEARCH_SCALAR2 LL_SEARCH_SCALAR2
+#define DL_SEARCH2 LL_SEARCH2
+
+#define DL_REPLACE_ELEM(head, el, add)                                                         \
+do {                                                                                           \
+ assert(head != NULL);                                                                         \
+ assert(el != NULL);                                                                           \
+ assert(add != NULL);                                                                          \
+ if ((head) == (el)) {                                                                         \
+  (head) = (add);                                                                              \
+  (add)->next = (el)->next;                                                                    \
+  if ((el)->next == NULL) {                                                                    \
+   (add)->prev = (add);                                                                        \
+  } else {                                                                                     \
+   (add)->prev = (el)->prev;                                                                   \
+   (add)->next->prev = (add);                                                                  \
+  }                                                                                            \
+ } else {                                                                                      \
+  (add)->next = (el)->next;                                                                    \
+  (add)->prev = (el)->prev;                                                                    \
+  (add)->prev->next = (add);                                                                   \
+  if ((el)->next == NULL) {                                                                    \
+   (head)->prev = (add);                                                                       \
+  } else {                                                                                     \
+   (add)->next->prev = (add);                                                                  \
+  }                                                                                            \
+ }                                                                                             \
+} while (0)
+
+#define DL_PREPEND_ELEM(head, el, add)                                                         \
+do {                                                                                           \
+ assert(head != NULL);                                                                         \
+ assert(el != NULL);                                                                           \
+ assert(add != NULL);                                                                          \
+ (add)->next = (el);                                                                           \
+ (add)->prev = (el)->prev;                                                                     \
+ (el)->prev = (add);                                                                           \
+ if ((head) == (el)) {                                                                         \
+  (head) = (add);                                                                              \
+ } else {                                                                                      \
+  (add)->prev->next = (add);                                                                   \
+ }                                                                                             \
+} while (0)                                                                                    \
+
+
+/******************************************************************************
+ * circular doubly linked list macros                                         *
+ *****************************************************************************/
+#define CDL_PREPEND(head,add)                                                                  \
+    CDL_PREPEND2(head,add,prev,next)
+
+#define CDL_PREPEND2(head,add,prev,next)                                                       \
+do {                                                                                           \
+ if (head) {                                                                                   \
+   (add)->prev = (head)->prev;                                                                 \
+   (add)->next = (head);                                                                       \
+   (head)->prev = (add);                                                                       \
+   (add)->prev->next = (add);                                                                  \
+ } else {                                                                                      \
+   (add)->prev = (add);                                                                        \
+   (add)->next = (add);                                                                        \
+ }                                                                                             \
+(head)=(add);                                                                                  \
+} while (0)
+
+#define CDL_DELETE(head,del)                                                                   \
+    CDL_DELETE2(head,del,prev,next)
+
+#define CDL_DELETE2(head,del,prev,next)                                                        \
+do {                                                                                           \
+  if ( ((head)==(del)) && ((head)->next == (head))) {                                          \
+      (head) = 0L;                                                                             \
+  } else {                                                                                     \
+     (del)->next->prev = (del)->prev;                                                          \
+     (del)->prev->next = (del)->next;                                                          \
+     if ((del) == (head)) (head)=(del)->next;                                                  \
+  }                                                                                            \
+} while (0) 
+
+#define CDL_COUNT(head,el,counter)                                                             \
+    CDL_COUNT2(head,el,counter,next)                                                           \
+
+#define CDL_COUNT2(head, el, counter,next)                                                     \
+{                                                                                              \
+    counter = 0;                                                                               \
+    CDL_FOREACH2(head,el,next){ ++counter; }                                                   \
+}
+
+#define CDL_FOREACH(head,el)                                                                   \
+    CDL_FOREACH2(head,el,next)
+
+#define CDL_FOREACH2(head,el,next)                                                             \
+    for(el=head;el;el=((el)->next==head ? 0L : (el)->next)) 
+
+#define CDL_FOREACH_SAFE(head,el,tmp1,tmp2)                                                    \
+    CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next)
+
+#define CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next)                                         \
+  for((el)=(head), ((tmp1)=(head)?((head)->prev):NULL);                                        \
+      (el) && ((tmp2)=(el)->next, 1);                                                          \
+      ((el) = (((el)==(tmp1)) ? 0L : (tmp2))))
+
+#define CDL_SEARCH_SCALAR(head,out,field,val)                                                  \
+    CDL_SEARCH_SCALAR2(head,out,field,val,next)
+
+#define CDL_SEARCH_SCALAR2(head,out,field,val,next)                                            \
+do {                                                                                           \
+    CDL_FOREACH2(head,out,next) {                                                              \
+      if ((out)->field == (val)) break;                                                        \
+    }                                                                                          \
+} while(0) 
+
+#define CDL_SEARCH(head,out,elt,cmp)                                                           \
+    CDL_SEARCH2(head,out,elt,cmp,next)
+
+#define CDL_SEARCH2(head,out,elt,cmp,next)                                                     \
+do {                                                                                           \
+    CDL_FOREACH2(head,out,next) {                                                              \
+      if ((cmp(out,elt))==0) break;                                                            \
+    }                                                                                          \
+} while(0) 
+
+#define CDL_REPLACE_ELEM(head, el, add)                                                        \
+do {                                                                                           \
+ assert(head != NULL);                                                                         \
+ assert(el != NULL);                                                                           \
+ assert(add != NULL);                                                                          \
+ if ((el)->next == (el)) {                                                                     \
+  (add)->next = (add);                                                                         \
+  (add)->prev = (add);                                                                         \
+  (head) = (add);                                                                              \
+ } else {                                                                                      \
+  (add)->next = (el)->next;                                                                    \
+  (add)->prev = (el)->prev;                                                                    \
+  (add)->next->prev = (add);                                                                   \
+  (add)->prev->next = (add);                                                                   \
+  if ((head) == (el)) {                                                                        \
+   (head) = (add);                                                                             \
+  }                                                                                            \
+ }                                                                                             \
+} while (0)
+
+#define CDL_PREPEND_ELEM(head, el, add)                                                        \
+do {                                                                                           \
+ assert(head != NULL);                                                                         \
+ assert(el != NULL);                                                                           \
+ assert(add != NULL);                                                                          \
+ (add)->next = (el);                                                                           \
+ (add)->prev = (el)->prev;                                                                     \
+ (el)->prev = (add);                                                                           \
+ (add)->prev->next = (add);                                                                    \
+ if ((head) == (el)) {                                                                         \
+  (head) = (add);                                                                              \
+ }                                                                                             \
+} while (0)                                                                                    \
+
+#endif /* UTLIST_H */
+
diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c

index 369784795129d7bcf0e7c7ad815e595ad61ff46d..0976fe3984680b8a7c9d236a397cc7bc3e6839b7 100644 (file)
--- a/bcftools/vcfannotate.c
+++ b/bcftools/vcfannotate.c
@@ -1,6 +1,6 @@
  /*  vcfannotate.c -- Annotate and edit VCF/BCF files.
  
-    Copyright (C) 2013-2019 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -26,6 +26,7 @@ THE SOFTWARE.  */
  #include <strings.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <ctype.h>
  #include <string.h>
  #include <errno.h>
@@ -70,6 +71,7 @@ annot_line_t;
  #define REPLACE_ALL      1      // replace both missing and existing values
  #define REPLACE_NON_MISSING 2   // replace only if tgt is not missing
  #define SET_OR_APPEND    3      // set new value if missing or non-existent, append otherwise
+#define MATCH_VALUE      4      // do not set, just match the value -c ~ID
  #define MM_FIRST   0    // if multiple annotation lines overlap a VCF record, use the first, discarding the rest
  #define MM_APPEND  1    // append, possibly multiple times
  #define MM_UNIQUE  2    // append, only unique values
@@ -77,19 +79,26 @@ annot_line_t;
  #define MM_AVG     4
  #define MM_MIN     5
  #define MM_MAX     6
+#define MM_APPEND_MISSING 7     // missing values will be transferred as well
  typedef struct _annot_col_t
  {
      int icol, replace, number;  // number: one of BCF_VL_* types
      char *hdr_key_src, *hdr_key_dst;
-    int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*);
+    // The setters return 0 on successful update of the bcf record, negative value (bcf_update_* return status) on errors,
+    // or 1 on (repeated partial updates) concluded with a src=NULL call
+    int (*setter)(struct _args_t *, bcf1_t *dst, struct _annot_col_t *, void *src); // the last is the annotation line, either src bcf1_t or annot_line_t
+    int (*getter)(struct _args_t *, bcf1_t *src, struct _annot_col_t *, void **ptr, int *mptr);
      int merge_method;               // one of the MM_* defines
      khash_t(str2int) *mm_str_hash;  // lookup table to ensure uniqueness of added string values
      kstring_t mm_kstr;
-    double
+    size_t
          mm_dbl_nalloc,  // the allocated size --merge-logic values array
          mm_dbl_nused,   // the number of used elements in the mm_dbl array
-        mm_dbl_ndat,    // the number of merged rows (for calculating the average)
+        mm_dbl_ndat;    // the number of merged rows (for calculating the average)
+    double
          *mm_dbl;
+    void *ptr;
+    int mptr, done;
  }
  annot_col_t;
  
@@ -103,12 +112,12 @@ annot_col_t;
  typedef struct _args_t
  {
      bcf_srs_t *files;
-    bcf_hdr_t *hdr, *hdr_out;
+    bcf_hdr_t *hdr, *hdr_out, *tgts_hdr;
      htsFile *out_fh;
      int output_type, n_threads;
      bcf_sr_regions_t *tgts;
  
-    regidx_t *tgt_idx;
+    regidx_t *tgt_idx;  // keep everything in memory only with .tab annotation file and -c BEG,END columns
      regitr_t *tgt_itr;
      int tgt_is_bed;
  
@@ -123,10 +132,13 @@ typedef struct _args_t
  
      vcmp_t *vcmp;           // for matching annotation and VCF lines by allele
      annot_line_t *alines;   // buffered annotation lines
-    int nalines, malines;
+    annot_line_t *aline_missing;
+    uint32_t *srt_alines;   // sorted indexes (iALT<<16 || iAline)
+    int nalines, malines, nsrt_alines, msrt_alines;
      int ref_idx, alt_idx, chr_idx, beg_idx, end_idx;   // -1 if not present
      annot_col_t *cols;      // column indexes and setters
      int ncols;
+    int match_id;           // set iff `-c ~ID` given
  
      char *set_ids_fmt;
      convert_t *set_ids;
@@ -144,9 +156,10 @@ typedef struct _args_t
      kstring_t tmpks;
  
      char **argv, *output_fname, *targets_fname, *regions_list, *header_fname;
-    char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites;
-    char *merge_method_str;
+    char *remove_annots, *columns, *rename_chrs, *rename_annots, *sample_names, *mark_sites;
+    kstring_t merge_method_str;
      int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps;
+    int columns_is_file, has_append_mode;
  }
  args_t;
  
@@ -195,6 +208,8 @@ void remove_info(args_t *args, bcf1_t *line, rm_tag_t *tag)
      for (i=0; i<line->n_info; i++)
      {
          bcf_info_t *inf = &line->d.info[i];
+        if (  !strcmp("END",bcf_hdr_int2id(args->hdr,BCF_DT_ID,inf->key)) )
+            line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
          if ( inf->vptr_free )
          {
              free(inf->vptr - inf->vptr_off);
@@ -374,6 +389,10 @@ static void init_remove_annots(args_t *args)
          }
          else if ( str.l )
          {
+            int id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, str.s);
+            if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,id) ) error("Error: did you mean INFO/%s?\n",str.s);
+            if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) error("Error: did you mean FORMAT/%s?\n",str.s);
+
              if ( !args->keep_sites )
              {
                  if ( str.s[0]=='#' && str.s[1]=='#' )
@@ -441,6 +460,42 @@ static void init_header_lines(args_t *args)
      if (bcf_hdr_sync(args->hdr) < 0)
          error_errno("[%s] Failed to update input header", __func__);
  }
+static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+    return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr); 
+}
+static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+    char *str = *((char**)ptr);
+    int len = strlen(rec->d.id);
+    if ( len >= *mptr ) str = realloc(str, len+1);
+    strcpy(str, rec->d.id);
+    *((char**)ptr) = str;
+    *mptr = len+1;
+    return len;
+}
+static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+    kstring_t str;
+    str.s = *((char**)ptr);
+    str.m = *mptr;
+    str.l = 0;
+
+    int i;
+    if ( rec->d.n_flt )
+    {
+        for (i=0; i<rec->d.n_flt; i++)
+        {
+            if (i) kputc(';', &str);
+            kputs(bcf_hdr_int2id(args->tgts_hdr,BCF_DT_ID,rec->d.flt[i]), &str);
+        }
+    }
+    else kputc('.', &str);
+
+    *((char**)ptr) = str.s;
+    *mptr = str.m;
+    return str.l;
+}
  static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
      if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n");
@@ -450,24 +505,24 @@ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *dat
      if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
      hts_expand(int,1,args->mtmpi,args->tmpi);
      args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]);
-    if ( args->tmpi[0]<0 ) error("The FILTER is not defined in the header: %s\n", tab->cols[col->icol]);
-    if ( col->replace==SET_OR_APPEND ) { bcf_add_filter(args->hdr_out,line,args->tmpi[0]); return 0; }
+    if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]);
+    if ( col->replace==SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]);
      if ( col->replace!=REPLACE_MISSING )
      {
          bcf_update_filter(args->hdr_out,line,NULL,0);
-        bcf_update_filter(args->hdr_out,line,args->tmpi,1); 
-        return 0; 
+        return bcf_update_filter(args->hdr_out,line,args->tmpi,1); 
      }
      
      // only update missing FILTER
      if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
      if ( !line->d.n_flt )
-        bcf_update_filter(args->hdr_out,line,args->tmpi,1);
+        return bcf_update_filter(args->hdr_out,line,args->tmpi,1);
+
      return 0;
  }
  static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
-    int i;
+    int i, ret = 0;
      bcf1_t *rec = (bcf1_t*) data;
      if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT);
      if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
@@ -478,9 +533,9 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void
          for (i=0; i<rec->d.n_flt; i++)
          {
              const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
-            bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt));
+            if ( bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt)) < 0 ) ret = -1;
          }
-        return 0;
+        return ret;
      }
      hts_expand(int,rec->d.n_flt,args->mtmpi,args->tmpi);
      for (i=0; i<rec->d.n_flt; i++)
@@ -489,12 +544,12 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void
          args->tmpi[i] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt);
      }
      bcf_update_filter(args->hdr_out,line,NULL,0);
-    bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
-    return 0;
+    return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
  }
  static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
      if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n");
+    if ( col->replace==MATCH_VALUE ) return 0;
  
      // possible cases:
      //      IN  ANNOT   OUT     ACHIEVED_BY
@@ -517,14 +572,28 @@ static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  }
  static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
+    if ( col->replace==MATCH_VALUE ) return 0;
+
      bcf1_t *rec = (bcf1_t*) data;
-    if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0;    // don't replace with "."
-    if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,rec->d.id);
-    if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,rec->d.id);
+
+    char *id;
+    if ( col->getter )
+    {
+        int nret = col->getter(args,rec,col,&col->ptr,&col->mptr);
+        id = (char*) col->ptr;
+        if ( nret<=0 || (nret==1 && *id=='.') ) return 0;   // don't replace with "."
+    }
+    else
+    {
+        if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0;    // don't replace with "."
+        id = rec->d.id;
+    }
+    if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id);
+    if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,id);
  
      // running with +ID, only update missing ids
      if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
-        return bcf_update_id(args->hdr_out,line,rec->d.id);
+        return bcf_update_id(args->hdr_out,line,id);
      return 0;
  }
  static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -535,9 +604,9 @@ static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *da
      als[0] = rec->d.allele[0];
      int i;
      for (i=1; i<line->n_allele; i++) als[i] = line->d.allele[i];
-    bcf_update_alleles(args->hdr_out, line, als, line->n_allele);
+    int ret = bcf_update_alleles(args->hdr_out, line, als, line->n_allele);
      free(als);
-    return 0;
+    return ret;
  }
  static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
@@ -551,9 +620,9 @@ static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *da
      const char **als = (const char**) malloc(sizeof(char*)*rec->n_allele);
      als[0] = line->d.allele[0];
      for (i=1; i<rec->n_allele; i++) als[i] = rec->d.allele[i];
-    bcf_update_alleles(args->hdr_out, line, als, rec->n_allele);
+    int ret = bcf_update_alleles(args->hdr_out, line, als, rec->n_allele);
      free(als);
-    return 0;
+    return ret;
  }
  static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
@@ -627,34 +696,51 @@ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int
  
          args->tmpi2[i] = args->tmpi[ map[i] ];
      }
-    bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
-    return 0;
+    return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
  }
  static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
      annot_line_t *tab = (annot_line_t*) data;
  
+    // This is a bit hacky, only to reuse existing code with minimal changes:
+    //      -c =TAG will now behave as -l TAG:APPEND for integers
+    if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND;
+
      if ( !tab )
      {
-        if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND )
-            error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Integer\n");
+        if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG &&
+             col->merge_method!=MM_MIN && col->merge_method!=MM_MAX &&
+             col->merge_method!=MM_APPEND && 
+             col->merge_method!=MM_APPEND_MISSING )
+            error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Integer\n");
      }
  
      int i,ntmpi = 0;
-    if ( tab )
+    if ( tab )  // has data, not flushing yet
      {
          char *str = tab->cols[col->icol], *end = str;
-        if ( str[0]=='.' && str[1]==0 ) return 0;
+        if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1;
  
          while ( *end )
          {
-            int val = strtol(str, &end, 10); 
-            if ( end==str )
-                error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
              ntmpi++;
              hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
-            args->tmpi[ntmpi-1] = val;
-            str = end+1;
+            if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
+            {
+                if ( col->merge_method==MM_APPEND_MISSING )
+                    args->tmpi[ntmpi-1] = bcf_int32_missing;
+                else
+                    ntmpi--;
+                if ( str[1]==0 ) end = str+1;
+                str += 2;
+            }
+            else
+            {
+                args->tmpi[ntmpi-1] = strtol(str, &end, 10); 
+                if ( end==str )
+                    error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
+                str = end+1;
+            }
          }
          if ( col->merge_method!=MM_FIRST )
          {
@@ -667,7 +753,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d
              }
              else
              {
-                if ( col->merge_method==MM_APPEND )
+                if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
                  {
                      int nori = col->mm_dbl_nused;
                      col->mm_dbl_nused += ntmpi;
@@ -687,9 +773,10 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d
                  }
              }
              col->mm_dbl_ndat++;
+            return 1;
          }
      }
-    else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND )
+    else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
      {
          ntmpi = col->mm_dbl_nused;
          hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
@@ -713,8 +800,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d
          if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
      }
  
-    bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
-    return 0;
+    return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
  }
  static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
@@ -731,8 +817,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi
          if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
      }
  
-    bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
-    return 0;
+    return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
  }
  static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
  {
@@ -763,34 +848,51 @@ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int
  
          args->tmpf2[i] = args->tmpf[ map[i] ];
      }
-    bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
-    return 0;
+    return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
  }
  static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
      annot_line_t *tab = (annot_line_t*) data;
  
+    // This is a bit hacky, only to reuse existing code with minimal changes:
+    //      -c =TAG will now behave as -l TAG:APPEND for floats
+    if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND;
+
      if ( !tab )
      {
-        if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND )
-            error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Float\n");
+        if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG &&
+             col->merge_method!=MM_MIN && col->merge_method!=MM_MAX &&
+             col->merge_method!=MM_APPEND &&
+             col->merge_method!=MM_APPEND_MISSING )
+            error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Float\n");
      }
  
      int i,ntmpf = 0;
      if ( tab )
      {
          char *str = tab->cols[col->icol], *end = str;
-        if ( str[0]=='.' && str[1]==0 ) return 0;
+        if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1;
  
          while ( *end )
          {
-            double val = strtod(str, &end);
-            if ( end==str )
-                error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
              ntmpf++;
              hts_expand(float,ntmpf,args->mtmpf,args->tmpf);
-            args->tmpf[ntmpf-1] = val;
-            str = end+1;
+            if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
+            {
+                if ( col->merge_method==MM_APPEND_MISSING ) 
+                    bcf_float_set_missing(args->tmpf[ntmpf-1]);
+                else
+                    ntmpf--;
+                if ( str[1]==0 ) end = str+1;
+                str += 2;
+            }
+            else
+            {
+                args->tmpf[ntmpf-1] = strtod(str, &end);
+                if ( end==str )
+                    error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
+                str = end+1;
+            }
          }
          if ( col->merge_method!=MM_FIRST )
          {
@@ -799,17 +901,27 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *
                  col->mm_dbl_nused = ntmpf;
                  hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
                  for (i=0; i<ntmpf; i++)
-                    col->mm_dbl[i] = args->tmpf[i];
+                {
+                    if ( bcf_float_is_missing(args->tmpf[i]) )
+                        bcf_double_set_missing(col->mm_dbl[i]);
+                    else
+                        col->mm_dbl[i] = args->tmpf[i];
+                }
              }
              else
              {
-                if ( col->merge_method==MM_APPEND )
+                if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
                  {
                      int nori = col->mm_dbl_nused;
                      col->mm_dbl_nused += ntmpf;
                      hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
                      for (i=0; i<ntmpf; i++)
-                        col->mm_dbl[i+nori] = args->tmpf[i];
+                    {
+                        if ( bcf_float_is_missing(args->tmpf[i]) )
+                            bcf_double_set_missing(col->mm_dbl[i+nori]);
+                        else
+                            col->mm_dbl[i+nori] = args->tmpf[i];
+                    }
                  }
                  else
                  {
@@ -823,13 +935,20 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *
                  }
              }
              col->mm_dbl_ndat++;
+            return 1;
          }
      }
-    else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND )
+    else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
      {
          ntmpf = col->mm_dbl_nused;
          hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf);
-        for (i=0; i<ntmpf; i++) args->tmpf[i] = col->mm_dbl[i];
+        for (i=0; i<ntmpf; i++)
+        {
+            if ( bcf_double_is_missing(col->mm_dbl[i]) )
+                bcf_float_set_missing(args->tmpf[i]);
+            else
+                args->tmpf[i] = col->mm_dbl[i];
+        }
          col->mm_dbl_nused = col->mm_dbl_ndat = 0;
      }
      else if ( col->merge_method==MM_AVG )
@@ -849,8 +968,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *
          if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
      }
  
-    bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
-    return 0;
+    return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
  }
  static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
@@ -867,8 +985,7 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo
          if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
      }
  
-    bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
-    return 0;
+    return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
  }
  int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
  static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als)
@@ -923,10 +1040,9 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in
              if ( str[0]!='.' || (str[1]!=',' && str[1]!=0) ) continue;  // value already set
          }
          int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i);
-        assert( ret==0 );
+        if ( ret!=0 ) error("[%s:%d %s] Failed to copy a string field\n",  __FILE__,__LINE__,__func__);
      }
-    bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
-    return 0;
+    return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
  }
  void khash_str2int_clear_free(void *_hash)
  {
@@ -945,14 +1061,18 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
          if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
      }
  
+    // This is a bit hacky, only to reuse existing code with minimal changes:
+    //      -c =TAG will now behave as -l TAG:unique for strings
+    if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_UNIQUE;
+
      annot_line_t *tab = (annot_line_t*) data;
-    
+
      int len = 0;
      if ( tab )
      {
          len = strlen(tab->cols[col->icol]);
          if ( !len ) return 0;
-        if ( len==1 && tab->cols[col->icol][0]=='.' ) return 0;
+        if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING ) return 1;
      }
  
      if ( col->merge_method!=MM_FIRST )
@@ -962,17 +1082,17 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
  
          if ( data )
          {
-            assert( col->merge_method==MM_APPEND || col->merge_method==MM_UNIQUE );
+            assert( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING || col->merge_method==MM_UNIQUE );
              if ( col->merge_method==MM_UNIQUE )
              {
                  if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init();
-                if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 0;
+                if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 1;
                  khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol]));
              }
  
              if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr);
              kputs(tab->cols[col->icol], &col->mm_kstr);
-            return 0;
+            return 1;
          }
  
          if ( col->mm_kstr.l )
@@ -983,12 +1103,10 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
          else
              return 0;
  
-        if ( !data )    // flush the line
-        {
-            if ( col->merge_method==MM_UNIQUE )
-                khash_str2int_clear_free(col->mm_str_hash);
-            col->mm_kstr.l = 0;
-        }
+        // flush the line
+        if ( col->merge_method==MM_UNIQUE )
+            khash_str2int_clear_free(col->mm_str_hash);
+        col->mm_kstr.l = 0;
      }
      else
      {
@@ -1000,14 +1118,19 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
              return setter_ARinfo_string(args,line,col,tab->nals,tab->als);
      }
  
-    bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
-    return 0;
+    return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
  }
  static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
      bcf1_t *rec = (bcf1_t*) data;
-    int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
-    if ( ntmps < 0 ) return 0;    // nothing to add
+
+    if ( col->getter )
+        col->getter(args,rec,col,(void**)&args->tmps, &args->mtmps);
+    else
+    {
+        int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
+        if ( ntmps < 0 ) return 0;    // nothing to add
+    }
  
      if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) 
          return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele);
@@ -1018,8 +1141,7 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi
          if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
      }
  
-    bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
-    return 0;
+    return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
  }
  static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str)
  {
@@ -1689,7 +1811,6 @@ static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col,
          }
      }
      return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nsmpl_dst*ndst1);
-
  }
  
  static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -1771,17 +1892,12 @@ static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst)
          // tab annotation file, expecting that all samples are present: sample map not needed
          if ( !src ) return 0;
  
-        int nmatch = 0, order_ok = 1;
+        int nmatch = 0;
          for (i=0; i<bcf_hdr_nsamples(src); i++)
          {
              int id = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, src->samples[i]);
-            if ( id!=-1 ) 
-            {
-                nmatch++;
-                if ( i!=id ) order_ok = 0;
-            }
+            if ( id!=-1 ) nmatch++;
          }
-        if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok ) return 0;  // not needed
          if ( !nmatch ) return -1;   // No matching samples found in the source and the destination file
  
          args->nsample_map = bcf_hdr_nsamples(dst);
@@ -1900,11 +2016,45 @@ static void init_columns(args_t *args)
      int need_sample_map = 0;
      int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr);
  
+    kstring_t tmp = {0,0,0};
+    if ( args->columns_is_file )
+    {
+        int i,n;
+        char **str = hts_readlist(args->columns, args->columns_is_file, &n);
+        if ( !str ) error("Could not parse %s\n", args->columns);
+        for (i=0; i<n; i++)
+        {
+            char *ptr = str[i];
+            while ( *ptr && !isspace(*ptr) ) ptr++;
+            if ( *ptr )
+            {
+                *ptr = 0;
+                ptr++;
+                while ( *ptr && isspace(*ptr) ) ptr++;
+                if ( *ptr )
+                {
+                    if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str);
+                    kputs(str[i],&args->merge_method_str);
+                    kputc(':',&args->merge_method_str);
+                    kputs(ptr,&args->merge_method_str);
+                }
+            }
+            if ( tmp.l ) kputc(',',&tmp);
+            kputs(str[i],&tmp);
+            free(str[i]);
+        }
+        free(str);
+        free(args->columns);
+        args->columns = tmp.s;
+        tmp.l = tmp.m = 0;
+        tmp.s = NULL;
+    }
+
      void *skip_fmt = NULL, *skip_info = NULL;
      if ( args->tgts_is_vcf )
          args->columns = columns_complement(args->columns, &skip_info, &skip_fmt);
  
-    kstring_t str = {0,0,0}, tmp = {0,0,0};
+    kstring_t str = {0,0,0};
      char *ss = args->columns, *se = ss;
      args->ncols = 0;
      int icol = -1, has_fmt_str = 0;
@@ -1929,6 +2079,7 @@ static void init_columns(args_t *args)
              {
                  args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
                  annot_col_t *col = &args->cols[args->ncols-1];
+                memset(col,0,sizeof(*col));
                  col->setter = vcf_setter_ref;
                  col->hdr_key_src = strdup(str.s);
                  col->hdr_key_dst = strdup(str.s);
@@ -1941,28 +2092,54 @@ static void init_columns(args_t *args)
              {
                  args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
                  annot_col_t *col = &args->cols[args->ncols-1];
+                memset(col,0,sizeof(*col));
                  col->setter = vcf_setter_alt;
                  col->hdr_key_src = strdup(str.s);
                  col->hdr_key_dst = strdup(str.s);
              }
              else args->alt_idx = icol;
          }
-        else if ( !strcasecmp("ID",str.s) )
+        else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) )
          {
              if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+            if ( str.s[0]=='~' ) replace = MATCH_VALUE;
+            if ( args->tgts_is_vcf && replace==MATCH_VALUE ) error("todo: -c ~ID with -a VCF?\n");
              args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
              annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
              col->icol = icol;
              col->replace = replace;
              col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
              col->hdr_key_src = strdup(str.s);
              col->hdr_key_dst = strdup(str.s);
+            if ( replace==MATCH_VALUE ) args->match_id = icol;
+        }
+        else if ( !strncasecmp("ID:=",str.s,4) )    // transfer a tag from INFO to ID column
+        {
+            if ( !args->tgts_is_vcf ) error("The annotation source must be a VCF for \"%s\"\n",str.s);
+            if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+            args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+            annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
+            col->icol = icol;
+            col->replace = replace;
+            col->setter = vcf_setter_id;
+            col->getter = vcf_getter_info_str2str;
+            str.s[2] = 0;
+            col->hdr_key_dst = strdup(str.s);
+            col->hdr_key_src = strncasecmp("INFO/",str.s+4,5) ? strdup(str.s+4) : strdup(str.s+4+5);
+            int hdr_id = bcf_hdr_id2int(args->tgts_hdr, BCF_DT_ID,col->hdr_key_src);
+            if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) ) 
+                error("The INFO tag \"%s\" is not defined in %s\n", col->hdr_key_src, args->targets_fname);
+            if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR )
+                error("Only Type=String tags can be used to annotate the ID column\n");
          }
          else if ( !strcasecmp("FILTER",str.s) )
          {
              if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
              args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
              annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
              col->icol = icol;
              col->replace = replace;
              col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
@@ -1977,7 +2154,7 @@ static void init_columns(args_t *args)
                      bcf_hrec_t *hrec = tgts_hdr->hrec[j];
                      if ( hrec->type!=BCF_HL_FLT ) continue;
                      int k = bcf_hrec_find_key(hrec,"ID");
-                    assert( k>=0 ); // this should always be true for valid VCFs
+                    if ( k<0 ) error("[%s] Failed to parse the header, the ID attribute not found", __func__);
                      tmp.l = 0;
                      bcf_hrec_format(hrec, &tmp);
                      bcf_hdr_append(args->hdr_out, tmp.s);
@@ -1992,6 +2169,7 @@ static void init_columns(args_t *args)
              if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
              args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
              annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
              col->icol = icol;
              col->replace = replace;
              col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
@@ -2001,7 +2179,7 @@ static void init_columns(args_t *args)
          else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
          {
              if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
-            if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
+            if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n");
              bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
              int j;
              for (j=0; j<tgts_hdr->nhrec; j++)
@@ -2019,6 +2197,7 @@ static void init_columns(args_t *args)
                  int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
                  args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
                  annot_col_t *col = &args->cols[args->ncols-1];
+                memset(col,0,sizeof(*col));
                  col->icol = -1;
                  col->replace = replace;
                  col->hdr_key_src = strdup(hrec->vals[k]);
@@ -2054,11 +2233,16 @@ static void init_columns(args_t *args)
                  int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
                  args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
                  annot_col_t *col = &args->cols[args->ncols-1];
+                memset(col,0,sizeof(*col));
                  col->icol = -1;
                  col->replace = replace;
                  col->hdr_key_src = strdup(hrec->vals[k]);
                  col->hdr_key_dst = strdup(hrec->vals[k]);
-                if ( !strcasecmp("GT",col->hdr_key_src) ) col->setter = vcf_setter_format_gt;
+                if ( !strcasecmp("GT",col->hdr_key_src) )
+                {
+                    if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n");
+                    col->setter = vcf_setter_format_gt;
+                }
                  else
                      switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
                      {
@@ -2097,9 +2281,10 @@ static void init_columns(args_t *args)
              }
              int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
              if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
-                error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
+                error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname);
              args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
              annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
              if ( !args->tgts_is_vcf )
              {
                  col->icol = icol;
@@ -2110,7 +2295,11 @@ static void init_columns(args_t *args)
              col->replace = replace;
              col->hdr_key_src = strdup(key_src);
              col->hdr_key_dst = strdup(key_dst);
-            if ( !strcasecmp("GT",key_src) ) col->setter = vcf_setter_format_gt;
+            if ( !strcasecmp("GT",key_src) )
+            {
+                if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n");
+                col->setter = vcf_setter_format_gt;
+            }
              else
                  switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
                  {
@@ -2129,13 +2318,20 @@ static void init_columns(args_t *args)
          else
          {
              if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
-            if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
-            int explicit_info = 0;
+            if ( replace==SET_OR_APPEND )
+            {
+                if ( args->tgts_is_vcf )
+                    error("Error: the =INFO/TAG feature is currently supported only with TAB annotation files and has limitations\n"
+                          "       (the annotation type is modified to \"Number=.\" and allele ordering is disregarded)\n");
+                fprintf(stderr,"Warning: the =INFO/TAG feature modifies the annotation to \"Number=.\" and disregards allele ordering\n");
+            }
+            int explicit_src_info = 0;
+            int explicit_dst_info = 0;
              char *key_dst;
              if ( !strncasecmp("INFO/",str.s,5) )
              {
                  key_dst = str.s + 5;
-                explicit_info = 1;
+                explicit_dst_info = 1;
              }
              else
                  key_dst = str.s;
@@ -2147,7 +2343,7 @@ static void init_columns(args_t *args)
                  if ( !strncasecmp("INFO/",key_src,5) )
                  {
                      key_src += 5;
-                    explicit_info = 1;
+                    explicit_src_info = 1;
                  }
                  else if ( !strncasecmp("FMT/",key_src,4) || !strncasecmp("FORMAT/",key_src,5) )
                  {
@@ -2157,38 +2353,65 @@ static void init_columns(args_t *args)
              }
              else
                  key_src = key_dst;
+
+            args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+            annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
+            col->icol = icol;
+            col->replace = replace;
+            col->hdr_key_src = strdup(key_src);
+            col->hdr_key_dst = strdup(key_dst);
+
              int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
              if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) )
              {
                  if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line
                  {
-                    bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
-                    if ( !hrec )
+                    if ( !strcasecmp("ID",key_src) && !explicit_src_info )
                      {
-                        if ( !explicit_info && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) )
-                            error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
-                    fprintf(stderr,"[%s] %d\n",key_src,explicit_info);
-                        error("The tag \"%s\" is not defined in %s\n", key_src,args->files->readers[1].fname);
+                        // transferring ID column into a new INFO tag
+                        tmp.l = 0;
+                        ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred ID column\">",key_dst);
+                    }
+                    else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info )
+                    {
+                        // transferring FILTER column into a new INFO tag
+                        tmp.l = 0;
+                        ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred FILTER column\">",key_dst);
+                    }
+                    else
+                    {
+                        bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
+                        if ( !hrec )
+                        {
+                            if ( explicit_dst_info+explicit_src_info==0 && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) )
+                                error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
+                            char *ptr = strchr(key_src,'=');
+                            if ( ptr )
+                            {
+                                *ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '=';
+                                error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s);
+                            }
+                            error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname);
+                        }
+                        tmp.l = 0;
+                        bcf_hrec_format_rename(hrec, key_dst, &tmp);
                      }
-                    tmp.l = 0;
-                    bcf_hrec_format_rename(hrec, key_dst, &tmp);
                      bcf_hdr_append(args->hdr_out, tmp.s);
                      if (bcf_hdr_sync(args->hdr_out) < 0)
                          error_errno("[%s] Failed to update header", __func__);
                      hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
                  }
                  else
-                    error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname);
+                    error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname);
                  assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
              }
-
-            args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
-            annot_col_t *col = &args->cols[args->ncols-1];
-            col->icol = icol;
-            col->replace = replace;
-            col->hdr_key_src = strdup(key_src);
-            col->hdr_key_dst = strdup(key_dst);
-            col->number  = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
+            if  ( args->tgts_is_vcf )
+            {
+                if ( !strcasecmp("ID",key_src) && !explicit_src_info ) col->getter = vcf_getter_id2str;
+                else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) col->getter = vcf_getter_filter2str;
+            }
+            col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
              switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
              {
                  case BCF_HT_FLAG:   col->setter = args->tgts_is_vcf ? vcf_setter_info_flag : setter_info_flag; break;
@@ -2197,6 +2420,18 @@ static void init_columns(args_t *args)
                  case BCF_HT_STR:    col->setter = args->tgts_is_vcf ? vcf_setter_info_str  : setter_info_str; break;
                  default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
              }
+            if ( replace==SET_OR_APPEND )   // change to Number=.
+            {
+                bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_INFO, "ID", key_dst, NULL);
+                if ( !hrec ) error("Uh, could not find the new tag \"%s\" in the header\n", key_dst);
+                hrec = bcf_hrec_dup(hrec);
+                int j = bcf_hrec_find_key(hrec, "Number");
+                if ( j<0 ) error("Uh, could not find the entry Number in the header record of %s\n",key_dst);
+                free(hrec->vals[j]);
+                hrec->vals[j] = strdup(".");
+                bcf_hdr_remove(args->hdr_out,BCF_HL_INFO, key_dst);
+                bcf_hdr_add_hrec(args->hdr_out, hrec);
+            }
          }
          if ( !*se ) break;
          ss = ++se;
@@ -2232,10 +2467,10 @@ static void init_merge_method(args_t *args)
          args->cols[i].mm_dbl_nalloc = args->cols[i].mm_dbl_nused = args->cols[i].mm_dbl_ndat = 0;
          memset(&args->cols[i].mm_kstr, 0, sizeof(args->cols[i].mm_kstr));
      }
-    if ( !args->merge_method_str ) return;
+    if ( !args->merge_method_str.l ) return;
      if ( args->tgts_is_vcf ) error("Error: the --merge-logic is intended for use with BED or TAB-delimited files only.\n");
-    if ( !args->tgt_idx ) error("Error: BEG,END (or FROM,TO) columns are expected with the --merge-logic option.\n");
-    char *sb = args->merge_method_str;
+    if ( !args->tgt_idx && !args->tgts ) error("Error: BEG,END (or FROM,TO) columns or REF,ALT columns are expected with the --merge-logic option.\n");
+    char *sb = args->merge_method_str.s;
      while ( *sb )
      {
          char *se = sb;
@@ -2246,21 +2481,27 @@ static void init_merge_method(args_t *args)
          char *mm_type_str = args->tmpks.s + args->tmpks.l;
          while ( *mm_type_str!=':' && mm_type_str > args->tmpks.s ) mm_type_str--;
          if ( *mm_type_str!=':' )
-            error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str);
+            error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str.s);
          *mm_type_str = 0;
          mm_type_str++;
          int mm_type = MM_FIRST;
          if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE;
+        else if ( !strcasecmp("first",mm_type_str) ) mm_type = MM_FIRST;
          else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND;
+        else if ( !strcasecmp("append-missing",mm_type_str) )
+        {
+            mm_type = MM_APPEND_MISSING;
+            if ( args->ref_idx!=-1 ) args->has_append_mode = 1;
+        }
          else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM;
          else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG;
          else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN;
          else if ( !strcasecmp("max",mm_type_str) ) mm_type = MM_MAX;
-        else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str,mm_type_str);
+        else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str.s,mm_type_str);
          for (i=0; i<args->ncols; i++)
          {
              if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue;
-            if ( mm_type==MM_APPEND && args->cols[i].number!=BCF_VL_VAR )
+            if ( (mm_type==MM_APPEND || mm_type==MM_APPEND_MISSING) && args->cols[i].number!=BCF_VL_VAR )
                  error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n");
              args->cols[i].merge_method = mm_type;
              break;
@@ -2268,6 +2509,20 @@ static void init_merge_method(args_t *args)
          if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s);
          sb = *se ? se + 1 : se;
      }
+    if ( args->has_append_mode )
+    {
+        // create a missing line to insert missing values when VCF ALT finds no match in the annotation file
+        args->aline_missing = (annot_line_t*)calloc(1,sizeof(*args->aline_missing));
+        int ncol = 0;
+        for (i=0; i<args->ncols; i++)
+            if ( ncol < args->cols[i].icol + 1 ) ncol = args->cols[i].icol + 1;
+        if ( ncol < args->ref_idx + 1 ) ncol = args->ref_idx + 1;
+        args->aline_missing->mcols = ncol;
+        args->aline_missing->ncols = ncol;
+        args->aline_missing->cols = (char**) malloc(ncol*sizeof(char*));
+        for (i=0; i<ncol; i++)
+            args->aline_missing->cols[i] = strdup(".");
+    }
  }
  
  static void rename_chrs(args_t *args, char *fname)
@@ -2299,6 +2554,42 @@ static void rename_chrs(args_t *args, char *fname)
      free(map);
  }
  
+static void rename_annots(args_t *args, char *fname)
+{
+    int n, i;
+    char **map = hts_readlist(fname, 1, &n);
+    if ( !map ) error("Could not read: %s\n", fname);
+    for (i=0; i<n; i++)
+    {
+        char *sb = NULL, *ss = map[i];
+        while ( *ss && !isspace(*ss) ) ss++;
+        if ( !*ss ) error("Could not parse: %s\n", fname);
+        *ss = 0;
+        int type;
+        if ( !strncasecmp("info/",map[i],5) ) type = BCF_HL_INFO, sb = map[i] + 5;
+        else if ( !strncasecmp("format/",map[i],7) ) type = BCF_HL_FMT, sb = map[i] + 7;
+        else if ( !strncasecmp("fmt/",map[i],4) ) type = BCF_HL_FMT, sb = map[i] + 4;
+        else if ( !strncasecmp("filter/",map[i],7) ) type = BCF_HL_FLT, sb = map[i] + 7;
+        else error("Could not parse \"%s\", expected INFO, FORMAT, or FILTER prefix for each line: %s\n",map[i],fname);
+        int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, sb);
+        if ( id<0 ) continue;
+        bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", sb, NULL);
+        if ( !hrec ) continue;  // the sequence not present
+        int j = bcf_hrec_find_key(hrec, "ID");
+        assert( j>=0 );
+        free(hrec->vals[j]);
+        ss++;
+        while ( *ss && isspace(*ss) ) ss++;
+        char *se = ss;
+        while ( *se && !isspace(*se) ) se++;
+        *se = 0;
+        hrec->vals[j] = strdup(ss);
+        args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j];
+    }
+    for (i=0; i<n; i++) free(map[i]);
+    free(map);
+}
+
  static void init_data(args_t *args)
  {
      args->hdr = args->files->readers[0].header;
@@ -2311,6 +2602,7 @@ static void init_data(args_t *args)
          // reading annots from a VCF
          if ( !bcf_sr_add_reader(args->files, args->targets_fname) )
              error("Failed to open %s: %s\n", args->targets_fname,bcf_sr_strerror(args->files->errnum));
+        args->tgts_hdr = args->files->readers[1].header;
      }
      if ( args->columns ) init_columns(args);
      if ( args->targets_fname && !args->tgts_is_vcf )
@@ -2318,8 +2610,8 @@ static void init_data(args_t *args)
          if ( !args->columns ) error("The -c option not given\n");
          if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n");
          if ( args->beg_idx==-1 ) error("The -c POS option not given\n");
-        if ( args->single_overlaps && args->merge_method_str ) error("The options --merge-logic and --single-overlaps cannot be combined\n");
-        if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str) )
+        if ( args->single_overlaps && args->merge_method_str.l ) error("The options --merge-logic and --single-overlaps cannot be combined\n");
+        if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str.l) )
          {
              args->end_idx = -args->beg_idx - 1;
              args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->beg_idx,args->end_idx);
@@ -2363,8 +2655,9 @@ static void init_data(args_t *args)
      if ( !args->drop_header )
      {
          if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
+        if ( args->rename_annots ) rename_annots(args, args->rename_annots);
  
-        args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+        args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
          if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno));
          if ( args->n_threads )
              hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
@@ -2386,8 +2679,15 @@ static void destroy_data(args_t *args)
          free(args->cols[i].mm_kstr.s);
          if ( args->cols[i].mm_str_hash ) khash_str2int_destroy_free(args->cols[i].mm_str_hash);
          free(args->cols[i].mm_dbl);
+        free(args->cols[i].ptr);
      }
      free(args->cols);
+    if ( args->aline_missing )
+    {
+        for (i=0; i<args->aline_missing->ncols; i++) free(args->aline_missing->cols[i]);
+        free(args->aline_missing->cols);
+        free(args->aline_missing);
+    }
      for (i=0; i<args->malines; i++)
      {
          free(args->alines[i].cols);
@@ -2395,6 +2695,7 @@ static void destroy_data(args_t *args)
          free(args->alines[i].line.s);
      }
      free(args->alines);
+    free(args->srt_alines);
      if ( args->tgt_idx )
      {
          regidx_destroy(args->tgt_idx);
@@ -2420,6 +2721,7 @@ static void destroy_data(args_t *args)
          filter_destroy(args->filter);
      if (args->out_fh) hts_close(args->out_fh);
      free(args->sample_map);
+    free(args->merge_method_str.s);
  }
  
  static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp)
@@ -2483,7 +2785,6 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en
          }
          else i++;
      }
-
      if ( args->ref_idx==-1 && args->nalines ) return;
  
      while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) )
@@ -2504,6 +2805,36 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en
      }
  }
  
+// search string in semicolon separated strings (xx vs aa;bb)
+static int str_match(char *needle, char *haystack)
+{
+    int len = strlen(needle);
+    char *ptr = haystack;
+    while ( *ptr && (ptr=strstr(ptr,needle)) )
+    {
+        if ( ptr[len]!=0 && ptr[len]!=';' ) ptr++;          // a prefix, not a match
+        else if ( ptr==haystack || ptr[-1]==';' ) return 1; // a match
+        ptr++;  // a suffix, not a match
+    }
+    return 0;
+}
+// search common string in semicolon separated strings (xx;yy;zz vs aa;bb)
+static int strstr_match(char *a, char *b)
+{
+    char *beg = a;
+    while ( *beg )
+    {
+        char *end = beg;
+        while ( *end && *end!=';' ) end++;
+        char tmp = *end;
+        if ( *end==';' ) *end = 0;
+        int ret = str_match(beg,b);
+        *end = tmp;
+        if ( ret || !*end ) return ret;
+        beg = end + 1;
+    }
+    return 0;
+}
  static void annotate(args_t *args, bcf1_t *line)
  {
      int i, j;
@@ -2511,9 +2842,9 @@ static void annotate(args_t *args, bcf1_t *line)
          args->rm[i].handler(args, line, &args->rm[i]);
  
      int has_overlap = 0;
-
      if ( args->tgt_idx )
      {
+        for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
          if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
          {
              while ( regitr_overlap(args->tgt_itr) )
@@ -2524,49 +2855,145 @@ static void annotate(args_t *args, bcf1_t *line)
                  tmp->end   = args->tgt_itr->end;
                  parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp);
                  for (j=0; j<args->ncols; j++)
-                    if ( args->cols[j].setter(args,line,&args->cols[j],tmp) )
+                {
+                    if ( args->cols[j].done==1 ) continue;
+                    int ret = args->cols[j].setter(args,line,&args->cols[j],tmp);
+                    if ( ret < 0 )
                          error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                    if ( ret==0 )
+                        args->cols[j].done = 1;
+                }
              }
              has_overlap = 1;
          }
          for (j=0; j<args->ncols; j++)
-            if ( args->cols[j].merge_method != MM_FIRST )
-                args->cols[j].setter(args,line,&args->cols[j],NULL);
+        {
+            if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+            if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 )
+                error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+        }
      }
      else if ( args->tgts )
      {
-        // Buffer annotation lines. When multiple ALT alleles are present in the
-        // annotation file, at least one must match one of the VCF alleles.
-        int len = 0;
-        bcf_get_variant_types(line);
-        for (i=1; i<line->n_allele; i++)
-            if ( len > line->d.var[i].n ) len = line->d.var[i].n;
-        int end_pos = len<0 ? line->pos - len : line->pos;
+        // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one
+        // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the
+        // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found
+        // for an ALT, missing value is appended instead.
+        int end_pos = line->pos + line->rlen - 1;
          buffer_annot_lines(args, line, line->pos, end_pos);
+
+        args->nsrt_alines = 0;
+        hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines);
+        if ( args->nalines >= 0xffff || line->n_allele >= 0xffff )
+            error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+
+        // Find matching lines
          for (i=0; i<args->nalines; i++)
          {
              if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue;
-            if ( args->ref_idx != -1 )
+            if ( args->ref_idx != -1 )  // REF+ALT matching requested
              {
-                if ( vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue;   // refs not compatible
+                if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue;   // refs are not compatible
                  for (j=1; j<args->alines[i].nals; j++)
                  {
-                    if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) break;   // no ALT allele in VCF and annot file has "."
-                    if ( vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]) >= 0 ) break;
+                    int ialt;
+                    if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 )  // match: no ALT allele in VCF and annot file has "."
+                        ialt = 0;
+                    else
+                    {
+                        ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]);
+                        if ( ialt < 0 ) continue;
+                        ialt++;
+                    }
+                    if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
+                    args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
+                    has_overlap = 1;
+                    break;
                  }
-                if ( j==args->alines[i].nals ) continue;    // none of the annot alleles present in VCF's ALT
              }
-            break;
+            else    // overlap, REF+ALT matching not requested
+            {
+                args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i;
+                has_overlap = 1;
+            }
          }
-
-        if ( i<args->nalines )
+        // Sort lines if needed
+        if ( args->has_append_mode )
+        {
+            // insertion sort by VCF ALT index (top bits) and alines index (low bits)
+            uint32_t tmp;
+            for (i=1; i<args->nsrt_alines; i++)
+                for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--)
+                    tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp;
+        }
+        // Annotate
+        for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
+        int ialt_exp = 1;
+        for (i=0; i<args->nsrt_alines; i++)
          {
-            // there is a matching line
+            int ialt = args->srt_alines[i] >> 16;
+            int ilin = args->srt_alines[i] & 0xffff;
+            if ( args->has_append_mode )
+            {
+                if ( ialt_exp > ialt ) continue;    // multiple annotation lines for the same position
+                if ( ialt_exp < ialt )
+                {
+                    // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT
+                    while ( ialt_exp++ < ialt )
+                    {
+                        for (j=0; j<args->ncols; j++)
+                        {
+                            if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
+                            if ( args->cols[j].done==1 ) continue;
+                            int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
+                            if ( ret < 0 )
+                                error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                            if ( ret==0 )
+                                args->cols[j].done = 1;
+                        }
+                    }
+                }
+            }
              for (j=0; j<args->ncols; j++)
-                if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) )
+            {
+                if ( args->cols[j].done==1 ) continue;
+                int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]);
+                if ( ret < 0 )
                      error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                if ( ret==0 )
+                    args->cols[j].done = 1;
+            }
+            ialt_exp = ialt + 1;
+        }
+        if ( args->nsrt_alines )
+        {
+            // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one
+            // record was found. Otherwise leave the row will be left without annotation.
+            if ( args->has_append_mode && ialt_exp < line->n_allele )
+            {
+                while ( ialt_exp++ < line->n_allele )
+                {
+                    for (j=0; j<args->ncols; j++)
+                    {
+                        if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
+                        if ( args->cols[j].done==1 ) continue;
+                        int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
+                        if ( ret < 0 )
+                            error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                        if ( ret==0 )
+                            args->cols[j].done = 1;
+                    }
+                }
+            }
+            // Flush
+            for (j=0; j<args->ncols; j++)
+            {
+                if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+                int ret = args->cols[j].setter(args,line,&args->cols[j],NULL);
+                if ( ret < 0 )
+                    error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+            }
          }
-        has_overlap = i<args->nalines ? 1 : 0;
      }
      else if ( args->files->nreaders == 2 )
      {
@@ -2611,28 +3038,30 @@ static void usage(args_t *args)
      fprintf(stderr, "Usage:   bcftools annotate [options] <in.vcf.gz>\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Options:\n");
-    fprintf(stderr, "   -a, --annotations <file>       VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n");
-    fprintf(stderr, "       --collapse <string>        matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
-    fprintf(stderr, "   -c, --columns <list>           list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
-    fprintf(stderr, "   -e, --exclude <expr>           exclude sites for which the expression is true (see man page for details)\n");
-    fprintf(stderr, "       --force                    continue despite parsing error (at your own risk!)\n");
-    fprintf(stderr, "   -h, --header-lines <file>      lines which should be appended to the VCF header\n");
-    fprintf(stderr, "   -I, --set-id [+]<format>       set ID column, see man page for details\n");
-    fprintf(stderr, "   -i, --include <expr>           select sites for which the expression is true (see man page for details)\n");
-    fprintf(stderr, "   -k, --keep-sites               leave -i/-e sites unchanged instead of discarding them\n");
-    fprintf(stderr, "   -l, --merge-logic <tag:type>   merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
-    fprintf(stderr, "   -m, --mark-sites [+-]<tag>     add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
-    fprintf(stderr, "       --no-version               do not append version and command line to the header\n");
-    fprintf(stderr, "   -o, --output <file>            write output to a file [standard output]\n");
-    fprintf(stderr, "   -O, --output-type <b|u|z|v>    b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
-    fprintf(stderr, "   -r, --regions <region>         restrict to comma-separated list of regions\n");
-    fprintf(stderr, "   -R, --regions-file <file>      restrict to regions listed in a file\n");
-    fprintf(stderr, "       --rename-chrs <file>       rename sequences according to map file: from\\tto\n");
-    fprintf(stderr, "   -s, --samples [^]<list>        comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
-    fprintf(stderr, "   -S, --samples-file [^]<file>   file of samples to annotate (or exclude with \"^\" prefix)\n");
-    fprintf(stderr, "       --single-overlaps          keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
-    fprintf(stderr, "   -x, --remove <list>            list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
-    fprintf(stderr, "       --threads <int>            number of extra output compression threads [0]\n");
+    fprintf(stderr, "   -a, --annotations FILE       VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n");
+    fprintf(stderr, "       --collapse STR           matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
+    fprintf(stderr, "   -c, --columns LIST           list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
+    fprintf(stderr, "   -C, --columns-file FILE      read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n");
+    fprintf(stderr, "   -e, --exclude EXPR           exclude sites for which the expression is true (see man page for details)\n");
+    fprintf(stderr, "       --force                  continue despite parsing error (at your own risk!)\n");
+    fprintf(stderr, "   -h, --header-lines FILE      lines which should be appended to the VCF header\n");
+    fprintf(stderr, "   -I, --set-id [+]FORMAT       set ID column using a `bcftools query`-like expression, see man page for details\n");
+    fprintf(stderr, "   -i, --include EXPR           select sites for which the expression is true (see man page for details)\n");
+    fprintf(stderr, "   -k, --keep-sites             leave -i/-e sites unchanged instead of discarding them\n");
+    fprintf(stderr, "   -l, --merge-logic TAG:TYPE   merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
+    fprintf(stderr, "   -m, --mark-sites [+-]TAG     add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
+    fprintf(stderr, "       --no-version             do not append version and command line to the header\n");
+    fprintf(stderr, "   -o, --output FILE            write output to a file [standard output]\n");
+    fprintf(stderr, "   -O, --output-type [b|u|z|v]  b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+    fprintf(stderr, "   -r, --regions REGION         restrict to comma-separated list of regions\n");
+    fprintf(stderr, "   -R, --regions-file FILE      restrict to regions listed in FILE\n");
+    fprintf(stderr, "       --rename-annots FILE     rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n");
+    fprintf(stderr, "       --rename-chrs FILE       rename sequences according to the mapping: old\\tnew\n");
+    fprintf(stderr, "   -s, --samples [^]LIST        comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
+    fprintf(stderr, "   -S, --samples-file [^]FILE   file of samples to annotate (or exclude with \"^\" prefix)\n");
+    fprintf(stderr, "       --single-overlaps        keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
+    fprintf(stderr, "   -x, --remove LIST            list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
+    fprintf(stderr, "       --threads INT            number of extra output compression threads [0]\n");
      fprintf(stderr, "\n");
      exit(1);
  }
@@ -2649,6 +3078,7 @@ int main_vcfannotate(int argc, char *argv[])
      args->record_cmd_line = 1;
      args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1;
      args->set_ids_replace = 1;
+    args->match_id = -1;
      int regions_is_file = 0, collapse = 0;
  
      static struct option loptions[] =
@@ -2667,7 +3097,9 @@ int main_vcfannotate(int argc, char *argv[])
          {"regions",required_argument,NULL,'r'},
          {"regions-file",required_argument,NULL,'R'},
          {"remove",required_argument,NULL,'x'},
+        {"columns-file",required_argument,NULL,'C'},
          {"columns",required_argument,NULL,'c'},
+        {"rename-annots",required_argument,NULL,11},
          {"rename-chrs",required_argument,NULL,1},
          {"header-lines",required_argument,NULL,'h'},
          {"samples",required_argument,NULL,'s'},
@@ -2677,7 +3109,7 @@ int main_vcfannotate(int argc, char *argv[])
          {"force",no_argument,NULL,'f'},
          {NULL,0,NULL,0}
      };
-    while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
      {
          switch (c) {
              case 'f': args->force = 1; break;
@@ -2688,11 +3120,15 @@ int main_vcfannotate(int argc, char *argv[])
                  else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; }
                  else args->mark_sites = optarg; 
                  break;
-            case 'l': args->merge_method_str = optarg; break;
+            case 'l': 
+                if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str);
+                kputs(optarg,&args->merge_method_str);
+                break;
              case 'I': args->set_ids_fmt = optarg; break;
              case 's': args->sample_names = optarg; break;
              case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
              case 'c': args->columns = strdup(optarg); break;
+            case 'C': args->columns = strdup(optarg); args->columns_is_file = 1; break;
              case 'o': args->output_fname = optarg; break;
              case 'O':
                  switch (optarg[0]) {
@@ -2703,8 +3139,12 @@ int main_vcfannotate(int argc, char *argv[])
                      default: error("The output type \"%s\" not recognised\n", optarg);
                  };
                  break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'x': args->remove_annots = optarg; break;
              case 'a': args->targets_fname = optarg; break;
              case 'r': args->regions_list = optarg; break;
@@ -2724,6 +3164,7 @@ int main_vcfannotate(int argc, char *argv[])
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
              case 10 : args->single_overlaps = 1; break;
+            case 11 : args->rename_annots = optarg; break;
              case '?': usage(args); break;
              default: error("Unknown argument: %s\n", optarg);
          }
diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c

index e9d31bf636ad24e1b4d1aac4601e109c02ff3288..b7e707bfa7a31865d0773da1ef227913e8dc370a 100644 (file)
--- a/bcftools/vcfannotate.c.pysam.c
+++ b/bcftools/vcfannotate.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfannotate.c -- Annotate and edit VCF/BCF files.
  
-    Copyright (C) 2013-2019 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -28,6 +28,7 @@ THE SOFTWARE.  */
  #include <strings.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <ctype.h>
  #include <string.h>
  #include <errno.h>
@@ -72,6 +73,7 @@ annot_line_t;
  #define REPLACE_ALL      1      // replace both missing and existing values
  #define REPLACE_NON_MISSING 2   // replace only if tgt is not missing
  #define SET_OR_APPEND    3      // set new value if missing or non-existent, append otherwise
+#define MATCH_VALUE      4      // do not set, just match the value -c ~ID
  #define MM_FIRST   0    // if multiple annotation lines overlap a VCF record, use the first, discarding the rest
  #define MM_APPEND  1    // append, possibly multiple times
  #define MM_UNIQUE  2    // append, only unique values
@@ -79,19 +81,26 @@ annot_line_t;
  #define MM_AVG     4
  #define MM_MIN     5
  #define MM_MAX     6
+#define MM_APPEND_MISSING 7     // missing values will be transferred as well
  typedef struct _annot_col_t
  {
      int icol, replace, number;  // number: one of BCF_VL_* types
      char *hdr_key_src, *hdr_key_dst;
-    int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*);
+    // The setters return 0 on successful update of the bcf record, negative value (bcf_update_* return status) on errors,
+    // or 1 on (repeated partial updates) concluded with a src=NULL call
+    int (*setter)(struct _args_t *, bcf1_t *dst, struct _annot_col_t *, void *src); // the last is the annotation line, either src bcf1_t or annot_line_t
+    int (*getter)(struct _args_t *, bcf1_t *src, struct _annot_col_t *, void **ptr, int *mptr);
      int merge_method;               // one of the MM_* defines
      khash_t(str2int) *mm_str_hash;  // lookup table to ensure uniqueness of added string values
      kstring_t mm_kstr;
-    double
+    size_t
          mm_dbl_nalloc,  // the allocated size --merge-logic values array
          mm_dbl_nused,   // the number of used elements in the mm_dbl array
-        mm_dbl_ndat,    // the number of merged rows (for calculating the average)
+        mm_dbl_ndat;    // the number of merged rows (for calculating the average)
+    double
          *mm_dbl;
+    void *ptr;
+    int mptr, done;
  }
  annot_col_t;
  
@@ -105,12 +114,12 @@ annot_col_t;
  typedef struct _args_t
  {
      bcf_srs_t *files;
-    bcf_hdr_t *hdr, *hdr_out;
+    bcf_hdr_t *hdr, *hdr_out, *tgts_hdr;
      htsFile *out_fh;
      int output_type, n_threads;
      bcf_sr_regions_t *tgts;
  
-    regidx_t *tgt_idx;
+    regidx_t *tgt_idx;  // keep everything in memory only with .tab annotation file and -c BEG,END columns
      regitr_t *tgt_itr;
      int tgt_is_bed;
  
@@ -125,10 +134,13 @@ typedef struct _args_t
  
      vcmp_t *vcmp;           // for matching annotation and VCF lines by allele
      annot_line_t *alines;   // buffered annotation lines
-    int nalines, malines;
+    annot_line_t *aline_missing;
+    uint32_t *srt_alines;   // sorted indexes (iALT<<16 || iAline)
+    int nalines, malines, nsrt_alines, msrt_alines;
      int ref_idx, alt_idx, chr_idx, beg_idx, end_idx;   // -1 if not present
      annot_col_t *cols;      // column indexes and setters
      int ncols;
+    int match_id;           // set iff `-c ~ID` given
  
      char *set_ids_fmt;
      convert_t *set_ids;
@@ -146,9 +158,10 @@ typedef struct _args_t
      kstring_t tmpks;
  
      char **argv, *output_fname, *targets_fname, *regions_list, *header_fname;
-    char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites;
-    char *merge_method_str;
+    char *remove_annots, *columns, *rename_chrs, *rename_annots, *sample_names, *mark_sites;
+    kstring_t merge_method_str;
      int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps;
+    int columns_is_file, has_append_mode;
  }
  args_t;
  
@@ -197,6 +210,8 @@ void remove_info(args_t *args, bcf1_t *line, rm_tag_t *tag)
      for (i=0; i<line->n_info; i++)
      {
          bcf_info_t *inf = &line->d.info[i];
+        if (  !strcmp("END",bcf_hdr_int2id(args->hdr,BCF_DT_ID,inf->key)) )
+            line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
          if ( inf->vptr_free )
          {
              free(inf->vptr - inf->vptr_off);
@@ -376,6 +391,10 @@ static void init_remove_annots(args_t *args)
          }
          else if ( str.l )
          {
+            int id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, str.s);
+            if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,id) ) error("Error: did you mean INFO/%s?\n",str.s);
+            if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) error("Error: did you mean FORMAT/%s?\n",str.s);
+
              if ( !args->keep_sites )
              {
                  if ( str.s[0]=='#' && str.s[1]=='#' )
@@ -443,6 +462,42 @@ static void init_header_lines(args_t *args)
      if (bcf_hdr_sync(args->hdr) < 0)
          error_errno("[%s] Failed to update input header", __func__);
  }
+static int vcf_getter_info_str2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+    return bcf_get_info_string(args->tgts_hdr,rec,col->hdr_key_src,ptr,mptr); 
+}
+static int vcf_getter_id2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+    char *str = *((char**)ptr);
+    int len = strlen(rec->d.id);
+    if ( len >= *mptr ) str = realloc(str, len+1);
+    strcpy(str, rec->d.id);
+    *((char**)ptr) = str;
+    *mptr = len+1;
+    return len;
+}
+static int vcf_getter_filter2str(args_t *args, bcf1_t *rec, annot_col_t *col, void **ptr, int *mptr)
+{
+    kstring_t str;
+    str.s = *((char**)ptr);
+    str.m = *mptr;
+    str.l = 0;
+
+    int i;
+    if ( rec->d.n_flt )
+    {
+        for (i=0; i<rec->d.n_flt; i++)
+        {
+            if (i) kputc(';', &str);
+            kputs(bcf_hdr_int2id(args->tgts_hdr,BCF_DT_ID,rec->d.flt[i]), &str);
+        }
+    }
+    else kputc('.', &str);
+
+    *((char**)ptr) = str.s;
+    *mptr = str.m;
+    return str.l;
+}
  static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
      if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n");
@@ -452,24 +507,24 @@ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *dat
      if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
      hts_expand(int,1,args->mtmpi,args->tmpi);
      args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]);
-    if ( args->tmpi[0]<0 ) error("The FILTER is not defined in the header: %s\n", tab->cols[col->icol]);
-    if ( col->replace==SET_OR_APPEND ) { bcf_add_filter(args->hdr_out,line,args->tmpi[0]); return 0; }
+    if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]);
+    if ( col->replace==SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]);
      if ( col->replace!=REPLACE_MISSING )
      {
          bcf_update_filter(args->hdr_out,line,NULL,0);
-        bcf_update_filter(args->hdr_out,line,args->tmpi,1); 
-        return 0; 
+        return bcf_update_filter(args->hdr_out,line,args->tmpi,1); 
      }
      
      // only update missing FILTER
      if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
      if ( !line->d.n_flt )
-        bcf_update_filter(args->hdr_out,line,args->tmpi,1);
+        return bcf_update_filter(args->hdr_out,line,args->tmpi,1);
+
      return 0;
  }
  static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
-    int i;
+    int i, ret = 0;
      bcf1_t *rec = (bcf1_t*) data;
      if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT);
      if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
@@ -480,9 +535,9 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void
          for (i=0; i<rec->d.n_flt; i++)
          {
              const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
-            bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt));
+            if ( bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt)) < 0 ) ret = -1;
          }
-        return 0;
+        return ret;
      }
      hts_expand(int,rec->d.n_flt,args->mtmpi,args->tmpi);
      for (i=0; i<rec->d.n_flt; i++)
@@ -491,12 +546,12 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void
          args->tmpi[i] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt);
      }
      bcf_update_filter(args->hdr_out,line,NULL,0);
-    bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
-    return 0;
+    return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
  }
  static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
      if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n");
+    if ( col->replace==MATCH_VALUE ) return 0;
  
      // possible cases:
      //      IN  ANNOT   OUT     ACHIEVED_BY
@@ -519,14 +574,28 @@ static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  }
  static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
+    if ( col->replace==MATCH_VALUE ) return 0;
+
      bcf1_t *rec = (bcf1_t*) data;
-    if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0;    // don't replace with "."
-    if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,rec->d.id);
-    if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,rec->d.id);
+
+    char *id;
+    if ( col->getter )
+    {
+        int nret = col->getter(args,rec,col,&col->ptr,&col->mptr);
+        id = (char*) col->ptr;
+        if ( nret<=0 || (nret==1 && *id=='.') ) return 0;   // don't replace with "."
+    }
+    else
+    {
+        if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0;    // don't replace with "."
+        id = rec->d.id;
+    }
+    if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id);
+    if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,id);
  
      // running with +ID, only update missing ids
      if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
-        return bcf_update_id(args->hdr_out,line,rec->d.id);
+        return bcf_update_id(args->hdr_out,line,id);
      return 0;
  }
  static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -537,9 +606,9 @@ static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *da
      als[0] = rec->d.allele[0];
      int i;
      for (i=1; i<line->n_allele; i++) als[i] = line->d.allele[i];
-    bcf_update_alleles(args->hdr_out, line, als, line->n_allele);
+    int ret = bcf_update_alleles(args->hdr_out, line, als, line->n_allele);
      free(als);
-    return 0;
+    return ret;
  }
  static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
@@ -553,9 +622,9 @@ static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *da
      const char **als = (const char**) malloc(sizeof(char*)*rec->n_allele);
      als[0] = line->d.allele[0];
      for (i=1; i<rec->n_allele; i++) als[i] = rec->d.allele[i];
-    bcf_update_alleles(args->hdr_out, line, als, rec->n_allele);
+    int ret = bcf_update_alleles(args->hdr_out, line, als, rec->n_allele);
      free(als);
-    return 0;
+    return ret;
  }
  static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
@@ -629,34 +698,51 @@ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int
  
          args->tmpi2[i] = args->tmpi[ map[i] ];
      }
-    bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
-    return 0;
+    return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
  }
  static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
      annot_line_t *tab = (annot_line_t*) data;
  
+    // This is a bit hacky, only to reuse existing code with minimal changes:
+    //      -c =TAG will now behave as -l TAG:APPEND for integers
+    if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND;
+
      if ( !tab )
      {
-        if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND )
-            error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Integer\n");
+        if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG &&
+             col->merge_method!=MM_MIN && col->merge_method!=MM_MAX &&
+             col->merge_method!=MM_APPEND && 
+             col->merge_method!=MM_APPEND_MISSING )
+            error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Integer\n");
      }
  
      int i,ntmpi = 0;
-    if ( tab )
+    if ( tab )  // has data, not flushing yet
      {
          char *str = tab->cols[col->icol], *end = str;
-        if ( str[0]=='.' && str[1]==0 ) return 0;
+        if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1;
  
          while ( *end )
          {
-            int val = strtol(str, &end, 10); 
-            if ( end==str )
-                error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
              ntmpi++;
              hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
-            args->tmpi[ntmpi-1] = val;
-            str = end+1;
+            if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
+            {
+                if ( col->merge_method==MM_APPEND_MISSING )
+                    args->tmpi[ntmpi-1] = bcf_int32_missing;
+                else
+                    ntmpi--;
+                if ( str[1]==0 ) end = str+1;
+                str += 2;
+            }
+            else
+            {
+                args->tmpi[ntmpi-1] = strtol(str, &end, 10); 
+                if ( end==str )
+                    error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
+                str = end+1;
+            }
          }
          if ( col->merge_method!=MM_FIRST )
          {
@@ -669,7 +755,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d
              }
              else
              {
-                if ( col->merge_method==MM_APPEND )
+                if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
                  {
                      int nori = col->mm_dbl_nused;
                      col->mm_dbl_nused += ntmpi;
@@ -689,9 +775,10 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d
                  }
              }
              col->mm_dbl_ndat++;
+            return 1;
          }
      }
-    else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND )
+    else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
      {
          ntmpi = col->mm_dbl_nused;
          hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
@@ -715,8 +802,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d
          if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
      }
  
-    bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
-    return 0;
+    return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
  }
  static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
@@ -733,8 +819,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi
          if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
      }
  
-    bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
-    return 0;
+    return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
  }
  static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
  {
@@ -765,34 +850,51 @@ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int
  
          args->tmpf2[i] = args->tmpf[ map[i] ];
      }
-    bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
-    return 0;
+    return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
  }
  static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
      annot_line_t *tab = (annot_line_t*) data;
  
+    // This is a bit hacky, only to reuse existing code with minimal changes:
+    //      -c =TAG will now behave as -l TAG:APPEND for floats
+    if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND;
+
      if ( !tab )
      {
-        if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND )
-            error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Float\n");
+        if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG &&
+             col->merge_method!=MM_MIN && col->merge_method!=MM_MAX &&
+             col->merge_method!=MM_APPEND &&
+             col->merge_method!=MM_APPEND_MISSING )
+            error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Float\n");
      }
  
      int i,ntmpf = 0;
      if ( tab )
      {
          char *str = tab->cols[col->icol], *end = str;
-        if ( str[0]=='.' && str[1]==0 ) return 0;
+        if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1;
  
          while ( *end )
          {
-            double val = strtod(str, &end);
-            if ( end==str )
-                error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
              ntmpf++;
              hts_expand(float,ntmpf,args->mtmpf,args->tmpf);
-            args->tmpf[ntmpf-1] = val;
-            str = end+1;
+            if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
+            {
+                if ( col->merge_method==MM_APPEND_MISSING ) 
+                    bcf_float_set_missing(args->tmpf[ntmpf-1]);
+                else
+                    ntmpf--;
+                if ( str[1]==0 ) end = str+1;
+                str += 2;
+            }
+            else
+            {
+                args->tmpf[ntmpf-1] = strtod(str, &end);
+                if ( end==str )
+                    error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]);
+                str = end+1;
+            }
          }
          if ( col->merge_method!=MM_FIRST )
          {
@@ -801,17 +903,27 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *
                  col->mm_dbl_nused = ntmpf;
                  hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
                  for (i=0; i<ntmpf; i++)
-                    col->mm_dbl[i] = args->tmpf[i];
+                {
+                    if ( bcf_float_is_missing(args->tmpf[i]) )
+                        bcf_double_set_missing(col->mm_dbl[i]);
+                    else
+                        col->mm_dbl[i] = args->tmpf[i];
+                }
              }
              else
              {
-                if ( col->merge_method==MM_APPEND )
+                if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
                  {
                      int nori = col->mm_dbl_nused;
                      col->mm_dbl_nused += ntmpf;
                      hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
                      for (i=0; i<ntmpf; i++)
-                        col->mm_dbl[i+nori] = args->tmpf[i];
+                    {
+                        if ( bcf_float_is_missing(args->tmpf[i]) )
+                            bcf_double_set_missing(col->mm_dbl[i+nori]);
+                        else
+                            col->mm_dbl[i+nori] = args->tmpf[i];
+                    }
                  }
                  else
                  {
@@ -825,13 +937,20 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *
                  }
              }
              col->mm_dbl_ndat++;
+            return 1;
          }
      }
-    else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND )
+    else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING )
      {
          ntmpf = col->mm_dbl_nused;
          hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf);
-        for (i=0; i<ntmpf; i++) args->tmpf[i] = col->mm_dbl[i];
+        for (i=0; i<ntmpf; i++)
+        {
+            if ( bcf_double_is_missing(col->mm_dbl[i]) )
+                bcf_float_set_missing(args->tmpf[i]);
+            else
+                args->tmpf[i] = col->mm_dbl[i];
+        }
          col->mm_dbl_nused = col->mm_dbl_ndat = 0;
      }
      else if ( col->merge_method==MM_AVG )
@@ -851,8 +970,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *
          if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
      }
  
-    bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
-    return 0;
+    return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
  }
  static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
@@ -869,8 +987,7 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo
          if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
      }
  
-    bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
-    return 0;
+    return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
  }
  int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
  static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als)
@@ -925,10 +1042,9 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in
              if ( str[0]!='.' || (str[1]!=',' && str[1]!=0) ) continue;  // value already set
          }
          int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i);
-        assert( ret==0 );
+        if ( ret!=0 ) error("[%s:%d %s] Failed to copy a string field\n",  __FILE__,__LINE__,__func__);
      }
-    bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
-    return 0;
+    return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
  }
  void khash_str2int_clear_free(void *_hash)
  {
@@ -947,14 +1063,18 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
          if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
      }
  
+    // This is a bit hacky, only to reuse existing code with minimal changes:
+    //      -c =TAG will now behave as -l TAG:unique for strings
+    if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_UNIQUE;
+
      annot_line_t *tab = (annot_line_t*) data;
-    
+
      int len = 0;
      if ( tab )
      {
          len = strlen(tab->cols[col->icol]);
          if ( !len ) return 0;
-        if ( len==1 && tab->cols[col->icol][0]=='.' ) return 0;
+        if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING ) return 1;
      }
  
      if ( col->merge_method!=MM_FIRST )
@@ -964,17 +1084,17 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
  
          if ( data )
          {
-            assert( col->merge_method==MM_APPEND || col->merge_method==MM_UNIQUE );
+            assert( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING || col->merge_method==MM_UNIQUE );
              if ( col->merge_method==MM_UNIQUE )
              {
                  if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init();
-                if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 0;
+                if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 1;
                  khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol]));
              }
  
              if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr);
              kputs(tab->cols[col->icol], &col->mm_kstr);
-            return 0;
+            return 1;
          }
  
          if ( col->mm_kstr.l )
@@ -985,12 +1105,10 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
          else
              return 0;
  
-        if ( !data )    // flush the line
-        {
-            if ( col->merge_method==MM_UNIQUE )
-                khash_str2int_clear_free(col->mm_str_hash);
-            col->mm_kstr.l = 0;
-        }
+        // flush the line
+        if ( col->merge_method==MM_UNIQUE )
+            khash_str2int_clear_free(col->mm_str_hash);
+        col->mm_kstr.l = 0;
      }
      else
      {
@@ -1002,14 +1120,19 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d
              return setter_ARinfo_string(args,line,col,tab->nals,tab->als);
      }
  
-    bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
-    return 0;
+    return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
  }
  static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
  {
      bcf1_t *rec = (bcf1_t*) data;
-    int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
-    if ( ntmps < 0 ) return 0;    // nothing to add
+
+    if ( col->getter )
+        col->getter(args,rec,col,(void**)&args->tmps, &args->mtmps);
+    else
+    {
+        int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
+        if ( ntmps < 0 ) return 0;    // nothing to add
+    }
  
      if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) 
          return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele);
@@ -1020,8 +1143,7 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi
          if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
      }
  
-    bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
-    return 0;
+    return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
  }
  static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str)
  {
@@ -1691,7 +1813,6 @@ static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col,
          }
      }
      return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nsmpl_dst*ndst1);
-
  }
  
  static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -1773,17 +1894,12 @@ static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst)
          // tab annotation file, expecting that all samples are present: sample map not needed
          if ( !src ) return 0;
  
-        int nmatch = 0, order_ok = 1;
+        int nmatch = 0;
          for (i=0; i<bcf_hdr_nsamples(src); i++)
          {
              int id = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, src->samples[i]);
-            if ( id!=-1 ) 
-            {
-                nmatch++;
-                if ( i!=id ) order_ok = 0;
-            }
+            if ( id!=-1 ) nmatch++;
          }
-        if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok ) return 0;  // not needed
          if ( !nmatch ) return -1;   // No matching samples found in the source and the destination file
  
          args->nsample_map = bcf_hdr_nsamples(dst);
@@ -1902,11 +2018,45 @@ static void init_columns(args_t *args)
      int need_sample_map = 0;
      int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr);
  
+    kstring_t tmp = {0,0,0};
+    if ( args->columns_is_file )
+    {
+        int i,n;
+        char **str = hts_readlist(args->columns, args->columns_is_file, &n);
+        if ( !str ) error("Could not parse %s\n", args->columns);
+        for (i=0; i<n; i++)
+        {
+            char *ptr = str[i];
+            while ( *ptr && !isspace(*ptr) ) ptr++;
+            if ( *ptr )
+            {
+                *ptr = 0;
+                ptr++;
+                while ( *ptr && isspace(*ptr) ) ptr++;
+                if ( *ptr )
+                {
+                    if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str);
+                    kputs(str[i],&args->merge_method_str);
+                    kputc(':',&args->merge_method_str);
+                    kputs(ptr,&args->merge_method_str);
+                }
+            }
+            if ( tmp.l ) kputc(',',&tmp);
+            kputs(str[i],&tmp);
+            free(str[i]);
+        }
+        free(str);
+        free(args->columns);
+        args->columns = tmp.s;
+        tmp.l = tmp.m = 0;
+        tmp.s = NULL;
+    }
+
      void *skip_fmt = NULL, *skip_info = NULL;
      if ( args->tgts_is_vcf )
          args->columns = columns_complement(args->columns, &skip_info, &skip_fmt);
  
-    kstring_t str = {0,0,0}, tmp = {0,0,0};
+    kstring_t str = {0,0,0};
      char *ss = args->columns, *se = ss;
      args->ncols = 0;
      int icol = -1, has_fmt_str = 0;
@@ -1931,6 +2081,7 @@ static void init_columns(args_t *args)
              {
                  args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
                  annot_col_t *col = &args->cols[args->ncols-1];
+                memset(col,0,sizeof(*col));
                  col->setter = vcf_setter_ref;
                  col->hdr_key_src = strdup(str.s);
                  col->hdr_key_dst = strdup(str.s);
@@ -1943,28 +2094,54 @@ static void init_columns(args_t *args)
              {
                  args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
                  annot_col_t *col = &args->cols[args->ncols-1];
+                memset(col,0,sizeof(*col));
                  col->setter = vcf_setter_alt;
                  col->hdr_key_src = strdup(str.s);
                  col->hdr_key_dst = strdup(str.s);
              }
              else args->alt_idx = icol;
          }
-        else if ( !strcasecmp("ID",str.s) )
+        else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) )
          {
              if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+            if ( str.s[0]=='~' ) replace = MATCH_VALUE;
+            if ( args->tgts_is_vcf && replace==MATCH_VALUE ) error("todo: -c ~ID with -a VCF?\n");
              args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
              annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
              col->icol = icol;
              col->replace = replace;
              col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
              col->hdr_key_src = strdup(str.s);
              col->hdr_key_dst = strdup(str.s);
+            if ( replace==MATCH_VALUE ) args->match_id = icol;
+        }
+        else if ( !strncasecmp("ID:=",str.s,4) )    // transfer a tag from INFO to ID column
+        {
+            if ( !args->tgts_is_vcf ) error("The annotation source must be a VCF for \"%s\"\n",str.s);
+            if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+            args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+            annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
+            col->icol = icol;
+            col->replace = replace;
+            col->setter = vcf_setter_id;
+            col->getter = vcf_getter_info_str2str;
+            str.s[2] = 0;
+            col->hdr_key_dst = strdup(str.s);
+            col->hdr_key_src = strncasecmp("INFO/",str.s+4,5) ? strdup(str.s+4) : strdup(str.s+4+5);
+            int hdr_id = bcf_hdr_id2int(args->tgts_hdr, BCF_DT_ID,col->hdr_key_src);
+            if ( !bcf_hdr_idinfo_exists(args->tgts_hdr,BCF_HL_INFO,hdr_id) ) 
+                error("The INFO tag \"%s\" is not defined in %s\n", col->hdr_key_src, args->targets_fname);
+            if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR )
+                error("Only Type=String tags can be used to annotate the ID column\n");
          }
          else if ( !strcasecmp("FILTER",str.s) )
          {
              if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
              args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
              annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
              col->icol = icol;
              col->replace = replace;
              col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
@@ -1979,7 +2156,7 @@ static void init_columns(args_t *args)
                      bcf_hrec_t *hrec = tgts_hdr->hrec[j];
                      if ( hrec->type!=BCF_HL_FLT ) continue;
                      int k = bcf_hrec_find_key(hrec,"ID");
-                    assert( k>=0 ); // this should always be true for valid VCFs
+                    if ( k<0 ) error("[%s] Failed to parse the header, the ID attribute not found", __func__);
                      tmp.l = 0;
                      bcf_hrec_format(hrec, &tmp);
                      bcf_hdr_append(args->hdr_out, tmp.s);
@@ -1994,6 +2171,7 @@ static void init_columns(args_t *args)
              if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
              args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
              annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
              col->icol = icol;
              col->replace = replace;
              col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
@@ -2003,7 +2181,7 @@ static void init_columns(args_t *args)
          else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
          {
              if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
-            if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
+            if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n");
              bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
              int j;
              for (j=0; j<tgts_hdr->nhrec; j++)
@@ -2021,6 +2199,7 @@ static void init_columns(args_t *args)
                  int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
                  args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
                  annot_col_t *col = &args->cols[args->ncols-1];
+                memset(col,0,sizeof(*col));
                  col->icol = -1;
                  col->replace = replace;
                  col->hdr_key_src = strdup(hrec->vals[k]);
@@ -2056,11 +2235,16 @@ static void init_columns(args_t *args)
                  int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]);
                  args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
                  annot_col_t *col = &args->cols[args->ncols-1];
+                memset(col,0,sizeof(*col));
                  col->icol = -1;
                  col->replace = replace;
                  col->hdr_key_src = strdup(hrec->vals[k]);
                  col->hdr_key_dst = strdup(hrec->vals[k]);
-                if ( !strcasecmp("GT",col->hdr_key_src) ) col->setter = vcf_setter_format_gt;
+                if ( !strcasecmp("GT",col->hdr_key_src) )
+                {
+                    if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n");
+                    col->setter = vcf_setter_format_gt;
+                }
                  else
                      switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
                      {
@@ -2099,9 +2283,10 @@ static void init_columns(args_t *args)
              }
              int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
              if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
-                error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
+                error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname);
              args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
              annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
              if ( !args->tgts_is_vcf )
              {
                  col->icol = icol;
@@ -2112,7 +2297,11 @@ static void init_columns(args_t *args)
              col->replace = replace;
              col->hdr_key_src = strdup(key_src);
              col->hdr_key_dst = strdup(key_dst);
-            if ( !strcasecmp("GT",key_src) ) col->setter = vcf_setter_format_gt;
+            if ( !strcasecmp("GT",key_src) )
+            {
+                if ( !args->tgts_is_vcf ) error("The FORMAT/GT field can be currently populated only from a VCF\n");
+                col->setter = vcf_setter_format_gt;
+            }
              else
                  switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
                  {
@@ -2131,13 +2320,20 @@ static void init_columns(args_t *args)
          else
          {
              if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
-            if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
-            int explicit_info = 0;
+            if ( replace==SET_OR_APPEND )
+            {
+                if ( args->tgts_is_vcf )
+                    error("Error: the =INFO/TAG feature is currently supported only with TAB annotation files and has limitations\n"
+                          "       (the annotation type is modified to \"Number=.\" and allele ordering is disregarded)\n");
+                fprintf(bcftools_stderr,"Warning: the =INFO/TAG feature modifies the annotation to \"Number=.\" and disregards allele ordering\n");
+            }
+            int explicit_src_info = 0;
+            int explicit_dst_info = 0;
              char *key_dst;
              if ( !strncasecmp("INFO/",str.s,5) )
              {
                  key_dst = str.s + 5;
-                explicit_info = 1;
+                explicit_dst_info = 1;
              }
              else
                  key_dst = str.s;
@@ -2149,7 +2345,7 @@ static void init_columns(args_t *args)
                  if ( !strncasecmp("INFO/",key_src,5) )
                  {
                      key_src += 5;
-                    explicit_info = 1;
+                    explicit_src_info = 1;
                  }
                  else if ( !strncasecmp("FMT/",key_src,4) || !strncasecmp("FORMAT/",key_src,5) )
                  {
@@ -2159,38 +2355,65 @@ static void init_columns(args_t *args)
              }
              else
                  key_src = key_dst;
+
+            args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+            annot_col_t *col = &args->cols[args->ncols-1];
+            memset(col,0,sizeof(*col));
+            col->icol = icol;
+            col->replace = replace;
+            col->hdr_key_src = strdup(key_src);
+            col->hdr_key_dst = strdup(key_dst);
+
              int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
              if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) )
              {
                  if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line
                  {
-                    bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
-                    if ( !hrec )
+                    if ( !strcasecmp("ID",key_src) && !explicit_src_info )
                      {
-                        if ( !explicit_info && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) )
-                            error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
-                    fprintf(bcftools_stderr,"[%s] %d\n",key_src,explicit_info);
-                        error("The tag \"%s\" is not defined in %s\n", key_src,args->files->readers[1].fname);
+                        // transferring ID column into a new INFO tag
+                        tmp.l = 0;
+                        ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred ID column\">",key_dst);
+                    }
+                    else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info )
+                    {
+                        // transferring FILTER column into a new INFO tag
+                        tmp.l = 0;
+                        ksprintf(&tmp,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Transferred FILTER column\">",key_dst);
+                    }
+                    else
+                    {
+                        bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
+                        if ( !hrec )
+                        {
+                            if ( explicit_dst_info+explicit_src_info==0 && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) )
+                                error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s);
+                            char *ptr = strchr(key_src,'=');
+                            if ( ptr )
+                            {
+                                *ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '=';
+                                error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s);
+                            }
+                            error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname);
+                        }
+                        tmp.l = 0;
+                        bcf_hrec_format_rename(hrec, key_dst, &tmp);
                      }
-                    tmp.l = 0;
-                    bcf_hrec_format_rename(hrec, key_dst, &tmp);
                      bcf_hdr_append(args->hdr_out, tmp.s);
                      if (bcf_hdr_sync(args->hdr_out) < 0)
                          error_errno("[%s] Failed to update header", __func__);
                      hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
                  }
                  else
-                    error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname);
+                    error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname);
                  assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
              }
-
-            args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
-            annot_col_t *col = &args->cols[args->ncols-1];
-            col->icol = icol;
-            col->replace = replace;
-            col->hdr_key_src = strdup(key_src);
-            col->hdr_key_dst = strdup(key_dst);
-            col->number  = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
+            if  ( args->tgts_is_vcf )
+            {
+                if ( !strcasecmp("ID",key_src) && !explicit_src_info ) col->getter = vcf_getter_id2str;
+                else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) col->getter = vcf_getter_filter2str;
+            }
+            col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
              switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
              {
                  case BCF_HT_FLAG:   col->setter = args->tgts_is_vcf ? vcf_setter_info_flag : setter_info_flag; break;
@@ -2199,6 +2422,18 @@ static void init_columns(args_t *args)
                  case BCF_HT_STR:    col->setter = args->tgts_is_vcf ? vcf_setter_info_str  : setter_info_str; break;
                  default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
              }
+            if ( replace==SET_OR_APPEND )   // change to Number=.
+            {
+                bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_INFO, "ID", key_dst, NULL);
+                if ( !hrec ) error("Uh, could not find the new tag \"%s\" in the header\n", key_dst);
+                hrec = bcf_hrec_dup(hrec);
+                int j = bcf_hrec_find_key(hrec, "Number");
+                if ( j<0 ) error("Uh, could not find the entry Number in the header record of %s\n",key_dst);
+                free(hrec->vals[j]);
+                hrec->vals[j] = strdup(".");
+                bcf_hdr_remove(args->hdr_out,BCF_HL_INFO, key_dst);
+                bcf_hdr_add_hrec(args->hdr_out, hrec);
+            }
          }
          if ( !*se ) break;
          ss = ++se;
@@ -2234,10 +2469,10 @@ static void init_merge_method(args_t *args)
          args->cols[i].mm_dbl_nalloc = args->cols[i].mm_dbl_nused = args->cols[i].mm_dbl_ndat = 0;
          memset(&args->cols[i].mm_kstr, 0, sizeof(args->cols[i].mm_kstr));
      }
-    if ( !args->merge_method_str ) return;
+    if ( !args->merge_method_str.l ) return;
      if ( args->tgts_is_vcf ) error("Error: the --merge-logic is intended for use with BED or TAB-delimited files only.\n");
-    if ( !args->tgt_idx ) error("Error: BEG,END (or FROM,TO) columns are expected with the --merge-logic option.\n");
-    char *sb = args->merge_method_str;
+    if ( !args->tgt_idx && !args->tgts ) error("Error: BEG,END (or FROM,TO) columns or REF,ALT columns are expected with the --merge-logic option.\n");
+    char *sb = args->merge_method_str.s;
      while ( *sb )
      {
          char *se = sb;
@@ -2248,21 +2483,27 @@ static void init_merge_method(args_t *args)
          char *mm_type_str = args->tmpks.s + args->tmpks.l;
          while ( *mm_type_str!=':' && mm_type_str > args->tmpks.s ) mm_type_str--;
          if ( *mm_type_str!=':' )
-            error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str);
+            error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str.s);
          *mm_type_str = 0;
          mm_type_str++;
          int mm_type = MM_FIRST;
          if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE;
+        else if ( !strcasecmp("first",mm_type_str) ) mm_type = MM_FIRST;
          else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND;
+        else if ( !strcasecmp("append-missing",mm_type_str) )
+        {
+            mm_type = MM_APPEND_MISSING;
+            if ( args->ref_idx!=-1 ) args->has_append_mode = 1;
+        }
          else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM;
          else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG;
          else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN;
          else if ( !strcasecmp("max",mm_type_str) ) mm_type = MM_MAX;
-        else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str,mm_type_str);
+        else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str.s,mm_type_str);
          for (i=0; i<args->ncols; i++)
          {
              if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue;
-            if ( mm_type==MM_APPEND && args->cols[i].number!=BCF_VL_VAR )
+            if ( (mm_type==MM_APPEND || mm_type==MM_APPEND_MISSING) && args->cols[i].number!=BCF_VL_VAR )
                  error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n");
              args->cols[i].merge_method = mm_type;
              break;
@@ -2270,6 +2511,20 @@ static void init_merge_method(args_t *args)
          if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s);
          sb = *se ? se + 1 : se;
      }
+    if ( args->has_append_mode )
+    {
+        // create a missing line to insert missing values when VCF ALT finds no match in the annotation file
+        args->aline_missing = (annot_line_t*)calloc(1,sizeof(*args->aline_missing));
+        int ncol = 0;
+        for (i=0; i<args->ncols; i++)
+            if ( ncol < args->cols[i].icol + 1 ) ncol = args->cols[i].icol + 1;
+        if ( ncol < args->ref_idx + 1 ) ncol = args->ref_idx + 1;
+        args->aline_missing->mcols = ncol;
+        args->aline_missing->ncols = ncol;
+        args->aline_missing->cols = (char**) malloc(ncol*sizeof(char*));
+        for (i=0; i<ncol; i++)
+            args->aline_missing->cols[i] = strdup(".");
+    }
  }
  
  static void rename_chrs(args_t *args, char *fname)
@@ -2301,6 +2556,42 @@ static void rename_chrs(args_t *args, char *fname)
      free(map);
  }
  
+static void rename_annots(args_t *args, char *fname)
+{
+    int n, i;
+    char **map = hts_readlist(fname, 1, &n);
+    if ( !map ) error("Could not read: %s\n", fname);
+    for (i=0; i<n; i++)
+    {
+        char *sb = NULL, *ss = map[i];
+        while ( *ss && !isspace(*ss) ) ss++;
+        if ( !*ss ) error("Could not parse: %s\n", fname);
+        *ss = 0;
+        int type;
+        if ( !strncasecmp("info/",map[i],5) ) type = BCF_HL_INFO, sb = map[i] + 5;
+        else if ( !strncasecmp("format/",map[i],7) ) type = BCF_HL_FMT, sb = map[i] + 7;
+        else if ( !strncasecmp("fmt/",map[i],4) ) type = BCF_HL_FMT, sb = map[i] + 4;
+        else if ( !strncasecmp("filter/",map[i],7) ) type = BCF_HL_FLT, sb = map[i] + 7;
+        else error("Could not parse \"%s\", expected INFO, FORMAT, or FILTER prefix for each line: %s\n",map[i],fname);
+        int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, sb);
+        if ( id<0 ) continue;
+        bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", sb, NULL);
+        if ( !hrec ) continue;  // the sequence not present
+        int j = bcf_hrec_find_key(hrec, "ID");
+        assert( j>=0 );
+        free(hrec->vals[j]);
+        ss++;
+        while ( *ss && isspace(*ss) ) ss++;
+        char *se = ss;
+        while ( *se && !isspace(*se) ) se++;
+        *se = 0;
+        hrec->vals[j] = strdup(ss);
+        args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j];
+    }
+    for (i=0; i<n; i++) free(map[i]);
+    free(map);
+}
+
  static void init_data(args_t *args)
  {
      args->hdr = args->files->readers[0].header;
@@ -2313,6 +2604,7 @@ static void init_data(args_t *args)
          // reading annots from a VCF
          if ( !bcf_sr_add_reader(args->files, args->targets_fname) )
              error("Failed to open %s: %s\n", args->targets_fname,bcf_sr_strerror(args->files->errnum));
+        args->tgts_hdr = args->files->readers[1].header;
      }
      if ( args->columns ) init_columns(args);
      if ( args->targets_fname && !args->tgts_is_vcf )
@@ -2320,8 +2612,8 @@ static void init_data(args_t *args)
          if ( !args->columns ) error("The -c option not given\n");
          if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n");
          if ( args->beg_idx==-1 ) error("The -c POS option not given\n");
-        if ( args->single_overlaps && args->merge_method_str ) error("The options --merge-logic and --single-overlaps cannot be combined\n");
-        if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str) )
+        if ( args->single_overlaps && args->merge_method_str.l ) error("The options --merge-logic and --single-overlaps cannot be combined\n");
+        if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str.l) )
          {
              args->end_idx = -args->beg_idx - 1;
              args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->beg_idx,args->end_idx);
@@ -2365,8 +2657,9 @@ static void init_data(args_t *args)
      if ( !args->drop_header )
      {
          if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
+        if ( args->rename_annots ) rename_annots(args, args->rename_annots);
  
-        args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+        args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
          if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno));
          if ( args->n_threads )
              hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
@@ -2388,8 +2681,15 @@ static void destroy_data(args_t *args)
          free(args->cols[i].mm_kstr.s);
          if ( args->cols[i].mm_str_hash ) khash_str2int_destroy_free(args->cols[i].mm_str_hash);
          free(args->cols[i].mm_dbl);
+        free(args->cols[i].ptr);
      }
      free(args->cols);
+    if ( args->aline_missing )
+    {
+        for (i=0; i<args->aline_missing->ncols; i++) free(args->aline_missing->cols[i]);
+        free(args->aline_missing->cols);
+        free(args->aline_missing);
+    }
      for (i=0; i<args->malines; i++)
      {
          free(args->alines[i].cols);
@@ -2397,6 +2697,7 @@ static void destroy_data(args_t *args)
          free(args->alines[i].line.s);
      }
      free(args->alines);
+    free(args->srt_alines);
      if ( args->tgt_idx )
      {
          regidx_destroy(args->tgt_idx);
@@ -2422,6 +2723,7 @@ static void destroy_data(args_t *args)
          filter_destroy(args->filter);
      if (args->out_fh) hts_close(args->out_fh);
      free(args->sample_map);
+    free(args->merge_method_str.s);
  }
  
  static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp)
@@ -2485,7 +2787,6 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en
          }
          else i++;
      }
-
      if ( args->ref_idx==-1 && args->nalines ) return;
  
      while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) )
@@ -2506,6 +2807,36 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en
      }
  }
  
+// search string in semicolon separated strings (xx vs aa;bb)
+static int str_match(char *needle, char *haystack)
+{
+    int len = strlen(needle);
+    char *ptr = haystack;
+    while ( *ptr && (ptr=strstr(ptr,needle)) )
+    {
+        if ( ptr[len]!=0 && ptr[len]!=';' ) ptr++;          // a prefix, not a match
+        else if ( ptr==haystack || ptr[-1]==';' ) return 1; // a match
+        ptr++;  // a suffix, not a match
+    }
+    return 0;
+}
+// search common string in semicolon separated strings (xx;yy;zz vs aa;bb)
+static int strstr_match(char *a, char *b)
+{
+    char *beg = a;
+    while ( *beg )
+    {
+        char *end = beg;
+        while ( *end && *end!=';' ) end++;
+        char tmp = *end;
+        if ( *end==';' ) *end = 0;
+        int ret = str_match(beg,b);
+        *end = tmp;
+        if ( ret || !*end ) return ret;
+        beg = end + 1;
+    }
+    return 0;
+}
  static void annotate(args_t *args, bcf1_t *line)
  {
      int i, j;
@@ -2513,9 +2844,9 @@ static void annotate(args_t *args, bcf1_t *line)
          args->rm[i].handler(args, line, &args->rm[i]);
  
      int has_overlap = 0;
-
      if ( args->tgt_idx )
      {
+        for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
          if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) )
          {
              while ( regitr_overlap(args->tgt_itr) )
@@ -2526,49 +2857,145 @@ static void annotate(args_t *args, bcf1_t *line)
                  tmp->end   = args->tgt_itr->end;
                  parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp);
                  for (j=0; j<args->ncols; j++)
-                    if ( args->cols[j].setter(args,line,&args->cols[j],tmp) )
+                {
+                    if ( args->cols[j].done==1 ) continue;
+                    int ret = args->cols[j].setter(args,line,&args->cols[j],tmp);
+                    if ( ret < 0 )
                          error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                    if ( ret==0 )
+                        args->cols[j].done = 1;
+                }
              }
              has_overlap = 1;
          }
          for (j=0; j<args->ncols; j++)
-            if ( args->cols[j].merge_method != MM_FIRST )
-                args->cols[j].setter(args,line,&args->cols[j],NULL);
+        {
+            if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+            if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 )
+                error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+        }
      }
      else if ( args->tgts )
      {
-        // Buffer annotation lines. When multiple ALT alleles are present in the
-        // annotation file, at least one must match one of the VCF alleles.
-        int len = 0;
-        bcf_get_variant_types(line);
-        for (i=1; i<line->n_allele; i++)
-            if ( len > line->d.var[i].n ) len = line->d.var[i].n;
-        int end_pos = len<0 ? line->pos - len : line->pos;
+        // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one
+        // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the
+        // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found
+        // for an ALT, missing value is appended instead.
+        int end_pos = line->pos + line->rlen - 1;
          buffer_annot_lines(args, line, line->pos, end_pos);
+
+        args->nsrt_alines = 0;
+        hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines);
+        if ( args->nalines >= 0xffff || line->n_allele >= 0xffff )
+            error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+
+        // Find matching lines
          for (i=0; i<args->nalines; i++)
          {
              if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue;
-            if ( args->ref_idx != -1 )
+            if ( args->ref_idx != -1 )  // REF+ALT matching requested
              {
-                if ( vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue;   // refs not compatible
+                if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue;   // refs are not compatible
                  for (j=1; j<args->alines[i].nals; j++)
                  {
-                    if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) break;   // no ALT allele in VCF and annot file has "."
-                    if ( vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]) >= 0 ) break;
+                    int ialt;
+                    if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 )  // match: no ALT allele in VCF and annot file has "."
+                        ialt = 0;
+                    else
+                    {
+                        ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]);
+                        if ( ialt < 0 ) continue;
+                        ialt++;
+                    }
+                    if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
+                    args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
+                    has_overlap = 1;
+                    break;
                  }
-                if ( j==args->alines[i].nals ) continue;    // none of the annot alleles present in VCF's ALT
              }
-            break;
+            else    // overlap, REF+ALT matching not requested
+            {
+                args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i;
+                has_overlap = 1;
+            }
          }
-
-        if ( i<args->nalines )
+        // Sort lines if needed
+        if ( args->has_append_mode )
+        {
+            // insertion sort by VCF ALT index (top bits) and alines index (low bits)
+            uint32_t tmp;
+            for (i=1; i<args->nsrt_alines; i++)
+                for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--)
+                    tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp;
+        }
+        // Annotate
+        for (j=0; j<args->ncols; j++) args->cols[j].done = 0;
+        int ialt_exp = 1;
+        for (i=0; i<args->nsrt_alines; i++)
          {
-            // there is a matching line
+            int ialt = args->srt_alines[i] >> 16;
+            int ilin = args->srt_alines[i] & 0xffff;
+            if ( args->has_append_mode )
+            {
+                if ( ialt_exp > ialt ) continue;    // multiple annotation lines for the same position
+                if ( ialt_exp < ialt )
+                {
+                    // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT
+                    while ( ialt_exp++ < ialt )
+                    {
+                        for (j=0; j<args->ncols; j++)
+                        {
+                            if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
+                            if ( args->cols[j].done==1 ) continue;
+                            int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
+                            if ( ret < 0 )
+                                error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                            if ( ret==0 )
+                                args->cols[j].done = 1;
+                        }
+                    }
+                }
+            }
              for (j=0; j<args->ncols; j++)
-                if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) )
+            {
+                if ( args->cols[j].done==1 ) continue;
+                int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]);
+                if ( ret < 0 )
                      error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                if ( ret==0 )
+                    args->cols[j].done = 1;
+            }
+            ialt_exp = ialt + 1;
+        }
+        if ( args->nsrt_alines )
+        {
+            // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one
+            // record was found. Otherwise leave the row will be left without annotation.
+            if ( args->has_append_mode && ialt_exp < line->n_allele )
+            {
+                while ( ialt_exp++ < line->n_allele )
+                {
+                    for (j=0; j<args->ncols; j++)
+                    {
+                        if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
+                        if ( args->cols[j].done==1 ) continue;
+                        int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
+                        if ( ret < 0 )
+                            error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+                        if ( ret==0 )
+                            args->cols[j].done = 1;
+                    }
+                }
+            }
+            // Flush
+            for (j=0; j<args->ncols; j++)
+            {
+                if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+                int ret = args->cols[j].setter(args,line,&args->cols[j],NULL);
+                if ( ret < 0 )
+                    error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+            }
          }
-        has_overlap = i<args->nalines ? 1 : 0;
      }
      else if ( args->files->nreaders == 2 )
      {
@@ -2613,30 +3040,32 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "Usage:   bcftools annotate [options] <in.vcf.gz>\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Options:\n");
-    fprintf(bcftools_stderr, "   -a, --annotations <file>       VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n");
-    fprintf(bcftools_stderr, "       --collapse <string>        matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
-    fprintf(bcftools_stderr, "   -c, --columns <list>           list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
-    fprintf(bcftools_stderr, "   -e, --exclude <expr>           exclude sites for which the expression is true (see man page for details)\n");
-    fprintf(bcftools_stderr, "       --force                    continue despite parsing error (at your own risk!)\n");
-    fprintf(bcftools_stderr, "   -h, --header-lines <file>      lines which should be appended to the VCF header\n");
-    fprintf(bcftools_stderr, "   -I, --set-id [+]<format>       set ID column, see man page for details\n");
-    fprintf(bcftools_stderr, "   -i, --include <expr>           select sites for which the expression is true (see man page for details)\n");
-    fprintf(bcftools_stderr, "   -k, --keep-sites               leave -i/-e sites unchanged instead of discarding them\n");
-    fprintf(bcftools_stderr, "   -l, --merge-logic <tag:type>   merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
-    fprintf(bcftools_stderr, "   -m, --mark-sites [+-]<tag>     add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
-    fprintf(bcftools_stderr, "       --no-version               do not append version and command line to the header\n");
-    fprintf(bcftools_stderr, "   -o, --output <file>            write output to a file [standard output]\n");
-    fprintf(bcftools_stderr, "   -O, --output-type <b|u|z|v>    b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
-    fprintf(bcftools_stderr, "   -r, --regions <region>         restrict to comma-separated list of regions\n");
-    fprintf(bcftools_stderr, "   -R, --regions-file <file>      restrict to regions listed in a file\n");
-    fprintf(bcftools_stderr, "       --rename-chrs <file>       rename sequences according to map file: from\\tto\n");
-    fprintf(bcftools_stderr, "   -s, --samples [^]<list>        comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
-    fprintf(bcftools_stderr, "   -S, --samples-file [^]<file>   file of samples to annotate (or exclude with \"^\" prefix)\n");
-    fprintf(bcftools_stderr, "       --single-overlaps          keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
-    fprintf(bcftools_stderr, "   -x, --remove <list>            list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
-    fprintf(bcftools_stderr, "       --threads <int>            number of extra output compression threads [0]\n");
+    fprintf(bcftools_stderr, "   -a, --annotations FILE       VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n");
+    fprintf(bcftools_stderr, "       --collapse STR           matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
+    fprintf(bcftools_stderr, "   -c, --columns LIST           list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
+    fprintf(bcftools_stderr, "   -C, --columns-file FILE      read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n");
+    fprintf(bcftools_stderr, "   -e, --exclude EXPR           exclude sites for which the expression is true (see man page for details)\n");
+    fprintf(bcftools_stderr, "       --force                  continue despite parsing error (at your own risk!)\n");
+    fprintf(bcftools_stderr, "   -h, --header-lines FILE      lines which should be appended to the VCF header\n");
+    fprintf(bcftools_stderr, "   -I, --set-id [+]FORMAT       set ID column using a `bcftools query`-like expression, see man page for details\n");
+    fprintf(bcftools_stderr, "   -i, --include EXPR           select sites for which the expression is true (see man page for details)\n");
+    fprintf(bcftools_stderr, "   -k, --keep-sites             leave -i/-e sites unchanged instead of discarding them\n");
+    fprintf(bcftools_stderr, "   -l, --merge-logic TAG:TYPE   merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
+    fprintf(bcftools_stderr, "   -m, --mark-sites [+-]TAG     add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
+    fprintf(bcftools_stderr, "       --no-version             do not append version and command line to the header\n");
+    fprintf(bcftools_stderr, "   -o, --output FILE            write output to a file [standard output]\n");
+    fprintf(bcftools_stderr, "   -O, --output-type [b|u|z|v]  b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+    fprintf(bcftools_stderr, "   -r, --regions REGION         restrict to comma-separated list of regions\n");
+    fprintf(bcftools_stderr, "   -R, --regions-file FILE      restrict to regions listed in FILE\n");
+    fprintf(bcftools_stderr, "       --rename-annots FILE     rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n");
+    fprintf(bcftools_stderr, "       --rename-chrs FILE       rename sequences according to the mapping: old\\tnew\n");
+    fprintf(bcftools_stderr, "   -s, --samples [^]LIST        comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
+    fprintf(bcftools_stderr, "   -S, --samples-file [^]FILE   file of samples to annotate (or exclude with \"^\" prefix)\n");
+    fprintf(bcftools_stderr, "       --single-overlaps        keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
+    fprintf(bcftools_stderr, "   -x, --remove LIST            list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
+    fprintf(bcftools_stderr, "       --threads INT            number of extra output compression threads [0]\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcfannotate(int argc, char *argv[])
@@ -2651,6 +3080,7 @@ int main_vcfannotate(int argc, char *argv[])
      args->record_cmd_line = 1;
      args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1;
      args->set_ids_replace = 1;
+    args->match_id = -1;
      int regions_is_file = 0, collapse = 0;
  
      static struct option loptions[] =
@@ -2669,7 +3099,9 @@ int main_vcfannotate(int argc, char *argv[])
          {"regions",required_argument,NULL,'r'},
          {"regions-file",required_argument,NULL,'R'},
          {"remove",required_argument,NULL,'x'},
+        {"columns-file",required_argument,NULL,'C'},
          {"columns",required_argument,NULL,'c'},
+        {"rename-annots",required_argument,NULL,11},
          {"rename-chrs",required_argument,NULL,1},
          {"header-lines",required_argument,NULL,'h'},
          {"samples",required_argument,NULL,'s'},
@@ -2679,7 +3111,7 @@ int main_vcfannotate(int argc, char *argv[])
          {"force",no_argument,NULL,'f'},
          {NULL,0,NULL,0}
      };
-    while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
      {
          switch (c) {
              case 'f': args->force = 1; break;
@@ -2690,11 +3122,15 @@ int main_vcfannotate(int argc, char *argv[])
                  else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; }
                  else args->mark_sites = optarg; 
                  break;
-            case 'l': args->merge_method_str = optarg; break;
+            case 'l': 
+                if ( args->merge_method_str.l ) kputc(',',&args->merge_method_str);
+                kputs(optarg,&args->merge_method_str);
+                break;
              case 'I': args->set_ids_fmt = optarg; break;
              case 's': args->sample_names = optarg; break;
              case 'S': args->sample_names = optarg; args->sample_is_file = 1; break;
              case 'c': args->columns = strdup(optarg); break;
+            case 'C': args->columns = strdup(optarg); args->columns_is_file = 1; break;
              case 'o': args->output_fname = optarg; break;
              case 'O':
                  switch (optarg[0]) {
@@ -2705,8 +3141,12 @@ int main_vcfannotate(int argc, char *argv[])
                      default: error("The output type \"%s\" not recognised\n", optarg);
                  };
                  break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'x': args->remove_annots = optarg; break;
              case 'a': args->targets_fname = optarg; break;
              case 'r': args->regions_list = optarg; break;
@@ -2726,6 +3166,7 @@ int main_vcfannotate(int argc, char *argv[])
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
              case 10 : args->single_overlaps = 1; break;
+            case 11 : args->rename_annots = optarg; break;
              case '?': usage(args); break;
              default: error("Unknown argument: %s\n", optarg);
          }
diff --git a/bcftools/vcfbuf.c b/bcftools/vcfbuf.c

index ffdfd407832b877637ddf7312eeb7eec30bcafc3..71916bb6bce455d38d017a5348832700e85b92a5 100644 (file)
--- a/bcftools/vcfbuf.c
+++ b/bcftools/vcfbuf.c
@@ -1,6 +1,6 @@
  /* The MIT License
  
-   Copyright (c) 2016-2019 Genome Research Ltd.
+   Copyright (c) 2016-2021 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -24,16 +24,19 @@
  
   */
  
+#include <assert.h>
+#include <strings.h>
  #include <htslib/vcf.h>
  #include <htslib/vcfutils.h>
+#include <htslib/hts_os.h>
  #include "bcftools.h"
  #include "vcfbuf.h"
  #include "rbuf.h"
  
  typedef struct
  {
-    double max;
-    int rand_missing, skip_filter;
+    double max[VCFBUF_LD_N];
+    int rand_missing, filter1;
  }
  ld_t;
  
@@ -41,13 +44,16 @@ typedef struct
  {
      bcf1_t *rec;
      double af;
-    int af_set:1, idx:31;
+    int af_set:1, filter:1, idx:30;
  }
  vcfrec_t;
  
+#define PRUNE_MODE_MAX_AF 1
+#define PRUNE_MODE_1ST    2
+#define PRUNE_MODE_RAND   3
  typedef struct
  {
-    int max_sites, mvrec, mac, mfarr;
+    int max_sites, mvrec, mac, mfarr, mode;
      int *ac, *idx;
      float *farr;
      char *af_tag;
@@ -85,6 +91,8 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win)
      buf->hdr = hdr;
      buf->win = win;
      buf->overlap.rid = -1;
+    int i;
+    for (i=0; i<VCFBUF_LD_N; i++) buf->ld.max[i] = HUGE_VAL;
      rbuf_init(&buf->rbuf, 0);
      return buf;
  }
@@ -104,13 +112,30 @@ void vcfbuf_destroy(vcfbuf_t *buf)
  
  void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value)
  {
-    if ( key==VCFBUF_LD_MAX ) { buf->ld.max = *((double*)value); return; }
-    if ( key==VCFBUF_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; }
-    if ( key==VCFBUF_SKIP_FILTER ) { buf->ld.skip_filter = *((int*)value); return; }
-    if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; }
+    if ( key==LD_FILTER1 ) { buf->ld.filter1 = *((int*)value); return; }
+    if ( key==LD_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; }
+    if ( key==LD_MAX_R2 ) { buf->ld.max[VCFBUF_LD_IDX_R2] = *((double*)value); return; }
+    if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; }
+    if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; }
+
+    if ( key==VCFBUF_NSITES )
+    {
+        buf->prune.max_sites = *((int*)value);
+        if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF;
+        return;
+    }
      if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; }
      if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; }
      if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; }
+
+    if ( key==VCFBUF_NSITES_MODE )
+    {
+        char *mode = *((char**)value);
+        if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF;
+        else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST;
+        else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND;
+        else error("The mode \"%s\" is not recognised\n",mode);
+    }
  }
  
  int vcfbuf_nsites(vcfbuf_t *buf)
@@ -118,10 +143,8 @@ int vcfbuf_nsites(vcfbuf_t *buf)
      return buf->rbuf.n;
  }
  
-bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap)
+bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec)
  {
-    if ( !swap ) error("todo: swap=%d\n", swap);
-
      rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf);
  
      int i = rbuf_append(&buf->rbuf);
@@ -130,6 +153,8 @@ bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap)
      bcf1_t *ret = buf->vcf[i].rec;
      buf->vcf[i].rec = rec;
      buf->vcf[i].af_set = 0;
+    buf->vcf[i].filter = buf->ld.filter1;
+    buf->ld.filter1 = 0;
  
      return ret;
  }
@@ -170,6 +195,26 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all)
  {
      int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1;
  
+    int nprune = nbuf - buf->prune.max_sites;
+    int i,k,irec = 0;
+    if ( buf->prune.mode==PRUNE_MODE_1ST )
+    {
+        int eoff = flush_all ? 1 : 2;
+        for (i=0; i<nprune; i++)
+            rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->rbuf.n - eoff, buf->vcf);
+        return;
+    }
+    if ( buf->prune.mode==PRUNE_MODE_RAND )
+    {
+        int eoff = flush_all ? 0 : 1;
+        for (i=0; i<nprune; i++)
+        {
+            int j = (buf->rbuf.n - eoff) * hts_drand48();
+            rbuf_remove_kth(&buf->rbuf, vcfrec_t, j, buf->vcf);
+        }
+        return;
+    }
+
      if ( nbuf > buf->prune.mvrec )
      {
          buf->prune.idx   = (int*) realloc(buf->prune.idx, nbuf*sizeof(int));
@@ -178,7 +223,6 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all)
      }
  
      // set allele frequency and prepare buffer for sorting
-    int i,k,irec = 0;
      for (i=-1; rbuf_next(&buf->rbuf,&i) && irec<nbuf; )
      {
          bcf1_t *line = buf->vcf[i].rec;
@@ -211,7 +255,6 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all)
  
      // sort the rbuf indexes to be pruned descendently so that j-th rbuf index
      // is removed before i-th index if i<j
-    int nprune = nbuf - buf->prune.max_sites;
      for (i=0; i<nprune; i++)
          buf->prune.idx[i] = buf->prune.vrec[i]->idx;
  
@@ -333,10 +376,21 @@ static double _estimate_af(int8_t *ptr, int size, int nvals, int nsamples)
  }
  
  /*
-    For unphased genotypes D is approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/
+    The `ld` is set to D approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/
          D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb))
+
+    and `hd` as proposed in Ragsdale, A. P., & Gravel, S. (2019). Unbiased estimation of linkage
+    disequilibrium from unphased data.  Molecular Biology and Evolution. doi:10.1093/molbev/msz265 
+
+        \hat{D} = 1/[n*(n+1)]*[
+                             (n1 + n2/2 + n4/2 + n5/4)*(n5/4 + n6/2 + n8/2 + n9)
+                            -(n2/2 + n3 + n5/4 + n6/2)*(n4/2 + n5/4 + n7 + n8/2)
+                        ]
+    where n1,n2,..n9 are counts of RR/RR,RR/RA,..,AA/AA genotypes.
+
+    Returns 0 on success, -1 if the values could not be determined (missing genotypes)
  */
-static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec)
+static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *ld)
  {
      if ( arec->n_sample!=brec->n_sample ) error("Different number of samples: %d vs %d\n",arec->n_sample,brec->n_sample);
      assert( arec->n_sample );
@@ -365,21 +419,24 @@ static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec)
          baf = _estimate_af((int8_t*)bfmt->p, bfmt->size, bfmt->n, brec->n_sample);
      }
  
-    // Calculate correlation 
+    // Calculate r2, lf, hd
+    double nhd[] = {0,0,0,0,0,0,0,0,0};
      double ab = 0, aa = 0, bb = 0, a = 0, b = 0;
-    int nab = 0, na = 0, nb = 0, ndiff = 0;
+    int nab = 0, ndiff = 0;
+    int an_tot = 0, bn_tot = 0; 
      for (i=0; i<arec->n_sample; i++)
      {
          int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size);
          int8_t *bptr = (int8_t*) (bfmt->p + i*bfmt->size);
-        int adsg = 0, bdsg = 0, an = 0, bn = 0;
+        int adsg = 0, bdsg = 0;     // dosages (0,1,2) at sites (a,b)
+        int an = 0, bn = 0;         // number of alleles at sites (a,b)
          for (j=0; j<afmt->n; j++)
          {
              if ( aptr[j]==bcf_int8_vector_end ) break;
              if ( aptr[j]==bcf_gt_missing )
              {
                  if ( !buf->ld.rand_missing ) break;
-                if ( rand()/RAND_MAX >= aaf ) adsg += 1;
+                if ( hts_drand48() >= aaf ) adsg += 1;
              }
              else if ( bcf_gt_allele(aptr[j]) ) adsg += 1;
              an++;
@@ -390,89 +447,112 @@ static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec)
              if ( bptr[j]==bcf_gt_missing )
              {
                  if ( !buf->ld.rand_missing ) break;
-                if ( rand()/RAND_MAX >= baf ) bdsg += 1;
+                if ( hts_drand48() >= baf ) bdsg += 1;
              }
              else if ( bcf_gt_allele(bptr[j]) ) bdsg += 1;
              bn++;
          }
-        if ( an )
+        if ( an && bn )
          {
+            an_tot += an;
              aa += adsg*adsg;
              a  += adsg;
-            na++;
-        }
-        if ( bn )
-        {
+
+            bn_tot += bn;
              bb += bdsg*bdsg;
              b  += bdsg;
-            nb++;
-        }
-        if ( an && bn )
-        {
+
              if ( adsg!=bdsg ) ndiff++;
              ab += adsg*bdsg;
              nab++;
          }
+        if ( an==2 && bn==2 )   // for now only diploid genotypes
+        {
+            assert( adsg<=2 && bdsg<=2 );
+            nhd[ bdsg*3 + adsg ]++;
+        }
      }
-    if ( !nab ) return -1;
+    if ( !nab ) return -1;  // no data in common for the two sites
  
+    double pa = a/an_tot;
+    double pb = b/bn_tot;
      double cor;
      if ( !ndiff ) cor = 1;
      else
      {
-        // Don't know how to deal with zero variance. Since this the purpose is filtering,
-        // it is not enough to say the value is undefined. Therefore an artificial noise is
-        // added to make the denominator non-zero.
-        if ( aa == a*a/na || bb == b*b/nb )
+        if ( aa == a*a/nab || bb == b*b/nab )     // zero variance, add small noise
          {
-            aa += 3*3;
-            bb += 3*3;
-            ab += 3*3;
-            a  += 3;
-            b  += 3;
-            na++;
-            nb++;
+            aa += 1e-4;
+            bb += 1e-4;
+            ab += 1e-4;
+            a  += 1e-2;
+            b  += 1e-2;
              nab++;
          }
-        cor = (ab/nab - a/na*b/nb) / sqrt(aa/na - a/na*a/na) / sqrt(bb/nb - b/nb*b/nb);
+        cor = (ab - a*b/nab) / sqrt(aa - a*a/nab) / sqrt(bb - b*b/nab);
      }
-    return cor*cor;
+
+    ld->val[VCFBUF_LD_IDX_R2] = cor * cor;
+
+    // Lewontin's normalization of D. Also we cap at 1 as the calculation
+    // can result in values bigger than 1 for high AFs.
+    ld->val[VCFBUF_LD_IDX_LD] = cor * sqrt(pa*(1-pa)*pb*(1-pb));
+    double norm;
+    if ( ld->val[VCFBUF_LD_IDX_LD] < 0 )
+        norm = -pa*pb > -(1-pa)*(1-pb) ? -pa*pb : -(1-pa)*(1-pb);
+    else
+        norm = pa*(1-pb) > (1-pa)*pb ? pa*(1-pb) : (1-pa)*pb;
+    if ( norm )
+        ld->val[VCFBUF_LD_IDX_LD] = fabs(norm) > fabs(ld->val[VCFBUF_LD_IDX_LD]) ? ld->val[VCFBUF_LD_IDX_LD]/norm : 1;
+    if ( !ld->val[VCFBUF_LD_IDX_LD] )
+        ld->val[VCFBUF_LD_IDX_LD] = fabs(ld->val[VCFBUF_LD_IDX_LD]);    // avoid "-0" on output
+
+    ld->val[VCFBUF_LD_IDX_HD] =
+        (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8]) 
+        - (nhd[1]/2. + nhd[2] + nhd[4]/4. + nhd[5]/2.)*(nhd[3]/2. + nhd[4]/4. + nhd[6] + nhd[7]/2.);
+    ld->val[VCFBUF_LD_IDX_HD] /= nab;
+    ld->val[VCFBUF_LD_IDX_HD] /= nab+1;
+
+    return 0;
  }
  
-bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld)
+int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld)
  {
-    *ld = -1;
-    if ( !buf->rbuf.n ) return NULL;
+    int ret = -1;
+    if ( !buf->rbuf.n ) return ret;
  
-    int i = buf->rbuf.f;
+    int j, i = buf->rbuf.f;
  
      // Relying on vcfbuf being properly flushed - all sites in the buffer
      // must come from the same chromosome
-    if ( buf->vcf[i].rec->rid != rec->rid ) return NULL;
+    if ( buf->vcf[i].rec->rid != rec->rid ) return ret;
+
+    vcfbuf_ld_t tmp;
+    for (j=0; j<VCFBUF_LD_N; j++)
+    {
+        ld->val[j] = -HUGE_VAL;
+        ld->rec[j] = NULL;
+    }
  
-    int imax = 0;
-    double max = 0;
      for (i=-1; rbuf_next(&buf->rbuf,&i); )
      {   
-        if ( buf->ld.skip_filter )
-        {
-            if ( buf->vcf[i].rec->d.n_flt > 1 ) continue;   // multiple filters are set
-            if ( buf->vcf[i].rec->d.n_flt==1 && buf->vcf[i].rec->d.flt[0]!=0 ) continue;    // not PASS
-        }
-        double val = _calc_ld(buf, buf->vcf[i].rec, rec);
-        if ( buf->ld.max && buf->ld.max < val ) 
-        {
-            *ld = val;
-            return buf->vcf[i].rec;
-        }
-        if ( val > max )
+        if ( buf->vcf[i].filter ) continue;
+        if ( _calc_r2_ld(buf, buf->vcf[i].rec, rec, &tmp) < 0 ) continue;   // missing genotypes
+
+        int done = 0;
+        for (j=0; j<VCFBUF_LD_N; j++)
          {
-            max  = val;
-            imax = i;
+            if ( ld->val[j] < tmp.val[j] )
+            {
+                ld->val[j] = tmp.val[j];
+                ld->rec[j] = buf->vcf[i].rec;
+            }
+            if ( buf->ld.max[j] < tmp.val[j] ) done = 1;
+            ret = 0;
          }
+        if ( done ) return ret;
      }
-    *ld = max;
-    return buf->vcf[imax].rec;
+    return ret;
  }
  
  
diff --git a/bcftools/vcfbuf.c.pysam.c b/bcftools/vcfbuf.c.pysam.c

index d1dcf991c16dd61ebdaaf257f2977f727e278e99..50df73d8cc2bf01e32cf30fe0da02556c0057ba2 100644 (file)
--- a/bcftools/vcfbuf.c.pysam.c
+++ b/bcftools/vcfbuf.c.pysam.c
@@ -2,7 +2,7 @@
  
  /* The MIT License
  
-   Copyright (c) 2016-2019 Genome Research Ltd.
+   Copyright (c) 2016-2021 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -26,16 +26,19 @@
  
   */
  
+#include <assert.h>
+#include <strings.h>
  #include <htslib/vcf.h>
  #include <htslib/vcfutils.h>
+#include <htslib/hts_os.h>
  #include "bcftools.h"
  #include "vcfbuf.h"
  #include "rbuf.h"
  
  typedef struct
  {
-    double max;
-    int rand_missing, skip_filter;
+    double max[VCFBUF_LD_N];
+    int rand_missing, filter1;
  }
  ld_t;
  
@@ -43,13 +46,16 @@ typedef struct
  {
      bcf1_t *rec;
      double af;
-    int af_set:1, idx:31;
+    int af_set:1, filter:1, idx:30;
  }
  vcfrec_t;
  
+#define PRUNE_MODE_MAX_AF 1
+#define PRUNE_MODE_1ST    2
+#define PRUNE_MODE_RAND   3
  typedef struct
  {
-    int max_sites, mvrec, mac, mfarr;
+    int max_sites, mvrec, mac, mfarr, mode;
      int *ac, *idx;
      float *farr;
      char *af_tag;
@@ -87,6 +93,8 @@ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win)
      buf->hdr = hdr;
      buf->win = win;
      buf->overlap.rid = -1;
+    int i;
+    for (i=0; i<VCFBUF_LD_N; i++) buf->ld.max[i] = HUGE_VAL;
      rbuf_init(&buf->rbuf, 0);
      return buf;
  }
@@ -106,13 +114,30 @@ void vcfbuf_destroy(vcfbuf_t *buf)
  
  void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value)
  {
-    if ( key==VCFBUF_LD_MAX ) { buf->ld.max = *((double*)value); return; }
-    if ( key==VCFBUF_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; }
-    if ( key==VCFBUF_SKIP_FILTER ) { buf->ld.skip_filter = *((int*)value); return; }
-    if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; }
+    if ( key==LD_FILTER1 ) { buf->ld.filter1 = *((int*)value); return; }
+    if ( key==LD_RAND_MISSING ) { buf->ld.rand_missing = *((int*)value); return; }
+    if ( key==LD_MAX_R2 ) { buf->ld.max[VCFBUF_LD_IDX_R2] = *((double*)value); return; }
+    if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; }
+    if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; }
+
+    if ( key==VCFBUF_NSITES )
+    {
+        buf->prune.max_sites = *((int*)value);
+        if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF;
+        return;
+    }
      if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; }
      if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; }
      if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; }
+
+    if ( key==VCFBUF_NSITES_MODE )
+    {
+        char *mode = *((char**)value);
+        if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF;
+        else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST;
+        else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND;
+        else error("The mode \"%s\" is not recognised\n",mode);
+    }
  }
  
  int vcfbuf_nsites(vcfbuf_t *buf)
@@ -120,10 +145,8 @@ int vcfbuf_nsites(vcfbuf_t *buf)
      return buf->rbuf.n;
  }
  
-bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap)
+bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec)
  {
-    if ( !swap ) error("todo: swap=%d\n", swap);
-
      rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf);
  
      int i = rbuf_append(&buf->rbuf);
@@ -132,6 +155,8 @@ bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap)
      bcf1_t *ret = buf->vcf[i].rec;
      buf->vcf[i].rec = rec;
      buf->vcf[i].af_set = 0;
+    buf->vcf[i].filter = buf->ld.filter1;
+    buf->ld.filter1 = 0;
  
      return ret;
  }
@@ -172,6 +197,26 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all)
  {
      int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1;
  
+    int nprune = nbuf - buf->prune.max_sites;
+    int i,k,irec = 0;
+    if ( buf->prune.mode==PRUNE_MODE_1ST )
+    {
+        int eoff = flush_all ? 1 : 2;
+        for (i=0; i<nprune; i++)
+            rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->rbuf.n - eoff, buf->vcf);
+        return;
+    }
+    if ( buf->prune.mode==PRUNE_MODE_RAND )
+    {
+        int eoff = flush_all ? 0 : 1;
+        for (i=0; i<nprune; i++)
+        {
+            int j = (buf->rbuf.n - eoff) * hts_drand48();
+            rbuf_remove_kth(&buf->rbuf, vcfrec_t, j, buf->vcf);
+        }
+        return;
+    }
+
      if ( nbuf > buf->prune.mvrec )
      {
          buf->prune.idx   = (int*) realloc(buf->prune.idx, nbuf*sizeof(int));
@@ -180,7 +225,6 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all)
      }
  
      // set allele frequency and prepare buffer for sorting
-    int i,k,irec = 0;
      for (i=-1; rbuf_next(&buf->rbuf,&i) && irec<nbuf; )
      {
          bcf1_t *line = buf->vcf[i].rec;
@@ -213,7 +257,6 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all)
  
      // sort the rbuf indexes to be pruned descendently so that j-th rbuf index
      // is removed before i-th index if i<j
-    int nprune = nbuf - buf->prune.max_sites;
      for (i=0; i<nprune; i++)
          buf->prune.idx[i] = buf->prune.vrec[i]->idx;
  
@@ -335,10 +378,21 @@ static double _estimate_af(int8_t *ptr, int size, int nvals, int nsamples)
  }
  
  /*
-    For unphased genotypes D is approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/
+    The `ld` is set to D approximated as suggested in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2710162/
          D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb))
+
+    and `hd` as proposed in Ragsdale, A. P., & Gravel, S. (2019). Unbiased estimation of linkage
+    disequilibrium from unphased data.  Molecular Biology and Evolution. doi:10.1093/molbev/msz265 
+
+        \hat{D} = 1/[n*(n+1)]*[
+                             (n1 + n2/2 + n4/2 + n5/4)*(n5/4 + n6/2 + n8/2 + n9)
+                            -(n2/2 + n3 + n5/4 + n6/2)*(n4/2 + n5/4 + n7 + n8/2)
+                        ]
+    where n1,n2,..n9 are counts of RR/RR,RR/RA,..,AA/AA genotypes.
+
+    Returns 0 on success, -1 if the values could not be determined (missing genotypes)
  */
-static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec)
+static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *ld)
  {
      if ( arec->n_sample!=brec->n_sample ) error("Different number of samples: %d vs %d\n",arec->n_sample,brec->n_sample);
      assert( arec->n_sample );
@@ -367,21 +421,24 @@ static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec)
          baf = _estimate_af((int8_t*)bfmt->p, bfmt->size, bfmt->n, brec->n_sample);
      }
  
-    // Calculate correlation 
+    // Calculate r2, lf, hd
+    double nhd[] = {0,0,0,0,0,0,0,0,0};
      double ab = 0, aa = 0, bb = 0, a = 0, b = 0;
-    int nab = 0, na = 0, nb = 0, ndiff = 0;
+    int nab = 0, ndiff = 0;
+    int an_tot = 0, bn_tot = 0; 
      for (i=0; i<arec->n_sample; i++)
      {
          int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size);
          int8_t *bptr = (int8_t*) (bfmt->p + i*bfmt->size);
-        int adsg = 0, bdsg = 0, an = 0, bn = 0;
+        int adsg = 0, bdsg = 0;     // dosages (0,1,2) at sites (a,b)
+        int an = 0, bn = 0;         // number of alleles at sites (a,b)
          for (j=0; j<afmt->n; j++)
          {
              if ( aptr[j]==bcf_int8_vector_end ) break;
              if ( aptr[j]==bcf_gt_missing )
              {
                  if ( !buf->ld.rand_missing ) break;
-                if ( rand()/RAND_MAX >= aaf ) adsg += 1;
+                if ( hts_drand48() >= aaf ) adsg += 1;
              }
              else if ( bcf_gt_allele(aptr[j]) ) adsg += 1;
              an++;
@@ -392,89 +449,112 @@ static double _calc_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec)
              if ( bptr[j]==bcf_gt_missing )
              {
                  if ( !buf->ld.rand_missing ) break;
-                if ( rand()/RAND_MAX >= baf ) bdsg += 1;
+                if ( hts_drand48() >= baf ) bdsg += 1;
              }
              else if ( bcf_gt_allele(bptr[j]) ) bdsg += 1;
              bn++;
          }
-        if ( an )
+        if ( an && bn )
          {
+            an_tot += an;
              aa += adsg*adsg;
              a  += adsg;
-            na++;
-        }
-        if ( bn )
-        {
+
+            bn_tot += bn;
              bb += bdsg*bdsg;
              b  += bdsg;
-            nb++;
-        }
-        if ( an && bn )
-        {
+
              if ( adsg!=bdsg ) ndiff++;
              ab += adsg*bdsg;
              nab++;
          }
+        if ( an==2 && bn==2 )   // for now only diploid genotypes
+        {
+            assert( adsg<=2 && bdsg<=2 );
+            nhd[ bdsg*3 + adsg ]++;
+        }
      }
-    if ( !nab ) return -1;
+    if ( !nab ) return -1;  // no data in common for the two sites
  
+    double pa = a/an_tot;
+    double pb = b/bn_tot;
      double cor;
      if ( !ndiff ) cor = 1;
      else
      {
-        // Don't know how to deal with zero variance. Since this the purpose is filtering,
-        // it is not enough to say the value is undefined. Therefore an artificial noise is
-        // added to make the denominator non-zero.
-        if ( aa == a*a/na || bb == b*b/nb )
+        if ( aa == a*a/nab || bb == b*b/nab )     // zero variance, add small noise
          {
-            aa += 3*3;
-            bb += 3*3;
-            ab += 3*3;
-            a  += 3;
-            b  += 3;
-            na++;
-            nb++;
+            aa += 1e-4;
+            bb += 1e-4;
+            ab += 1e-4;
+            a  += 1e-2;
+            b  += 1e-2;
              nab++;
          }
-        cor = (ab/nab - a/na*b/nb) / sqrt(aa/na - a/na*a/na) / sqrt(bb/nb - b/nb*b/nb);
+        cor = (ab - a*b/nab) / sqrt(aa - a*a/nab) / sqrt(bb - b*b/nab);
      }
-    return cor*cor;
+
+    ld->val[VCFBUF_LD_IDX_R2] = cor * cor;
+
+    // Lewontin's normalization of D. Also we cap at 1 as the calculation
+    // can result in values bigger than 1 for high AFs.
+    ld->val[VCFBUF_LD_IDX_LD] = cor * sqrt(pa*(1-pa)*pb*(1-pb));
+    double norm;
+    if ( ld->val[VCFBUF_LD_IDX_LD] < 0 )
+        norm = -pa*pb > -(1-pa)*(1-pb) ? -pa*pb : -(1-pa)*(1-pb);
+    else
+        norm = pa*(1-pb) > (1-pa)*pb ? pa*(1-pb) : (1-pa)*pb;
+    if ( norm )
+        ld->val[VCFBUF_LD_IDX_LD] = fabs(norm) > fabs(ld->val[VCFBUF_LD_IDX_LD]) ? ld->val[VCFBUF_LD_IDX_LD]/norm : 1;
+    if ( !ld->val[VCFBUF_LD_IDX_LD] )
+        ld->val[VCFBUF_LD_IDX_LD] = fabs(ld->val[VCFBUF_LD_IDX_LD]);    // avoid "-0" on output
+
+    ld->val[VCFBUF_LD_IDX_HD] =
+        (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8]) 
+        - (nhd[1]/2. + nhd[2] + nhd[4]/4. + nhd[5]/2.)*(nhd[3]/2. + nhd[4]/4. + nhd[6] + nhd[7]/2.);
+    ld->val[VCFBUF_LD_IDX_HD] /= nab;
+    ld->val[VCFBUF_LD_IDX_HD] /= nab+1;
+
+    return 0;
  }
  
-bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld)
+int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld)
  {
-    *ld = -1;
-    if ( !buf->rbuf.n ) return NULL;
+    int ret = -1;
+    if ( !buf->rbuf.n ) return ret;
  
-    int i = buf->rbuf.f;
+    int j, i = buf->rbuf.f;
  
      // Relying on vcfbuf being properly flushed - all sites in the buffer
      // must come from the same chromosome
-    if ( buf->vcf[i].rec->rid != rec->rid ) return NULL;
+    if ( buf->vcf[i].rec->rid != rec->rid ) return ret;
+
+    vcfbuf_ld_t tmp;
+    for (j=0; j<VCFBUF_LD_N; j++)
+    {
+        ld->val[j] = -HUGE_VAL;
+        ld->rec[j] = NULL;
+    }
  
-    int imax = 0;
-    double max = 0;
      for (i=-1; rbuf_next(&buf->rbuf,&i); )
      {   
-        if ( buf->ld.skip_filter )
-        {
-            if ( buf->vcf[i].rec->d.n_flt > 1 ) continue;   // multiple filters are set
-            if ( buf->vcf[i].rec->d.n_flt==1 && buf->vcf[i].rec->d.flt[0]!=0 ) continue;    // not PASS
-        }
-        double val = _calc_ld(buf, buf->vcf[i].rec, rec);
-        if ( buf->ld.max && buf->ld.max < val ) 
-        {
-            *ld = val;
-            return buf->vcf[i].rec;
-        }
-        if ( val > max )
+        if ( buf->vcf[i].filter ) continue;
+        if ( _calc_r2_ld(buf, buf->vcf[i].rec, rec, &tmp) < 0 ) continue;   // missing genotypes
+
+        int done = 0;
+        for (j=0; j<VCFBUF_LD_N; j++)
          {
-            max  = val;
-            imax = i;
+            if ( ld->val[j] < tmp.val[j] )
+            {
+                ld->val[j] = tmp.val[j];
+                ld->rec[j] = buf->vcf[i].rec;
+            }
+            if ( buf->ld.max[j] < tmp.val[j] ) done = 1;
+            ret = 0;
          }
+        if ( done ) return ret;
      }
-    *ld = max;
-    return buf->vcf[imax].rec;
+    return ret;
  }
  
  
diff --git a/bcftools/vcfbuf.h b/bcftools/vcfbuf.h

index 9ede5b5d7625dc9eb71cda99012c696262bdd3d2..d3be6c53c1ac3b8c56ba96f133ed858030f89c9f 100644 (file)
--- a/bcftools/vcfbuf.h
+++ b/bcftools/vcfbuf.h
@@ -1,6 +1,6 @@
  /* The MIT License
  
-   Copyright (c) 2017-2019 Genome Research Ltd.
+   Copyright (c) 2017-2021 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -38,13 +38,18 @@ typedef struct _vcfbuf_t vcfbuf_t;
  // Modes of operation
  typedef enum
  {
-    VCFBUF_LD_MAX,          // vcfbuf_max_ld() stops at the first record that exceeds the threshold
-    VCFBUF_RAND_MISSING,    // randomize rather than ignore missing genotypes
-    VCFBUF_SKIP_FILTER,     // skip sites with FILTER diferent from "PASS" or "."
-    VCFBUF_NSITES,          // leave at max this many sites in the window
-    VCFBUF_AF_TAG,          // use this INFO tag with LD_NSITES
      VCFBUF_OVERLAP_WIN,     // keep only overlapping variants in the window
      VCFBUF_RMDUP,           // remove duplicate sites (completely)
+    VCFBUF_NSITES,          // leave at max this many sites in the window
+    VCFBUF_NSITES_MODE,     // one of: maxAF (keep sites with max AF), 1st (sites that come first), rand (pick randomly)
+    VCFBUF_AF_TAG,          // use this INFO tag with VCFBUF_NSITES
+
+    // LD related options
+    LD_RAND_MISSING,        // randomize rather than ignore missing genotypes
+    LD_FILTER1,             // exclude the next record inserted by vcfbuf_push() from LD analysis
+    LD_MAX_R2,              // If set, vcfbuf_ld() will stop at the first record that exceeds the R2,
+    LD_MAX_LD,              //      LD, or HD threshold. When multiple are set, the OR logic is applied
+    LD_MAX_HD,              //      
  }
  vcfbuf_opt_t;
  
@@ -61,9 +66,8 @@ void vcfbuf_destroy(vcfbuf_t *buf);
  
  /*
   *  vcfbuf_push() - push a new site for analysis
- *  @swap:  if set, do not create a copy, but return a substitute
   */
-bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap);
+bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec);
  
  /*
   *  vcfbuf_peek() - return pointer to i-th record in the buffer but do not remove it from the buffer
@@ -85,10 +89,28 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all);
  int vcfbuf_nsites(vcfbuf_t *buf);
  
  /*
- *  vcfbuf_max_ld() - return a record that has maximum D or first record exceeding the threshold
- *  @ld:        will be filled with the maximum D found
+ *  vcfbuf_ld() - find records with maximum LD values or the values in first record that exceeds thresholds
+ *                set by vcfbuf_set_opt(..,LD_MAX*,..)
+ *
+ *  Returns 0 on success or -1 if no values were filled.
+ *
+ *  @val:  will be filled with the values
+ *          .. correlation coefficient r-squared
+ *          .. Lewontin's D' (PMID: 19433632)
+ *          .. Ragsdale's \hat{D} (doi:10.1093/molbev/msz265)
+ *  @rec: corresponding positions or NULL if the value(s) has not been set
   */
-bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld);
+#define VCFBUF_LD_N 3
+#define VCFBUF_LD_IDX_R2 0
+#define VCFBUF_LD_IDX_LD 1
+#define VCFBUF_LD_IDX_HD 2
+typedef struct
+{
+    double val[VCFBUF_LD_N];    // r2, ld, hd
+    bcf1_t *rec[VCFBUF_LD_N];   // record with max r2, ld, hd
+}
+vcfbuf_ld_t;
+int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld);
  
  #endif
  
diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c

index f54654237334d4cc1c978bdceb01df1df3043b23..e2aab3f95cc9077851b4515f85408dd02500c4ac 100644 (file)
--- a/bcftools/vcfcall.c
+++ b/bcftools/vcfcall.c
@@ -1,6 +1,6 @@
  /*  vcfcall.c -- SNP/indel variant calling from VCF/BCF.
  
-    Copyright (C) 2013-2016 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -25,6 +25,7 @@ THE SOFTWARE.  */
  #include <stdarg.h>
  #include <string.h>
  #include <strings.h>
+#include <assert.h>
  #include <errno.h>
  #include <unistd.h>
  #include <getopt.h>
@@ -189,6 +190,11 @@ static ploidy_predef_t ploidy_predefs[] =
        .ploidy =
            "*  * *     * 1\n"
      },
+    { .alias  = "2",
+      .about  = "Treat all samples as diploid",
+      .ploidy =
+          "*  * *     * 2\n"
+    },
      {
          .alias  = NULL,
          .about  = NULL,
@@ -536,7 +542,7 @@ bcf1_t *next_line(args_t *args)
              bcf_unpack(rec, BCF_UN_STR);
              if ( !rec0 ) rec0 = rec;
              recN = rec;
-            args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1);
+            args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec);
              if ( rec0->rid!=recN->rid || rec0->pos!=recN->pos ) break;
          }
      }
@@ -611,7 +617,7 @@ static void init_data(args_t *args)
      // Open files for input and output, initialize structures
      if ( args->targets )
      {
-        args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL);
+        args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : (regidx_free_f) NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL);
          args->tgt_itr = regitr_init(args->tgt_idx);
          args->tgt_itr_tmp = regitr_init(args->tgt_idx);
      }
@@ -686,7 +692,7 @@ static void init_data(args_t *args)
      if ( args->aux.flag & CALL_CONSTR_ALLELES )
          args->vcfbuf = vcfbuf_init(args->aux.hdr, 0);
  
-    args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+    args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
      if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
  
@@ -768,7 +774,20 @@ void parse_novel_rate(args_t *args, const char *str)
      else error("Could not parse --novel-rate %s\n", str);
  }
  
-static int parse_format_flag(const char *str)
+static void list_annotations(FILE *fp)
+{
+    fprintf(fp,
+        "\n"
+        "Optional INFO annotations available with -m (\"INFO/\" prefix is optional):\n"
+        "  INFO/PV4   .. P-values for strand bias, baseQ bias, mapQ bias and tail distance bias (Number=4,Type=Float)\n"
+        "\n"
+        "Optional FORMAT annotations available with -m (\"FORMAT/\" prefix is optional):\n"
+        "  FORMAT/GQ  .. Phred-scaled genotype quality (Number=1,Type=Integer)\n"
+        "  FORMAT/GP  .. Phred-scaled genotype posterior probabilities (Number=G,Type=Float)\n"
+        "\n");
+}
+
+static int parse_output_tags(const char *str)
  {
      int flag = 0;
      const char *ss = str;
@@ -776,8 +795,9 @@ static int parse_format_flag(const char *str)
      {
          const char *se = ss;
          while ( *se && *se!=',' ) se++;
-        if ( !strncasecmp(ss,"GQ",se-ss) ) flag |= CALL_FMT_GQ;
-        else if ( !strncasecmp(ss,"GP",se-ss) ) flag |= CALL_FMT_GP;
+        if ( !strncasecmp(ss,"GQ",se-ss) || !strncasecmp(ss,"FORMAT/GQ",se-ss) || !strncasecmp(ss,"FMT/GQ",se-ss)  ) flag |= CALL_FMT_GQ;
+        else if ( !strncasecmp(ss,"GP",se-ss) || !strncasecmp(ss,"FORMAT/GP",se-ss) || !strncasecmp(ss,"FMT/GP",se-ss) ) flag |= CALL_FMT_GP;
+        else if ( !strncasecmp(ss,"PV4",se-ss) || !strncasecmp(ss,"INFO/PV4",se-ss) ) flag |= CALL_FMT_PV4;
          else
          {
              fprintf(stderr,"Could not parse \"%s\"\n", str);
@@ -856,41 +876,46 @@ static void usage(args_t *args)
      fprintf(stderr, "Usage:   bcftools call [options] <in.vcf.gz>\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "File format options:\n");
-    fprintf(stderr, "       --no-version                do not append version and command line to the header\n");
-    fprintf(stderr, "   -o, --output <file>             write output to a file [standard output]\n");
-    fprintf(stderr, "   -O, --output-type <b|u|z|v>     output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
-    fprintf(stderr, "       --ploidy <assembly>[?]      predefined ploidy, 'list' to print available settings, append '?' for details\n");
-    fprintf(stderr, "       --ploidy-file <file>        space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
-    fprintf(stderr, "   -r, --regions <region>          restrict to comma-separated list of regions\n");
-    fprintf(stderr, "   -R, --regions-file <file>       restrict to regions listed in a file\n");
-    fprintf(stderr, "   -s, --samples <list>            list of samples to include [all samples]\n");
-    fprintf(stderr, "   -S, --samples-file <file>       PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
-    fprintf(stderr, "   -t, --targets <region>          similar to -r but streams rather than index-jumps\n");
-    fprintf(stderr, "   -T, --targets-file <file>       similar to -R but streams rather than index-jumps\n");
-    fprintf(stderr, "       --threads <int>             use multithreading with <int> worker threads [0]\n");
+    fprintf(stderr, "       --no-version              Do not append version and command line to the header\n");
+    fprintf(stderr, "   -o, --output FILE             Write output to a file [standard output]\n");
+    fprintf(stderr, "   -O, --output-type b|u|z|v     Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+    fprintf(stderr, "       --ploidy ASSEMBLY[?]      Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n");
+    fprintf(stderr, "       --ploidy-file FILE        Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
+    fprintf(stderr, "   -r, --regions REGION          Restrict to comma-separated list of regions\n");
+    fprintf(stderr, "   -R, --regions-file FILE       Restrict to regions listed in a file\n");
+    fprintf(stderr, "   -s, --samples LIST            List of samples to include [all samples]\n");
+    fprintf(stderr, "   -S, --samples-file FILE       PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
+    fprintf(stderr, "   -t, --targets REGION          Similar to -r but streams rather than index-jumps\n");
+    fprintf(stderr, "   -T, --targets-file FILE       Similar to -R but streams rather than index-jumps\n");
+    fprintf(stderr, "       --threads INT             Use multithreading with INT worker threads [0]\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Input/output options:\n");
-    fprintf(stderr, "   -A, --keep-alts                 keep all possible alternate alleles at variant sites\n");
-    fprintf(stderr, "   -f, --format-fields <list>      output format fields: GQ,GP (lowercase allowed) []\n");
-    fprintf(stderr, "   -F, --prior-freqs <AN,AC>       use prior allele frequencies\n");
-    fprintf(stderr, "   -G, --group-samples <file|->    group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling\n");
-    fprintf(stderr, "   -g, --gvcf <int>,[...]          group non-variant sites into gVCF blocks by minimum per-sample DP\n");
-    fprintf(stderr, "   -i, --insert-missed             output also sites missed by mpileup but present in -T\n");
-    fprintf(stderr, "   -M, --keep-masked-ref           keep sites with masked reference allele (REF=N)\n");
-    fprintf(stderr, "   -V, --skip-variants <type>      skip indels/snps\n");
-    fprintf(stderr, "   -v, --variants-only             output variant sites only\n");
+    fprintf(stderr, "   -A, --keep-alts               Keep all possible alternate alleles at variant sites\n");
+    fprintf(stderr, "   -a, --annotate LIST           Optional tags to output (lowercase allowed); '?' to list available tags\n");
+//todo?    
+//    fprintf(stderr, "   -a, --annots LIST             Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n");
+//    fprintf(stderr, "                                 tag removal [^I16,^QS,^FMT/QS]\n");
+    fprintf(stderr, "   -F, --prior-freqs AN,AC       Use prior allele frequencies, determined from these pre-filled tags\n");
+    fprintf(stderr, "   -G, --group-samples FILE|-    Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
+    fprintf(stderr, "                                 This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); 
+    fprintf(stderr, "       --group-samples-tag TAG   The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
+    fprintf(stderr, "   -g, --gvcf INT,[...]          Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
+    fprintf(stderr, "   -i, --insert-missed           Output also sites missed by mpileup but present in -T\n");
+    fprintf(stderr, "   -M, --keep-masked-ref         Keep sites with masked reference allele (REF=N)\n");
+    fprintf(stderr, "   -V, --skip-variants TYPE      Skip indels/snps\n");
+    fprintf(stderr, "   -v, --variants-only           Output variant sites only\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Consensus/variant calling options:\n");
-    fprintf(stderr, "   -c, --consensus-caller          the original calling method (conflicts with -m)\n");
-    fprintf(stderr, "   -C, --constrain <str>           one of: alleles, trio (see manual)\n");
-    fprintf(stderr, "   -m, --multiallelic-caller       alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
-    fprintf(stderr, "   -n, --novel-rate <float>,[...]  likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
-    fprintf(stderr, "   -p, --pval-threshold <float>    variant if P(ref|D)<FLOAT with -c [0.5]\n");
-    fprintf(stderr, "   -P, --prior <float>             mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
+    fprintf(stderr, "   -c, --consensus-caller        The original calling method (conflicts with -m)\n");
+    fprintf(stderr, "   -C, --constrain STR           One of: alleles, trio (see manual)\n");
+    fprintf(stderr, "   -m, --multiallelic-caller     Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
+    fprintf(stderr, "   -n, --novel-rate FLOAT,[...]  Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
+    fprintf(stderr, "   -p, --pval-threshold FLOAT    Variant if P(ref|D)<FLOAT with -c [0.5]\n");
+    fprintf(stderr, "   -P, --prior FLOAT             Mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Example:\n");
      fprintf(stderr, "   # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n");
-    fprintf(stderr, "   bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n");
+    fprintf(stderr, "   bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n");
  
      // todo (and more)
      // fprintf(stderr, "\nContrast calling and association test options:\n");
@@ -927,9 +952,11 @@ int main_vcfcall(int argc, char *argv[])
      {
          {"help",no_argument,NULL,'h'},
          {"format-fields",required_argument,NULL,'f'},
+        {"annotate",required_argument,NULL,'a'},
          {"prior-freqs",required_argument,NULL,'F'},
          {"gvcf",required_argument,NULL,'g'},
          {"group-samples",required_argument,NULL,'G'},
+        {"group-samples-tag",required_argument,NULL,3},
          {"output",required_argument,NULL,'o'},
          {"output-type",required_argument,NULL,'O'},
          {"regions",required_argument,NULL,'r'},
@@ -960,7 +987,7 @@ int main_vcfcall(int argc, char *argv[])
      };
  
      char *tmp = NULL;
-    while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:G:", loptions, NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0)
      {
          switch (c)
          {
@@ -969,7 +996,12 @@ int main_vcfcall(int argc, char *argv[])
              case 'X': ploidy = "X"; fprintf(stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break;
              case 'Y': ploidy = "Y"; fprintf(stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break;
              case 'G': args.aux.sample_groups = optarg; break;
-            case 'f': args.aux.output_tags |= parse_format_flag(optarg); break;
+            case  3 : args.aux.sample_groups_tag = optarg; break;
+            case 'f': fprintf(stderr,"Warning: -f, --format-fields will be deprecated, please use -a, --annotate instead.\n");
+            case 'a':
+                      if (optarg[0]=='?') { list_annotations(stderr); return 1; }
+                      args.aux.output_tags |= parse_output_tags(optarg);
+                      break;
              case 'M': args.flag &= ~CF_ACGT_ONLY; break;     // keep sites where REF is N
              case 'N': args.flag |= CF_ACGT_ONLY; break;      // omit sites where first base in REF is N (the new default)
              case 'A': args.aux.flag |= CALL_KEEPALT; break;
diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c

index 8caf5100b21aabdd566952952445c77b61b1ec84..b5bedb9c8e7da52c8209d32c1ea844d20036cf81 100644 (file)
--- a/bcftools/vcfcall.c.pysam.c
+++ b/bcftools/vcfcall.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfcall.c -- SNP/indel variant calling from VCF/BCF.
  
-    Copyright (C) 2013-2016 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -27,6 +27,7 @@ THE SOFTWARE.  */
  #include <stdarg.h>
  #include <string.h>
  #include <strings.h>
+#include <assert.h>
  #include <errno.h>
  #include <unistd.h>
  #include <getopt.h>
@@ -191,6 +192,11 @@ static ploidy_predef_t ploidy_predefs[] =
        .ploidy =
            "*  * *     * 1\n"
      },
+    { .alias  = "2",
+      .about  = "Treat all samples as diploid",
+      .ploidy =
+          "*  * *     * 2\n"
+    },
      {
          .alias  = NULL,
          .about  = NULL,
@@ -538,7 +544,7 @@ bcf1_t *next_line(args_t *args)
              bcf_unpack(rec, BCF_UN_STR);
              if ( !rec0 ) rec0 = rec;
              recN = rec;
-            args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1);
+            args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec);
              if ( rec0->rid!=recN->rid || rec0->pos!=recN->pos ) break;
          }
      }
@@ -613,7 +619,7 @@ static void init_data(args_t *args)
      // Open files for input and output, initialize structures
      if ( args->targets )
      {
-        args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL);
+        args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : (regidx_free_f) NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL);
          args->tgt_itr = regitr_init(args->tgt_idx);
          args->tgt_itr_tmp = regitr_init(args->tgt_idx);
      }
@@ -688,7 +694,7 @@ static void init_data(args_t *args)
      if ( args->aux.flag & CALL_CONSTR_ALLELES )
          args->vcfbuf = vcfbuf_init(args->aux.hdr, 0);
  
-    args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+    args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
      if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
  
@@ -770,7 +776,20 @@ void parse_novel_rate(args_t *args, const char *str)
      else error("Could not parse --novel-rate %s\n", str);
  }
  
-static int parse_format_flag(const char *str)
+static void list_annotations(FILE *fp)
+{
+    fprintf(fp,
+        "\n"
+        "Optional INFO annotations available with -m (\"INFO/\" prefix is optional):\n"
+        "  INFO/PV4   .. P-values for strand bias, baseQ bias, mapQ bias and tail distance bias (Number=4,Type=Float)\n"
+        "\n"
+        "Optional FORMAT annotations available with -m (\"FORMAT/\" prefix is optional):\n"
+        "  FORMAT/GQ  .. Phred-scaled genotype quality (Number=1,Type=Integer)\n"
+        "  FORMAT/GP  .. Phred-scaled genotype posterior probabilities (Number=G,Type=Float)\n"
+        "\n");
+}
+
+static int parse_output_tags(const char *str)
  {
      int flag = 0;
      const char *ss = str;
@@ -778,12 +797,13 @@ static int parse_format_flag(const char *str)
      {
          const char *se = ss;
          while ( *se && *se!=',' ) se++;
-        if ( !strncasecmp(ss,"GQ",se-ss) ) flag |= CALL_FMT_GQ;
-        else if ( !strncasecmp(ss,"GP",se-ss) ) flag |= CALL_FMT_GP;
+        if ( !strncasecmp(ss,"GQ",se-ss) || !strncasecmp(ss,"FORMAT/GQ",se-ss) || !strncasecmp(ss,"FMT/GQ",se-ss)  ) flag |= CALL_FMT_GQ;
+        else if ( !strncasecmp(ss,"GP",se-ss) || !strncasecmp(ss,"FORMAT/GP",se-ss) || !strncasecmp(ss,"FMT/GP",se-ss) ) flag |= CALL_FMT_GP;
+        else if ( !strncasecmp(ss,"PV4",se-ss) || !strncasecmp(ss,"INFO/PV4",se-ss) ) flag |= CALL_FMT_PV4;
          else
          {
              fprintf(bcftools_stderr,"Could not parse \"%s\"\n", str);
-            exit(1);
+            bcftools_exit(1);
          }
          if ( !*se ) break;
          ss = se + 1;
@@ -837,12 +857,12 @@ ploidy_t *init_ploidy(char *alias)
          fprintf(bcftools_stderr,"Run as --ploidy <alias> (e.g. --ploidy GRCh37).\n");
          fprintf(bcftools_stderr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n");
          fprintf(bcftools_stderr,"\n");
-        exit(-1);
+        bcftools_exit(-1);
      }
      else if ( detailed )
      {
          fprintf(bcftools_stderr,"%s", pld->ploidy);
-        exit(-1);
+        bcftools_exit(-1);
      }
      return ploidy_init_string(pld->ploidy,2);
  }
@@ -858,41 +878,46 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "Usage:   bcftools call [options] <in.vcf.gz>\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "File format options:\n");
-    fprintf(bcftools_stderr, "       --no-version                do not append version and command line to the header\n");
-    fprintf(bcftools_stderr, "   -o, --output <file>             write output to a file [standard output]\n");
-    fprintf(bcftools_stderr, "   -O, --output-type <b|u|z|v>     output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
-    fprintf(bcftools_stderr, "       --ploidy <assembly>[?]      predefined ploidy, 'list' to print available settings, append '?' for details\n");
-    fprintf(bcftools_stderr, "       --ploidy-file <file>        space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
-    fprintf(bcftools_stderr, "   -r, --regions <region>          restrict to comma-separated list of regions\n");
-    fprintf(bcftools_stderr, "   -R, --regions-file <file>       restrict to regions listed in a file\n");
-    fprintf(bcftools_stderr, "   -s, --samples <list>            list of samples to include [all samples]\n");
-    fprintf(bcftools_stderr, "   -S, --samples-file <file>       PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
-    fprintf(bcftools_stderr, "   -t, --targets <region>          similar to -r but streams rather than index-jumps\n");
-    fprintf(bcftools_stderr, "   -T, --targets-file <file>       similar to -R but streams rather than index-jumps\n");
-    fprintf(bcftools_stderr, "       --threads <int>             use multithreading with <int> worker threads [0]\n");
+    fprintf(bcftools_stderr, "       --no-version              Do not append version and command line to the header\n");
+    fprintf(bcftools_stderr, "   -o, --output FILE             Write output to a file [standard output]\n");
+    fprintf(bcftools_stderr, "   -O, --output-type b|u|z|v     Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+    fprintf(bcftools_stderr, "       --ploidy ASSEMBLY[?]      Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n");
+    fprintf(bcftools_stderr, "       --ploidy-file FILE        Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
+    fprintf(bcftools_stderr, "   -r, --regions REGION          Restrict to comma-separated list of regions\n");
+    fprintf(bcftools_stderr, "   -R, --regions-file FILE       Restrict to regions listed in a file\n");
+    fprintf(bcftools_stderr, "   -s, --samples LIST            List of samples to include [all samples]\n");
+    fprintf(bcftools_stderr, "   -S, --samples-file FILE       PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
+    fprintf(bcftools_stderr, "   -t, --targets REGION          Similar to -r but streams rather than index-jumps\n");
+    fprintf(bcftools_stderr, "   -T, --targets-file FILE       Similar to -R but streams rather than index-jumps\n");
+    fprintf(bcftools_stderr, "       --threads INT             Use multithreading with INT worker threads [0]\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Input/output options:\n");
-    fprintf(bcftools_stderr, "   -A, --keep-alts                 keep all possible alternate alleles at variant sites\n");
-    fprintf(bcftools_stderr, "   -f, --format-fields <list>      output format fields: GQ,GP (lowercase allowed) []\n");
-    fprintf(bcftools_stderr, "   -F, --prior-freqs <AN,AC>       use prior allele frequencies\n");
-    fprintf(bcftools_stderr, "   -G, --group-samples <file|->    group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling\n");
-    fprintf(bcftools_stderr, "   -g, --gvcf <int>,[...]          group non-variant sites into gVCF blocks by minimum per-sample DP\n");
-    fprintf(bcftools_stderr, "   -i, --insert-missed             output also sites missed by mpileup but present in -T\n");
-    fprintf(bcftools_stderr, "   -M, --keep-masked-ref           keep sites with masked reference allele (REF=N)\n");
-    fprintf(bcftools_stderr, "   -V, --skip-variants <type>      skip indels/snps\n");
-    fprintf(bcftools_stderr, "   -v, --variants-only             output variant sites only\n");
+    fprintf(bcftools_stderr, "   -A, --keep-alts               Keep all possible alternate alleles at variant sites\n");
+    fprintf(bcftools_stderr, "   -a, --annotate LIST           Optional tags to output (lowercase allowed); '?' to list available tags\n");
+//todo?    
+//    fprintf(bcftools_stderr, "   -a, --annots LIST             Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n");
+//    fprintf(bcftools_stderr, "                                 tag removal [^I16,^QS,^FMT/QS]\n");
+    fprintf(bcftools_stderr, "   -F, --prior-freqs AN,AC       Use prior allele frequencies, determined from these pre-filled tags\n");
+    fprintf(bcftools_stderr, "   -G, --group-samples FILE|-    Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
+    fprintf(bcftools_stderr, "                                 This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); 
+    fprintf(bcftools_stderr, "       --group-samples-tag TAG   The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
+    fprintf(bcftools_stderr, "   -g, --gvcf INT,[...]          Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
+    fprintf(bcftools_stderr, "   -i, --insert-missed           Output also sites missed by mpileup but present in -T\n");
+    fprintf(bcftools_stderr, "   -M, --keep-masked-ref         Keep sites with masked reference allele (REF=N)\n");
+    fprintf(bcftools_stderr, "   -V, --skip-variants TYPE      Skip indels/snps\n");
+    fprintf(bcftools_stderr, "   -v, --variants-only           Output variant sites only\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Consensus/variant calling options:\n");
-    fprintf(bcftools_stderr, "   -c, --consensus-caller          the original calling method (conflicts with -m)\n");
-    fprintf(bcftools_stderr, "   -C, --constrain <str>           one of: alleles, trio (see manual)\n");
-    fprintf(bcftools_stderr, "   -m, --multiallelic-caller       alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
-    fprintf(bcftools_stderr, "   -n, --novel-rate <float>,[...]  likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
-    fprintf(bcftools_stderr, "   -p, --pval-threshold <float>    variant if P(ref|D)<FLOAT with -c [0.5]\n");
-    fprintf(bcftools_stderr, "   -P, --prior <float>             mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
+    fprintf(bcftools_stderr, "   -c, --consensus-caller        The original calling method (conflicts with -m)\n");
+    fprintf(bcftools_stderr, "   -C, --constrain STR           One of: alleles, trio (see manual)\n");
+    fprintf(bcftools_stderr, "   -m, --multiallelic-caller     Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
+    fprintf(bcftools_stderr, "   -n, --novel-rate FLOAT,[...]  Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
+    fprintf(bcftools_stderr, "   -p, --pval-threshold FLOAT    Variant if P(ref|D)<FLOAT with -c [0.5]\n");
+    fprintf(bcftools_stderr, "   -P, --prior FLOAT             Mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Example:\n");
      fprintf(bcftools_stderr, "   # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n");
-    fprintf(bcftools_stderr, "   bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n");
+    fprintf(bcftools_stderr, "   bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n");
  
      // todo (and more)
      // fprintf(bcftools_stderr, "\nContrast calling and association test options:\n");
@@ -901,7 +926,7 @@ static void usage(args_t *args)
      // fprintf(bcftools_stderr, "       -U INT    number of permutations for association testing (effective with -1) [0]\n");
      // fprintf(bcftools_stderr, "       -X FLOAT  only perform permutations for P(chi^2)<FLOAT [%g]\n", args->aux.min_perm_p);
      fprintf(bcftools_stderr, "\n");
-    exit(-1);
+    bcftools_exit(-1);
  }
  
  int main_vcfcall(int argc, char *argv[])
@@ -929,9 +954,11 @@ int main_vcfcall(int argc, char *argv[])
      {
          {"help",no_argument,NULL,'h'},
          {"format-fields",required_argument,NULL,'f'},
+        {"annotate",required_argument,NULL,'a'},
          {"prior-freqs",required_argument,NULL,'F'},
          {"gvcf",required_argument,NULL,'g'},
          {"group-samples",required_argument,NULL,'G'},
+        {"group-samples-tag",required_argument,NULL,3},
          {"output",required_argument,NULL,'o'},
          {"output-type",required_argument,NULL,'O'},
          {"regions",required_argument,NULL,'r'},
@@ -962,7 +989,7 @@ int main_vcfcall(int argc, char *argv[])
      };
  
      char *tmp = NULL;
-    while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:G:", loptions, NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0)
      {
          switch (c)
          {
@@ -971,7 +998,12 @@ int main_vcfcall(int argc, char *argv[])
              case 'X': ploidy = "X"; fprintf(bcftools_stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break;
              case 'Y': ploidy = "Y"; fprintf(bcftools_stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break;
              case 'G': args.aux.sample_groups = optarg; break;
-            case 'f': args.aux.output_tags |= parse_format_flag(optarg); break;
+            case  3 : args.aux.sample_groups_tag = optarg; break;
+            case 'f': fprintf(bcftools_stderr,"Warning: -f, --format-fields will be deprecated, please use -a, --annotate instead.\n");
+            case 'a':
+                      if (optarg[0]=='?') { list_annotations(bcftools_stderr); return 1; }
+                      args.aux.output_tags |= parse_output_tags(optarg);
+                      break;
              case 'M': args.flag &= ~CF_ACGT_ONLY; break;     // keep sites where REF is N
              case 'N': args.flag |= CF_ACGT_ONLY; break;      // omit sites where first base in REF is N (the new default)
              case 'A': args.aux.flag |= CALL_KEEPALT; break;
diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c

index 2d8a94c8f140940d7c0eeaebd81026aa562cd2ed..02d610d9fb95fd65705e170b8da7f088d3518401 100644 (file)
--- a/bcftools/vcfcnv.c
+++ b/bcftools/vcfcnv.c
@@ -32,6 +32,7 @@
  
  #include <stdio.h>
  #include <unistd.h>
+#include <assert.h>
  #include <getopt.h>
  #include <math.h>
  #include <inttypes.h>
diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c

index 21b9e9d7971a0bf01a300b37306f142ce378d508..d74486d6ef481e65a5d88827abe671ebabbe5e47 100644 (file)
--- a/bcftools/vcfcnv.c.pysam.c
+++ b/bcftools/vcfcnv.c.pysam.c
@@ -34,6 +34,7 @@
  
  #include <stdio.h>
  #include <unistd.h>
+#include <assert.h>
  #include <getopt.h>
  #include <math.h>
  #include <inttypes.h>
@@ -1236,7 +1237,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "    -P, --same-prob <float>            prior probability of -s/-c being the same [0.5]\n");
      fprintf(bcftools_stderr, "    -x, --xy-prob <float>              P(x|y) transition probability [1e-9]\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcfcnv(int argc, char *argv[])
diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c

index dce17f9859385685c8cba2b3974501dc763f59f8..0781a6067dafd67698b45a28731485ae9e157976 100644 (file)
--- a/bcftools/vcfconcat.c
+++ b/bcftools/vcfconcat.c
@@ -1,6 +1,6 @@
  /*  vcfconcat.c -- Concatenate or combine VCF/BCF files.
  
-    Copyright (C) 2013-2019 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -26,6 +26,7 @@ THE SOFTWARE.  */
  #include <unistd.h>
  #include <getopt.h>
  #include <string.h>
+#include <assert.h>
  #include <errno.h>
  #include <math.h>
  #include <inttypes.h>
@@ -115,7 +116,7 @@ static void init_data(args_t *args)
          bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
      }
      if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
-    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
      if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
      if ( args->allow_overlaps || args->phased_concat )
      {
@@ -154,6 +155,7 @@ static void init_data(args_t *args)
              else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY;
              else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY;
              else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE;
+            else if ( !strcmp(args->remove_dups,"exact") ) args->files->collapse = COLLAPSE_NONE;
              else error("The -D string \"%s\" not recognised.\n", args->remove_dups);
          }
          for (i=0; i<args->nfnames; i++)
@@ -233,6 +235,7 @@ static void phase_update(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec)
          if ( !args->swap_phase[i] ) continue;
          int *gt = &args->GTa[i*2];
          if ( bcf_gt_is_missing(gt[0]) || gt[1]==bcf_int32_vector_end ) continue;
+        if ( !bcf_gt_is_phased(gt[1]) ) continue;
          SWAP(int, gt[0], gt[1]);
          gt[1] |= 1;
      }
@@ -845,8 +848,8 @@ static void usage(args_t *args)
      fprintf(stderr, "Options:\n");
      fprintf(stderr, "   -a, --allow-overlaps           First coordinate of the next file can precede last record of the current file.\n");
      fprintf(stderr, "   -c, --compact-PS               Do not output PS tag at each site, only at the start of a new phase set block.\n");
-    fprintf(stderr, "   -d, --rm-dups <string>         Output duplicate records present in multiple files only once: <snps|indels|both|all|none>\n");
-    fprintf(stderr, "   -D, --remove-duplicates        Alias for -d none\n");
+    fprintf(stderr, "   -d, --rm-dups <string>         Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
+    fprintf(stderr, "   -D, --remove-duplicates        Alias for -d exact\n");
      fprintf(stderr, "   -f, --file-list <file>         Read the list of files from a file.\n");
      fprintf(stderr, "   -l, --ligate                   Ligate phased VCFs by matching phase at overlapping haplotypes\n");
      fprintf(stderr, "       --no-version               Do not append version and command line to the header\n");
@@ -903,7 +906,7 @@ int main_vcfconcat(int argc, char *argv[])
              case 'r': args->regions_list = optarg; break;
              case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
              case 'd': args->remove_dups = optarg; break;
-            case 'D': args->remove_dups = "none"; break;
+            case 'D': args->remove_dups = "exact"; break;
              case 'q': 
                  args->min_PQ = strtol(optarg,&tmp,10);
                  if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c

index 0004a553ddea30326b9249c8ef5c9e32f6379859..0cd061ee2c1d51991b67c02e8d5bf2106f6918cc 100644 (file)
--- a/bcftools/vcfconcat.c.pysam.c
+++ b/bcftools/vcfconcat.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfconcat.c -- Concatenate or combine VCF/BCF files.
  
-    Copyright (C) 2013-2019 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -28,6 +28,7 @@ THE SOFTWARE.  */
  #include <unistd.h>
  #include <getopt.h>
  #include <string.h>
+#include <assert.h>
  #include <errno.h>
  #include <math.h>
  #include <inttypes.h>
@@ -117,7 +118,7 @@ static void init_data(args_t *args)
          bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
      }
      if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
-    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
      if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
      if ( args->allow_overlaps || args->phased_concat )
      {
@@ -156,6 +157,7 @@ static void init_data(args_t *args)
              else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY;
              else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY;
              else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE;
+            else if ( !strcmp(args->remove_dups,"exact") ) args->files->collapse = COLLAPSE_NONE;
              else error("The -D string \"%s\" not recognised.\n", args->remove_dups);
          }
          for (i=0; i<args->nfnames; i++)
@@ -235,6 +237,7 @@ static void phase_update(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec)
          if ( !args->swap_phase[i] ) continue;
          int *gt = &args->GTa[i*2];
          if ( bcf_gt_is_missing(gt[0]) || gt[1]==bcf_int32_vector_end ) continue;
+        if ( !bcf_gt_is_phased(gt[1]) ) continue;
          SWAP(int, gt[0], gt[1]);
          gt[1] |= 1;
      }
@@ -847,8 +850,8 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "Options:\n");
      fprintf(bcftools_stderr, "   -a, --allow-overlaps           First coordinate of the next file can precede last record of the current file.\n");
      fprintf(bcftools_stderr, "   -c, --compact-PS               Do not output PS tag at each site, only at the start of a new phase set block.\n");
-    fprintf(bcftools_stderr, "   -d, --rm-dups <string>         Output duplicate records present in multiple files only once: <snps|indels|both|all|none>\n");
-    fprintf(bcftools_stderr, "   -D, --remove-duplicates        Alias for -d none\n");
+    fprintf(bcftools_stderr, "   -d, --rm-dups <string>         Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
+    fprintf(bcftools_stderr, "   -D, --remove-duplicates        Alias for -d exact\n");
      fprintf(bcftools_stderr, "   -f, --file-list <file>         Read the list of files from a file.\n");
      fprintf(bcftools_stderr, "   -l, --ligate                   Ligate phased VCFs by matching phase at overlapping haplotypes\n");
      fprintf(bcftools_stderr, "       --no-version               Do not append version and command line to the header\n");
@@ -862,7 +865,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "       --threads <int>            Use multithreading with <int> worker threads [0]\n");
      fprintf(bcftools_stderr, "   -v, --verbose <0|1>            Set verbosity level [1]\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcfconcat(int argc, char *argv[])
@@ -905,7 +908,7 @@ int main_vcfconcat(int argc, char *argv[])
              case 'r': args->regions_list = optarg; break;
              case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
              case 'd': args->remove_dups = optarg; break;
-            case 'D': args->remove_dups = "none"; break;
+            case 'D': args->remove_dups = "exact"; break;
              case 'q': 
                  args->min_PQ = strtol(optarg,&tmp,10);
                  if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c

index 445a894672320f45436d9b47efdc5b16842f9c99..a48e85cfd530f5ad727055477d1c54c21db62146 100644 (file)
--- a/bcftools/vcfconvert.c
+++ b/bcftools/vcfconvert.c
@@ -1,6 +1,6 @@
  /*  vcfconvert.c -- convert between VCF/BCF and related formats.
  
-    Copyright (C) 2013-2017 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -68,7 +68,7 @@ struct _args_t
      int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
      char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
      char *outfname, *infname, *ref_fname, *sex_fname;
-    int argc, n_threads, record_cmd_line;
+    int argc, n_threads, record_cmd_line, keep_duplicates;
  };
  
  static void destroy_data(args_t *args)
@@ -153,6 +153,15 @@ static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
      if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss);
      rec->pos--;
  
+    // ID
+    if ( args->output_vcf_ids )
+    {
+        char tmp = *tsv->se;
+        *tsv->se = 0;
+        bcf_update_id(args->header, rec, tsv->ss);
+        *tsv->se = tmp;
+    }
+
      // REF,ALT
      args->str.l = 0;
      se = ++ss;
@@ -385,7 +394,7 @@ static void gensample_to_vcf(args_t *args)
      for (i=0; i<nsamples; i++) free(samples[i]);
      free(samples);
  
-    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
@@ -513,7 +522,7 @@ static void haplegendsample_to_vcf(args_t *args)
      for (i=0; i<nrows; i++) free(samples[i]);
      free(samples);
  
-    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
@@ -627,7 +636,7 @@ static void hapsample_to_vcf(args_t *args)
      for (i=0; i<nsamples; i++) free(samples[i]);
      free(samples);
  
-    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
@@ -802,7 +811,7 @@ static void vcf_to_gensample(args_t *args)
          }
  
          // skip duplicate lines, or otherwise shapeit complains
-        if ( prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; }
+        if ( !args->keep_duplicates && prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; }
          prev_rid = line->rid;
          prev_pos = line->pos;
  
@@ -977,7 +986,7 @@ static void vcf_to_hapsample(args_t *args)
      if ( args->output_vcf_ids )
          kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str);
      else
-        kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
+        kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
      
      if ( args->hap2dip )
          kputs("%_GT_TO_HAP2\n", &str);
@@ -994,7 +1003,7 @@ static void vcf_to_hapsample(args_t *args)
      if ( n_files==1 )
      {
          int l = str.l;
-        kputs(".sample",&str);
+        kputs(".samples",&str);
          sample_fname = strdup(str.s);
          str.l = l;
          kputs(".hap.gz",&str);
@@ -1215,7 +1224,7 @@ static void tsv_to_vcf(args_t *args)
      bcf_hdr_add_sample(args->header, NULL);
      args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
  
-    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
@@ -1267,7 +1276,7 @@ static void tsv_to_vcf(args_t *args)
  static void vcf_to_vcf(args_t *args)
  {
      open_vcf(args,NULL);
-    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
  
@@ -1296,7 +1305,7 @@ static void gvcf_to_vcf(args_t *args)
      if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname);
  
      open_vcf(args,NULL);
-    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
  
@@ -1395,6 +1404,7 @@ static void usage(void)
      fprintf(stderr, "   -g, --gensample <...>       <prefix>|<gen-file>,<sample-file>\n");
      fprintf(stderr, "       --tag <string>          tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
      fprintf(stderr, "       --chrom                 output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+    fprintf(stderr, "       --keep-duplicates       keep duplicate positions\n");
      fprintf(stderr, "       --sex <file>            output sex column in the sample-file, input format is: Sample\\t[MF]\n");
      fprintf(stderr, "       --vcf-ids               output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
      fprintf(stderr, "\n");
@@ -1473,12 +1483,17 @@ int main_vcfconvert(int argc, char *argv[])
          {"columns",required_argument,NULL,'c'},
          {"fasta-ref",required_argument,NULL,'f'},
          {"no-version",no_argument,NULL,10},
+        {"keep-duplicates",no_argument,NULL,12},
          {NULL,0,NULL,0}
      };
      while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
          switch (c) {
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'r': args->regions_list = optarg; break;
              case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
              case 't': args->targets_list = optarg; break;
@@ -1512,6 +1527,7 @@ int main_vcfconvert(int argc, char *argv[])
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case 10 : args->record_cmd_line = 0; break;
              case 11 : args->sex_fname = optarg; break;
+            case 12 : args->keep_duplicates = 1; break;
              case '?': usage(); break;
              default: error("Unknown argument: %s\n", optarg);
          }
diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c

index abdfbec6584e560957611088be8ca573e4811223..358e4043db5ca53d0382ada8fb76707499170bc7 100644 (file)
--- a/bcftools/vcfconvert.c.pysam.c
+++ b/bcftools/vcfconvert.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfconvert.c -- convert between VCF/BCF and related formats.
  
-    Copyright (C) 2013-2017 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -70,7 +70,7 @@ struct _args_t
      int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
      char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
      char *outfname, *infname, *ref_fname, *sex_fname;
-    int argc, n_threads, record_cmd_line;
+    int argc, n_threads, record_cmd_line, keep_duplicates;
  };
  
  static void destroy_data(args_t *args)
@@ -155,6 +155,15 @@ static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
      if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss);
      rec->pos--;
  
+    // ID
+    if ( args->output_vcf_ids )
+    {
+        char tmp = *tsv->se;
+        *tsv->se = 0;
+        bcf_update_id(args->header, rec, tsv->ss);
+        *tsv->se = tmp;
+    }
+
      // REF,ALT
      args->str.l = 0;
      se = ++ss;
@@ -387,7 +396,7 @@ static void gensample_to_vcf(args_t *args)
      for (i=0; i<nsamples; i++) free(samples[i]);
      free(samples);
  
-    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
@@ -515,7 +524,7 @@ static void haplegendsample_to_vcf(args_t *args)
      for (i=0; i<nrows; i++) free(samples[i]);
      free(samples);
  
-    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
@@ -629,7 +638,7 @@ static void hapsample_to_vcf(args_t *args)
      for (i=0; i<nsamples; i++) free(samples[i]);
      free(samples);
  
-    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
@@ -804,7 +813,7 @@ static void vcf_to_gensample(args_t *args)
          }
  
          // skip duplicate lines, or otherwise shapeit complains
-        if ( prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; }
+        if ( !args->keep_duplicates && prev_rid==line->rid && prev_pos==line->pos ) { ndup++; continue; }
          prev_rid = line->rid;
          prev_pos = line->pos;
  
@@ -979,7 +988,7 @@ static void vcf_to_hapsample(args_t *args)
      if ( args->output_vcf_ids )
          kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str);
      else
-        kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
+        kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
      
      if ( args->hap2dip )
          kputs("%_GT_TO_HAP2\n", &str);
@@ -996,7 +1005,7 @@ static void vcf_to_hapsample(args_t *args)
      if ( n_files==1 )
      {
          int l = str.l;
-        kputs(".sample",&str);
+        kputs(".samples",&str);
          sample_fname = strdup(str.s);
          str.l = l;
          kputs(".hap.gz",&str);
@@ -1217,7 +1226,7 @@ static void tsv_to_vcf(args_t *args)
      bcf_hdr_add_sample(args->header, NULL);
      args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
  
-    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
      if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
@@ -1269,7 +1278,7 @@ static void tsv_to_vcf(args_t *args)
  static void vcf_to_vcf(args_t *args)
  {
      open_vcf(args,NULL);
-    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
  
@@ -1298,7 +1307,7 @@ static void gvcf_to_vcf(args_t *args)
      if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname);
  
      open_vcf(args,NULL);
-    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type));
+    htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
      if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
  
@@ -1397,6 +1406,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "   -g, --gensample <...>       <prefix>|<gen-file>,<sample-file>\n");
      fprintf(bcftools_stderr, "       --tag <string>          tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
      fprintf(bcftools_stderr, "       --chrom                 output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+    fprintf(bcftools_stderr, "       --keep-duplicates       keep duplicate positions\n");
      fprintf(bcftools_stderr, "       --sex <file>            output sex column in the sample-file, input format is: Sample\\t[MF]\n");
      fprintf(bcftools_stderr, "       --vcf-ids               output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
      fprintf(bcftools_stderr, "\n");
@@ -1433,7 +1443,7 @@ static void usage(void)
      // fprintf(bcftools_stderr, "PBWT options:\n");
      // fprintf(bcftools_stderr, "   -b, --pbwt          <prefix> or <pbwt>,<sites>,<sample>,<missing>\n");
      // fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcfconvert(int argc, char *argv[])
@@ -1475,12 +1485,17 @@ int main_vcfconvert(int argc, char *argv[])
          {"columns",required_argument,NULL,'c'},
          {"fasta-ref",required_argument,NULL,'f'},
          {"no-version",no_argument,NULL,10},
+        {"keep-duplicates",no_argument,NULL,12},
          {NULL,0,NULL,0}
      };
      while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
          switch (c) {
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'r': args->regions_list = optarg; break;
              case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
              case 't': args->targets_list = optarg; break;
@@ -1514,6 +1529,7 @@ int main_vcfconvert(int argc, char *argv[])
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case 10 : args->record_cmd_line = 0; break;
              case 11 : args->sex_fname = optarg; break;
+            case 12 : args->keep_duplicates = 1; break;
              case '?': usage(); break;
              default: error("Unknown argument: %s\n", optarg);
          }
diff --git a/bcftools/vcffilter.c b/bcftools/vcffilter.c

index 257ee3fb03e01d63943fa8a01d95cddc6185d26e..723bcdf145693699ebad9fd9088780c840e5cb74 100644 (file)
--- a/bcftools/vcffilter.c
+++ b/bcftools/vcffilter.c
@@ -1,6 +1,6 @@
  /*  vcffilter.c -- Apply fixed-threshold filters.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -25,8 +25,10 @@ THE SOFTWARE.  */
  #include <stdio.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <ctype.h>
  #include <string.h>
+#include <strings.h>
  #include <errno.h>
  #include <sys/stat.h>
  #include <sys/types.h>
@@ -60,7 +62,8 @@ typedef struct _args_t
      char *soft_filter;  // drop failed sites or annotate FILTER column?
      int annot_mode;     // add to existing FILTER annotation or replace? Otherwise reset FILTER to PASS or leave as it is?
      int flt_fail, flt_pass;     // BCF ids of fail and pass filters
-    int snp_gap, indel_gap, IndelGap_id, SnpGap_id;
+    int snp_gap, snp_gap_type, indel_gap, IndelGap_id, SnpGap_id;
+    char *snp_gap_str;
      int32_t ntmpi, *tmpi, ntmp_ac, *tmp_ac;
      rbuf_t rbuf;
      bcf1_t **rbuf_lines;
@@ -77,7 +80,7 @@ args_t;
  
  static void init_data(args_t *args)
  {
-    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
      if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
  
@@ -138,7 +141,7 @@ static void init_data(args_t *args)
          args->rbuf_lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
          if ( args->snp_gap )
          {
-            bcf_hdr_printf(args->hdr, "##FILTER=<ID=SnpGap,Description=\"SNP within %d bp of an indel\">", args->snp_gap);
+            bcf_hdr_printf(args->hdr, "##FILTER=<ID=SnpGap,Description=\"SNP within %d bp of %s\">", args->snp_gap,args->snp_gap_str);
              args->SnpGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "SnpGap");
              assert( args->SnpGap_id>=0 );
          }
@@ -217,9 +220,9 @@ static void buffered_filters(args_t *args, bcf1_t *line)
       */
  
      // To avoid additional data structure, we abuse bcf1_t's var and var_type records.
-    const int SnpGap_set     = VCF_OTHER<<1;
-    const int IndelGap_set   = VCF_OTHER<<2;
-    const int IndelGap_flush = VCF_OTHER<<3;
+    const int SnpGap_set     = 1 << (8*sizeof(int)/2);
+    const int IndelGap_set   = 1 << (8*sizeof(int)/2-1);
+    const int IndelGap_flush = 1 << (8*sizeof(int)/2-2);
  
      int var_type = 0, i;
      if ( line )
@@ -245,15 +248,8 @@ static void buffered_filters(args_t *args, bcf1_t *line)
          // output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be
          // used. This filter is therefore more strict and may remove some valid
          // SNPs.
-        int len = 1;
-        if ( var_type & VCF_INDEL )
-        {
-            for (i=1; i<line->n_allele; i++)
-                if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n;
-        }
-
          // Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion.
-        line->d.var[0].n = len;
+        line->d.var[0].n = line->rlen;
      }
  
      int k_flush = 1;
@@ -328,13 +324,13 @@ static void buffered_filters(args_t *args, bcf1_t *line)
              int rec_to  = rec->pos + rec->d.var[0].n - 1;   // last position affected by the variant
              if ( rec_to + args->snp_gap < last_from )
                  j_flush++;
-            else if ( (var_type & VCF_INDEL) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) )
+            else if ( (var_type & args->snp_gap_type) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) )
              {
                  // this SNP has not been SnpGap-filtered yet
                  rec->d.var_type |= SnpGap_set;
                  bcf_add_filter(args->hdr, rec, args->SnpGap_id);
              }
-            else if ( (var_type & VCF_SNP) && (rec->d.var_type & VCF_INDEL) )
+            else if ( (var_type & VCF_SNP) && (rec->d.var_type & args->snp_gap_type) )
              {
                  // the line which we are adding is a SNP and needs to be filtered
                  line->d.var_type |= SnpGap_set;
@@ -413,7 +409,7 @@ static void usage(args_t *args)
      fprintf(stderr, "\n");
      fprintf(stderr, "Options:\n");
      fprintf(stderr, "    -e, --exclude <expr>          exclude sites for which the expression is true (see man page for details)\n");
-    fprintf(stderr, "    -g, --SnpGap <int>            filter SNPs within <int> base pairs of an indel\n");
+    fprintf(stderr, "    -g, --SnpGap <int>[:type]     filter SNPs within <int> base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n");
      fprintf(stderr, "    -G, --IndelGap <int>          filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
      fprintf(stderr, "    -i, --include <expr>          include only sites for which the expression is true (see man page for details\n");
      fprintf(stderr, "    -m, --mode [+x]               \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
@@ -465,9 +461,31 @@ int main_vcffilter(int argc, char *argv[])
      char *tmp;
      while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) {
          switch (c) {
-            case 'g': 
+            case 'g':
                  args->snp_gap = strtol(optarg,&tmp,10); 
-                if ( *tmp ) error("Could not parse argument: --SnpGap %s\n", optarg);
+                if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg);
+                if ( *tmp==':' )
+                {
+                    args->snp_gap_str = tmp+1;
+                    int i,n;
+                    char **keys = hts_readlist(tmp+1,0,&n);
+                    for(i=0; i<n; i++)
+                    {
+                        if ( !strcasecmp(keys[i],"indel") ) args->snp_gap_type |= VCF_INDEL;
+                        else if ( !strcasecmp(keys[i],"mnp") ) args->snp_gap_type |= VCF_MNP;
+                        else if ( !strcasecmp(keys[i],"bnd") ) args->snp_gap_type |= VCF_BND;
+                        else if ( !strcasecmp(keys[i],"other") ) args->snp_gap_type |= VCF_OTHER;
+                        else if ( !strcasecmp(keys[i],"overlap") ) args->snp_gap_type |= VCF_OVERLAP;
+                        else error("Could not parse \"%s\" in \"--SnpGap %s\"\n", keys[i], optarg);
+                        free(keys[i]);
+                    }
+                    if ( n ) free(keys);
+                }
+                else
+                {
+                    args->snp_gap_type = VCF_INDEL;
+                    args->snp_gap_str = "indel";
+                }
                  break;
              case 'G':
                  args->indel_gap = strtol(optarg,&tmp,10);
@@ -492,8 +510,12 @@ int main_vcffilter(int argc, char *argv[])
              case 'T': args->targets_list = optarg; targets_is_file = 1; break;
              case 'r': args->regions_list = optarg; break;
              case 'R': args->regions_list = optarg; regions_is_file = 1; break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'S':
                  if ( !strcmp(".",optarg) ) args->set_gts = SET_GTS_MISSING;
                  else if ( !strcmp("0",optarg) ) args->set_gts = SET_GTS_REF;
diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c

index 908c3b4032fcd5f3227dcfa44facc9f15ecaf844..57091826b0b7b50df0f74b28b06daf0e70bc51a2 100644 (file)
--- a/bcftools/vcffilter.c.pysam.c
+++ b/bcftools/vcffilter.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcffilter.c -- Apply fixed-threshold filters.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -27,8 +27,10 @@ THE SOFTWARE.  */
  #include <stdio.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <ctype.h>
  #include <string.h>
+#include <strings.h>
  #include <errno.h>
  #include <sys/stat.h>
  #include <sys/types.h>
@@ -62,7 +64,8 @@ typedef struct _args_t
      char *soft_filter;  // drop failed sites or annotate FILTER column?
      int annot_mode;     // add to existing FILTER annotation or replace? Otherwise reset FILTER to PASS or leave as it is?
      int flt_fail, flt_pass;     // BCF ids of fail and pass filters
-    int snp_gap, indel_gap, IndelGap_id, SnpGap_id;
+    int snp_gap, snp_gap_type, indel_gap, IndelGap_id, SnpGap_id;
+    char *snp_gap_str;
      int32_t ntmpi, *tmpi, ntmp_ac, *tmp_ac;
      rbuf_t rbuf;
      bcf1_t **rbuf_lines;
@@ -79,7 +82,7 @@ args_t;
  
  static void init_data(args_t *args)
  {
-    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
      if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
      if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
  
@@ -140,7 +143,7 @@ static void init_data(args_t *args)
          args->rbuf_lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
          if ( args->snp_gap )
          {
-            bcf_hdr_printf(args->hdr, "##FILTER=<ID=SnpGap,Description=\"SNP within %d bp of an indel\">", args->snp_gap);
+            bcf_hdr_printf(args->hdr, "##FILTER=<ID=SnpGap,Description=\"SNP within %d bp of %s\">", args->snp_gap,args->snp_gap_str);
              args->SnpGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "SnpGap");
              assert( args->SnpGap_id>=0 );
          }
@@ -219,9 +222,9 @@ static void buffered_filters(args_t *args, bcf1_t *line)
       */
  
      // To avoid additional data structure, we abuse bcf1_t's var and var_type records.
-    const int SnpGap_set     = VCF_OTHER<<1;
-    const int IndelGap_set   = VCF_OTHER<<2;
-    const int IndelGap_flush = VCF_OTHER<<3;
+    const int SnpGap_set     = 1 << (8*sizeof(int)/2);
+    const int IndelGap_set   = 1 << (8*sizeof(int)/2-1);
+    const int IndelGap_flush = 1 << (8*sizeof(int)/2-2);
  
      int var_type = 0, i;
      if ( line )
@@ -247,15 +250,8 @@ static void buffered_filters(args_t *args, bcf1_t *line)
          // output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be
          // used. This filter is therefore more strict and may remove some valid
          // SNPs.
-        int len = 1;
-        if ( var_type & VCF_INDEL )
-        {
-            for (i=1; i<line->n_allele; i++)
-                if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n;
-        }
-
          // Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion.
-        line->d.var[0].n = len;
+        line->d.var[0].n = line->rlen;
      }
  
      int k_flush = 1;
@@ -330,13 +326,13 @@ static void buffered_filters(args_t *args, bcf1_t *line)
              int rec_to  = rec->pos + rec->d.var[0].n - 1;   // last position affected by the variant
              if ( rec_to + args->snp_gap < last_from )
                  j_flush++;
-            else if ( (var_type & VCF_INDEL) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) )
+            else if ( (var_type & args->snp_gap_type) && (rec->d.var_type & VCF_SNP) && !(rec->d.var_type & SnpGap_set) )
              {
                  // this SNP has not been SnpGap-filtered yet
                  rec->d.var_type |= SnpGap_set;
                  bcf_add_filter(args->hdr, rec, args->SnpGap_id);
              }
-            else if ( (var_type & VCF_SNP) && (rec->d.var_type & VCF_INDEL) )
+            else if ( (var_type & VCF_SNP) && (rec->d.var_type & args->snp_gap_type) )
              {
                  // the line which we are adding is a SNP and needs to be filtered
                  line->d.var_type |= SnpGap_set;
@@ -415,7 +411,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Options:\n");
      fprintf(bcftools_stderr, "    -e, --exclude <expr>          exclude sites for which the expression is true (see man page for details)\n");
-    fprintf(bcftools_stderr, "    -g, --SnpGap <int>            filter SNPs within <int> base pairs of an indel\n");
+    fprintf(bcftools_stderr, "    -g, --SnpGap <int>[:type]     filter SNPs within <int> base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n");
      fprintf(bcftools_stderr, "    -G, --IndelGap <int>          filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
      fprintf(bcftools_stderr, "    -i, --include <expr>          include only sites for which the expression is true (see man page for details\n");
      fprintf(bcftools_stderr, "    -m, --mode [+x]               \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
@@ -430,7 +426,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "    -T, --targets-file <file>     similar to -R but streams rather than index-jumps\n");
      fprintf(bcftools_stderr, "        --threads <int>           use multithreading with <int> worker threads [0]\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcffilter(int argc, char *argv[])
@@ -467,9 +463,31 @@ int main_vcffilter(int argc, char *argv[])
      char *tmp;
      while ((c = getopt_long(argc, argv, "e:i:t:T:r:R:h?s:m:o:O:g:G:S:",loptions,NULL)) >= 0) {
          switch (c) {
-            case 'g': 
+            case 'g':
                  args->snp_gap = strtol(optarg,&tmp,10); 
-                if ( *tmp ) error("Could not parse argument: --SnpGap %s\n", optarg);
+                if ( *tmp && *tmp!=':' ) error("Could not parse argument: --SnpGap %s\n", optarg);
+                if ( *tmp==':' )
+                {
+                    args->snp_gap_str = tmp+1;
+                    int i,n;
+                    char **keys = hts_readlist(tmp+1,0,&n);
+                    for(i=0; i<n; i++)
+                    {
+                        if ( !strcasecmp(keys[i],"indel") ) args->snp_gap_type |= VCF_INDEL;
+                        else if ( !strcasecmp(keys[i],"mnp") ) args->snp_gap_type |= VCF_MNP;
+                        else if ( !strcasecmp(keys[i],"bnd") ) args->snp_gap_type |= VCF_BND;
+                        else if ( !strcasecmp(keys[i],"other") ) args->snp_gap_type |= VCF_OTHER;
+                        else if ( !strcasecmp(keys[i],"overlap") ) args->snp_gap_type |= VCF_OVERLAP;
+                        else error("Could not parse \"%s\" in \"--SnpGap %s\"\n", keys[i], optarg);
+                        free(keys[i]);
+                    }
+                    if ( n ) free(keys);
+                }
+                else
+                {
+                    args->snp_gap_type = VCF_INDEL;
+                    args->snp_gap_str = "indel";
+                }
                  break;
              case 'G':
                  args->indel_gap = strtol(optarg,&tmp,10);
@@ -494,8 +512,12 @@ int main_vcffilter(int argc, char *argv[])
              case 'T': args->targets_list = optarg; targets_is_file = 1; break;
              case 'r': args->regions_list = optarg; break;
              case 'R': args->regions_list = optarg; regions_is_file = 1; break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'S':
                  if ( !strcmp(".",optarg) ) args->set_gts = SET_GTS_MISSING;
                  else if ( !strcmp("0",optarg) ) args->set_gts = SET_GTS_REF;
diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c

index 8bf3223a3473e67ad2f419c75276e307c5ac413c..8a96e3ec1976e67fc879c3d3ac55539ddff07ea6 100644 (file)
--- a/bcftools/vcfgtcheck.c
+++ b/bcftools/vcfgtcheck.c
@@ -1,6 +1,6 @@
  /*  vcfgtcheck.c -- Check sample identity.
  
-    Copyright (C) 2013-2018 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -26,8 +26,10 @@ THE SOFTWARE.  */
  #include <stdarg.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <ctype.h>
  #include <string.h>
+#include <strings.h>
  #include <errno.h>
  #include <sys/stat.h>
  #include <sys/types.h>
@@ -35,240 +37,46 @@ THE SOFTWARE.  */
  #include <htslib/vcf.h>
  #include <htslib/synced_bcf_reader.h>
  #include <htslib/vcfutils.h>
+#include <htslib/kbitset.h>
+#include <htslib/hts_os.h>
  #include <inttypes.h>
+#include <sys/time.h>
  #include "bcftools.h"
-#include "hclust.h"
+#include "extsort.h"
+//#include "hclust.h"
  
  typedef struct
  {
-    bcf_srs_t *files;           // first reader is the query VCF - single sample normally or multi-sample for cross-check
-    bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF
-    int ntmp_arr, npl_arr;
-    int32_t *tmp_arr, *pl_arr;
-    double *lks, *sites, min_inter_err, max_intra_err;
-    int *cnts, *dps, hom_only, cross_check, all_sites;
-    char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample;
-    int argc, no_PLs, narr, nsmpl;
-}
-args_t;
-
-FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
-char *msprintf(const char *fmt, ...);
-void mkdir_p(const char *fmt, ...);
-
-void py_plot(char *script)
-{
-    mkdir_p(script);
-    int len = strlen(script);
-    char *cmd = !strcmp(".py",script+len-3) ? msprintf("python %s", script) : msprintf("python %s.py", script);
-    int ret = system(cmd);
-    if ( ret ) fprintf(stderr, "The command returned non-zero status %d: %s\n", ret, cmd);
-    free(cmd);
-}
-
-static void plot_check(args_t *args, char *target_sample, char *query_sample)
-{
-    char *fname;
-    FILE *fp = open_file(&fname, "w", "%s.py", args->plot);
-    fprintf(fp,
-            "import matplotlib as mpl\n"
-            "mpl.use('Agg')\n"
-            "import matplotlib.pyplot as plt\n"
-            "import matplotlib.gridspec as gridspec\n"
-            "import csv\n"
-            "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
-            "\n"
-            "sample_ids = False\n"
-            "\n"
-            "dat = []\n"
-            "with open('%s.tab', 'r') as f:\n"
-            "    reader = csv.reader(f, 'tab')\n"
-            "    for row in reader:\n"
-            "        if row[0][0]=='#': continue\n"
-            "        if row[0]!='CN': continue\n"
-            "        tgt = 0\n"
-            "        if row[4]=='%s': tgt = 1\n"
-            "        dat.append([float(row[1]), float(row[2]), float(row[3]), tgt, row[4]])\n"
-            "\n"
-            "dat = sorted(dat)\n"
-            "\n"
-            "iq = -1; dp = 0\n"
-            "for i in range(len(dat)):\n"
-            "    if iq==-1 and dat[i][3]==1: iq = i\n"
-            "    dp += dat[i][2]\n"
-            "dp /= len(dat)\n"
-            "\n"
-            "fig,ax1 = plt.subplots(figsize=(8,5))\n"
-            "ax2 = ax1.twinx()\n"
-            "plots  = ax1.plot([x[0] for x in dat],'o-', ms=3, color='g', mec='g', label='Discordance (total)')\n"
-            "plots += ax1.plot([x[1] for x in dat], '^', ms=3, color='r', mec='r', label='Discordance (avg per site)')\n"
-            "plots += ax2.plot([x[2] for x in dat],'v', ms=3, color='k', label='Number of sites')\n"
-            "if iq!=-1:\n"
-            "   ax1.plot([iq],[dat[iq][0]],'o',color='orange', ms=9)\n"
-            "   ax1.annotate('%s',xy=(iq,dat[iq][0]), xytext=(5,5), textcoords='offset points',fontsize='xx-small',rotation=45,va='bottom',ha='left')\n"
-            "   ax1.plot([iq],[dat[iq][1]],'^',color='red', ms=5)\n"
-            "for tl in ax1.get_yticklabels(): tl.set_color('g')\n"
-            "for tl in ax2.get_yticklabels(): tl.set_color('k'); tl.set_fontsize(9)\n"
-            "min_dp = min([x[2] for x in dat])\n"
-            "max_dp = max([x[2] for x in dat])\n"
-            "ax2.set_ylim(min_dp-1,max_dp+1)\n"
-            "ax1.set_title('Discordance with %s')\n"
-            "ax1.set_xlim(-0.05*len(dat),1.05*(len(dat)-1))\n"
-            "ax1.set_xlabel('Sample ID')\n"
-            "plt.subplots_adjust(left=0.1,right=0.9,bottom=0.1,top=0.9)\n"
-            "if sample_ids:\n"
-            "   ax1.set_xticks(range(len(dat)))\n"
-            "   ax1.set_xticklabels([x[4] for x in dat],**{'rotation':45, 'ha':'right', 'fontsize':8})\n"
-            "   plt.subplots_adjust(bottom=0.2)\n"
-            "ax1.set_ylabel('Discordance',color='g')\n"
-            "ax2.set_ylabel('Number of sites',color='k')\n"
-            "ax2.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n"
-            "ax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n"
-            "labels = [l.get_label() for l in plots]\n"
-            "plt.legend(plots,labels,numpoints=1,markerscale=1,loc='best',prop={'size':10},frameon=False)\n"
-            "plt.savefig('%s.png')\n"
-            "plt.close()\n"
-            "\n", args->plot, target_sample, target_sample, query_sample, args->plot
-           );
-    fclose(fp);
-    py_plot(fname);
-    free(fname);
-}
-
-#if 0
-static void plot_cross_check(args_t *args)
-{
-    char *fname;
-    FILE *fp = open_file(&fname, "w", "%s.py", args->plot);
-    fprintf(fp,
-            "import matplotlib as mpl\n"
-            "mpl.use('Agg')\n"
-            "import matplotlib.pyplot as plt\n"
-            "import matplotlib.gridspec as gridspec\n"
-            "import csv\n"
-            "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
-            "avg   = []\n"
-            "dp    = []\n"
-            "sm2id = {}\n"
-            "dat   = None\n"
-            "min   = None\n"
-            "max   = None\n"
-            "with open('%s.tab', 'r') as f:\n"
-            "   reader = csv.reader(f, 'tab')\n"
-            "   i = 0\n"
-            "   for row in reader:\n"
-            "       if row[0]=='SM':\n"
-            "           sm2id[row[4]] = i\n"
-            "           avg.append([i,float(row[1])])\n"
-            "           dp.append([i,float(row[2])])\n"
-            "           i += 1\n"
-            "       elif row[0]=='CN':\n"
-            "           val = 0\n"
-            "           if int(row[2])!=0: val = float(row[1])/int(row[2])\n"
-            "           if not dat:\n"
-            "               dat = [[0]*len(sm2id) for x in xrange(len(sm2id))]\n"
-            "               min = val\n"
-            "               max = val\n"
-            "           id_i = sm2id[row[4]]\n"
-            "           id_j = sm2id[row[5]]\n"
-            "           dat[id_i][id_j] = val\n"
-            "           dat[id_j][id_i] = val\n"
-            "           if min > val: min = val\n"
-            "           if max < val: max = val\n"
-            "\n"
-            "if len(sm2id)<=1: exit(1)\n"
-            "if min==max: exit(1)\n"
-            "\n"
-            "fig = plt.figure(figsize=(6,7))\n"
-            "gs  = gridspec.GridSpec(2, 1, height_ratios=[1, 1.5])\n"
-            "ax1 = plt.subplot(gs[0])\n"
-            "ax2 = plt.subplot(gs[1])\n"
-            "\n"
-            "ax1.plot([x[0] for x in avg],[x[1] for x in avg],'^-', ms=3, color='k')\n"
-            "ax3 = ax1.twinx()\n"
-            "ax3.plot([x[0] for x in dp],[x[1] for x in dp],'^-', ms=3, color='r',mec='r')\n"
-            "for tl in ax3.get_yticklabels():\n"
-            "   tl.set_color('r')\n"
-            "   tl.set_fontsize(9)\n"
-            "\n"
-            "im = ax2.imshow(dat,clim=(min),interpolation='nearest',origin='lower')\n"
-            "cb1  = plt.colorbar(im,ax=ax2)\n"
-            "cb1.set_label('Pairwise discordance')\n"
-            "for t in cb1.ax.get_yticklabels(): t.set_fontsize(9)\n"
-            "\n"
-            "ax1.tick_params(axis='both', which='major', labelsize=9)\n"
-            "ax1.tick_params(axis='both', which='minor', labelsize=9)\n"
-            "ax2.tick_params(axis='both', which='major', labelsize=9)\n"
-            "ax2.tick_params(axis='both', which='minor', labelsize=9)\n"
-            "\n"
-            "ax1.set_title('Sample Discordance Score')\n"
-            "ax2.set_ylabel('Sample ID')\n"
-            "ax2.set_xlabel('Sample ID')\n"
-            "ax3.set_ylabel('Average Depth',color='r')\n"
-            "ax1.set_xlabel('Sample ID')\n"
-            "ax1.set_ylabel('Average discordance')\n"
-            "\n"
-            "plt.subplots_adjust(left=0.15,right=0.87,bottom=0.08,top=0.93,hspace=0.25)\n"
-            "plt.savefig('%s.png')\n"
-            "plt.close()\n"
-            "\n", args->plot,args->plot
-           );
-    fclose(fp);
-    py_plot(fname);
-    free(fname);
-}
-#endif
-
-static void init_data(args_t *args)
-{
-    args->sm_hdr = args->files->readers[0].header;
-    if ( !bcf_hdr_nsamples(args->sm_hdr) ) error("No samples in %s?\n", args->files->readers[0].fname);
-
-    if ( !args->cross_check )
-    {
-        args->gt_hdr = args->files->readers[1].header;
-        int nsamples = bcf_hdr_nsamples(args->gt_hdr);
-        if ( !nsamples ) error("No samples in %s?\n", args->files->readers[1].fname);
-        args->lks   = (double*) calloc(nsamples,sizeof(double));
-        args->cnts  = (int*) calloc(nsamples,sizeof(int));
-        args->sites = (double*) calloc(nsamples,sizeof(double));
-        args->dps   = (int*) calloc(nsamples,sizeof(int));
-    }
+    int iqry, igt;
  }
+pair_t;
  
-static void destroy_data(args_t *args)
-{
-    free(args->lks); free(args->cnts); free(args->dps); free(args->cwd); free(args->sites);
-}
-
-static int allele_to_int(bcf1_t *line, char *allele)
+typedef struct
  {
-    int i;
-    for (i=0; i<line->n_allele; i++)
-        if ( !strcmp(allele,line->d.allele[i]) ) return i;
-    if ( strcmp(line->d.allele[i-1],"X") ) return -1;
-    return i-1;
-}
+    bcf_srs_t *files;           // first reader is the query VCF - single sample normally or multi-sample for cross-check
+    bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF
+    char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples;
+    int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
+    int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
+    double *pdiff, *qry_prob, *gt_prob;
+    uint32_t *ndiff,*ncnt,ncmp, npairs;
+    int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
+    uint8_t *qry_dsg, *gt_dsg;
+    pair_t *pairs;
+    double *hwe_prob, dsg2prob[8][3], pl2prob[256];
+    double min_inter_err, max_intra_err;
+    int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, use_PLs;
+    FILE *fp;
+    unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL;
  
-static int init_gt2ipl(args_t *args, bcf1_t *gt_line, bcf1_t *sm_line, int *gt2ipl, int n_gt2ipl)
-{
-    int i, j;
-    for (i=0; i<n_gt2ipl; i++) gt2ipl[i] = -1;
-    for (i=0; i<gt_line->n_allele; i++)
-    {
-        // find which of the sm_alleles (k) corresponds to the gt_allele (i)
-        int k = allele_to_int(sm_line, gt_line->d.allele[i]);
-        if ( k<0 ) return 0;
-        for (j=0; j<=i; j++)
-        {
-            int l = allele_to_int(sm_line, gt_line->d.allele[j]);
-            if ( l<0 ) return 0;
-            gt2ipl[ bcf_ij2G(j,i) ] = k<=l ? bcf_ij2G(k,l) : bcf_ij2G(l,k);
-        }
-    }
-    //for (i=0; i<n_gt2ipl; i++) printf("%d .. %d\n", i,gt2ipl[i]);
-    return 1;
+    // for --distinctive-sites
+    double distinctive_sites;
+    kbitset_t *kbs_diff;
+    size_t diff_sites_size;
+    extsort_t *es;
+    char *es_tmp_prefix, *es_max_mem;
  }
+args_t;
  
  static void set_cwd(args_t *args)
  {
@@ -284,7 +92,6 @@ static void set_cwd(args_t *args)
      }
      assert(buf);
  }
-
  static void print_header(args_t *args, FILE *fp)
  {
      fprintf(fp, "# This file was produced by bcftools (%s+htslib-%s), the command line was:\n", bcftools_version(), hts_version());
@@ -296,413 +103,920 @@ static void print_header(args_t *args, FILE *fp)
      fprintf(fp, "# \t %s\n#\n", args->cwd);
  }
  
-static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line)
+static int cmp_int(const void *_a, const void *_b)
  {
-    // PLs not present, use GTs instead.
-    int fake_PL = args->no_PLs ? args->no_PLs : 99;    // with 1, discordance is the number of non-matching GTs
-    int nsm_gt, i;
-    if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 )
-        error("GT not present at %s:%"PRId64"?\n", hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
-    nsm_gt /= bcf_hdr_nsamples(hdr);
-    int npl = line->n_allele*(line->n_allele+1)/2;
-    hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr);
-    for (i=0; i<bcf_hdr_nsamples(hdr); i++)
-    {
-        int *gt_ptr = args->tmp_arr + i*nsm_gt;
-        int j, *pl_ptr = args->pl_arr + i*npl;
-        if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) // missing genotype
-        {
-            for (j=0; j<npl; j++) pl_ptr[j] = -1;
-        }
-        else
-        {
-            int a = bcf_gt_allele(gt_ptr[0]);
-            int b = bcf_gt_allele(gt_ptr[1]);
-            for (j=0; j<npl; j++) pl_ptr[j] = fake_PL;
-            int idx = bcf_alleles2gt(a,b);
-            pl_ptr[idx] = 0;
-        }
-    }
-    return npl;
+    int a = *((int*)_a);
+    int b = *((int*)_b);
+    if ( a < b ) return -1;
+    if ( a > b ) return 1;
+    return 0;
+}
+static int cmp_pair(const void *_a, const void *_b)
+{
+    pair_t *a = (pair_t*)_a;
+    pair_t *b = (pair_t*)_b;
+    if ( a->iqry < b->iqry ) return -1;
+    if ( a->iqry > b->iqry ) return 1;
+    if ( a->igt < b->igt ) return -1;
+    if ( a->igt > b->igt ) return 1;
+    return 0;
  }
  
-static int cmp_doubleptr(const void *_a, const void *_b)
+typedef struct
+{
+    uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosoms
+    unsigned long kbs_dat[1];
+}
+diff_sites_t;
+#if DBG
+static void diff_sites_debug_print(args_t *args, diff_sites_t *ds)
+{
+    int i;
+    memcpy(args->kbs_diff->b,ds->kbs_dat,args->kbs_diff->n*sizeof(unsigned long));
+    fprintf(stderr,"%s:%d\t%d\t",bcf_hdr_id2name(args->qry_hdr,ds->rid),ds->pos+1,ds->ndiff);
+    for (i=0; i<args->npairs; i++) fprintf(stderr,"%d",kbs_exists(args->kbs_diff,i)?1:0);
+    fprintf(stderr,"\n");
+}
+#endif
+static int diff_sites_cmp(const void *aptr, const void *bptr)
+{
+    diff_sites_t *a = *((diff_sites_t**)aptr);
+    diff_sites_t *b = *((diff_sites_t**)bptr);
+    if ( a->ndiff < b->ndiff ) return 1;        // descending order
+    if ( a->ndiff > b->ndiff ) return -1;
+    if ( a->rand < b->rand ) return -1;
+    if ( a->rand > b->rand ) return 1;
+    return 0;
+}
+static void diff_sites_init(args_t *args)
+{
+    int nsites = args->distinctive_sites<=1 ? args->npairs*args->distinctive_sites : args->distinctive_sites;
+    if ( nsites<=0 ) error("The value for --distinctive-sites was set too low: %d\n",nsites);
+    if ( nsites > args->npairs )
+    {
+        fprintf(stderr,"Warning: The value for --distinctive-sites is bigger than is the number of pairs, all discordant sites be printed.\n");
+        nsites = args->npairs;
+        args->distinctive_sites = args->npairs + 1;
+    }
+    else
+        args->distinctive_sites = nsites;
+    args->kbs_diff = kbs_init(args->npairs);
+    size_t n = (args->npairs + KBS_ELTBITS-1) / KBS_ELTBITS;
+    assert( n==args->kbs_diff->n );
+    args->diff_sites_size = sizeof(diff_sites_t) + (n-1)*sizeof(unsigned long);
+    args->es = extsort_alloc();
+    extsort_set_opt(args->es,size_t,DAT_SIZE,args->diff_sites_size);
+    extsort_set_opt(args->es,const char*,TMP_PREFIX,args->es_tmp_prefix);
+    extsort_set_opt(args->es,const char*,MAX_MEM,args->es_max_mem);
+    extsort_set_opt(args->es,extsort_cmp_f,FUNC_CMP,diff_sites_cmp);
+    extsort_init(args->es);
+}
+static void diff_sites_destroy(args_t *args)
  {
-    double *a = *((double**)_a);
-    double *b = *((double**)_b);
-    if ( *a < *b ) return -1;
-    else if ( *a == *b ) return 0;
+    kbs_destroy(args->kbs_diff);
+    extsort_destroy(args->es);
+}
+static inline void diff_sites_reset(args_t *args)
+{
+    kbs_clear(args->kbs_diff);
+}
+static inline void diff_sites_push(args_t *args, int ndiff, int rid, int pos)
+{
+    diff_sites_t *dat = (diff_sites_t*) malloc(args->diff_sites_size);
+    memset(dat,0,sizeof(*dat)); // for debugging: prevent warnings about uninitialized memory coming from struct padding (not needed after rand added)
+    dat->ndiff = ndiff;
+    dat->rid  = rid;
+    dat->pos  = pos;
+    dat->rand = hts_lrand48();
+    memcpy(dat->kbs_dat,args->kbs_diff->b,args->kbs_diff->n*sizeof(unsigned long));
+    extsort_push(args->es,dat);
+}
+static inline int diff_sites_shift(args_t *args, int *ndiff, int *rid, int *pos)
+{
+    diff_sites_t *dat = (diff_sites_t*) extsort_shift(args->es);
+    if ( !dat ) return 0;
+    *ndiff = dat->ndiff;
+    *rid   = dat->rid;
+    *pos   = dat->pos;
+    memcpy(args->kbs_diff->b,dat->kbs_dat,args->kbs_diff->n*sizeof(unsigned long));
      return 1;
  }
  
-static void check_gt(args_t *args)
+static void init_samples(char *list, int list_is_file, int **smpl, int *nsmpl, bcf_hdr_t *hdr, char *vcf_fname)
  {
-    int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0;
-    int fake_pls = args->no_PLs;
+    int i;
+    if ( !strcmp(list,"-") )
+    {
+        *nsmpl = bcf_hdr_nsamples(hdr);
+        *smpl  = (int*) malloc(sizeof(**smpl)*(*nsmpl));
+        for (i=0; i<*nsmpl; i++) (*smpl)[i] = i;
+        return;
+    }
  
-    // Initialize things: check which tags are defined in the header, sample names etc.
-    if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname);
-    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
+    char **tmp = hts_readlist(list, list_is_file, nsmpl);
+    if ( !tmp || !*nsmpl ) error("Failed to parse %s\n", list);
+    *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl));
+    for (i=0; i<*nsmpl; i++)
      {
-        if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
-            error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
-        if ( !args->no_PLs )
-            fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
-        fake_pls = 1;
+        int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, tmp[i]);
+        if ( idx<0 ) error("No such sample in %s: [%s]\n",vcf_fname,tmp[i]);
+        (*smpl)[i] = idx;
+        free(tmp[i]);
      }
+    free(tmp);
+    qsort(*smpl,*nsmpl,sizeof(**smpl),cmp_int);
+    // check for duplicates
+    for (i=1; i<*nsmpl; i++)
+        if ( (*smpl)[i-1]==(*smpl)[i] )
+            error("Error: the sample \"%s\" is listed twice in %s\n", hdr->samples[(*smpl)[i]],list);
+}
  
-    FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
-    print_header(args, fp);
+static void init_data(args_t *args)
+{
+    hts_srand48(0);
  
-    int tgt_isample = -1, query_isample = 0;
-    if ( args->target_sample )
+    args->files = bcf_sr_init();
+    if ( args->regions && bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions);
+    if ( args->targets && bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets);
+
+    if ( args->gt_fname ) bcf_sr_set_opt(args->files, BCF_SR_REQUIRE_IDX);
+    if ( !bcf_sr_add_reader(args->files,args->qry_fname) ) error("Failed to open %s: %s\n", args->qry_fname,bcf_sr_strerror(args->files->errnum));
+    if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) )
+        error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum));
+
+    args->qry_hdr = bcf_sr_get_header(args->files,0);
+    if ( !bcf_hdr_nsamples(args->qry_hdr) ) error("No samples in %s?\n", args->qry_fname);
+    if ( args->gt_fname )
      {
-        tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample);
-        if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample);
+        args->gt_hdr = bcf_sr_get_header(args->files,1);
+        if ( !bcf_hdr_nsamples(args->gt_hdr) ) error("No samples in %s?\n", args->gt_fname);
      }
-    if ( args->all_sites )
+
+    // Determine whether GT or PL will be used
+    if ( args->qry_use_GT==-1 ) // not set by -u, qry uses PL by default
      {
-        if ( tgt_isample==-1 )
-        {
-            fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
-            tgt_isample = 0;
-        }
+        if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")>=0 )
+            args->qry_use_GT = 0;
+        else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")>=0 )
+            args->qry_use_GT = 1;
+        else
+            error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->qry_fname);
      }
-    if ( args->query_sample )
+    else if ( args->qry_use_GT==1 )
      {
-        query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample);
-        if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample);
+        if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")<0 )
+            error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->qry_fname);
      }
-    if ( args->all_sites )
-        fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]match log LK\t[7]Query alleles\t[8-]Query PLs (%s)\n",
-                args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]);
+    else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")<0 )
+        error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->qry_fname);
  
-    // Main loop
-    float prev_lk = 0;
-    while ( (ret=bcf_sr_next_line(args->files)) )
+    if ( args->gt_hdr )
      {
-        if ( ret!=2 ) continue;
-        bcf1_t *sm_line = args->files->readers[0].buffer[0];    // the query file
-        bcf1_t *gt_line = args->files->readers[1].buffer[0];    // the -g target file
-        bcf_unpack(sm_line, BCF_UN_FMT);
-        bcf_unpack(gt_line, BCF_UN_FMT);
-
-        // Init mapping from target genotype index to the sample's PL fields
-        int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2;
-        if ( n_gt2ipl > m_gt2ipl )
+        if ( args->gt_use_GT==-1 ) // not set by -u, gt uses GT by default
+        {
+            if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")>=0 )
+                args->gt_use_GT = 1;
+            else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")>=0 )
+                args->gt_use_GT = 0;
+            else
+                error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->gt_fname);
+        }
+        else if ( args->gt_use_GT==1 )
          {
-            m_gt2ipl = n_gt2ipl;
-            gt2ipl   = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl);
+            if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")<0 )
+                error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->gt_fname);
          }
-        if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue;
-
-        // Target genotypes
-        int ngt, npl;
-        if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, &gt_arr, &ngt_arr)) <= 0 )
-            error("GT not present at %s:%"PRId64"?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1);
-        ngt /= bcf_hdr_nsamples(args->gt_hdr);
-        if ( ngt!=2 ) continue; // checking only diploid genotypes
+        else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")<0 )
+            error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->gt_fname);
+    }
+    else
+        args->gt_use_GT = args->qry_use_GT;
  
-        // Sample PLs
-        if ( !fake_pls )
+    // Prepare samples
+    int i,j;
+    args->nqry_smpl = bcf_hdr_nsamples(args->qry_hdr);
+    if ( args->qry_samples )
+    {
+        init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname);
+    }
+    if ( args->gt_samples )
+    {   
+        init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl,
+            args->gt_hdr ? args->gt_hdr : args->qry_hdr,
+            args->gt_fname ? args->gt_fname : args->qry_fname);
+    }
+    else if ( args->pair_samples )
+    {
+        int npairs;
+        char **tmp = hts_readlist(args->pair_samples, args->pair_samples_is_file, &npairs);
+        if ( !tmp || !npairs ) error("Failed to parse %s\n", args->pair_samples);
+        if ( !args->pair_samples_is_file && npairs%2 ) error("Expected even number of comma-delimited samples with -p\n");
+        args->npairs = args->pair_samples_is_file ? npairs : npairs/2;
+        args->pairs  = (pair_t*) calloc(args->npairs,sizeof(*args->pairs));
+        if ( !args->pair_samples_is_file )
          {
-            if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 )
+            for (i=0; i<args->npairs; i++)
              {
-                if ( sm_line->n_allele==1 )
-                {
-                    // PL values may not be present when ALT=. (mpileup/bcftools output), in that case 
-                    // switch automatically to GT at these sites
-                    npl = fake_PLs(args, args->sm_hdr, sm_line);
-                }
-                else
-                    error("PL not present at %s:%"PRId64"?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, (int64_t) sm_line->pos+1);
+                args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i]);
+                args->pairs[i].igt  = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i+1]);
+                if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[2*i]);
+                if ( args->pairs[i].igt  < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,tmp[2*i+1]);
+                free(tmp[2*i]);
+                free(tmp[2*i+1]);
              }
-            else
-                npl /= bcf_hdr_nsamples(args->sm_hdr);
          }
          else
-            npl = fake_PLs(args, args->sm_hdr, sm_line);
+        {
+            for (i=0; i<args->npairs; i++)
+            {
+                char *ptr = tmp[i];
+                while ( *ptr && !isspace(*ptr) ) ptr++;
+                if ( !*ptr ) error("Could not parse %s: %s\n",args->pair_samples,tmp[i]);
+                *ptr = 0;
+                args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[i]);
+                if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[i]);
+                ptr++;
+                while ( *ptr && isspace(*ptr) ) ptr++;
+                args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, ptr);
+                if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,ptr);
+                free(tmp[i]);
+            }
+        }
+        free(tmp);
+        qsort(args->pairs,args->npairs,sizeof(*args->pairs),cmp_pair);
+    }
+    else if ( args->gt_hdr )
+        args->ngt_smpl = bcf_hdr_nsamples(args->gt_hdr);
+    if ( !args->ngt_smpl )
+    {
+        args->ngt_smpl = args->nqry_smpl;
+        args->gt_smpl  = args->qry_smpl;
+        args->cross_check = 1;
+    }
+
+    // The data arrays
+    if ( !args->npairs ) args->npairs = args->cross_check ? args->nqry_smpl*(args->nqry_smpl+1)/2 : args->ngt_smpl*args->nqry_smpl;
+    if ( !args->pair_samples )
+    {
+        args->qry_dsg = (uint8_t*) malloc(args->nqry_smpl);
+        args->gt_dsg  = args->cross_check ? args->qry_dsg : (uint8_t*) malloc(args->ngt_smpl);
+    }
+    if ( args->use_PLs )
+    {
+        args->pdiff = (double*) calloc(args->npairs,sizeof(*args->pdiff));      // log probability of pair samples being the same
+        args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob));
+        args->gt_prob  = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
+
+        // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
+        // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding 
+        // probabilities of 0/0, 0/1, and 1/1 genotypes
+        for (i=0; i<8; i++)
+            for (j=0; j<3; j++)
+                args->dsg2prob[i][j] = HUGE_VAL;
+        args->dsg2prob[1][0] = -log(1-pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[1][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[1][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[2][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[2][1] = -log(1-pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[2][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[4][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[4][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[4][2] = -log(1-pow(10,-0.1*args->use_PLs));
  
-        // Calculate likelihoods for all samples, assuming diploid genotypes
+        // lookup table to avoid exponentiation
+        for (i=0; i<256; i++) args->pl2prob[i] = pow(10,-0.1*i);
+    }
+    else
+        args->ndiff = (uint32_t*) calloc(args->npairs,sizeof(*args->ndiff));    // number of differing genotypes for each pair of samples
+    args->ncnt  = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt));         // number of comparisons performed (non-missing data)
+    if ( !args->ncnt ) error("Error: failed to allocate %.1f Mb\n", args->npairs*sizeof(*args->ncnt)/1e6);
+    if ( args->calc_hwe_prob )
+    {
+        // prob of the observed sequence of matches given site AFs and HWE
+        args->hwe_prob = (double*) calloc(args->npairs,sizeof(*args->hwe_prob));
+        if ( !args->hwe_prob ) error("Error: failed to allocate %.1f Mb. Run with --no-HWE-prob to save some memory.\n", args->npairs*sizeof(*args->hwe_prob)/1e6);
+    }
+
+    if ( args->distinctive_sites ) diff_sites_init(args);
+
+    args->fp = stdout;
+    print_header(args, args->fp);
+}
+
+static void destroy_data(args_t *args)
+{
+    if ( args->gt_dsg!=args->qry_dsg ) free(args->gt_dsg);
+    free(args->qry_dsg);
+    if ( args->gt_prob!=args->qry_prob ) free(args->gt_prob);
+    free(args->qry_prob);
+    free(args->es_max_mem);
+    fclose(args->fp);
+    if ( args->distinctive_sites ) diff_sites_destroy(args);
+    free(args->hwe_prob);
+    free(args->cwd);
+    free(args->qry_arr);
+    if ( args->gt_hdr ) free(args->gt_arr);
+    free(args->pdiff);
+    free(args->ndiff);
+    free(args->ncnt);
+    free(args->qry_smpl);
+    if ( args->gt_smpl!=args->qry_smpl ) free(args->gt_smpl);
+    free(args->pairs);
+    bcf_sr_destroy(args->files);
+}
  
-        // For faster access to genotype likelihoods (PLs) of the query sample
-        int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl;
-        double sum_pl = 0; // for converting PLs to probs
-        for (max_ipl=0; max_ipl<npl; max_ipl++)
+static inline uint8_t gt_to_dsg(int32_t *ptr)
+{
+    if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end ) return 0;
+    uint8_t dsg = (bcf_gt_allele(ptr[0])?1:0) + (bcf_gt_allele(ptr[1])?1:0);
+    return 1<<dsg;
+}
+static inline uint8_t pl_to_dsg(int32_t *ptr)
+{
+    if ( ptr[0]==bcf_int32_missing || ptr[1]==bcf_int32_missing || ptr[2]==bcf_int32_missing ) return 0;
+    if ( ptr[1]==bcf_int32_vector_end || ptr[2]==bcf_int32_vector_end ) return 0;
+    int min_pl = ptr[0]<ptr[1] ? (ptr[0]<ptr[2]?ptr[0]:ptr[2]) : (ptr[1]<ptr[2]?ptr[1]:ptr[2]);
+    uint8_t dsg = 0;
+    if ( ptr[0]==min_pl ) dsg |= 1;
+    if ( ptr[1]==min_pl ) dsg |= 2;
+    if ( ptr[2]==min_pl ) dsg |= 4;
+    return dsg;
+}
+static inline uint8_t gt_to_prob(args_t *args, int32_t *ptr, double *prob)
+{
+    uint8_t dsg = gt_to_dsg(ptr);
+    if ( dsg )
+    {
+        prob[0] = args->dsg2prob[dsg][0];
+        prob[1] = args->dsg2prob[dsg][1];
+        prob[2] = args->dsg2prob[dsg][2];
+    }
+    return dsg;
+}
+static inline uint8_t pl_to_prob(args_t *args, int32_t *ptr, double *prob)
+{
+    uint8_t dsg = pl_to_dsg(ptr);
+    if ( dsg )
+    {
+        prob[0] = (ptr[0]>=0 && ptr[0]<255) ? args->pl2prob[ptr[0]] : args->pl2prob[255];
+        prob[1] = (ptr[1]>=0 && ptr[1]<255) ? args->pl2prob[ptr[1]] : args->pl2prob[255];
+        prob[2] = (ptr[2]>=0 && ptr[2]<255) ? args->pl2prob[ptr[2]] : args->pl2prob[255];
+        double sum = prob[0] + prob[1] + prob[2];
+        prob[0] /= sum;
+        prob[1] /= sum;
+        prob[2] /= sum;
+        prob[0] = -log(prob[0]);
+        prob[1] = -log(prob[1]);
+        prob[2] = -log(prob[2]);
+    }
+    return dsg;
+}
+static int set_data(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec, int32_t **arr, int32_t *narr, int *narr1, int *use_GT)
+{
+    static int warn_dip_GT = 1;
+    static int warn_dip_PL = 1;
+    int i;
+    for (i=0; i<2; i++)
+    {
+        if ( *use_GT )
          {
-            if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break;
-            if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue;
-            sum_pl += pow(10, -0.1*pl_ptr[max_ipl]);
+            int ret = bcf_get_genotypes(hdr,rec,arr,narr);
+            if ( ret < 0 )
+            {
+                if ( !i ) { *use_GT = 0; continue; }
+                args->nskip_no_data++;
+                return -1;
+            }
+            if ( ret != 2*bcf_hdr_nsamples(hdr) )
+            {
+                if ( warn_dip_GT )
+                {
+                    fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/GT fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1);
+                    warn_dip_GT = 0;
+                }
+                args->nskip_dip_GT++;
+                return -1;
+            }
+            *narr1 = 2;
+            return 0;
          }
-        if ( sum_pl==0 ) continue; // no PLs present
-        if ( fake_pls && args->no_PLs==1 ) sum_pl = -1;
  
-        // The main stats: concordance of the query sample with the target -g samples
-        for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++)
+        int ret = bcf_get_format_int32(hdr,rec,"PL",arr,narr);
+        if ( ret < 0 )
          {
-            int *gt_ptr = gt_arr + i*ngt;
-            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
-            if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) continue;
-            int a = bcf_gt_allele(gt_ptr[0]);
-            int b = bcf_gt_allele(gt_ptr[1]);
-            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
-            int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file
-            int igt_qry = gt2ipl[igt_tgt];  // corresponding genotype in query file
-            if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue;   // genotype not present in query sample: haploid or missing
-            args->lks[i] += sum_pl<0 ? -pl_ptr[igt_qry] : log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl);
-            args->sites[i]++;
+            if ( !i ) { *use_GT = 1; continue; }
+            args->nskip_no_data++;
+            return -1;
          }
-        if ( args->all_sites )
+        if ( ret != 3*bcf_hdr_nsamples(hdr) )
          {
-            // Print LKs at all sites for debugging
-            int *gt_ptr = gt_arr + tgt_isample*ngt;
-            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
-            int a = bcf_gt_allele(gt_ptr[0]);
-            int b = bcf_gt_allele(gt_ptr[1]);
-            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
-            fprintf(fp, "SC\t%s\t%"PRId64, args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1);
-            for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]);
-            fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : ".");
-            fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk);
-            prev_lk = args->lks[query_isample];
-
-            int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample
-            for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]);
-            for (igt=0; igt<npl; igt++)
-                if ( pl_ptr[igt]==bcf_int32_vector_end ) break;
-                else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, ".");
-                else fprintf(fp, "\t%d", pl_ptr[igt]);
-            fprintf(fp, "\n");
+            if ( warn_dip_PL )
+            {
+                fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/PL fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1);
+                warn_dip_PL = 0;
+            }
+            args->nskip_dip_PL++;
+            return -1;
          }
+        *narr1 = 3;
+        return 0;
      }
-    free(gt2ipl);
-    free(gt_arr);
-    free(args->pl_arr);
-    free(args->tmp_arr);
+    return -1;  // should never reach
+}
+static void process_line(args_t *args)
+{
+    int i,j,k, nqry1, ngt1, ret;
+
+    bcf1_t *gt_rec = NULL, *qry_rec = bcf_sr_get_line(args->files,0);   // the query file
+    int qry_use_GT = args->qry_use_GT;
+    int gt_use_GT  = args->gt_use_GT;
+
+    ret = set_data(args, args->qry_hdr, qry_rec, &args->qry_arr, &args->nqry_arr, &nqry1, &qry_use_GT);
+    if ( ret<0 ) return;
  
-    // To be able to plot total discordance (=number of mismatching GTs with -G1) in the same
-    // plot as discordance per site, the latter must be scaled to the same range
-    int nsamples = bcf_hdr_nsamples(args->gt_hdr);
-    double extreme_lk = 0, extreme_lk_per_site = 0;
-    for (i=0; i<nsamples; i++)
+    if ( args->gt_hdr )
      {
-        if ( args->lks[i] < extreme_lk ) extreme_lk = args->lks[i];
-        if ( args->sites[i] && args->lks[i]/args->sites[i] < extreme_lk_per_site ) extreme_lk_per_site = args->lks[i]/args->sites[i];
+        gt_rec = bcf_sr_get_line(args->files,1);
+        ret = set_data(args, args->gt_hdr, gt_rec, &args->gt_arr, &args->ngt_arr, &ngt1, &gt_use_GT);
+        if ( ret<0 ) return;
+    }
+    else
+    {
+        ngt1 = nqry1;
+        args->gt_arr = args->qry_arr;
      }
  
-    // Sorted output
-    double **p = (double**) malloc(sizeof(double*)*nsamples);
-    for (i=0; i<nsamples; i++) p[i] = &args->lks[i];
-    qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
+    args->ncmp++;
  
-    fprintf(fp, "# [1]CN\t[2]Discordance with %s (total)\t[3]Discordance (avg score per site)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]);
-    for (i=0; i<nsamples; i++)
+    double af,hwe_dsg[8];
+    if ( args->calc_hwe_prob )
      {
-        int idx = p[i] - args->lks;
-        double per_site = 0;
-        if ( args->sites[idx] )
+        int ac[2];
+        if ( args->gt_hdr )
          {
-            if ( args->sites[idx] && extreme_lk_per_site )
+            if ( bcf_calc_ac(args->gt_hdr, gt_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
+        }
+        else if ( bcf_calc_ac(args->qry_hdr, qry_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
+
+        // hwe indexes correspond to the bitmask of eight dsg combinations to account for PL uncertainty
+        // for in the extreme case we can have uninformative PL=0,0,0. So the values are the minima of e.g.
+        //      hwe[1,2,4] ..  dsg=0,1,2
+        //      hwe[3]     ..  dsg=0 or 1
+        //      hwe[6]     ..  dsg=1 or 2
+
+        double hwe[3];
+        const double min_af = 1e-5;             // cap the AF in case we get unrealistic values
+        af = (double)ac[1]/(ac[0]+ac[1]);
+        hwe[0] = af>min_af ? -log(af*af) : -log(min_af*min_af);
+        hwe[1] = af>min_af && af<1-min_af ? -log(2*af*(1-af)) : -log(2*min_af*(1-min_af));
+        hwe[2] = af<(1-min_af) ? -log((1-af)*(1-af)) : -log(min_af*min_af);
+        hwe_dsg[0] = 0;
+        for (i=1; i<8; i++)
+        {
+            hwe_dsg[i] = HUGE_VAL;
+            for (k=0; k<3; k++)
              {
-                per_site = args->lks[idx]/args->sites[idx];
-                per_site *= extreme_lk / extreme_lk_per_site;
+                if ( ((1<<k)&i) && hwe_dsg[i] > hwe[k] ) hwe_dsg[i] = hwe[k];
              }
-            else
-                per_site = 0;
          }
-        fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", fabs(args->lks[idx]), fabs(per_site), args->sites[idx], args->gt_hdr->samples[idx], i);
      }
  
-    if ( args->plot )
+    // The sample pairs were given explicitly via -p/-P options
+    if ( args->pairs )
      {
-        if ( fclose(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
-        plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]);
-    }
-}
+        if ( !args->use_PLs )
+        {
+            int ndiff = 0;
+            if ( args->kbs_diff ) diff_sites_reset(args);
  
-// static inline int is_hom_most_likely(int nals, int *pls)
-// {
-//     int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
-//     for (ia=1; ia<nals; ia++)
-//     {
-//         for (ib=0; ib<ia; ib++)
-//         {
-//             if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
-//             idx++;
-//         }
-//         if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
-//         idx++;
-//     }
-//     return min_is_hom;
-// }
-
-int process_GT(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
-{
-    int ngt = bcf_get_genotypes(args->sm_hdr, line, &args->tmp_arr, &args->ntmp_arr);
+            for (i=0; i<args->npairs; i++)
+            {
+                int32_t *ptr;
+                uint8_t qry_dsg, gt_dsg;
  
-    if ( ngt<=0 ) return 1;                 // GT not present
-    if ( ngt!=args->nsmpl*2 ) return 2;     // not diploid
-    ngt /= args->nsmpl;
-    
-    int i,j, idx = 0;
-    for (i=1; i<args->nsmpl; i++)
-    {
-        int32_t *a = args->tmp_arr + i*ngt;
-        if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; }
-        int agt = 1<<bcf_gt_allele(a[0]) | 1<<bcf_gt_allele(a[1]);
+                ptr = args->gt_arr + args->pairs[i].igt*ngt1;
+                gt_dsg = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
+                if ( !gt_dsg ) continue;                        // missing value
+                if ( args->hom_only && !(gt_dsg&5) ) continue;  // not a hom
+
+                ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
+                qry_dsg = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
+                if ( !qry_dsg ) continue;                       // missing value
+
+                int match = qry_dsg & gt_dsg;
+                if ( !match )
+                {
+                    args->ndiff[i]++;
+                    if ( args->kbs_diff ) { ndiff++; kbs_insert(args->kbs_diff, i); }
+                }
+                else if ( args->calc_hwe_prob ) args->hwe_prob[i] += hwe_dsg[match];
+                args->ncnt[i]++;
+            }
  
-        for (j=0; j<i; j++)
+            if ( ndiff ) diff_sites_push(args, ndiff, qry_rec->rid, qry_rec->pos);
+        }
+        else    // use_PLs set
          {
-            int32_t *b = args->tmp_arr + j*ngt;
-            if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; }
-            int bgt = 1<<bcf_gt_allele(b[0]) | 1<<bcf_gt_allele(b[1]);
+            for (i=0; i<args->npairs; i++)
+            {
+                int32_t *ptr;
+                double qry_prob[3], gt_prob[3];
+                uint8_t qry_dsg, gt_dsg;
+
+                ptr = args->gt_arr + args->pairs[i].igt*ngt1;
+                gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob);
+                if ( !gt_dsg ) continue;                        // missing value
+                if ( args->hom_only && !(gt_dsg&5) ) continue;  // not a hom
+               
+                ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
+                qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob);
+                if ( !qry_dsg ) continue;                       // missing value
  
-            ntot[idx]++;
-            if ( agt!=bgt ) ndif[idx]++;
-            idx++;
+                double min = qry_prob[0] + gt_prob[0];
+                qry_prob[1] += gt_prob[1];
+                if ( min > qry_prob[1] ) min = qry_prob[1];
+                qry_prob[2] += gt_prob[2];
+                if ( min > qry_prob[2] ) min = qry_prob[2];
+                args->pdiff[i] += min;
+
+                if ( args->calc_hwe_prob )
+                {
+                    int match = qry_dsg & gt_dsg;
+                    args->hwe_prob[i] += hwe_dsg[match];
+                }
+                args->ncnt[i]++;
+            }
          }
+        return;
      }
-    return 0;
-}
-int process_PL(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
-{
-    int npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr);
  
-    if ( npl<=0 ) return 1;                 // PL not present
-    npl /= args->nsmpl;
-    
-    int i,j,k, idx = 0;
-    for (i=1; i<args->nsmpl; i++)
+    int idx=0;
+    if ( !args->use_PLs )
      {
-        int32_t *a = args->tmp_arr + i*npl;
-        int imin = -1;
-        for (k=0; k<npl; k++)
+        for (i=0; i<args->nqry_smpl; i++)
          {
-            if ( a[k]==bcf_int32_vector_end ) break;
-            if ( a[k]==bcf_int32_missing ) continue;
-            if ( imin==-1 || a[imin] > a[k] ) imin = k;
+            int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+            int32_t *ptr = args->qry_arr + nqry1*iqry;
+            args->qry_dsg[i] = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
          }
-        if ( imin<0 ) { idx+=i; continue; }
-
-        for (j=0; j<i; j++)
+        if ( !args->cross_check )   // in this case gt_dsg points to qry_dsg
          {
-            int32_t *b = args->tmp_arr + j*npl;
-            int jmin = -1;
-            for (k=0; k<npl; k++)
+            for (i=0; i<args->ngt_smpl; i++)
              {
-                if ( b[k]==bcf_int32_vector_end ) break;
-                if ( b[k]==bcf_int32_missing ) continue;
-                if ( jmin==-1 || b[jmin] > b[k] ) jmin = k;
+                int igt = args->gt_smpl ? args->gt_smpl[i] : i;
+                int32_t *ptr = args->gt_arr + ngt1*igt;
+                args->gt_dsg[i] = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
+                if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0;      // not a hom, set to a missing value
+            }
+        }
+        for (i=0; i<args->nqry_smpl; i++)
+        {
+            int ngt = args->cross_check ? i : args->ngt_smpl;       // two files or a sub-diagonal cross-check mode?
+            if ( !args->qry_dsg[i] ) { idx += ngt; continue; }      // missing value
+            for (j=0; j<ngt; j++)
+            {
+                if ( !args->gt_dsg[j] ) { idx++; continue; }        // missing value
+                int match = args->qry_dsg[i] & args->gt_dsg[j];
+                if ( !match ) args->ndiff[idx]++;
+                else if ( args->calc_hwe_prob ) args->hwe_prob[idx] += hwe_dsg[match];
+                args->ncnt[idx]++;
+                idx++;
              }
-            if ( jmin<0 ) { idx++; continue; }
-
-            ntot[idx]++;
-            if ( imin!=jmin ) ndif[idx]++;
-            idx++;
          }
      }
-    return 0;
-}
+    else    // use_PLs set
+    {
+        for (i=0; i<args->nqry_smpl; i++)
+        {
+            int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+            int32_t *ptr = args->qry_arr + nqry1*iqry;
+            args->qry_dsg[i] = qry_use_GT ? gt_to_prob(args,ptr,args->qry_prob+i*3) : pl_to_prob(args,ptr,args->qry_prob+i*3);
+        }
+        if ( !args->cross_check )   // in this case gt_dsg points to qry_dsg
+        {
+            for (i=0; i<args->ngt_smpl; i++)
+            {
+                int igt = args->gt_smpl ? args->gt_smpl[i] : i;
+                int32_t *ptr = args->gt_arr + ngt1*igt;
+                args->gt_dsg[i] = gt_use_GT ? gt_to_prob(args,ptr,args->gt_prob+i*3) : pl_to_prob(args,ptr,args->gt_prob+i*3);
+                if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0;      // not a hom, set to a missing value
+            }
+        }
+        for (i=0; i<args->nqry_smpl; i++)
+        {
+            int ngt = args->cross_check ? i : args->ngt_smpl;       // two files or a sub-diagonal cross-check mode?
+            if ( !args->qry_dsg[i] ) { idx += ngt; continue; }      // missing value
+            for (j=0; j<ngt; j++)
+            {
+                if ( !args->gt_dsg[j] ) { idx++; continue; }        // missing value
  
-static void cross_check_gts(args_t *args)
-{
-    // Initialize things: check which tags are defined in the header, sample names etc.
-    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
-    {
-        if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
-            error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
-        if ( !args->no_PLs ) {
-            fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
-            args->no_PLs = 99;
+                double min = args->qry_prob[i*3] + args->gt_prob[j*3];
+                if ( min > args->qry_prob[i*3+1] + args->gt_prob[j*3+1] ) min = args->qry_prob[i*3+1] + args->gt_prob[j*3+1];
+                if ( min > args->qry_prob[i*3+2] + args->gt_prob[j*3+2] ) min = args->qry_prob[i*3+2] + args->gt_prob[j*3+2];
+                args->pdiff[idx] += min;
+
+                if ( args->calc_hwe_prob )
+                {
+                    int match = args->qry_dsg[i] & args->gt_dsg[j];
+                    args->hwe_prob[idx] += hwe_dsg[match];
+                }
+                args->ncnt[idx]++;
+                idx++;
+            }
          }
      }
+}
  
-    args->nsmpl = bcf_hdr_nsamples(args->sm_hdr);
-    args->narr  = (args->nsmpl-1)*args->nsmpl/2;
  
-    uint32_t *ndif = (uint32_t*) calloc(args->narr,4);
-    uint32_t *ntot = (uint32_t*) calloc(args->narr,4);
+typedef struct
+{
+    int ism, idx;
+    double val;
+}
+idbl_t;
+static int cmp_idbl(const void *_a, const void *_b)
+{
+    idbl_t *a = (idbl_t*)_a;
+    idbl_t *b = (idbl_t*)_b;
+    if ( a->val < b->val ) return -1;
+    if ( a->val > b->val ) return 1;
+    return 0;
+}
+static void report_distinctive_sites(args_t *args)
+{
+    extsort_sort(args->es);
+
+    fprintf(args->fp,"# DS, distinctive sites:\n");
+    fprintf(args->fp,"#     - chromosome\n");
+    fprintf(args->fp,"#     - position\n");
+    fprintf(args->fp,"#     - cumulative number of pairs distinguished by this block\n");
+    fprintf(args->fp,"#     - block id\n");
+    fprintf(args->fp,"#DS\t[2]Chromosome\t[3]Position\t[4]Cumulative number of distinct pairs\t[5]Block id\n");
  
-    while ( bcf_sr_next_line(args->files) )
+    kbitset_t *kbs_blk = kbs_init(args->npairs);
+    kbitset_iter_t itr;
+    int i,ndiff,rid,pos,ndiff_tot = 0, iblock = 0;
+    int ndiff_min = args->distinctive_sites <= args->npairs ? args->distinctive_sites : args->npairs;
+    while ( diff_sites_shift(args,&ndiff,&rid,&pos) )
      {
-        bcf1_t *line = bcf_sr_get_line(args->files,0);
-
-        // use PLs unless no_PLs is set and GT exists
-        if ( args->no_PLs )
+        int ndiff_new = 0, ndiff_dbg = 0;
+        kbs_start(&itr);
+        while ( (i=kbs_next(args->kbs_diff, &itr))>=0 )
          {
-            if ( process_GT(args,line,ntot,ndif)==0 ) continue;
+            ndiff_dbg++;
+            if ( kbs_exists(kbs_blk,i) ) continue;   // already set
+            kbs_insert(kbs_blk,i);
+            ndiff_new++;
          }
-        process_PL(args,line,ntot,ndif);
+        if ( ndiff_dbg!=ndiff ) error("Corrupted data, fixme: %d vs %d\n",ndiff_dbg,ndiff);
+        if ( !ndiff_new ) continue;     // no new pair distinguished by this site
+        ndiff_tot += ndiff_new;
+        fprintf(args->fp,"DS\t%s\t%d\t%d\t%d\n",bcf_hdr_id2name(args->qry_hdr,rid),pos+1,ndiff_tot,iblock);
+        if ( ndiff_tot < ndiff_min ) continue;   // fewer than the requested number of pairs can be distinguished at this point
+        iblock++;
+        ndiff_tot = 0;
+        kbs_clear(kbs_blk);
      }
-    
-    FILE *fp = stdout;
-    print_header(args, fp);
+    kbs_destroy(kbs_blk);
+}
+static void report(args_t *args)
+{
+    fprintf(args->fp,"INFO\tsites-compared\t%u\n",args->ncmp);
+    fprintf(args->fp,"INFO\tsites-skipped-no-match\t%u\n",args->nskip_no_match);
+    fprintf(args->fp,"INFO\tsites-skipped-multiallelic\t%u\n",args->nskip_not_ba);
+    fprintf(args->fp,"INFO\tsites-skipped-monoallelic\t%u\n",args->nskip_mono);
+    fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
+    fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
+    fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
+    fprintf(args->fp,"# DC, discordance:\n");
+    fprintf(args->fp,"#     - query sample\n");
+    fprintf(args->fp,"#     - genotyped sample\n");
+    fprintf(args->fp,"#     - discordance (number of mismatches; smaller is better)\n");
+    fprintf(args->fp,"#     - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n");
+    fprintf(args->fp,"#     - number of sites compared (bigger is better)\n");
+    fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
  
-    float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2);
+    int trim = args->ntop;
+    if ( !args->pairs )
+    {
+        if ( !args->ngt_smpl && args->nqry_smpl <= args->ntop ) trim = 0;
+        if ( args->ngt_smpl && args->ngt_smpl <= args->ntop  ) trim = 0;
+    }
  
-    // Output pairwise distances
-    fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n");
-    int i,j, idx = 0;
-    for (i=0; i<args->nsmpl; i++)
+    if ( args->pairs )
      {
-        for (j=0; j<i; j++)
+        int i;
+        for (i=0; i<args->npairs; i++)
          {
-            float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10;
-            fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
-            PDIST(tmp,i,j) = err;
-            idx++;
+            int iqry = args->pairs[i].iqry;
+            int igt  = args->pairs[i].igt;
+            if ( args->ndiff )
+            {
+                fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                        args->qry_hdr->samples[iqry],
+                        args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+                        args->ndiff[i],
+                        args->calc_hwe_prob ? args->hwe_prob[i] : 0,
+                        args->ncnt[i]);
+            }
+            else
+            {
+                fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                        args->qry_hdr->samples[iqry],
+                        args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+                        args->pdiff[i],
+                        args->calc_hwe_prob ? args->hwe_prob[i] : 0,
+                        args->ncnt[i]);
+            }
          }
      }
-
-    // Cluster samples
-    int nlist;
-    float clust_max_err = args->max_intra_err;
-    hclust_t *clust = hclust_init(args->nsmpl,tmp);
-    cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist);
-    fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n");
-    for (i=0; i<nlist; i++)
-    {
-        fprintf(fp,"CLUSTER\t%f", list[i].dist);
-        for (j=0; j<list[i].nmemb; j++)
-            fprintf(fp,"\t%s",args->sm_hdr->samples[list[i].memb[j]]);
-        fprintf(fp,"\n");
-    }
-    hclust_destroy_list(list,nlist);
-    // Debugging output: the cluster graph and data used for deciding
-    char **dbg = hclust_explain(clust,&nlist);
-    for (i=0; i<nlist; i++)
-        fprintf(fp,"DBG\t%s\n", dbg[i]);
-    fprintf(fp, "# TH, clustering threshold\t[2]Value\nTH\t%f\n",clust_max_err);
-    fprintf(fp, "# DOT\t[2]Cluster graph, visualize e.g. as \"this-output.txt | grep ^DOT | cut -f2- | dot -Tsvg -o graph.svg\"\n");
-    fprintf(fp, "DOT\t%s\n", hclust_create_dot(clust,args->sm_hdr->samples,clust_max_err));
-    hclust_destroy(clust);
-    free(tmp);
-
-
-    // Deprecated output for temporary backward compatibility
-    fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n");
-    fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
-    idx = 0;
-    for (i=0; i<args->nsmpl; i++)
+    else if ( !trim )
      {
-        for (j=0; j<i; j++)
+        int i,j,idx=0;
+        for (i=0; i<args->nqry_smpl; i++)
          {
-            fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
-            idx++;
+            int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+            int ngt  = args->cross_check ? i : args->ngt_smpl;
+            for (j=0; j<ngt; j++)
+            {
+                int igt = args->gt_smpl ? args->gt_smpl[j] : j;
+                if ( args->ndiff )
+                {
+                    fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                            args->qry_hdr->samples[iqry],
+                            args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+                            args->ndiff[idx],
+                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+                            args->ncnt[idx]);
+                }
+                else
+                {
+                    fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                            args->qry_hdr->samples[iqry],
+                            args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+                            args->pdiff[idx],
+                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+                            args->ncnt[idx]);
+                }
+                idx++;
+            }
          }
      }
-
-    free(ndif);
-    free(ntot);
-    free(args->tmp_arr);
+    else if ( !args->cross_check )
+    {
+        idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*args->ngt_smpl);
+        int i,j;
+        for (i=0; i<args->nqry_smpl; i++)
+        {
+            int idx  = i*args->ngt_smpl;
+            for (j=0; j<args->ngt_smpl; j++)
+            {
+                if ( args->sort_by_hwe )
+                    arr[j].val = -args->hwe_prob[idx];
+                else if ( args->ndiff )
+                    arr[j].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
+                else
+                    arr[j].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
+                arr[j].ism = j;
+                arr[j].idx = idx;
+                idx++;
+            }
+            qsort(arr, args->ngt_smpl, sizeof(*arr), cmp_idbl);
+            int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+            for (j=0; j<args->ntop; j++)
+            {
+                int idx = arr[j].idx;
+                int igt = args->gt_smpl ? args->gt_smpl[arr[j].ism] : arr[j].ism;
+                if ( args->ndiff )
+                {
+                    fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                            args->qry_hdr->samples[iqry],
+                            args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+                            args->ndiff[idx],
+                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+                            args->ncnt[idx]);
+                }
+                else
+                {
+                    fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                            args->qry_hdr->samples[iqry],
+                            args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+                            args->pdiff[idx],
+                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+                            args->ncnt[idx]);
+                }
+            }
+        }
+        free(arr);
+    }
+    else
+    {
+        int narr = args->nqry_smpl-1;
+        idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*narr);
+        int i,j,k,idx;
+        for (i=0; i<args->nqry_smpl; i++)
+        {
+            k = 0, idx = i*(i-1)/2;
+            for (j=0; j<i; j++)
+            {
+                if ( args->sort_by_hwe )
+                    arr[k].val = -args->hwe_prob[idx];
+                else if ( args->ndiff )
+                    arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
+                else
+                    arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
+                arr[k].ism = j;
+                arr[k].idx = idx;
+                idx++;
+                k++;
+            }
+            for (; j<narr; j++)
+            {
+                idx = j*(j+1)/2 + i;
+                if ( args->sort_by_hwe )
+                    arr[k].val = -args->hwe_prob[idx];
+                else if ( args->ndiff )
+                    arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
+                else
+                    arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
+                arr[k].ism = j + 1;
+                arr[k].idx = idx;
+                k++;
+            }
+            qsort(arr, narr, sizeof(*arr), cmp_idbl);
+            int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+            for (j=0; j<args->ntop; j++)
+            {
+                if ( i <= arr[j].ism ) continue;
+                int idx = arr[j].idx;
+                int igt = args->qry_smpl ? args->qry_smpl[arr[j].ism] : arr[j].ism;
+                if ( args->ndiff )
+                {
+                    fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                            args->qry_hdr->samples[iqry],
+                            args->qry_hdr->samples[igt],
+                            args->ndiff[idx],
+                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+                            args->ncnt[idx]);
+                }
+                else
+                {
+                    fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                            args->qry_hdr->samples[iqry],
+                            args->qry_hdr->samples[igt],
+                            args->pdiff[idx],
+                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+                            args->ncnt[idx]);
+                }
+            }
+        }
+        free(arr);
+    }
  }
  
-static char *init_prefix(char *prefix)
+static int is_input_okay(args_t *args, int nmatch)
  {
-    int len = strlen(prefix);
-    if ( prefix[len-1] == '/' || prefix[len-1] == '\\' )
-        return msprintf("%sgtcheck", prefix);
-    return strdup(prefix);
+    int i;
+    const char *msg;
+    bcf_hdr_t *hdr;
+    bcf1_t *rec;
+    if ( args->gt_hdr && nmatch!=2 )
+    {
+        if ( args->nskip_no_match++ ) return 0;
+        for (i=0; i<2; i++)
+        {
+            rec = bcf_sr_get_line(args->files,i);
+            if ( rec ) break;
+        }
+        hdr = bcf_sr_get_header(args->files,i);
+        fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", no record with matching POS+ALT. (This is printed only once.)\n",
+                bcf_seqname(hdr,rec),rec->pos+1);
+        return 0;
+    }
+    for (i=0; i<2; i++)
+    {
+        hdr = bcf_sr_get_header(args->files,i);
+        rec = bcf_sr_get_line(args->files,i);
+        if ( rec->n_allele>2 )
+        {
+            if ( args->nskip_not_ba++ ) return 0;
+            msg = "not a biallelic site, run `bcftools norm -m -` first";
+            goto not_okay;
+        }
+        if ( bcf_get_variant_types(rec)==VCF_REF )
+        {
+            if ( args->nskip_mono++ ) return 0;
+            msg = "monoallelic site";
+            goto not_okay;
+        }
+        if ( !args->gt_hdr ) break;
+    }
+    return 1;
+
+not_okay:
+    fprintf(stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", 
+        bcf_seqname(hdr,rec),rec->pos+1,msg);
+    return 0;
  }
  
  static void usage(void)
@@ -712,18 +1026,41 @@ static void usage(void)
      fprintf(stderr, "Usage:   bcftools gtcheck [options] [-g <genotypes.vcf.gz>] <query.vcf.gz>\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Options:\n");
-    fprintf(stderr, "    -a, --all-sites                 output comparison for all sites\n");
-    fprintf(stderr, "    -c, --cluster <min,max>         min inter- and max intra-sample error [0.23,-0.3]\n");
-    fprintf(stderr, "    -g, --genotypes <file>          genotypes to compare against\n");
-    fprintf(stderr, "    -G, --GTs-only <int>            use GTs, ignore PLs, using <int> for unseen genotypes [99]\n");
-    fprintf(stderr, "    -H, --homs-only                 homozygous genotypes only (useful for low coverage data)\n");
-    fprintf(stderr, "    -p, --plot <prefix>             plot\n");
-    fprintf(stderr, "    -r, --regions <region>          restrict to comma-separated list of regions\n");
-    fprintf(stderr, "    -R, --regions-file <file>       restrict to regions listed in a file\n");
-    fprintf(stderr, "    -s, --query-sample <string>     query sample (by default the first sample is checked)\n");
-    fprintf(stderr, "    -S, --target-sample <string>    target sample in the -g file (used only for plotting)\n");
-    fprintf(stderr, "    -t, --targets <region>          similar to -r but streams rather than index-jumps\n");
-    fprintf(stderr, "    -T, --targets-file <file>       similar to -R but streams rather than index-jumps\n");
+    //fprintf(stderr, "    -a, --all-sites                  Output comparison for all sites\n");
+    //fprintf(stderr, "    -c, --cluster MIN,MAX            Min inter- and max intra-sample error [0.23,-0.3]\n");
+    fprintf(stderr, "        --distinctive-sites            Find sites that can distinguish between at least NUM sample pairs.\n");
+    fprintf(stderr, "                  NUM[,MEM[,TMP]]          If the number is smaller or equal to 1, it is interpreted as the fraction of pairs.\n");
+    fprintf(stderr, "                                           The optional MEM string sets the maximum memory used for in-memory sorting [500M]\n");
+#ifdef _WIN32
+    fprintf(stderr, "                                           and TMP is a prefix of temporary files used by external sorting [/bcftools.XXXXXX]\n");
+#else
+    fprintf(stderr, "                                           and TMP is a prefix of temporary files used by external sorting [/tmp/bcftools.XXXXXX]\n");
+#endif
+    fprintf(stderr, "        --dry-run                      Stop after first record to estimate required time\n");
+    fprintf(stderr, "    -e, --error-probability INT        Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n");
+    fprintf(stderr, "    -g, --genotypes FILE               Genotypes to compare against\n");
+    fprintf(stderr, "    -H, --homs-only                    Homozygous genotypes only, useful with low coverage data (requires -g)\n");
+    fprintf(stderr, "        --n-matches INT                Print only top INT matches for each sample (sorted by average score), 0 for unlimited.\n");
+    fprintf(stderr, "                                           Use negative value to sort by HWE probability rather than by discordance [0]\n");
+    fprintf(stderr, "        --no-HWE-prob                  Disable calculation of HWE probability\n");
+    fprintf(stderr, "    -p, --pairs LIST                   Comma-separated sample pairs to compare (qry,gt[,qry,gt..] with -g or qry,qry[,qry,qry..] w/o)\n");
+    fprintf(stderr, "    -P, --pairs-file FILE              File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n");
+    fprintf(stderr, "    -r, --regions REGION               Restrict to comma-separated list of regions\n");
+    fprintf(stderr, "    -R, --regions-file FILE            Restrict to regions listed in a file\n");
+    fprintf(stderr, "    -s, --samples [qry|gt]:LIST        List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n");
+    fprintf(stderr, "    -S, --samples-file [qry|gt]:FILE   File with the query or -g samples to compare\n");
+    fprintf(stderr, "    -t, --targets REGION               Similar to -r but streams rather than index-jumps\n");
+    fprintf(stderr, "    -T, --targets-file FILE            Similar to -R but streams rather than index-jumps\n");
+    fprintf(stderr, "    -u, --use TAG1[,TAG2]              Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n");
+    fprintf(stderr, "Examples:\n");
+    fprintf(stderr, "   # Check discordance of all samples from B against all sample in A\n");
+    fprintf(stderr, "   bcftools gtcheck -g A.bcf B.bcf\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "   # Limit comparisons to the fiven list of samples\n");
+    fprintf(stderr, "   bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "   # Compare only two pairs a1,b1 and a1,b2\n");
+    fprintf(stderr, "   bcftools gtcheck -p a1,b1,a1,b2 -g A.bcf B.bcf\n");
      fprintf(stderr, "\n");
      exit(1);
  }
@@ -732,10 +1069,19 @@ int main_vcfgtcheck(int argc, char *argv[])
  {
      int c;
      args_t *args = (args_t*) calloc(1,sizeof(args_t));
-    args->files  = bcf_sr_init();
      args->argc   = argc; args->argv = argv; set_cwd(args);
-    char *regions = NULL, *targets = NULL;
-    int regions_is_file = 0, targets_is_file = 0;
+    args->qry_use_GT = -1;
+    args->gt_use_GT  = -1;
+    args->calc_hwe_prob = 1;
+    args->use_PLs = 40;
+
+    // external sort for --distinctive-sites
+#ifdef _WIN32
+    args->es_tmp_prefix = NULL;
+#else
+    args->es_tmp_prefix = "/tmp/bcftools-gtcheck";
+#endif
+    args->es_max_mem = strdup("500M");
  
      // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
      //    - min_inter: pairs with smaller err value will be considered identical 
@@ -746,6 +1092,8 @@ int main_vcfgtcheck(int argc, char *argv[])
  
      static struct option loptions[] =
      {
+        {"error-probability",1,0,'e'},
+        {"use",1,0,'u'},
          {"cluster",1,0,'c'},
          {"GTs-only",1,0,'G'},
          {"all-sites",0,0,'a'},
@@ -753,18 +1101,74 @@ int main_vcfgtcheck(int argc, char *argv[])
          {"help",0,0,'h'},
          {"genotypes",1,0,'g'},
          {"plot",1,0,'p'},
-        {"target-sample",1,0,'S'},
-        {"query-sample",1,0,'s'},
+        {"samples",1,0,'s'},
+        {"samples-file",1,0,'S'},
+        {"n-matches",1,0,2},
+        {"no-HWE-prob",0,0,3},
+        {"target-sample",1,0,4},
+        {"dry-run",0,0,5},
+        {"distinctive-sites",1,0,6},
          {"regions",1,0,'r'},
          {"regions-file",1,0,'R'},
          {"targets",1,0,'t'},
          {"targets-file",1,0,'T'},
+        {"pairs",1,0,'p'},
+        {"pairs-file",1,0,'P'},
          {0,0,0,0}
      };
      char *tmp;
-    while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:",loptions,NULL)) >= 0) {
          switch (c) {
+            case 'e':
+                args->use_PLs = strtol(optarg,&tmp,10);
+                if ( !tmp || *tmp ) error("Could not parse: --error-probability %s\n", optarg);
+                break;
+            case 'u':
+                {
+                    int i,nlist;
+                    char **list = hts_readlist(optarg, 0, &nlist);
+                    if ( !list || nlist<=0 || nlist>2 ) error("Failed to parse --use %s\n", optarg);
+                    if ( !strcasecmp("GT",list[0]) ) args->qry_use_GT = 1;
+                    else if ( !strcasecmp("PL",list[0]) ) args->qry_use_GT = 0;
+                    else error("Failed to parse --use %s; only GT and PL are supported\n", optarg);
+                    if ( nlist==2 )
+                    {
+                        if ( !strcasecmp("GT",list[1]) ) args->gt_use_GT = 1;
+                        else if ( !strcasecmp("PL",list[1]) ) args->gt_use_GT = 0;
+                        else error("Failed to parse --use %s; only GT and PL are supported\n", optarg);
+                    }
+                    else args->gt_use_GT = args->qry_use_GT;
+                    for (i=0; i<nlist; i++) free(list[i]);
+                    free(list);
+                }
+                break;
+            case 2 :
+                args->ntop = strtol(optarg,&tmp,10);
+                if ( !tmp || *tmp ) error("Could not parse: --n-matches %s\n", optarg);
+                if ( args->ntop < 0 )
+                {
+                    args->sort_by_hwe = 1;
+                    args->ntop *= -1;
+                }
+                break;
+            case 3 : args->calc_hwe_prob = 0; break;
+            case 4 : error("The option -S, --target-sample has been deprecated\n"); break;
+            case 5 : args->dry_run = 1; break;
+            case 6 : 
+                args->distinctive_sites = strtod(optarg,&tmp);
+                if ( *tmp )
+                {
+                    if ( *tmp!=',' ) error("Could not parse: --distinctive-sites %s\n", optarg);
+                    tmp++;
+                    free(args->es_max_mem);
+                    args->es_max_mem = strdup(tmp);
+                    while ( *tmp && *tmp!=',' ) tmp++;
+                    if ( *tmp ) { *tmp = 0; args->es_tmp_prefix = tmp+1; }
+                }
+                args->use_PLs = 0;
+                break;
              case 'c':
+                error("The -c option is to be implemented, please open an issue on github\n");
                  args->min_inter_err = strtod(optarg,&tmp);
                  if ( *tmp )
                  {
@@ -773,50 +1177,77 @@ int main_vcfgtcheck(int argc, char *argv[])
                      if ( *tmp ) error("Could not parse: -c %s\n", optarg);
                  }
                  break;
-            case 'G':
-                args->no_PLs = strtol(optarg,&tmp,10);
-                if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg);
-                break;
-            case 'a': args->all_sites = 1; break;
+            case 'G': error("The option -G, --GTs-only has been deprecated\n"); break;
+            case 'a': args->all_sites = 1; error("The -a option is to be implemented, please open an issue on github\n"); break;
              case 'H': args->hom_only = 1; break;
              case 'g': args->gt_fname = optarg; break;
-            case 'p': args->plot = optarg; break;
-            case 'S': args->target_sample = optarg; break;
-            case 's': args->query_sample = optarg; break;
-            case 'r': regions = optarg; break;
-            case 'R': regions = optarg; regions_is_file = 1; break;
-            case 't': targets = optarg; break;
-            case 'T': targets = optarg; targets_is_file = 1; break;
+//            case 'p': args->plot = optarg; break;
+            case 's':
+                if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3;
+                else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4;
+                else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
+                break;
+            case 'S': 
+                if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1;
+                else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1;
+                else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
+                break;
+            case 'p': args->pair_samples = optarg; break;
+            case 'P': args->pair_samples = optarg; args->pair_samples_is_file = 1; break;
+            case 'r': args->regions = optarg; break;
+            case 'R': args->regions = optarg; args->regions_is_file = 1; break;
+            case 't': args->targets = optarg; break;
+            case 'T': args->targets = optarg; args->targets_is_file = 1; break;
              case 'h':
              case '?': usage(); break;
              default: error("Unknown argument: %s\n", optarg);
          }
      }
-    char *fname = NULL;
      if ( optind==argc )
      {
-        if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
+        if ( !isatty(fileno((FILE *)stdin)) ) args->qry_fname = "-";  // reading from stdin
          else usage();   // no files given
      }
-    else fname = argv[optind];
-    if ( argc>optind+1 )  usage();  // too many files given
-    if ( !args->gt_fname ) args->cross_check = 1;   // no genotype file, run in cross-check mode
-    else args->files->require_index = 1;
-    if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions);
-    if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets);
-    if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
-    if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) )
-        error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum));
-    args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS;
-    if ( args->plot ) args->plot = init_prefix(args->plot);
+    else args->qry_fname = argv[optind];
+    if ( argc>optind+1 ) error("Error: too many files given, run with -h for help\n");  // too many files given
+    if ( args->pair_samples )
+    {
+        if ( args->gt_samples || args->qry_samples ) error("The -p/-P option cannot be combined with -s/-S\n");
+        if ( args->ntop ) error("The --n-matches option cannot be combined with -p/-P\n");
+    }
+    if ( args->distinctive_sites && !args->pair_samples ) error("The experimental option --distinctive-sites requires -p/-P\n");
+    if ( args->hom_only && !args->gt_fname ) error("The option --homs-only requires --genotypes\n");
+    if ( args->distinctive_sites && args->use_PLs ) error("The option --distinctive-sites cannot be combined with --error-probability\n");
+
      init_data(args);
-    if ( args->cross_check )
-        cross_check_gts(args);
-    else
-        check_gt(args);
+
+    int ret;
+    while ( (ret=bcf_sr_next_line(args->files)) )
+    {
+        if ( !is_input_okay(args,ret) ) continue;
+
+        // time one record to give the user an estimate with very big files
+        struct timeval t0, t1;
+        if ( !args->ncmp )  gettimeofday(&t0, NULL);
+
+        process_line(args);
+
+        if ( args->ncmp==1 )
+        {
+            gettimeofday(&t1, NULL);
+            double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec);
+            fprintf(stderr,"INFO:\tTime required to process one record .. %f seconds\n",delta/1e6);
+            fprintf(args->fp,"INFO\tTime required to process one record .. %f seconds\n",delta/1e6);
+            if ( args->dry_run ) break;
+        }
+    }
+    if ( !args->dry_run )
+    {
+        report(args);
+        if ( args->distinctive_sites ) report_distinctive_sites(args);
+    }
+
      destroy_data(args);
-    bcf_sr_destroy(args->files);
-    if (args->plot) free(args->plot);
      free(args);
      return 0;
  }
diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c

index ae8ba7437540a3ea95335bdde8952f7fa1e32986..6ab27ede477d25908d02b8838b66c8929518c8a6 100644 (file)
--- a/bcftools/vcfgtcheck.c.pysam.c
+++ b/bcftools/vcfgtcheck.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfgtcheck.c -- Check sample identity.
  
-    Copyright (C) 2013-2018 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -28,8 +28,10 @@ THE SOFTWARE.  */
  #include <stdarg.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <ctype.h>
  #include <string.h>
+#include <strings.h>
  #include <errno.h>
  #include <sys/stat.h>
  #include <sys/types.h>
@@ -37,240 +39,46 @@ THE SOFTWARE.  */
  #include <htslib/vcf.h>
  #include <htslib/synced_bcf_reader.h>
  #include <htslib/vcfutils.h>
+#include <htslib/kbitset.h>
+#include <htslib/hts_os.h>
  #include <inttypes.h>
+#include <sys/time.h>
  #include "bcftools.h"
-#include "hclust.h"
+#include "extsort.h"
+//#include "hclust.h"
  
  typedef struct
  {
-    bcf_srs_t *files;           // first reader is the query VCF - single sample normally or multi-sample for cross-check
-    bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF
-    int ntmp_arr, npl_arr;
-    int32_t *tmp_arr, *pl_arr;
-    double *lks, *sites, min_inter_err, max_intra_err;
-    int *cnts, *dps, hom_only, cross_check, all_sites;
-    char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample;
-    int argc, no_PLs, narr, nsmpl;
-}
-args_t;
-
-FILE *open_file(char **fname, const char *mode, const char *fmt, ...);
-char *msprintf(const char *fmt, ...);
-void mkdir_p(const char *fmt, ...);
-
-void py_plot(char *script)
-{
-    mkdir_p(script);
-    int len = strlen(script);
-    char *cmd = !strcmp(".py",script+len-3) ? msprintf("python %s", script) : msprintf("python %s.py", script);
-    int ret = system(cmd);
-    if ( ret ) fprintf(bcftools_stderr, "The command returned non-zero status %d: %s\n", ret, cmd);
-    free(cmd);
-}
-
-static void plot_check(args_t *args, char *target_sample, char *query_sample)
-{
-    char *fname;
-    FILE *fp = open_file(&fname, "w", "%s.py", args->plot);
-    fprintf(fp,
-            "import matplotlib as mpl\n"
-            "mpl.use('Agg')\n"
-            "import matplotlib.pyplot as plt\n"
-            "import matplotlib.gridspec as gridspec\n"
-            "import csv\n"
-            "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
-            "\n"
-            "sample_ids = False\n"
-            "\n"
-            "dat = []\n"
-            "with open('%s.tab', 'r') as f:\n"
-            "    reader = csv.reader(f, 'tab')\n"
-            "    for row in reader:\n"
-            "        if row[0][0]=='#': continue\n"
-            "        if row[0]!='CN': continue\n"
-            "        tgt = 0\n"
-            "        if row[4]=='%s': tgt = 1\n"
-            "        dat.append([float(row[1]), float(row[2]), float(row[3]), tgt, row[4]])\n"
-            "\n"
-            "dat = sorted(dat)\n"
-            "\n"
-            "iq = -1; dp = 0\n"
-            "for i in range(len(dat)):\n"
-            "    if iq==-1 and dat[i][3]==1: iq = i\n"
-            "    dp += dat[i][2]\n"
-            "dp /= len(dat)\n"
-            "\n"
-            "fig,ax1 = plt.subplots(figsize=(8,5))\n"
-            "ax2 = ax1.twinx()\n"
-            "plots  = ax1.plot([x[0] for x in dat],'o-', ms=3, color='g', mec='g', label='Discordance (total)')\n"
-            "plots += ax1.plot([x[1] for x in dat], '^', ms=3, color='r', mec='r', label='Discordance (avg per site)')\n"
-            "plots += ax2.plot([x[2] for x in dat],'v', ms=3, color='k', label='Number of sites')\n"
-            "if iq!=-1:\n"
-            "   ax1.plot([iq],[dat[iq][0]],'o',color='orange', ms=9)\n"
-            "   ax1.annotate('%s',xy=(iq,dat[iq][0]), xytext=(5,5), textcoords='offset points',fontsize='xx-small',rotation=45,va='bottom',ha='left')\n"
-            "   ax1.plot([iq],[dat[iq][1]],'^',color='red', ms=5)\n"
-            "for tl in ax1.get_yticklabels(): tl.set_color('g')\n"
-            "for tl in ax2.get_yticklabels(): tl.set_color('k'); tl.set_fontsize(9)\n"
-            "min_dp = min([x[2] for x in dat])\n"
-            "max_dp = max([x[2] for x in dat])\n"
-            "ax2.set_ylim(min_dp-1,max_dp+1)\n"
-            "ax1.set_title('Discordance with %s')\n"
-            "ax1.set_xlim(-0.05*len(dat),1.05*(len(dat)-1))\n"
-            "ax1.set_xlabel('Sample ID')\n"
-            "plt.subplots_adjust(left=0.1,right=0.9,bottom=0.1,top=0.9)\n"
-            "if sample_ids:\n"
-            "   ax1.set_xticks(range(len(dat)))\n"
-            "   ax1.set_xticklabels([x[4] for x in dat],**{'rotation':45, 'ha':'right', 'fontsize':8})\n"
-            "   plt.subplots_adjust(bottom=0.2)\n"
-            "ax1.set_ylabel('Discordance',color='g')\n"
-            "ax2.set_ylabel('Number of sites',color='k')\n"
-            "ax2.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n"
-            "ax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='y')\n"
-            "labels = [l.get_label() for l in plots]\n"
-            "plt.legend(plots,labels,numpoints=1,markerscale=1,loc='best',prop={'size':10},frameon=False)\n"
-            "plt.savefig('%s.png')\n"
-            "plt.close()\n"
-            "\n", args->plot, target_sample, target_sample, query_sample, args->plot
-           );
-    fclose(fp);
-    py_plot(fname);
-    free(fname);
-}
-
-#if 0
-static void plot_cross_check(args_t *args)
-{
-    char *fname;
-    FILE *fp = open_file(&fname, "w", "%s.py", args->plot);
-    fprintf(fp,
-            "import matplotlib as mpl\n"
-            "mpl.use('Agg')\n"
-            "import matplotlib.pyplot as plt\n"
-            "import matplotlib.gridspec as gridspec\n"
-            "import csv\n"
-            "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
-            "avg   = []\n"
-            "dp    = []\n"
-            "sm2id = {}\n"
-            "dat   = None\n"
-            "min   = None\n"
-            "max   = None\n"
-            "with open('%s.tab', 'r') as f:\n"
-            "   reader = csv.reader(f, 'tab')\n"
-            "   i = 0\n"
-            "   for row in reader:\n"
-            "       if row[0]=='SM':\n"
-            "           sm2id[row[4]] = i\n"
-            "           avg.append([i,float(row[1])])\n"
-            "           dp.append([i,float(row[2])])\n"
-            "           i += 1\n"
-            "       elif row[0]=='CN':\n"
-            "           val = 0\n"
-            "           if int(row[2])!=0: val = float(row[1])/int(row[2])\n"
-            "           if not dat:\n"
-            "               dat = [[0]*len(sm2id) for x in xrange(len(sm2id))]\n"
-            "               min = val\n"
-            "               max = val\n"
-            "           id_i = sm2id[row[4]]\n"
-            "           id_j = sm2id[row[5]]\n"
-            "           dat[id_i][id_j] = val\n"
-            "           dat[id_j][id_i] = val\n"
-            "           if min > val: min = val\n"
-            "           if max < val: max = val\n"
-            "\n"
-            "if len(sm2id)<=1: exit(1)\n"
-            "if min==max: exit(1)\n"
-            "\n"
-            "fig = plt.figure(figsize=(6,7))\n"
-            "gs  = gridspec.GridSpec(2, 1, height_ratios=[1, 1.5])\n"
-            "ax1 = plt.subplot(gs[0])\n"
-            "ax2 = plt.subplot(gs[1])\n"
-            "\n"
-            "ax1.plot([x[0] for x in avg],[x[1] for x in avg],'^-', ms=3, color='k')\n"
-            "ax3 = ax1.twinx()\n"
-            "ax3.plot([x[0] for x in dp],[x[1] for x in dp],'^-', ms=3, color='r',mec='r')\n"
-            "for tl in ax3.get_yticklabels():\n"
-            "   tl.set_color('r')\n"
-            "   tl.set_fontsize(9)\n"
-            "\n"
-            "im = ax2.imshow(dat,clim=(min),interpolation='nearest',origin='lower')\n"
-            "cb1  = plt.colorbar(im,ax=ax2)\n"
-            "cb1.set_label('Pairwise discordance')\n"
-            "for t in cb1.ax.get_yticklabels(): t.set_fontsize(9)\n"
-            "\n"
-            "ax1.tick_params(axis='both', which='major', labelsize=9)\n"
-            "ax1.tick_params(axis='both', which='minor', labelsize=9)\n"
-            "ax2.tick_params(axis='both', which='major', labelsize=9)\n"
-            "ax2.tick_params(axis='both', which='minor', labelsize=9)\n"
-            "\n"
-            "ax1.set_title('Sample Discordance Score')\n"
-            "ax2.set_ylabel('Sample ID')\n"
-            "ax2.set_xlabel('Sample ID')\n"
-            "ax3.set_ylabel('Average Depth',color='r')\n"
-            "ax1.set_xlabel('Sample ID')\n"
-            "ax1.set_ylabel('Average discordance')\n"
-            "\n"
-            "plt.subplots_adjust(left=0.15,right=0.87,bottom=0.08,top=0.93,hspace=0.25)\n"
-            "plt.savefig('%s.png')\n"
-            "plt.close()\n"
-            "\n", args->plot,args->plot
-           );
-    fclose(fp);
-    py_plot(fname);
-    free(fname);
-}
-#endif
-
-static void init_data(args_t *args)
-{
-    args->sm_hdr = args->files->readers[0].header;
-    if ( !bcf_hdr_nsamples(args->sm_hdr) ) error("No samples in %s?\n", args->files->readers[0].fname);
-
-    if ( !args->cross_check )
-    {
-        args->gt_hdr = args->files->readers[1].header;
-        int nsamples = bcf_hdr_nsamples(args->gt_hdr);
-        if ( !nsamples ) error("No samples in %s?\n", args->files->readers[1].fname);
-        args->lks   = (double*) calloc(nsamples,sizeof(double));
-        args->cnts  = (int*) calloc(nsamples,sizeof(int));
-        args->sites = (double*) calloc(nsamples,sizeof(double));
-        args->dps   = (int*) calloc(nsamples,sizeof(int));
-    }
+    int iqry, igt;
  }
+pair_t;
  
-static void destroy_data(args_t *args)
-{
-    free(args->lks); free(args->cnts); free(args->dps); free(args->cwd); free(args->sites);
-}
-
-static int allele_to_int(bcf1_t *line, char *allele)
+typedef struct
  {
-    int i;
-    for (i=0; i<line->n_allele; i++)
-        if ( !strcmp(allele,line->d.allele[i]) ) return i;
-    if ( strcmp(line->d.allele[i-1],"X") ) return -1;
-    return i-1;
-}
+    bcf_srs_t *files;           // first reader is the query VCF - single sample normally or multi-sample for cross-check
+    bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF
+    char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples;
+    int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
+    int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
+    double *pdiff, *qry_prob, *gt_prob;
+    uint32_t *ndiff,*ncnt,ncmp, npairs;
+    int32_t *qry_arr,*gt_arr, nqry_arr,ngt_arr;
+    uint8_t *qry_dsg, *gt_dsg;
+    pair_t *pairs;
+    double *hwe_prob, dsg2prob[8][3], pl2prob[256];
+    double min_inter_err, max_intra_err;
+    int all_sites, hom_only, ntop, cross_check, calc_hwe_prob, sort_by_hwe, dry_run, use_PLs;
+    FILE *fp;
+    unsigned int nskip_no_match, nskip_not_ba, nskip_mono, nskip_no_data, nskip_dip_GT, nskip_dip_PL;
  
-static int init_gt2ipl(args_t *args, bcf1_t *gt_line, bcf1_t *sm_line, int *gt2ipl, int n_gt2ipl)
-{
-    int i, j;
-    for (i=0; i<n_gt2ipl; i++) gt2ipl[i] = -1;
-    for (i=0; i<gt_line->n_allele; i++)
-    {
-        // find which of the sm_alleles (k) corresponds to the gt_allele (i)
-        int k = allele_to_int(sm_line, gt_line->d.allele[i]);
-        if ( k<0 ) return 0;
-        for (j=0; j<=i; j++)
-        {
-            int l = allele_to_int(sm_line, gt_line->d.allele[j]);
-            if ( l<0 ) return 0;
-            gt2ipl[ bcf_ij2G(j,i) ] = k<=l ? bcf_ij2G(k,l) : bcf_ij2G(l,k);
-        }
-    }
-    //for (i=0; i<n_gt2ipl; i++) fprintf(bcftools_stdout, "%d .. %d\n", i,gt2ipl[i]);
-    return 1;
+    // for --distinctive-sites
+    double distinctive_sites;
+    kbitset_t *kbs_diff;
+    size_t diff_sites_size;
+    extsort_t *es;
+    char *es_tmp_prefix, *es_max_mem;
  }
+args_t;
  
  static void set_cwd(args_t *args)
  {
@@ -286,7 +94,6 @@ static void set_cwd(args_t *args)
      }
      assert(buf);
  }
-
  static void print_header(args_t *args, FILE *fp)
  {
      fprintf(fp, "# This file was produced by bcftools (%s+htslib-%s), the command line was:\n", bcftools_version(), hts_version());
@@ -298,413 +105,920 @@ static void print_header(args_t *args, FILE *fp)
      fprintf(fp, "# \t %s\n#\n", args->cwd);
  }
  
-static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line)
+static int cmp_int(const void *_a, const void *_b)
  {
-    // PLs not present, use GTs instead.
-    int fake_PL = args->no_PLs ? args->no_PLs : 99;    // with 1, discordance is the number of non-matching GTs
-    int nsm_gt, i;
-    if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 )
-        error("GT not present at %s:%"PRId64"?\n", hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
-    nsm_gt /= bcf_hdr_nsamples(hdr);
-    int npl = line->n_allele*(line->n_allele+1)/2;
-    hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr);
-    for (i=0; i<bcf_hdr_nsamples(hdr); i++)
-    {
-        int *gt_ptr = args->tmp_arr + i*nsm_gt;
-        int j, *pl_ptr = args->pl_arr + i*npl;
-        if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) // missing genotype
-        {
-            for (j=0; j<npl; j++) pl_ptr[j] = -1;
-        }
-        else
-        {
-            int a = bcf_gt_allele(gt_ptr[0]);
-            int b = bcf_gt_allele(gt_ptr[1]);
-            for (j=0; j<npl; j++) pl_ptr[j] = fake_PL;
-            int idx = bcf_alleles2gt(a,b);
-            pl_ptr[idx] = 0;
-        }
-    }
-    return npl;
+    int a = *((int*)_a);
+    int b = *((int*)_b);
+    if ( a < b ) return -1;
+    if ( a > b ) return 1;
+    return 0;
+}
+static int cmp_pair(const void *_a, const void *_b)
+{
+    pair_t *a = (pair_t*)_a;
+    pair_t *b = (pair_t*)_b;
+    if ( a->iqry < b->iqry ) return -1;
+    if ( a->iqry > b->iqry ) return 1;
+    if ( a->igt < b->igt ) return -1;
+    if ( a->igt > b->igt ) return 1;
+    return 0;
  }
  
-static int cmp_doubleptr(const void *_a, const void *_b)
+typedef struct
+{
+    uint32_t ndiff,rid,pos,rand; // rand is to shuffle sites with the same ndiff from across all chromosoms
+    unsigned long kbs_dat[1];
+}
+diff_sites_t;
+#if DBG
+static void diff_sites_debug_print(args_t *args, diff_sites_t *ds)
+{
+    int i;
+    memcpy(args->kbs_diff->b,ds->kbs_dat,args->kbs_diff->n*sizeof(unsigned long));
+    fprintf(bcftools_stderr,"%s:%d\t%d\t",bcf_hdr_id2name(args->qry_hdr,ds->rid),ds->pos+1,ds->ndiff);
+    for (i=0; i<args->npairs; i++) fprintf(bcftools_stderr,"%d",kbs_exists(args->kbs_diff,i)?1:0);
+    fprintf(bcftools_stderr,"\n");
+}
+#endif
+static int diff_sites_cmp(const void *aptr, const void *bptr)
+{
+    diff_sites_t *a = *((diff_sites_t**)aptr);
+    diff_sites_t *b = *((diff_sites_t**)bptr);
+    if ( a->ndiff < b->ndiff ) return 1;        // descending order
+    if ( a->ndiff > b->ndiff ) return -1;
+    if ( a->rand < b->rand ) return -1;
+    if ( a->rand > b->rand ) return 1;
+    return 0;
+}
+static void diff_sites_init(args_t *args)
+{
+    int nsites = args->distinctive_sites<=1 ? args->npairs*args->distinctive_sites : args->distinctive_sites;
+    if ( nsites<=0 ) error("The value for --distinctive-sites was set too low: %d\n",nsites);
+    if ( nsites > args->npairs )
+    {
+        fprintf(bcftools_stderr,"Warning: The value for --distinctive-sites is bigger than is the number of pairs, all discordant sites be printed.\n");
+        nsites = args->npairs;
+        args->distinctive_sites = args->npairs + 1;
+    }
+    else
+        args->distinctive_sites = nsites;
+    args->kbs_diff = kbs_init(args->npairs);
+    size_t n = (args->npairs + KBS_ELTBITS-1) / KBS_ELTBITS;
+    assert( n==args->kbs_diff->n );
+    args->diff_sites_size = sizeof(diff_sites_t) + (n-1)*sizeof(unsigned long);
+    args->es = extsort_alloc();
+    extsort_set_opt(args->es,size_t,DAT_SIZE,args->diff_sites_size);
+    extsort_set_opt(args->es,const char*,TMP_PREFIX,args->es_tmp_prefix);
+    extsort_set_opt(args->es,const char*,MAX_MEM,args->es_max_mem);
+    extsort_set_opt(args->es,extsort_cmp_f,FUNC_CMP,diff_sites_cmp);
+    extsort_init(args->es);
+}
+static void diff_sites_destroy(args_t *args)
  {
-    double *a = *((double**)_a);
-    double *b = *((double**)_b);
-    if ( *a < *b ) return -1;
-    else if ( *a == *b ) return 0;
+    kbs_destroy(args->kbs_diff);
+    extsort_destroy(args->es);
+}
+static inline void diff_sites_reset(args_t *args)
+{
+    kbs_clear(args->kbs_diff);
+}
+static inline void diff_sites_push(args_t *args, int ndiff, int rid, int pos)
+{
+    diff_sites_t *dat = (diff_sites_t*) malloc(args->diff_sites_size);
+    memset(dat,0,sizeof(*dat)); // for debugging: prevent warnings about uninitialized memory coming from struct padding (not needed after rand added)
+    dat->ndiff = ndiff;
+    dat->rid  = rid;
+    dat->pos  = pos;
+    dat->rand = hts_lrand48();
+    memcpy(dat->kbs_dat,args->kbs_diff->b,args->kbs_diff->n*sizeof(unsigned long));
+    extsort_push(args->es,dat);
+}
+static inline int diff_sites_shift(args_t *args, int *ndiff, int *rid, int *pos)
+{
+    diff_sites_t *dat = (diff_sites_t*) extsort_shift(args->es);
+    if ( !dat ) return 0;
+    *ndiff = dat->ndiff;
+    *rid   = dat->rid;
+    *pos   = dat->pos;
+    memcpy(args->kbs_diff->b,dat->kbs_dat,args->kbs_diff->n*sizeof(unsigned long));
      return 1;
  }
  
-static void check_gt(args_t *args)
+static void init_samples(char *list, int list_is_file, int **smpl, int *nsmpl, bcf_hdr_t *hdr, char *vcf_fname)
  {
-    int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0;
-    int fake_pls = args->no_PLs;
+    int i;
+    if ( !strcmp(list,"-") )
+    {
+        *nsmpl = bcf_hdr_nsamples(hdr);
+        *smpl  = (int*) malloc(sizeof(**smpl)*(*nsmpl));
+        for (i=0; i<*nsmpl; i++) (*smpl)[i] = i;
+        return;
+    }
  
-    // Initialize things: check which tags are defined in the header, sample names etc.
-    if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname);
-    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
+    char **tmp = hts_readlist(list, list_is_file, nsmpl);
+    if ( !tmp || !*nsmpl ) error("Failed to parse %s\n", list);
+    *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl));
+    for (i=0; i<*nsmpl; i++)
      {
-        if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
-            error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
-        if ( !args->no_PLs )
-            fprintf(bcftools_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
-        fake_pls = 1;
+        int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, tmp[i]);
+        if ( idx<0 ) error("No such sample in %s: [%s]\n",vcf_fname,tmp[i]);
+        (*smpl)[i] = idx;
+        free(tmp[i]);
      }
+    free(tmp);
+    qsort(*smpl,*nsmpl,sizeof(**smpl),cmp_int);
+    // check for duplicates
+    for (i=1; i<*nsmpl; i++)
+        if ( (*smpl)[i-1]==(*smpl)[i] )
+            error("Error: the sample \"%s\" is listed twice in %s\n", hdr->samples[(*smpl)[i]],list);
+}
  
-    FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : bcftools_stdout;
-    print_header(args, fp);
+static void init_data(args_t *args)
+{
+    hts_srand48(0);
  
-    int tgt_isample = -1, query_isample = 0;
-    if ( args->target_sample )
+    args->files = bcf_sr_init();
+    if ( args->regions && bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions);
+    if ( args->targets && bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets);
+
+    if ( args->gt_fname ) bcf_sr_set_opt(args->files, BCF_SR_REQUIRE_IDX);
+    if ( !bcf_sr_add_reader(args->files,args->qry_fname) ) error("Failed to open %s: %s\n", args->qry_fname,bcf_sr_strerror(args->files->errnum));
+    if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) )
+        error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum));
+
+    args->qry_hdr = bcf_sr_get_header(args->files,0);
+    if ( !bcf_hdr_nsamples(args->qry_hdr) ) error("No samples in %s?\n", args->qry_fname);
+    if ( args->gt_fname )
      {
-        tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample);
-        if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample);
+        args->gt_hdr = bcf_sr_get_header(args->files,1);
+        if ( !bcf_hdr_nsamples(args->gt_hdr) ) error("No samples in %s?\n", args->gt_fname);
      }
-    if ( args->all_sites )
+
+    // Determine whether GT or PL will be used
+    if ( args->qry_use_GT==-1 ) // not set by -u, qry uses PL by default
      {
-        if ( tgt_isample==-1 )
-        {
-            fprintf(bcftools_stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
-            tgt_isample = 0;
-        }
+        if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")>=0 )
+            args->qry_use_GT = 0;
+        else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")>=0 )
+            args->qry_use_GT = 1;
+        else
+            error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->qry_fname);
      }
-    if ( args->query_sample )
+    else if ( args->qry_use_GT==1 )
      {
-        query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample);
-        if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample);
+        if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"GT")<0 )
+            error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->qry_fname);
      }
-    if ( args->all_sites )
-        fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]match log LK\t[7]Query alleles\t[8-]Query PLs (%s)\n",
-                args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]);
+    else if ( bcf_hdr_id2int(args->qry_hdr,BCF_DT_ID,"PL")<0 )
+        error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->qry_fname);
  
-    // Main loop
-    float prev_lk = 0;
-    while ( (ret=bcf_sr_next_line(args->files)) )
+    if ( args->gt_hdr )
      {
-        if ( ret!=2 ) continue;
-        bcf1_t *sm_line = args->files->readers[0].buffer[0];    // the query file
-        bcf1_t *gt_line = args->files->readers[1].buffer[0];    // the -g target file
-        bcf_unpack(sm_line, BCF_UN_FMT);
-        bcf_unpack(gt_line, BCF_UN_FMT);
-
-        // Init mapping from target genotype index to the sample's PL fields
-        int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2;
-        if ( n_gt2ipl > m_gt2ipl )
+        if ( args->gt_use_GT==-1 ) // not set by -u, gt uses GT by default
+        {
+            if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")>=0 )
+                args->gt_use_GT = 1;
+            else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")>=0 )
+                args->gt_use_GT = 0;
+            else
+                error("[E::%s] Neither PL nor GT tag is present in the header of %s\n", __func__, args->gt_fname);
+        }
+        else if ( args->gt_use_GT==1 )
          {
-            m_gt2ipl = n_gt2ipl;
-            gt2ipl   = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl);
+            if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"GT")<0 )
+                error("[E::%s] The GT tag is not present in the header of %s\n", __func__, args->gt_fname);
          }
-        if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue;
-
-        // Target genotypes
-        int ngt, npl;
-        if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, &gt_arr, &ngt_arr)) <= 0 )
-            error("GT not present at %s:%"PRId64"?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1);
-        ngt /= bcf_hdr_nsamples(args->gt_hdr);
-        if ( ngt!=2 ) continue; // checking only diploid genotypes
+        else if ( bcf_hdr_id2int(args->gt_hdr,BCF_DT_ID,"PL")<0 )
+            error("[E::%s] The PL tag is not present in the header of %s\n", __func__, args->gt_fname);
+    }
+    else
+        args->gt_use_GT = args->qry_use_GT;
  
-        // Sample PLs
-        if ( !fake_pls )
+    // Prepare samples
+    int i,j;
+    args->nqry_smpl = bcf_hdr_nsamples(args->qry_hdr);
+    if ( args->qry_samples )
+    {
+        init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname);
+    }
+    if ( args->gt_samples )
+    {   
+        init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl,
+            args->gt_hdr ? args->gt_hdr : args->qry_hdr,
+            args->gt_fname ? args->gt_fname : args->qry_fname);
+    }
+    else if ( args->pair_samples )
+    {
+        int npairs;
+        char **tmp = hts_readlist(args->pair_samples, args->pair_samples_is_file, &npairs);
+        if ( !tmp || !npairs ) error("Failed to parse %s\n", args->pair_samples);
+        if ( !args->pair_samples_is_file && npairs%2 ) error("Expected even number of comma-delimited samples with -p\n");
+        args->npairs = args->pair_samples_is_file ? npairs : npairs/2;
+        args->pairs  = (pair_t*) calloc(args->npairs,sizeof(*args->pairs));
+        if ( !args->pair_samples_is_file )
          {
-            if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 )
+            for (i=0; i<args->npairs; i++)
              {
-                if ( sm_line->n_allele==1 )
-                {
-                    // PL values may not be present when ALT=. (mpileup/bcftools output), in that case 
-                    // switch automatically to GT at these sites
-                    npl = fake_PLs(args, args->sm_hdr, sm_line);
-                }
-                else
-                    error("PL not present at %s:%"PRId64"?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, (int64_t) sm_line->pos+1);
+                args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i]);
+                args->pairs[i].igt  = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, tmp[2*i+1]);
+                if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[2*i]);
+                if ( args->pairs[i].igt  < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,tmp[2*i+1]);
+                free(tmp[2*i]);
+                free(tmp[2*i+1]);
              }
-            else
-                npl /= bcf_hdr_nsamples(args->sm_hdr);
          }
          else
-            npl = fake_PLs(args, args->sm_hdr, sm_line);
+        {
+            for (i=0; i<args->npairs; i++)
+            {
+                char *ptr = tmp[i];
+                while ( *ptr && !isspace(*ptr) ) ptr++;
+                if ( !*ptr ) error("Could not parse %s: %s\n",args->pair_samples,tmp[i]);
+                *ptr = 0;
+                args->pairs[i].iqry = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[i]);
+                if ( args->pairs[i].iqry < 0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[i]);
+                ptr++;
+                while ( *ptr && isspace(*ptr) ) ptr++;
+                args->pairs[i].igt = bcf_hdr_id2int(args->gt_hdr?args->gt_hdr:args->qry_hdr, BCF_DT_SAMPLE, ptr);
+                if ( args->pairs[i].igt < 0 ) error("No such sample in %s: [%s]\n",args->gt_fname?args->gt_fname:args->qry_fname,ptr);
+                free(tmp[i]);
+            }
+        }
+        free(tmp);
+        qsort(args->pairs,args->npairs,sizeof(*args->pairs),cmp_pair);
+    }
+    else if ( args->gt_hdr )
+        args->ngt_smpl = bcf_hdr_nsamples(args->gt_hdr);
+    if ( !args->ngt_smpl )
+    {
+        args->ngt_smpl = args->nqry_smpl;
+        args->gt_smpl  = args->qry_smpl;
+        args->cross_check = 1;
+    }
+
+    // The data arrays
+    if ( !args->npairs ) args->npairs = args->cross_check ? args->nqry_smpl*(args->nqry_smpl+1)/2 : args->ngt_smpl*args->nqry_smpl;
+    if ( !args->pair_samples )
+    {
+        args->qry_dsg = (uint8_t*) malloc(args->nqry_smpl);
+        args->gt_dsg  = args->cross_check ? args->qry_dsg : (uint8_t*) malloc(args->ngt_smpl);
+    }
+    if ( args->use_PLs )
+    {
+        args->pdiff = (double*) calloc(args->npairs,sizeof(*args->pdiff));      // log probability of pair samples being the same
+        args->qry_prob = (double*) malloc(3*args->nqry_smpl*sizeof(*args->qry_prob));
+        args->gt_prob  = args->cross_check ? args->qry_prob : (double*) malloc(3*args->ngt_smpl*sizeof(*args->gt_prob));
+
+        // dsg2prob: the first index is bitmask of 8 possible dsg combinations (only 1<<0,1<<2,1<<3 are set, accessing
+        // anything else indicated an error, this is just to reuse gt_to_dsg()); the second index are the corresponding 
+        // probabilities of 0/0, 0/1, and 1/1 genotypes
+        for (i=0; i<8; i++)
+            for (j=0; j<3; j++)
+                args->dsg2prob[i][j] = HUGE_VAL;
+        args->dsg2prob[1][0] = -log(1-pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[1][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[1][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[2][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[2][1] = -log(1-pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[2][2] = -log(0.5*pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[4][0] = -log(0.5*pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[4][1] = -log(0.5*pow(10,-0.1*args->use_PLs));
+        args->dsg2prob[4][2] = -log(1-pow(10,-0.1*args->use_PLs));
  
-        // Calculate likelihoods for all samples, assuming diploid genotypes
+        // lookup table to avoid exponentiation
+        for (i=0; i<256; i++) args->pl2prob[i] = pow(10,-0.1*i);
+    }
+    else
+        args->ndiff = (uint32_t*) calloc(args->npairs,sizeof(*args->ndiff));    // number of differing genotypes for each pair of samples
+    args->ncnt  = (uint32_t*) calloc(args->npairs,sizeof(*args->ncnt));         // number of comparisons performed (non-missing data)
+    if ( !args->ncnt ) error("Error: failed to allocate %.1f Mb\n", args->npairs*sizeof(*args->ncnt)/1e6);
+    if ( args->calc_hwe_prob )
+    {
+        // prob of the observed sequence of matches given site AFs and HWE
+        args->hwe_prob = (double*) calloc(args->npairs,sizeof(*args->hwe_prob));
+        if ( !args->hwe_prob ) error("Error: failed to allocate %.1f Mb. Run with --no-HWE-prob to save some memory.\n", args->npairs*sizeof(*args->hwe_prob)/1e6);
+    }
+
+    if ( args->distinctive_sites ) diff_sites_init(args);
+
+    args->fp = bcftools_stdout;
+    print_header(args, args->fp);
+}
+
+static void destroy_data(args_t *args)
+{
+    if ( args->gt_dsg!=args->qry_dsg ) free(args->gt_dsg);
+    free(args->qry_dsg);
+    if ( args->gt_prob!=args->qry_prob ) free(args->gt_prob);
+    free(args->qry_prob);
+    free(args->es_max_mem);
+    fclose(args->fp);
+    if ( args->distinctive_sites ) diff_sites_destroy(args);
+    free(args->hwe_prob);
+    free(args->cwd);
+    free(args->qry_arr);
+    if ( args->gt_hdr ) free(args->gt_arr);
+    free(args->pdiff);
+    free(args->ndiff);
+    free(args->ncnt);
+    free(args->qry_smpl);
+    if ( args->gt_smpl!=args->qry_smpl ) free(args->gt_smpl);
+    free(args->pairs);
+    bcf_sr_destroy(args->files);
+}
  
-        // For faster access to genotype likelihoods (PLs) of the query sample
-        int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl;
-        double sum_pl = 0; // for converting PLs to probs
-        for (max_ipl=0; max_ipl<npl; max_ipl++)
+static inline uint8_t gt_to_dsg(int32_t *ptr)
+{
+    if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) || ptr[1]==bcf_int32_vector_end ) return 0;
+    uint8_t dsg = (bcf_gt_allele(ptr[0])?1:0) + (bcf_gt_allele(ptr[1])?1:0);
+    return 1<<dsg;
+}
+static inline uint8_t pl_to_dsg(int32_t *ptr)
+{
+    if ( ptr[0]==bcf_int32_missing || ptr[1]==bcf_int32_missing || ptr[2]==bcf_int32_missing ) return 0;
+    if ( ptr[1]==bcf_int32_vector_end || ptr[2]==bcf_int32_vector_end ) return 0;
+    int min_pl = ptr[0]<ptr[1] ? (ptr[0]<ptr[2]?ptr[0]:ptr[2]) : (ptr[1]<ptr[2]?ptr[1]:ptr[2]);
+    uint8_t dsg = 0;
+    if ( ptr[0]==min_pl ) dsg |= 1;
+    if ( ptr[1]==min_pl ) dsg |= 2;
+    if ( ptr[2]==min_pl ) dsg |= 4;
+    return dsg;
+}
+static inline uint8_t gt_to_prob(args_t *args, int32_t *ptr, double *prob)
+{
+    uint8_t dsg = gt_to_dsg(ptr);
+    if ( dsg )
+    {
+        prob[0] = args->dsg2prob[dsg][0];
+        prob[1] = args->dsg2prob[dsg][1];
+        prob[2] = args->dsg2prob[dsg][2];
+    }
+    return dsg;
+}
+static inline uint8_t pl_to_prob(args_t *args, int32_t *ptr, double *prob)
+{
+    uint8_t dsg = pl_to_dsg(ptr);
+    if ( dsg )
+    {
+        prob[0] = (ptr[0]>=0 && ptr[0]<255) ? args->pl2prob[ptr[0]] : args->pl2prob[255];
+        prob[1] = (ptr[1]>=0 && ptr[1]<255) ? args->pl2prob[ptr[1]] : args->pl2prob[255];
+        prob[2] = (ptr[2]>=0 && ptr[2]<255) ? args->pl2prob[ptr[2]] : args->pl2prob[255];
+        double sum = prob[0] + prob[1] + prob[2];
+        prob[0] /= sum;
+        prob[1] /= sum;
+        prob[2] /= sum;
+        prob[0] = -log(prob[0]);
+        prob[1] = -log(prob[1]);
+        prob[2] = -log(prob[2]);
+    }
+    return dsg;
+}
+static int set_data(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec, int32_t **arr, int32_t *narr, int *narr1, int *use_GT)
+{
+    static int warn_dip_GT = 1;
+    static int warn_dip_PL = 1;
+    int i;
+    for (i=0; i<2; i++)
+    {
+        if ( *use_GT )
          {
-            if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break;
-            if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue;
-            sum_pl += pow(10, -0.1*pl_ptr[max_ipl]);
+            int ret = bcf_get_genotypes(hdr,rec,arr,narr);
+            if ( ret < 0 )
+            {
+                if ( !i ) { *use_GT = 0; continue; }
+                args->nskip_no_data++;
+                return -1;
+            }
+            if ( ret != 2*bcf_hdr_nsamples(hdr) )
+            {
+                if ( warn_dip_GT )
+                {
+                    fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/GT fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1);
+                    warn_dip_GT = 0;
+                }
+                args->nskip_dip_GT++;
+                return -1;
+            }
+            *narr1 = 2;
+            return 0;
          }
-        if ( sum_pl==0 ) continue; // no PLs present
-        if ( fake_pls && args->no_PLs==1 ) sum_pl = -1;
  
-        // The main stats: concordance of the query sample with the target -g samples
-        for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++)
+        int ret = bcf_get_format_int32(hdr,rec,"PL",arr,narr);
+        if ( ret < 0 )
          {
-            int *gt_ptr = gt_arr + i*ngt;
-            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
-            if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) continue;
-            int a = bcf_gt_allele(gt_ptr[0]);
-            int b = bcf_gt_allele(gt_ptr[1]);
-            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
-            int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file
-            int igt_qry = gt2ipl[igt_tgt];  // corresponding genotype in query file
-            if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue;   // genotype not present in query sample: haploid or missing
-            args->lks[i] += sum_pl<0 ? -pl_ptr[igt_qry] : log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl);
-            args->sites[i]++;
+            if ( !i ) { *use_GT = 1; continue; }
+            args->nskip_no_data++;
+            return -1;
          }
-        if ( args->all_sites )
+        if ( ret != 3*bcf_hdr_nsamples(hdr) )
          {
-            // Print LKs at all sites for debugging
-            int *gt_ptr = gt_arr + tgt_isample*ngt;
-            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
-            int a = bcf_gt_allele(gt_ptr[0]);
-            int b = bcf_gt_allele(gt_ptr[1]);
-            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
-            fprintf(fp, "SC\t%s\t%"PRId64, args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1);
-            for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]);
-            fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : ".");
-            fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk);
-            prev_lk = args->lks[query_isample];
-
-            int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample
-            for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]);
-            for (igt=0; igt<npl; igt++)
-                if ( pl_ptr[igt]==bcf_int32_vector_end ) break;
-                else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, ".");
-                else fprintf(fp, "\t%d", pl_ptr[igt]);
-            fprintf(fp, "\n");
+            if ( warn_dip_PL )
+            {
+                fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", only diploid FORMAT/PL fields supported. (This is printed only once.)\n", bcf_seqname(hdr,rec),rec->pos+1);
+                warn_dip_PL = 0;
+            }
+            args->nskip_dip_PL++;
+            return -1;
          }
+        *narr1 = 3;
+        return 0;
      }
-    free(gt2ipl);
-    free(gt_arr);
-    free(args->pl_arr);
-    free(args->tmp_arr);
+    return -1;  // should never reach
+}
+static void process_line(args_t *args)
+{
+    int i,j,k, nqry1, ngt1, ret;
+
+    bcf1_t *gt_rec = NULL, *qry_rec = bcf_sr_get_line(args->files,0);   // the query file
+    int qry_use_GT = args->qry_use_GT;
+    int gt_use_GT  = args->gt_use_GT;
+
+    ret = set_data(args, args->qry_hdr, qry_rec, &args->qry_arr, &args->nqry_arr, &nqry1, &qry_use_GT);
+    if ( ret<0 ) return;
  
-    // To be able to plot total discordance (=number of mismatching GTs with -G1) in the same
-    // plot as discordance per site, the latter must be scaled to the same range
-    int nsamples = bcf_hdr_nsamples(args->gt_hdr);
-    double extreme_lk = 0, extreme_lk_per_site = 0;
-    for (i=0; i<nsamples; i++)
+    if ( args->gt_hdr )
      {
-        if ( args->lks[i] < extreme_lk ) extreme_lk = args->lks[i];
-        if ( args->sites[i] && args->lks[i]/args->sites[i] < extreme_lk_per_site ) extreme_lk_per_site = args->lks[i]/args->sites[i];
+        gt_rec = bcf_sr_get_line(args->files,1);
+        ret = set_data(args, args->gt_hdr, gt_rec, &args->gt_arr, &args->ngt_arr, &ngt1, &gt_use_GT);
+        if ( ret<0 ) return;
+    }
+    else
+    {
+        ngt1 = nqry1;
+        args->gt_arr = args->qry_arr;
      }
  
-    // Sorted output
-    double **p = (double**) malloc(sizeof(double*)*nsamples);
-    for (i=0; i<nsamples; i++) p[i] = &args->lks[i];
-    qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
+    args->ncmp++;
  
-    fprintf(fp, "# [1]CN\t[2]Discordance with %s (total)\t[3]Discordance (avg score per site)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]);
-    for (i=0; i<nsamples; i++)
+    double af,hwe_dsg[8];
+    if ( args->calc_hwe_prob )
      {
-        int idx = p[i] - args->lks;
-        double per_site = 0;
-        if ( args->sites[idx] )
+        int ac[2];
+        if ( args->gt_hdr )
          {
-            if ( args->sites[idx] && extreme_lk_per_site )
+            if ( bcf_calc_ac(args->gt_hdr, gt_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
+        }
+        else if ( bcf_calc_ac(args->qry_hdr, qry_rec, ac, BCF_UN_INFO|BCF_UN_FMT)!=1 ) error("todo: bcf_calc_ac() failed\n");
+
+        // hwe indexes correspond to the bitmask of eight dsg combinations to account for PL uncertainty
+        // for in the extreme case we can have uninformative PL=0,0,0. So the values are the minima of e.g.
+        //      hwe[1,2,4] ..  dsg=0,1,2
+        //      hwe[3]     ..  dsg=0 or 1
+        //      hwe[6]     ..  dsg=1 or 2
+
+        double hwe[3];
+        const double min_af = 1e-5;             // cap the AF in case we get unrealistic values
+        af = (double)ac[1]/(ac[0]+ac[1]);
+        hwe[0] = af>min_af ? -log(af*af) : -log(min_af*min_af);
+        hwe[1] = af>min_af && af<1-min_af ? -log(2*af*(1-af)) : -log(2*min_af*(1-min_af));
+        hwe[2] = af<(1-min_af) ? -log((1-af)*(1-af)) : -log(min_af*min_af);
+        hwe_dsg[0] = 0;
+        for (i=1; i<8; i++)
+        {
+            hwe_dsg[i] = HUGE_VAL;
+            for (k=0; k<3; k++)
              {
-                per_site = args->lks[idx]/args->sites[idx];
-                per_site *= extreme_lk / extreme_lk_per_site;
+                if ( ((1<<k)&i) && hwe_dsg[i] > hwe[k] ) hwe_dsg[i] = hwe[k];
              }
-            else
-                per_site = 0;
          }
-        fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", fabs(args->lks[idx]), fabs(per_site), args->sites[idx], args->gt_hdr->samples[idx], i);
      }
  
-    if ( args->plot )
+    // The sample pairs were given explicitly via -p/-P options
+    if ( args->pairs )
      {
-        if ( fclose(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
-        plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]);
-    }
-}
+        if ( !args->use_PLs )
+        {
+            int ndiff = 0;
+            if ( args->kbs_diff ) diff_sites_reset(args);
  
-// static inline int is_hom_most_likely(int nals, int *pls)
-// {
-//     int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
-//     for (ia=1; ia<nals; ia++)
-//     {
-//         for (ib=0; ib<ia; ib++)
-//         {
-//             if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
-//             idx++;
-//         }
-//         if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
-//         idx++;
-//     }
-//     return min_is_hom;
-// }
-
-int process_GT(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
-{
-    int ngt = bcf_get_genotypes(args->sm_hdr, line, &args->tmp_arr, &args->ntmp_arr);
+            for (i=0; i<args->npairs; i++)
+            {
+                int32_t *ptr;
+                uint8_t qry_dsg, gt_dsg;
  
-    if ( ngt<=0 ) return 1;                 // GT not present
-    if ( ngt!=args->nsmpl*2 ) return 2;     // not diploid
-    ngt /= args->nsmpl;
-    
-    int i,j, idx = 0;
-    for (i=1; i<args->nsmpl; i++)
-    {
-        int32_t *a = args->tmp_arr + i*ngt;
-        if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; }
-        int agt = 1<<bcf_gt_allele(a[0]) | 1<<bcf_gt_allele(a[1]);
+                ptr = args->gt_arr + args->pairs[i].igt*ngt1;
+                gt_dsg = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
+                if ( !gt_dsg ) continue;                        // missing value
+                if ( args->hom_only && !(gt_dsg&5) ) continue;  // not a hom
+
+                ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
+                qry_dsg = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
+                if ( !qry_dsg ) continue;                       // missing value
+
+                int match = qry_dsg & gt_dsg;
+                if ( !match )
+                {
+                    args->ndiff[i]++;
+                    if ( args->kbs_diff ) { ndiff++; kbs_insert(args->kbs_diff, i); }
+                }
+                else if ( args->calc_hwe_prob ) args->hwe_prob[i] += hwe_dsg[match];
+                args->ncnt[i]++;
+            }
  
-        for (j=0; j<i; j++)
+            if ( ndiff ) diff_sites_push(args, ndiff, qry_rec->rid, qry_rec->pos);
+        }
+        else    // use_PLs set
          {
-            int32_t *b = args->tmp_arr + j*ngt;
-            if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; }
-            int bgt = 1<<bcf_gt_allele(b[0]) | 1<<bcf_gt_allele(b[1]);
+            for (i=0; i<args->npairs; i++)
+            {
+                int32_t *ptr;
+                double qry_prob[3], gt_prob[3];
+                uint8_t qry_dsg, gt_dsg;
+
+                ptr = args->gt_arr + args->pairs[i].igt*ngt1;
+                gt_dsg = gt_use_GT ? gt_to_prob(args,ptr,gt_prob) : pl_to_prob(args,ptr,gt_prob);
+                if ( !gt_dsg ) continue;                        // missing value
+                if ( args->hom_only && !(gt_dsg&5) ) continue;  // not a hom
+               
+                ptr = args->qry_arr + args->pairs[i].iqry*nqry1;
+                qry_dsg = qry_use_GT ? gt_to_prob(args,ptr,qry_prob) : pl_to_prob(args,ptr,qry_prob);
+                if ( !qry_dsg ) continue;                       // missing value
  
-            ntot[idx]++;
-            if ( agt!=bgt ) ndif[idx]++;
-            idx++;
+                double min = qry_prob[0] + gt_prob[0];
+                qry_prob[1] += gt_prob[1];
+                if ( min > qry_prob[1] ) min = qry_prob[1];
+                qry_prob[2] += gt_prob[2];
+                if ( min > qry_prob[2] ) min = qry_prob[2];
+                args->pdiff[i] += min;
+
+                if ( args->calc_hwe_prob )
+                {
+                    int match = qry_dsg & gt_dsg;
+                    args->hwe_prob[i] += hwe_dsg[match];
+                }
+                args->ncnt[i]++;
+            }
          }
+        return;
      }
-    return 0;
-}
-int process_PL(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
-{
-    int npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr);
  
-    if ( npl<=0 ) return 1;                 // PL not present
-    npl /= args->nsmpl;
-    
-    int i,j,k, idx = 0;
-    for (i=1; i<args->nsmpl; i++)
+    int idx=0;
+    if ( !args->use_PLs )
      {
-        int32_t *a = args->tmp_arr + i*npl;
-        int imin = -1;
-        for (k=0; k<npl; k++)
+        for (i=0; i<args->nqry_smpl; i++)
          {
-            if ( a[k]==bcf_int32_vector_end ) break;
-            if ( a[k]==bcf_int32_missing ) continue;
-            if ( imin==-1 || a[imin] > a[k] ) imin = k;
+            int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+            int32_t *ptr = args->qry_arr + nqry1*iqry;
+            args->qry_dsg[i] = qry_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
          }
-        if ( imin<0 ) { idx+=i; continue; }
-
-        for (j=0; j<i; j++)
+        if ( !args->cross_check )   // in this case gt_dsg points to qry_dsg
          {
-            int32_t *b = args->tmp_arr + j*npl;
-            int jmin = -1;
-            for (k=0; k<npl; k++)
+            for (i=0; i<args->ngt_smpl; i++)
              {
-                if ( b[k]==bcf_int32_vector_end ) break;
-                if ( b[k]==bcf_int32_missing ) continue;
-                if ( jmin==-1 || b[jmin] > b[k] ) jmin = k;
+                int igt = args->gt_smpl ? args->gt_smpl[i] : i;
+                int32_t *ptr = args->gt_arr + ngt1*igt;
+                args->gt_dsg[i] = gt_use_GT ? gt_to_dsg(ptr) : pl_to_dsg(ptr);
+                if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0;      // not a hom, set to a missing value
+            }
+        }
+        for (i=0; i<args->nqry_smpl; i++)
+        {
+            int ngt = args->cross_check ? i : args->ngt_smpl;       // two files or a sub-diagonal cross-check mode?
+            if ( !args->qry_dsg[i] ) { idx += ngt; continue; }      // missing value
+            for (j=0; j<ngt; j++)
+            {
+                if ( !args->gt_dsg[j] ) { idx++; continue; }        // missing value
+                int match = args->qry_dsg[i] & args->gt_dsg[j];
+                if ( !match ) args->ndiff[idx]++;
+                else if ( args->calc_hwe_prob ) args->hwe_prob[idx] += hwe_dsg[match];
+                args->ncnt[idx]++;
+                idx++;
              }
-            if ( jmin<0 ) { idx++; continue; }
-
-            ntot[idx]++;
-            if ( imin!=jmin ) ndif[idx]++;
-            idx++;
          }
      }
-    return 0;
-}
+    else    // use_PLs set
+    {
+        for (i=0; i<args->nqry_smpl; i++)
+        {
+            int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+            int32_t *ptr = args->qry_arr + nqry1*iqry;
+            args->qry_dsg[i] = qry_use_GT ? gt_to_prob(args,ptr,args->qry_prob+i*3) : pl_to_prob(args,ptr,args->qry_prob+i*3);
+        }
+        if ( !args->cross_check )   // in this case gt_dsg points to qry_dsg
+        {
+            for (i=0; i<args->ngt_smpl; i++)
+            {
+                int igt = args->gt_smpl ? args->gt_smpl[i] : i;
+                int32_t *ptr = args->gt_arr + ngt1*igt;
+                args->gt_dsg[i] = gt_use_GT ? gt_to_prob(args,ptr,args->gt_prob+i*3) : pl_to_prob(args,ptr,args->gt_prob+i*3);
+                if ( args->hom_only && !(args->gt_dsg[i]&5) ) args->gt_dsg[i] = 0;      // not a hom, set to a missing value
+            }
+        }
+        for (i=0; i<args->nqry_smpl; i++)
+        {
+            int ngt = args->cross_check ? i : args->ngt_smpl;       // two files or a sub-diagonal cross-check mode?
+            if ( !args->qry_dsg[i] ) { idx += ngt; continue; }      // missing value
+            for (j=0; j<ngt; j++)
+            {
+                if ( !args->gt_dsg[j] ) { idx++; continue; }        // missing value
  
-static void cross_check_gts(args_t *args)
-{
-    // Initialize things: check which tags are defined in the header, sample names etc.
-    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
-    {
-        if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
-            error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
-        if ( !args->no_PLs ) {
-            fprintf(bcftools_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
-            args->no_PLs = 99;
+                double min = args->qry_prob[i*3] + args->gt_prob[j*3];
+                if ( min > args->qry_prob[i*3+1] + args->gt_prob[j*3+1] ) min = args->qry_prob[i*3+1] + args->gt_prob[j*3+1];
+                if ( min > args->qry_prob[i*3+2] + args->gt_prob[j*3+2] ) min = args->qry_prob[i*3+2] + args->gt_prob[j*3+2];
+                args->pdiff[idx] += min;
+
+                if ( args->calc_hwe_prob )
+                {
+                    int match = args->qry_dsg[i] & args->gt_dsg[j];
+                    args->hwe_prob[idx] += hwe_dsg[match];
+                }
+                args->ncnt[idx]++;
+                idx++;
+            }
          }
      }
+}
  
-    args->nsmpl = bcf_hdr_nsamples(args->sm_hdr);
-    args->narr  = (args->nsmpl-1)*args->nsmpl/2;
  
-    uint32_t *ndif = (uint32_t*) calloc(args->narr,4);
-    uint32_t *ntot = (uint32_t*) calloc(args->narr,4);
+typedef struct
+{
+    int ism, idx;
+    double val;
+}
+idbl_t;
+static int cmp_idbl(const void *_a, const void *_b)
+{
+    idbl_t *a = (idbl_t*)_a;
+    idbl_t *b = (idbl_t*)_b;
+    if ( a->val < b->val ) return -1;
+    if ( a->val > b->val ) return 1;
+    return 0;
+}
+static void report_distinctive_sites(args_t *args)
+{
+    extsort_sort(args->es);
+
+    fprintf(args->fp,"# DS, distinctive sites:\n");
+    fprintf(args->fp,"#     - chromosome\n");
+    fprintf(args->fp,"#     - position\n");
+    fprintf(args->fp,"#     - cumulative number of pairs distinguished by this block\n");
+    fprintf(args->fp,"#     - block id\n");
+    fprintf(args->fp,"#DS\t[2]Chromosome\t[3]Position\t[4]Cumulative number of distinct pairs\t[5]Block id\n");
  
-    while ( bcf_sr_next_line(args->files) )
+    kbitset_t *kbs_blk = kbs_init(args->npairs);
+    kbitset_iter_t itr;
+    int i,ndiff,rid,pos,ndiff_tot = 0, iblock = 0;
+    int ndiff_min = args->distinctive_sites <= args->npairs ? args->distinctive_sites : args->npairs;
+    while ( diff_sites_shift(args,&ndiff,&rid,&pos) )
      {
-        bcf1_t *line = bcf_sr_get_line(args->files,0);
-
-        // use PLs unless no_PLs is set and GT exists
-        if ( args->no_PLs )
+        int ndiff_new = 0, ndiff_dbg = 0;
+        kbs_start(&itr);
+        while ( (i=kbs_next(args->kbs_diff, &itr))>=0 )
          {
-            if ( process_GT(args,line,ntot,ndif)==0 ) continue;
+            ndiff_dbg++;
+            if ( kbs_exists(kbs_blk,i) ) continue;   // already set
+            kbs_insert(kbs_blk,i);
+            ndiff_new++;
          }
-        process_PL(args,line,ntot,ndif);
+        if ( ndiff_dbg!=ndiff ) error("Corrupted data, fixme: %d vs %d\n",ndiff_dbg,ndiff);
+        if ( !ndiff_new ) continue;     // no new pair distinguished by this site
+        ndiff_tot += ndiff_new;
+        fprintf(args->fp,"DS\t%s\t%d\t%d\t%d\n",bcf_hdr_id2name(args->qry_hdr,rid),pos+1,ndiff_tot,iblock);
+        if ( ndiff_tot < ndiff_min ) continue;   // fewer than the requested number of pairs can be distinguished at this point
+        iblock++;
+        ndiff_tot = 0;
+        kbs_clear(kbs_blk);
      }
-    
-    FILE *fp = bcftools_stdout;
-    print_header(args, fp);
+    kbs_destroy(kbs_blk);
+}
+static void report(args_t *args)
+{
+    fprintf(args->fp,"INFO\tsites-compared\t%u\n",args->ncmp);
+    fprintf(args->fp,"INFO\tsites-skipped-no-match\t%u\n",args->nskip_no_match);
+    fprintf(args->fp,"INFO\tsites-skipped-multiallelic\t%u\n",args->nskip_not_ba);
+    fprintf(args->fp,"INFO\tsites-skipped-monoallelic\t%u\n",args->nskip_mono);
+    fprintf(args->fp,"INFO\tsites-skipped-no-data\t%u\n",args->nskip_no_data);
+    fprintf(args->fp,"INFO\tsites-skipped-GT-not-diploid\t%u\n",args->nskip_dip_GT);
+    fprintf(args->fp,"INFO\tsites-skipped-PL-not-diploid\t%u\n",args->nskip_dip_PL);
+    fprintf(args->fp,"# DC, discordance:\n");
+    fprintf(args->fp,"#     - query sample\n");
+    fprintf(args->fp,"#     - genotyped sample\n");
+    fprintf(args->fp,"#     - discordance (number of mismatches; smaller is better)\n");
+    fprintf(args->fp,"#     - negative log of HWE probability at matching sites (rare genotypes mataches are more informative, bigger is better)\n");
+    fprintf(args->fp,"#     - number of sites compared (bigger is better)\n");
+    fprintf(args->fp,"#DC\t[2]Query Sample\t[3]Genotyped Sample\t[4]Discordance\t[5]-log P(HWE)\t[6]Number of sites compared\n");
  
-    float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2);
+    int trim = args->ntop;
+    if ( !args->pairs )
+    {
+        if ( !args->ngt_smpl && args->nqry_smpl <= args->ntop ) trim = 0;
+        if ( args->ngt_smpl && args->ngt_smpl <= args->ntop  ) trim = 0;
+    }
  
-    // Output pairwise distances
-    fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n");
-    int i,j, idx = 0;
-    for (i=0; i<args->nsmpl; i++)
+    if ( args->pairs )
      {
-        for (j=0; j<i; j++)
+        int i;
+        for (i=0; i<args->npairs; i++)
          {
-            float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10;
-            fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
-            PDIST(tmp,i,j) = err;
-            idx++;
+            int iqry = args->pairs[i].iqry;
+            int igt  = args->pairs[i].igt;
+            if ( args->ndiff )
+            {
+                fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                        args->qry_hdr->samples[iqry],
+                        args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+                        args->ndiff[i],
+                        args->calc_hwe_prob ? args->hwe_prob[i] : 0,
+                        args->ncnt[i]);
+            }
+            else
+            {
+                fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                        args->qry_hdr->samples[iqry],
+                        args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+                        args->pdiff[i],
+                        args->calc_hwe_prob ? args->hwe_prob[i] : 0,
+                        args->ncnt[i]);
+            }
          }
      }
-
-    // Cluster samples
-    int nlist;
-    float clust_max_err = args->max_intra_err;
-    hclust_t *clust = hclust_init(args->nsmpl,tmp);
-    cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist);
-    fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n");
-    for (i=0; i<nlist; i++)
-    {
-        fprintf(fp,"CLUSTER\t%f", list[i].dist);
-        for (j=0; j<list[i].nmemb; j++)
-            fprintf(fp,"\t%s",args->sm_hdr->samples[list[i].memb[j]]);
-        fprintf(fp,"\n");
-    }
-    hclust_destroy_list(list,nlist);
-    // Debugging output: the cluster graph and data used for deciding
-    char **dbg = hclust_explain(clust,&nlist);
-    for (i=0; i<nlist; i++)
-        fprintf(fp,"DBG\t%s\n", dbg[i]);
-    fprintf(fp, "# TH, clustering threshold\t[2]Value\nTH\t%f\n",clust_max_err);
-    fprintf(fp, "# DOT\t[2]Cluster graph, visualize e.g. as \"this-output.txt | grep ^DOT | cut -f2- | dot -Tsvg -o graph.svg\"\n");
-    fprintf(fp, "DOT\t%s\n", hclust_create_dot(clust,args->sm_hdr->samples,clust_max_err));
-    hclust_destroy(clust);
-    free(tmp);
-
-
-    // Deprecated output for temporary backward compatibility
-    fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n");
-    fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
-    idx = 0;
-    for (i=0; i<args->nsmpl; i++)
+    else if ( !trim )
      {
-        for (j=0; j<i; j++)
+        int i,j,idx=0;
+        for (i=0; i<args->nqry_smpl; i++)
          {
-            fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
-            idx++;
+            int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+            int ngt  = args->cross_check ? i : args->ngt_smpl;
+            for (j=0; j<ngt; j++)
+            {
+                int igt = args->gt_smpl ? args->gt_smpl[j] : j;
+                if ( args->ndiff )
+                {
+                    fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                            args->qry_hdr->samples[iqry],
+                            args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+                            args->ndiff[idx],
+                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+                            args->ncnt[idx]);
+                }
+                else
+                {
+                    fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                            args->qry_hdr->samples[iqry],
+                            args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+                            args->pdiff[idx],
+                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+                            args->ncnt[idx]);
+                }
+                idx++;
+            }
          }
      }
-
-    free(ndif);
-    free(ntot);
-    free(args->tmp_arr);
+    else if ( !args->cross_check )
+    {
+        idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*args->ngt_smpl);
+        int i,j;
+        for (i=0; i<args->nqry_smpl; i++)
+        {
+            int idx  = i*args->ngt_smpl;
+            for (j=0; j<args->ngt_smpl; j++)
+            {
+                if ( args->sort_by_hwe )
+                    arr[j].val = -args->hwe_prob[idx];
+                else if ( args->ndiff )
+                    arr[j].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
+                else
+                    arr[j].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
+                arr[j].ism = j;
+                arr[j].idx = idx;
+                idx++;
+            }
+            qsort(arr, args->ngt_smpl, sizeof(*arr), cmp_idbl);
+            int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+            for (j=0; j<args->ntop; j++)
+            {
+                int idx = arr[j].idx;
+                int igt = args->gt_smpl ? args->gt_smpl[arr[j].ism] : arr[j].ism;
+                if ( args->ndiff )
+                {
+                    fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                            args->qry_hdr->samples[iqry],
+                            args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+                            args->ndiff[idx],
+                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+                            args->ncnt[idx]);
+                }
+                else
+                {
+                    fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                            args->qry_hdr->samples[iqry],
+                            args->gt_hdr?args->gt_hdr->samples[igt]:args->qry_hdr->samples[igt],
+                            args->pdiff[idx],
+                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+                            args->ncnt[idx]);
+                }
+            }
+        }
+        free(arr);
+    }
+    else
+    {
+        int narr = args->nqry_smpl-1;
+        idbl_t *arr = (idbl_t*)malloc(sizeof(*arr)*narr);
+        int i,j,k,idx;
+        for (i=0; i<args->nqry_smpl; i++)
+        {
+            k = 0, idx = i*(i-1)/2;
+            for (j=0; j<i; j++)
+            {
+                if ( args->sort_by_hwe )
+                    arr[k].val = -args->hwe_prob[idx];
+                else if ( args->ndiff )
+                    arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
+                else
+                    arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
+                arr[k].ism = j;
+                arr[k].idx = idx;
+                idx++;
+                k++;
+            }
+            for (; j<narr; j++)
+            {
+                idx = j*(j+1)/2 + i;
+                if ( args->sort_by_hwe )
+                    arr[k].val = -args->hwe_prob[idx];
+                else if ( args->ndiff )
+                    arr[k].val = args->ncnt[idx] ? (double)args->ndiff[idx]/args->ncnt[idx] : 0;
+                else
+                    arr[k].val = args->ncnt[idx] ? args->pdiff[idx]/args->ncnt[idx] : 0;
+                arr[k].ism = j + 1;
+                arr[k].idx = idx;
+                k++;
+            }
+            qsort(arr, narr, sizeof(*arr), cmp_idbl);
+            int iqry = args->qry_smpl ? args->qry_smpl[i] : i;
+            for (j=0; j<args->ntop; j++)
+            {
+                if ( i <= arr[j].ism ) continue;
+                int idx = arr[j].idx;
+                int igt = args->qry_smpl ? args->qry_smpl[arr[j].ism] : arr[j].ism;
+                if ( args->ndiff )
+                {
+                    fprintf(args->fp,"DC\t%s\t%s\t%u\t%e\t%u\n",
+                            args->qry_hdr->samples[iqry],
+                            args->qry_hdr->samples[igt],
+                            args->ndiff[idx],
+                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+                            args->ncnt[idx]);
+                }
+                else
+                {
+                    fprintf(args->fp,"DC\t%s\t%s\t%e\t%e\t%u\n",
+                            args->qry_hdr->samples[iqry],
+                            args->qry_hdr->samples[igt],
+                            args->pdiff[idx],
+                            args->calc_hwe_prob ? args->hwe_prob[idx] : 0,
+                            args->ncnt[idx]);
+                }
+            }
+        }
+        free(arr);
+    }
  }
  
-static char *init_prefix(char *prefix)
+static int is_input_okay(args_t *args, int nmatch)
  {
-    int len = strlen(prefix);
-    if ( prefix[len-1] == '/' || prefix[len-1] == '\\' )
-        return msprintf("%sgtcheck", prefix);
-    return strdup(prefix);
+    int i;
+    const char *msg;
+    bcf_hdr_t *hdr;
+    bcf1_t *rec;
+    if ( args->gt_hdr && nmatch!=2 )
+    {
+        if ( args->nskip_no_match++ ) return 0;
+        for (i=0; i<2; i++)
+        {
+            rec = bcf_sr_get_line(args->files,i);
+            if ( rec ) break;
+        }
+        hdr = bcf_sr_get_header(args->files,i);
+        fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", no record with matching POS+ALT. (This is printed only once.)\n",
+                bcf_seqname(hdr,rec),rec->pos+1);
+        return 0;
+    }
+    for (i=0; i<2; i++)
+    {
+        hdr = bcf_sr_get_header(args->files,i);
+        rec = bcf_sr_get_line(args->files,i);
+        if ( rec->n_allele>2 )
+        {
+            if ( args->nskip_not_ba++ ) return 0;
+            msg = "not a biallelic site, run `bcftools norm -m -` first";
+            goto not_okay;
+        }
+        if ( bcf_get_variant_types(rec)==VCF_REF )
+        {
+            if ( args->nskip_mono++ ) return 0;
+            msg = "monoallelic site";
+            goto not_okay;
+        }
+        if ( !args->gt_hdr ) break;
+    }
+    return 1;
+
+not_okay:
+    fprintf(bcftools_stderr,"INFO: skipping %s:%"PRIhts_pos", %s. (This is printed only once.)\n", 
+        bcf_seqname(hdr,rec),rec->pos+1,msg);
+    return 0;
  }
  
  static void usage(void)
@@ -714,30 +1028,62 @@ static void usage(void)
      fprintf(bcftools_stderr, "Usage:   bcftools gtcheck [options] [-g <genotypes.vcf.gz>] <query.vcf.gz>\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Options:\n");
-    fprintf(bcftools_stderr, "    -a, --all-sites                 output comparison for all sites\n");
-    fprintf(bcftools_stderr, "    -c, --cluster <min,max>         min inter- and max intra-sample error [0.23,-0.3]\n");
-    fprintf(bcftools_stderr, "    -g, --genotypes <file>          genotypes to compare against\n");
-    fprintf(bcftools_stderr, "    -G, --GTs-only <int>            use GTs, ignore PLs, using <int> for unseen genotypes [99]\n");
-    fprintf(bcftools_stderr, "    -H, --homs-only                 homozygous genotypes only (useful for low coverage data)\n");
-    fprintf(bcftools_stderr, "    -p, --plot <prefix>             plot\n");
-    fprintf(bcftools_stderr, "    -r, --regions <region>          restrict to comma-separated list of regions\n");
-    fprintf(bcftools_stderr, "    -R, --regions-file <file>       restrict to regions listed in a file\n");
-    fprintf(bcftools_stderr, "    -s, --query-sample <string>     query sample (by default the first sample is checked)\n");
-    fprintf(bcftools_stderr, "    -S, --target-sample <string>    target sample in the -g file (used only for plotting)\n");
-    fprintf(bcftools_stderr, "    -t, --targets <region>          similar to -r but streams rather than index-jumps\n");
-    fprintf(bcftools_stderr, "    -T, --targets-file <file>       similar to -R but streams rather than index-jumps\n");
+    //fprintf(bcftools_stderr, "    -a, --all-sites                  Output comparison for all sites\n");
+    //fprintf(bcftools_stderr, "    -c, --cluster MIN,MAX            Min inter- and max intra-sample error [0.23,-0.3]\n");
+    fprintf(bcftools_stderr, "        --distinctive-sites            Find sites that can distinguish between at least NUM sample pairs.\n");
+    fprintf(bcftools_stderr, "                  NUM[,MEM[,TMP]]          If the number is smaller or equal to 1, it is interpreted as the fraction of pairs.\n");
+    fprintf(bcftools_stderr, "                                           The optional MEM string sets the maximum memory used for in-memory sorting [500M]\n");
+#ifdef _WIN32
+    fprintf(bcftools_stderr, "                                           and TMP is a prefix of temporary files used by external sorting [/bcftools.XXXXXX]\n");
+#else
+    fprintf(bcftools_stderr, "                                           and TMP is a prefix of temporary files used by external sorting [/tmp/bcftools.XXXXXX]\n");
+#endif
+    fprintf(bcftools_stderr, "        --dry-run                      Stop after first record to estimate required time\n");
+    fprintf(bcftools_stderr, "    -e, --error-probability INT        Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n");
+    fprintf(bcftools_stderr, "    -g, --genotypes FILE               Genotypes to compare against\n");
+    fprintf(bcftools_stderr, "    -H, --homs-only                    Homozygous genotypes only, useful with low coverage data (requires -g)\n");
+    fprintf(bcftools_stderr, "        --n-matches INT                Print only top INT matches for each sample (sorted by average score), 0 for unlimited.\n");
+    fprintf(bcftools_stderr, "                                           Use negative value to sort by HWE probability rather than by discordance [0]\n");
+    fprintf(bcftools_stderr, "        --no-HWE-prob                  Disable calculation of HWE probability\n");
+    fprintf(bcftools_stderr, "    -p, --pairs LIST                   Comma-separated sample pairs to compare (qry,gt[,qry,gt..] with -g or qry,qry[,qry,qry..] w/o)\n");
+    fprintf(bcftools_stderr, "    -P, --pairs-file FILE              File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n");
+    fprintf(bcftools_stderr, "    -r, --regions REGION               Restrict to comma-separated list of regions\n");
+    fprintf(bcftools_stderr, "    -R, --regions-file FILE            Restrict to regions listed in a file\n");
+    fprintf(bcftools_stderr, "    -s, --samples [qry|gt]:LIST        List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n");
+    fprintf(bcftools_stderr, "    -S, --samples-file [qry|gt]:FILE   File with the query or -g samples to compare\n");
+    fprintf(bcftools_stderr, "    -t, --targets REGION               Similar to -r but streams rather than index-jumps\n");
+    fprintf(bcftools_stderr, "    -T, --targets-file FILE            Similar to -R but streams rather than index-jumps\n");
+    fprintf(bcftools_stderr, "    -u, --use TAG1[,TAG2]              Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n");
+    fprintf(bcftools_stderr, "Examples:\n");
+    fprintf(bcftools_stderr, "   # Check discordance of all samples from B against all sample in A\n");
+    fprintf(bcftools_stderr, "   bcftools gtcheck -g A.bcf B.bcf\n");
+    fprintf(bcftools_stderr, "\n");
+    fprintf(bcftools_stderr, "   # Limit comparisons to the fiven list of samples\n");
+    fprintf(bcftools_stderr, "   bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    fprintf(bcftools_stderr, "   # Compare only two pairs a1,b1 and a1,b2\n");
+    fprintf(bcftools_stderr, "   bcftools gtcheck -p a1,b1,a1,b2 -g A.bcf B.bcf\n");
+    fprintf(bcftools_stderr, "\n");
+    bcftools_exit(1);
  }
  
  int main_vcfgtcheck(int argc, char *argv[])
  {
      int c;
      args_t *args = (args_t*) calloc(1,sizeof(args_t));
-    args->files  = bcf_sr_init();
      args->argc   = argc; args->argv = argv; set_cwd(args);
-    char *regions = NULL, *targets = NULL;
-    int regions_is_file = 0, targets_is_file = 0;
+    args->qry_use_GT = -1;
+    args->gt_use_GT  = -1;
+    args->calc_hwe_prob = 1;
+    args->use_PLs = 40;
+
+    // external sort for --distinctive-sites
+#ifdef _WIN32
+    args->es_tmp_prefix = NULL;
+#else
+    args->es_tmp_prefix = "/tmp/bcftools-gtcheck";
+#endif
+    args->es_max_mem = strdup("500M");
  
      // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
      //    - min_inter: pairs with smaller err value will be considered identical 
@@ -748,6 +1094,8 @@ int main_vcfgtcheck(int argc, char *argv[])
  
      static struct option loptions[] =
      {
+        {"error-probability",1,0,'e'},
+        {"use",1,0,'u'},
          {"cluster",1,0,'c'},
          {"GTs-only",1,0,'G'},
          {"all-sites",0,0,'a'},
@@ -755,18 +1103,74 @@ int main_vcfgtcheck(int argc, char *argv[])
          {"help",0,0,'h'},
          {"genotypes",1,0,'g'},
          {"plot",1,0,'p'},
-        {"target-sample",1,0,'S'},
-        {"query-sample",1,0,'s'},
+        {"samples",1,0,'s'},
+        {"samples-file",1,0,'S'},
+        {"n-matches",1,0,2},
+        {"no-HWE-prob",0,0,3},
+        {"target-sample",1,0,4},
+        {"dry-run",0,0,5},
+        {"distinctive-sites",1,0,6},
          {"regions",1,0,'r'},
          {"regions-file",1,0,'R'},
          {"targets",1,0,'t'},
          {"targets-file",1,0,'T'},
+        {"pairs",1,0,'p'},
+        {"pairs-file",1,0,'P'},
          {0,0,0,0}
      };
      char *tmp;
-    while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hg:p:s:S:p:P:Hr:R:at:T:G:c:u:e:",loptions,NULL)) >= 0) {
          switch (c) {
+            case 'e':
+                args->use_PLs = strtol(optarg,&tmp,10);
+                if ( !tmp || *tmp ) error("Could not parse: --error-probability %s\n", optarg);
+                break;
+            case 'u':
+                {
+                    int i,nlist;
+                    char **list = hts_readlist(optarg, 0, &nlist);
+                    if ( !list || nlist<=0 || nlist>2 ) error("Failed to parse --use %s\n", optarg);
+                    if ( !strcasecmp("GT",list[0]) ) args->qry_use_GT = 1;
+                    else if ( !strcasecmp("PL",list[0]) ) args->qry_use_GT = 0;
+                    else error("Failed to parse --use %s; only GT and PL are supported\n", optarg);
+                    if ( nlist==2 )
+                    {
+                        if ( !strcasecmp("GT",list[1]) ) args->gt_use_GT = 1;
+                        else if ( !strcasecmp("PL",list[1]) ) args->gt_use_GT = 0;
+                        else error("Failed to parse --use %s; only GT and PL are supported\n", optarg);
+                    }
+                    else args->gt_use_GT = args->qry_use_GT;
+                    for (i=0; i<nlist; i++) free(list[i]);
+                    free(list);
+                }
+                break;
+            case 2 :
+                args->ntop = strtol(optarg,&tmp,10);
+                if ( !tmp || *tmp ) error("Could not parse: --n-matches %s\n", optarg);
+                if ( args->ntop < 0 )
+                {
+                    args->sort_by_hwe = 1;
+                    args->ntop *= -1;
+                }
+                break;
+            case 3 : args->calc_hwe_prob = 0; break;
+            case 4 : error("The option -S, --target-sample has been deprecated\n"); break;
+            case 5 : args->dry_run = 1; break;
+            case 6 : 
+                args->distinctive_sites = strtod(optarg,&tmp);
+                if ( *tmp )
+                {
+                    if ( *tmp!=',' ) error("Could not parse: --distinctive-sites %s\n", optarg);
+                    tmp++;
+                    free(args->es_max_mem);
+                    args->es_max_mem = strdup(tmp);
+                    while ( *tmp && *tmp!=',' ) tmp++;
+                    if ( *tmp ) { *tmp = 0; args->es_tmp_prefix = tmp+1; }
+                }
+                args->use_PLs = 0;
+                break;
              case 'c':
+                error("The -c option is to be implemented, please open an issue on github\n");
                  args->min_inter_err = strtod(optarg,&tmp);
                  if ( *tmp )
                  {
@@ -775,50 +1179,77 @@ int main_vcfgtcheck(int argc, char *argv[])
                      if ( *tmp ) error("Could not parse: -c %s\n", optarg);
                  }
                  break;
-            case 'G':
-                args->no_PLs = strtol(optarg,&tmp,10);
-                if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg);
-                break;
-            case 'a': args->all_sites = 1; break;
+            case 'G': error("The option -G, --GTs-only has been deprecated\n"); break;
+            case 'a': args->all_sites = 1; error("The -a option is to be implemented, please open an issue on github\n"); break;
              case 'H': args->hom_only = 1; break;
              case 'g': args->gt_fname = optarg; break;
-            case 'p': args->plot = optarg; break;
-            case 'S': args->target_sample = optarg; break;
-            case 's': args->query_sample = optarg; break;
-            case 'r': regions = optarg; break;
-            case 'R': regions = optarg; regions_is_file = 1; break;
-            case 't': targets = optarg; break;
-            case 'T': targets = optarg; targets_is_file = 1; break;
+//            case 'p': args->plot = optarg; break;
+            case 's':
+                if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3;
+                else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4;
+                else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
+                break;
+            case 'S': 
+                if ( !strncasecmp("gt:",optarg,3) ) args->gt_samples = optarg+3, args->gt_samples_is_file = 1;
+                else if ( !strncasecmp("qry:",optarg,4) ) args->qry_samples = optarg+4, args->qry_samples_is_file = 1;
+                else error("Which one? Query samples (qry:%s) or genotype samples (gt:%s)?\n",optarg,optarg);
+                break;
+            case 'p': args->pair_samples = optarg; break;
+            case 'P': args->pair_samples = optarg; args->pair_samples_is_file = 1; break;
+            case 'r': args->regions = optarg; break;
+            case 'R': args->regions = optarg; args->regions_is_file = 1; break;
+            case 't': args->targets = optarg; break;
+            case 'T': args->targets = optarg; args->targets_is_file = 1; break;
              case 'h':
              case '?': usage(); break;
              default: error("Unknown argument: %s\n", optarg);
          }
      }
-    char *fname = NULL;
      if ( optind==argc )
      {
-        if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
+        if ( !isatty(fileno((FILE *)stdin)) ) args->qry_fname = "-";  // reading from stdin
          else usage();   // no files given
      }
-    else fname = argv[optind];
-    if ( argc>optind+1 )  usage();  // too many files given
-    if ( !args->gt_fname ) args->cross_check = 1;   // no genotype file, run in cross-check mode
-    else args->files->require_index = 1;
-    if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions);
-    if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets);
-    if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
-    if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) )
-        error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum));
-    args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS;
-    if ( args->plot ) args->plot = init_prefix(args->plot);
+    else args->qry_fname = argv[optind];
+    if ( argc>optind+1 ) error("Error: too many files given, run with -h for help\n");  // too many files given
+    if ( args->pair_samples )
+    {
+        if ( args->gt_samples || args->qry_samples ) error("The -p/-P option cannot be combined with -s/-S\n");
+        if ( args->ntop ) error("The --n-matches option cannot be combined with -p/-P\n");
+    }
+    if ( args->distinctive_sites && !args->pair_samples ) error("The experimental option --distinctive-sites requires -p/-P\n");
+    if ( args->hom_only && !args->gt_fname ) error("The option --homs-only requires --genotypes\n");
+    if ( args->distinctive_sites && args->use_PLs ) error("The option --distinctive-sites cannot be combined with --error-probability\n");
+
      init_data(args);
-    if ( args->cross_check )
-        cross_check_gts(args);
-    else
-        check_gt(args);
+
+    int ret;
+    while ( (ret=bcf_sr_next_line(args->files)) )
+    {
+        if ( !is_input_okay(args,ret) ) continue;
+
+        // time one record to give the user an estimate with very big files
+        struct timeval t0, t1;
+        if ( !args->ncmp )  gettimeofday(&t0, NULL);
+
+        process_line(args);
+
+        if ( args->ncmp==1 )
+        {
+            gettimeofday(&t1, NULL);
+            double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec);
+            fprintf(bcftools_stderr,"INFO:\tTime required to process one record .. %f seconds\n",delta/1e6);
+            fprintf(args->fp,"INFO\tTime required to process one record .. %f seconds\n",delta/1e6);
+            if ( args->dry_run ) break;
+        }
+    }
+    if ( !args->dry_run )
+    {
+        report(args);
+        if ( args->distinctive_sites ) report_distinctive_sites(args);
+    }
+
      destroy_data(args);
-    bcf_sr_destroy(args->files);
-    if (args->plot) free(args->plot);
      free(args);
      return 0;
  }
diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c

index 9f7de23f73002325bda91cd01f57c9c3d1c5f3dc..4a16d8a4b8ae63920390055cf6af29a152f79c7e 100644 (file)
--- a/bcftools/vcfindex.c
+++ b/bcftools/vcfindex.c
@@ -1,6 +1,6 @@
  /*  vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
  
-    Copyright (C) 2014-2016 Genome Research Ltd.
+    Copyright (C) 2014-2021 Genome Research Ltd.
  
      Author: Shane McCarthy <sm15@sanger.ac.uk>
  
@@ -24,6 +24,7 @@ DEALINGS IN THE SOFTWARE.  */
  
  #include <stdio.h>
  #include <stdlib.h>
+#include <strings.h>
  #include <unistd.h>
  #include <getopt.h>
  #include <htslib/vcf.h>
@@ -37,6 +38,11 @@ DEALINGS IN THE SOFTWARE.  */
  
  #define BCF_LIDX_SHIFT    14
  
+enum {
+    per_contig = 1,
+    total = 2
+};
+
  static void usage(void)
  {
      fprintf(stderr, "\n");
@@ -47,7 +53,7 @@ static void usage(void)
      fprintf(stderr, "    -c, --csi                generate CSI-format index for VCF/BCF files [default]\n");
      fprintf(stderr, "    -f, --force              overwrite index if it already exists\n");
      fprintf(stderr, "    -m, --min-shift INT      set minimal interval size for CSI indices to 2^INT [14]\n");
-    fprintf(stderr, "    -o, --output-file FILE   optional output index file name\n");
+    fprintf(stderr, "    -o, --output FILE        optional output index file name\n");
      fprintf(stderr, "    -t, --tbi                generate TBI-format index for VCF files\n");
      fprintf(stderr, "        --threads INT        use multithreading with INT worker threads [0]\n");
      fprintf(stderr, "\n");
@@ -60,65 +66,137 @@ static void usage(void)
  
  int vcf_index_stats(char *fname, int stats)
  {
-    const char **seq;
-    int i, nseq;
+    const char **seq = NULL;
+    int tid, nseq = 0, ret = 0;
      tbx_t *tbx = NULL;
+    bcf_hdr_t *hdr = NULL;
      hts_idx_t *idx = NULL;
+    htsFile *fp = NULL;
+    uint64_t sum = 0;
+    char *fntemp = NULL, *fnidx = NULL;
  
-    htsFile *fp = hts_open(fname,"r");
-    if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; }
-    bcf_hdr_t *hdr = bcf_hdr_read(fp);
-    if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; }
-
-    if ( hts_get_format(fp)->format==vcf )
+    /*
+     * First, has the user provided an index file? If per contig stats
+     * are requested, open the variant file (together with the index file,
+     * if provided), since the contig names can only be retrieved from its
+     * header. Otherwise, use just the corresponding index file to count
+     * the total number of records.
+     */
+    int len = strlen(fname);
+    if ( (fnidx = strstr(fname, HTS_IDX_DELIM)) != NULL ) {
+        fntemp = strdup(fname);
+        if ( !fntemp ) return 1;
+        fntemp[fnidx-fname] = 0;
+        fname = fntemp;
+        fnidx += strlen(HTS_IDX_DELIM);
+    }
+    else if ( len>4 && (!strcasecmp(".csi",fname+len-4) || !strcasecmp(".tbi",fname+len-4)) )
      {
-        tbx = tbx_index_load(fname);
-        if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; }
+        fnidx  = fname;
+        fntemp = strdup(fname);
+        fname  = fntemp;
+        fname[len-4] = 0;
      }
-    else if ( hts_get_format(fp)->format==bcf )
+
+    if ( stats&per_contig )
      {
-        idx = bcf_index_load(fname);
-        if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
+        fp = hts_open(fname,"r");
+        if ( !fp ) {
+            fprintf(stderr,"Could not read %s\n", fname);
+            ret = 1; goto cleanup;
+        }
+        hdr = bcf_hdr_read(fp);
+        if ( !hdr ) {
+            fprintf(stderr,"Could not read the header: %s\n", fname);
+            ret = 1; goto cleanup;
+        }
+
+        if ( hts_get_format(fp)->format==vcf )
+        {
+            tbx = tbx_index_load2(fname, fnidx);
+            if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; }
+        }
+        else if ( hts_get_format(fp)->format==bcf )
+        {
+            idx = bcf_index_load2(fname, fnidx);
+            if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
+        }
+        else
+        {
+            fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
+            return 1;
+        }
      }
-    else
+    else if ( fnidx )
      {
-        fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
-        return 1;
+        char *ext = strrchr(fnidx, '.');
+        if ( ext && strcmp(ext, ".tbi") == 0 ) {
+            tbx = tbx_index_load2(fname, fnidx);
+        } else if ( ext && strcmp(ext, ".csi") == 0 ) {
+            idx = bcf_index_load2(fname, fnidx);
+        }
+        if ( !tbx && !idx ) {
+            fprintf(stderr,"Could not load index file '%s'\n", fnidx);
+            ret = 1; goto cleanup;
+        }
+    } else {
+        char *ext = strrchr(fname, '.');
+        if ( ext && strcmp(ext, ".bcf") == 0 ) {
+            idx = bcf_index_load(fname);
+        } else if ( ext && (ext-fname) > 4 && strcmp(ext-4, ".vcf.gz") == 0 ) {
+            tbx = tbx_index_load(fname);
+        }
      }
  
-    seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq);
-    uint64_t sum = 0;
-    for (i=0; i<nseq; i++)
+    if ( !tbx && !idx ) {
+        fprintf(stderr,"No index file could be found for '%s'. Use 'bcftools index' to create one\n", fname);
+        ret = 1; goto cleanup;
+    }
+
+    if ( tbx ) {
+        seq = tbx_seqnames(tbx, &nseq);
+    } else {
+        nseq = hts_idx_nseq(idx);
+    }
+
+    for (tid=0; tid<nseq; tid++)
      {
          uint64_t records, v;
-        hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v);
-        sum+=records;
-        if (stats&2 || !records) continue;
-        bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
-        int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
-        printf("%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
-    }
-    if (!sum)
+        hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v);
+        sum += records;
+        if ( (stats&total) || !records ) continue;
+        const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : NULL;
+        if ( ctg_name ) {
+            bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL;
+            int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
+            printf("%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records);
+        }
+    }
+    if ( !sum )
      {
          // No counts found.
          // Is this because index version has no stored count data, or no records?
          bcf1_t *rec = bcf_init1();
-        if (bcf_read1(fp, hdr, rec) >= 0)
-        {
+        if (fp && hdr && rec && bcf_read1(fp, hdr, rec) >= 0) {
              fprintf(stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname);
-            return 1;
+            ret = 1;
          }
          bcf_destroy1(rec);
      }
-    if (stats&2) printf("%" PRIu64 "\n", sum);
+    if ( (stats&total) && !ret ) {
+        printf("%" PRIu64 "\n", sum);
+    }
+
+cleanup:
      free(seq);
-    if ( hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
+    free(fntemp);
+    if ( fp && hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
      bcf_hdr_destroy(hdr);
      if (tbx)
          tbx_destroy(tbx);
      if (idx)
          hts_idx_destroy(idx);
-    return 0;
+    return ret;
  }
  
  int main_vcfindex(int argc, char *argv[])
@@ -137,6 +215,7 @@ int main_vcfindex(int argc, char *argv[])
          {"nrecords",no_argument,NULL,'n'},
          {"threads",required_argument,NULL,9},
          {"output-file",required_argument,NULL,'o'},
+        {"output",required_argument,NULL,'o'},
          {NULL, 0, NULL, 0}
      };
  
@@ -152,8 +231,8 @@ int main_vcfindex(int argc, char *argv[])
                  min_shift = strtol(optarg,&tmp,10);
                  if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg);
                  break;
-            case 's': stats |= 1; break;
-            case 'n': stats |= 2; break;
+            case 's': stats |= per_contig; break;
+            case 'n': stats |= total; break;
              case 9:
                  n_threads = strtol(optarg,&tmp,10);
                  if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
@@ -162,7 +241,7 @@ int main_vcfindex(int argc, char *argv[])
              default: usage();
          }
      }
-    if (stats>2)
+    if (stats > total)
      {
          fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
          return 1;
diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c

index 0b7aeeb1e8bd9ece229e291fcf56d96a757766eb..acbae8958cd35016950536f2f5219d93753b5b3d 100644 (file)
--- a/bcftools/vcfindex.c.pysam.c
+++ b/bcftools/vcfindex.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
  
-    Copyright (C) 2014-2016 Genome Research Ltd.
+    Copyright (C) 2014-2021 Genome Research Ltd.
  
      Author: Shane McCarthy <sm15@sanger.ac.uk>
  
@@ -26,6 +26,7 @@ DEALINGS IN THE SOFTWARE.  */
  
  #include <stdio.h>
  #include <stdlib.h>
+#include <strings.h>
  #include <unistd.h>
  #include <getopt.h>
  #include <htslib/vcf.h>
@@ -39,6 +40,11 @@ DEALINGS IN THE SOFTWARE.  */
  
  #define BCF_LIDX_SHIFT    14
  
+enum {
+    per_contig = 1,
+    total = 2
+};
+
  static void usage(void)
  {
      fprintf(bcftools_stderr, "\n");
@@ -49,7 +55,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "    -c, --csi                generate CSI-format index for VCF/BCF files [default]\n");
      fprintf(bcftools_stderr, "    -f, --force              overwrite index if it already exists\n");
      fprintf(bcftools_stderr, "    -m, --min-shift INT      set minimal interval size for CSI indices to 2^INT [14]\n");
-    fprintf(bcftools_stderr, "    -o, --output-file FILE   optional output index file name\n");
+    fprintf(bcftools_stderr, "    -o, --output FILE        optional output index file name\n");
      fprintf(bcftools_stderr, "    -t, --tbi                generate TBI-format index for VCF files\n");
      fprintf(bcftools_stderr, "        --threads INT        use multithreading with INT worker threads [0]\n");
      fprintf(bcftools_stderr, "\n");
@@ -57,70 +63,142 @@ static void usage(void)
      fprintf(bcftools_stderr, "    -n, --nrecords       print number of records based on existing index file\n");
      fprintf(bcftools_stderr, "    -s, --stats          print per contig stats based on existing index file\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int vcf_index_stats(char *fname, int stats)
  {
-    const char **seq;
-    int i, nseq;
+    const char **seq = NULL;
+    int tid, nseq = 0, ret = 0;
      tbx_t *tbx = NULL;
+    bcf_hdr_t *hdr = NULL;
      hts_idx_t *idx = NULL;
+    htsFile *fp = NULL;
+    uint64_t sum = 0;
+    char *fntemp = NULL, *fnidx = NULL;
  
-    htsFile *fp = hts_open(fname,"r");
-    if ( !fp ) { fprintf(bcftools_stderr,"Could not read %s\n", fname); return 1; }
-    bcf_hdr_t *hdr = bcf_hdr_read(fp);
-    if ( !hdr ) { fprintf(bcftools_stderr,"Could not read the header: %s\n", fname); return 1; }
-
-    if ( hts_get_format(fp)->format==vcf )
+    /*
+     * First, has the user provided an index file? If per contig stats
+     * are requested, open the variant file (together with the index file,
+     * if provided), since the contig names can only be retrieved from its
+     * header. Otherwise, use just the corresponding index file to count
+     * the total number of records.
+     */
+    int len = strlen(fname);
+    if ( (fnidx = strstr(fname, HTS_IDX_DELIM)) != NULL ) {
+        fntemp = strdup(fname);
+        if ( !fntemp ) return 1;
+        fntemp[fnidx-fname] = 0;
+        fname = fntemp;
+        fnidx += strlen(HTS_IDX_DELIM);
+    }
+    else if ( len>4 && (!strcasecmp(".csi",fname+len-4) || !strcasecmp(".tbi",fname+len-4)) )
      {
-        tbx = tbx_index_load(fname);
-        if ( !tbx ) { fprintf(bcftools_stderr,"Could not load index for VCF: %s\n", fname); return 1; }
+        fnidx  = fname;
+        fntemp = strdup(fname);
+        fname  = fntemp;
+        fname[len-4] = 0;
      }
-    else if ( hts_get_format(fp)->format==bcf )
+
+    if ( stats&per_contig )
      {
-        idx = bcf_index_load(fname);
-        if ( !idx ) { fprintf(bcftools_stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
+        fp = hts_open(fname,"r");
+        if ( !fp ) {
+            fprintf(bcftools_stderr,"Could not read %s\n", fname);
+            ret = 1; goto cleanup;
+        }
+        hdr = bcf_hdr_read(fp);
+        if ( !hdr ) {
+            fprintf(bcftools_stderr,"Could not read the header: %s\n", fname);
+            ret = 1; goto cleanup;
+        }
+
+        if ( hts_get_format(fp)->format==vcf )
+        {
+            tbx = tbx_index_load2(fname, fnidx);
+            if ( !tbx ) { fprintf(bcftools_stderr,"Could not load index for VCF: %s\n", fname); return 1; }
+        }
+        else if ( hts_get_format(fp)->format==bcf )
+        {
+            idx = bcf_index_load2(fname, fnidx);
+            if ( !idx ) { fprintf(bcftools_stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
+        }
+        else
+        {
+            fprintf(bcftools_stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
+            return 1;
+        }
      }
-    else
+    else if ( fnidx )
      {
-        fprintf(bcftools_stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
-        return 1;
+        char *ext = strrchr(fnidx, '.');
+        if ( ext && strcmp(ext, ".tbi") == 0 ) {
+            tbx = tbx_index_load2(fname, fnidx);
+        } else if ( ext && strcmp(ext, ".csi") == 0 ) {
+            idx = bcf_index_load2(fname, fnidx);
+        }
+        if ( !tbx && !idx ) {
+            fprintf(bcftools_stderr,"Could not load index file '%s'\n", fnidx);
+            ret = 1; goto cleanup;
+        }
+    } else {
+        char *ext = strrchr(fname, '.');
+        if ( ext && strcmp(ext, ".bcf") == 0 ) {
+            idx = bcf_index_load(fname);
+        } else if ( ext && (ext-fname) > 4 && strcmp(ext-4, ".vcf.gz") == 0 ) {
+            tbx = tbx_index_load(fname);
+        }
      }
  
-    seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq);
-    uint64_t sum = 0;
-    for (i=0; i<nseq; i++)
+    if ( !tbx && !idx ) {
+        fprintf(bcftools_stderr,"No index file could be found for '%s'. Use 'bcftools index' to create one\n", fname);
+        ret = 1; goto cleanup;
+    }
+
+    if ( tbx ) {
+        seq = tbx_seqnames(tbx, &nseq);
+    } else {
+        nseq = hts_idx_nseq(idx);
+    }
+
+    for (tid=0; tid<nseq; tid++)
      {
          uint64_t records, v;
-        hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v);
-        sum+=records;
-        if (stats&2 || !records) continue;
-        bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
-        int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
-        fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
-    }
-    if (!sum)
+        hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v);
+        sum += records;
+        if ( (stats&total) || !records ) continue;
+        const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : NULL;
+        if ( ctg_name ) {
+            bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL;
+            int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
+            fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records);
+        }
+    }
+    if ( !sum )
      {
          // No counts found.
          // Is this because index version has no stored count data, or no records?
          bcf1_t *rec = bcf_init1();
-        if (bcf_read1(fp, hdr, rec) >= 0)
-        {
+        if (fp && hdr && rec && bcf_read1(fp, hdr, rec) >= 0) {
              fprintf(bcftools_stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname);
-            return 1;
+            ret = 1;
          }
          bcf_destroy1(rec);
      }
-    if (stats&2) fprintf(bcftools_stdout, "%" PRIu64 "\n", sum);
+    if ( (stats&total) && !ret ) {
+        fprintf(bcftools_stdout, "%" PRIu64 "\n", sum);
+    }
+
+cleanup:
      free(seq);
-    if ( hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
+    free(fntemp);
+    if ( fp && hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__);
      bcf_hdr_destroy(hdr);
      if (tbx)
          tbx_destroy(tbx);
      if (idx)
          hts_idx_destroy(idx);
-    return 0;
+    return ret;
  }
  
  int main_vcfindex(int argc, char *argv[])
@@ -139,6 +217,7 @@ int main_vcfindex(int argc, char *argv[])
          {"nrecords",no_argument,NULL,'n'},
          {"threads",required_argument,NULL,9},
          {"output-file",required_argument,NULL,'o'},
+        {"output",required_argument,NULL,'o'},
          {NULL, 0, NULL, 0}
      };
  
@@ -154,8 +233,8 @@ int main_vcfindex(int argc, char *argv[])
                  min_shift = strtol(optarg,&tmp,10);
                  if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg);
                  break;
-            case 's': stats |= 1; break;
-            case 'n': stats |= 2; break;
+            case 's': stats |= per_contig; break;
+            case 'n': stats |= total; break;
              case 9:
                  n_threads = strtol(optarg,&tmp,10);
                  if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
@@ -164,7 +243,7 @@ int main_vcfindex(int argc, char *argv[])
              default: usage();
          }
      }
-    if (stats>2)
+    if (stats > total)
      {
          fprintf(bcftools_stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
          return 1;
diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c

index 261841ce6072b4466ef1d4055812f4fdbdb66adf..1d2fab159992c4e3d6989b8357d8ba619dfdf94d 100644 (file)
--- a/bcftools/vcfisec.c
+++ b/bcftools/vcfisec.c
@@ -1,6 +1,6 @@
  /*  vcfisec.c -- Create intersections, unions and complements of VCF files.
  
-    Copyright (C) 2012-2019 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -141,7 +141,7 @@ void isec_vcf(args_t *args)
      if ( args->targets_list && files->nreaders==1 ) out_std = 1;
      if ( out_std )
      {
-        out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+        out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname));
          if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
          if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
          if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
@@ -356,7 +356,7 @@ static void init_data(args_t *args)
  
              #define OPEN_FILE(i,j) { \
                  open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \
-                args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type));  \
+                args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode2(args->output_type,args->fnames[i]));  \
                  if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \
                  if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \
                  if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
@@ -397,10 +397,9 @@ static void init_data(args_t *args)
                  fprintf(args->fh_log,"%s\tfor stripped\t%s\n", args->fnames[i], args->files->readers[i].fname);
              }
              #undef OPEN_FILE
-
-            args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix);
-            if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno));
          }
+        args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix);
+        if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno));
      }
      else {
          if (args->output_fname) {
diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c

index 2ef8853bb99974bf6962fcbccb18cbac6050542b..d59d7df1feb8a791e5c89d25f58ebaed848202a9 100644 (file)
--- a/bcftools/vcfisec.c.pysam.c
+++ b/bcftools/vcfisec.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfisec.c -- Create intersections, unions and complements of VCF files.
  
-    Copyright (C) 2012-2019 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -143,7 +143,7 @@ void isec_vcf(args_t *args)
      if ( args->targets_list && files->nreaders==1 ) out_std = 1;
      if ( out_std )
      {
-        out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+        out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname));
          if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
          if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
          if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
@@ -358,7 +358,7 @@ static void init_data(args_t *args)
  
              #define OPEN_FILE(i,j) { \
                  open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \
-                args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type));  \
+                args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode2(args->output_type,args->fnames[i]));  \
                  if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \
                  if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \
                  if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
@@ -399,10 +399,9 @@ static void init_data(args_t *args)
                  fprintf(args->fh_log,"%s\tfor stripped\t%s\n", args->fnames[i], args->files->readers[i].fname);
              }
              #undef OPEN_FILE
-
-            args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix);
-            if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno));
          }
+        args->fh_sites = open_file(NULL, "w", "%s/sites.txt", args->prefix);
+        if ( !args->fh_sites ) error("%s/sites.txt: %s\n", args->prefix, strerror(errno));
      }
      else {
          if (args->output_fname) {
@@ -494,7 +493,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "   # Extract records private to A or B comparing by position only\n");
      fprintf(bcftools_stderr, "   bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcfisec(int argc, char *argv[])
diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c

index 42c2bd318d4586dd1721c4ce8a4b3bb006963426..637e1b9106020d3b167d6a091596a7f10c1a2bca 100644 (file)
--- a/bcftools/vcfmerge.c
+++ b/bcftools/vcfmerge.c
@@ -1,6 +1,6 @@
  /*  vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
  
-    Copyright (C) 2012-2019 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -25,6 +25,7 @@ THE SOFTWARE.  */
  #include <stdio.h>
  #include <string.h>
  #include <strings.h>
+#include <assert.h>
  #include <errno.h>
  #include <unistd.h>
  #include <getopt.h>
@@ -58,6 +59,8 @@ typedef khash_t(strdict) strdict_t;
  
  #define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
  
+#define PL2PROB_MAX 1024
+
  // For merging INFO Number=A,G,R tags
  typedef struct
  {
@@ -132,6 +135,11 @@ typedef struct
      gvcf_aux_t *gvcf;   // buffer of gVCF lines, for each reader one line
      int nout_smpl;
      kstring_t *str;
+    int32_t *laa;           // localized alternate alleles given as input-based indexes in per-sample blocks of (args->local_alleles+1) values, 0 is always first
+    int nlaa, laa_dirty;    // number of LAA alleles actually used at this site, and was any L* added?
+    int32_t *tmpi, *k2k;
+    double *tmpd, *pl2prob; // mapping from phred-score likelihoods (PL) to probability
+    int ntmpi, ntmpd, nk2k;
  }
  maux_t;
  
@@ -141,7 +149,7 @@ typedef struct
      maux_t *maux;
      regidx_t *regs;    // apply regions only after the blocks are expanded
      regitr_t *regs_itr;
-    int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref;
+    int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index;
      char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
      faidx_t *gvcf_fai;
      info_rule_t *rules;
@@ -154,6 +162,7 @@ typedef struct
      bcf_hdr_t *out_hdr;
      char **argv;
      int argc, n_threads, record_cmd_line;
+    int local_alleles;    // the value of -L option
  }
  args_t;
  
@@ -262,7 +271,28 @@ static void info_rules_merge_join(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rul
          bcf_update_info_string(hdr,line,rule->hdr_tag,rule->vals);
      }
      else
+    {
+        int isrc, idst = 0;
+        #define BRANCH(type_t,is_missing,is_vector_end) { \
+            type_t *ptr = (type_t*) rule->vals; \
+            for (isrc=0; isrc<rule->nvals; isrc++) \
+            { \
+                if ( is_vector_end ) break; \
+                if ( is_missing ) continue; \
+                if ( idst!=isrc ) ptr[idst] = ptr[isrc]; \
+                idst++; \
+            } \
+        }
+        switch (rule->type) {
+            case BCF_HT_INT:  BRANCH(int32_t, ptr[isrc]==bcf_int32_missing, ptr[isrc]==bcf_int32_vector_end); break;
+            case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[isrc]), bcf_float_is_vector_end(ptr[isrc])); break;
+            default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+        }
+        #undef BRANCH
+
+        rule->nvals = idst;
          bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,rule->nvals,rule->type);
+    }
  }
  
  static int info_rules_comp_key2(const void *a, const void *b)
@@ -344,7 +374,7 @@ static void info_rules_init(args_t *args)
          if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t);
          else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float);
          else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); 
-        else error("The type is not supported: \"%s\"\n", rule->hdr_tag);
+        else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type);
  
          ss = strchr(ss, '\0'); ss++;
          if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag);
@@ -366,8 +396,17 @@ static void info_rules_init(args_t *args)
                      bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_G ||
                      bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_R
                      ) ? 1 : 0;
-            if ( is_join && is_agr )
-                error("Cannot -i %s:join on Number=[AGR] tags is not supported.\n", rule->hdr_tag);
+            if ( is_join && bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)!=BCF_VL_VAR )
+            {
+                bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->out_hdr, BCF_HL_INFO, "ID", rule->hdr_tag, NULL);
+                hrec = bcf_hrec_dup(hrec);
+                int i = bcf_hrec_find_key(hrec, "Number");
+                if ( i<0 ) error("Uh, could not find the entry Number in the header record of %s\n",rule->hdr_tag);
+                free(hrec->vals[i]);
+                hrec->vals[i] = strdup(".");
+                bcf_hdr_remove(args->out_hdr,BCF_HL_INFO, rule->hdr_tag);
+                bcf_hdr_add_hrec(args->out_hdr, hrec);
+            }
              if ( !is_join && !is_agr )
                  error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag);
          }
@@ -689,7 +728,7 @@ maux_t *maux_init(args_t *args)
      assert( n_smpl==bcf_hdr_nsamples(args->out_hdr) );
      if ( args->do_gvcf )
      {
-        ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t));
+        ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t));  // -Walloc-size-larger-than gives a harmless warning caused by signed integer ma->n
          for (i=0; i<ma->n; i++)
              ma->gvcf[i].line = bcf_init1();
      }
@@ -699,6 +738,13 @@ maux_t *maux_init(args_t *args)
      for (i=0; i<ma->n; i++)
          ma->buf[i].rid = -1;
      ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t));
+    if ( args->local_alleles )
+    {
+        ma->laa = (int32_t*)malloc(sizeof(*ma->laa)*ma->nout_smpl*(1+args->local_alleles));
+        ma->pl2prob = (double*)malloc(PL2PROB_MAX*sizeof(*ma->pl2prob));
+        for (i=0; i<PL2PROB_MAX; i++)
+            ma->pl2prob[i] = pow(10,-0.1*i);
+    }
      return ma;
  }
  void maux_destroy(maux_t *ma)
@@ -737,6 +783,11 @@ void maux_destroy(maux_t *ma)
      free(ma->smpl_ploidy);
      free(ma->smpl_nGsize);
      free(ma->chr);
+    free(ma->laa);
+    free(ma->tmpi);
+    free(ma->k2k);
+    free(ma->tmpd);
+    free(ma->pl2prob);
      free(ma);
  }
  void maux_expand1(buffer_t *buf, int size)
@@ -1325,6 +1376,171 @@ static inline int max_used_gt_ploidy(bcf_fmt_t *fmt, int nsmpl)
      return max_ploidy;
  }
  
+// Sets ma->laa to local indexes relevant for each sample or missing/vector_end.
+// The indexes are with respect to the source indexes and must be translated as
+// the very last step.
+void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL)
+{
+    bcf_srs_t *files = args->files;
+    maux_t *ma = args->maux;
+    int i,j,k,l, ismpl = 0, nlaa = 0;
+    static int warned = 0;
+
+    hts_expand(double,out->n_allele,ma->ntmpd,ma->tmpd); // allele probabilities
+    hts_expand(int,out->n_allele,ma->ntmpi,ma->tmpi);    // indexes of the sorted probabilities
+
+    // Let map[] be the mapping from src to output idx. Then k2k[] is mapping from src allele idxs to src allele idxs
+    // reordered so that if i<j then map[k2k[i]] < map[k2k[j]]
+    hts_expand(int,out->n_allele,ma->nk2k,ma->k2k);
+
+    // Determine local alleles: either take all that are present in the reader or use PL to determine the best
+    // subset for each sample. The alleles must be listed in the order of the alleles in the output file.
+    for (i=0; i<files->nreaders; i++)
+    {
+        bcf_sr_t *reader = &files->readers[i];
+        bcf_hdr_t *hdr = reader->header;
+        bcf_fmt_t *fmt_ori = ma->fmt_map[files->nreaders*ifmt_PL+i];
+        bcf1_t *line = maux_get_line(args, i);
+        int nsmpl = bcf_hdr_nsamples(hdr);
+        if ( line )
+        {
+            if ( nlaa < line->n_allele - 1 )
+                nlaa = line->n_allele - 1 <= args->local_alleles ? line->n_allele - 1 : args->local_alleles;
+
+            for (j=0; j<line->n_allele; j++) ma->k2k[j] = j;
+
+            if ( line->n_allele <= args->local_alleles + 1 )
+            {
+                // sort to the output order, insertion sort, ascending 
+                int *map = ma->buf[i].rec[ma->buf[i].cur].map;
+                int *k2k = ma->k2k;
+                int tmp;
+                for (k=1; k<line->n_allele; k++)
+                    for (l=k; l>0 && map[k2k[l]] < map[k2k[l-1]]; l--)
+                        tmp = k2k[l], k2k[l] = k2k[l-1], k2k[l-1] = tmp;
+
+                // fewer than the allowed number of alleles, use all alleles from this file
+                for (j=0; j<nsmpl; j++)
+                {
+                    int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl;
+                    for (k=0; k<line->n_allele; k++) ptr[k] = k2k[k];
+                    for (; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end;
+                    ismpl++;
+                }
+                continue;
+            }
+        }
+        if ( !line || !fmt_ori )
+        {
+            // no values, fill in missing values
+            for (j=0; j<nsmpl; j++)
+            {
+                int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl;
+                ptr[0] = bcf_int32_missing;
+                for (k=1; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end;
+                ismpl++;
+            }
+            continue;
+        }
+
+        // there are more alternate alleles in the input files than is allowed on output, need to subset
+        if ( ifmt_PL==-1 )
+        {
+            if ( !warned )
+                fprintf(stderr,"Warning: local alleles are determined from FORMAT/PL but the tag is missing, cannot apply --local-alleles\n");
+            warned = 1;
+            ma->nlaa = 0;
+            return;
+        }
+
+        if ( !IS_VL_G(hdr, fmt_ori->id) ) error("FORMAT/PL must be defined as Number=G\n");
+        if ( 2*fmt_ori->n != line->n_allele*(line->n_allele+1) ) error("Todo: haploid PL to LPL\n");
+
+        int *map = ma->buf[i].rec[ma->buf[i].cur].map;
+        double *allele_prob = ma->tmpd;
+        int *idx = ma->tmpi;
+        #define BRANCH(src_type_t, src_is_missing, src_is_vector_end, pl2prob_idx) { \
+            src_type_t *src = (src_type_t*) fmt_ori->p; \
+            for (j=0; j<nsmpl; j++) \
+            { \
+                for (k=0; k<line->n_allele; k++) allele_prob[k] = 0; \
+                for (k=0; k<line->n_allele; k++) \
+                    for (l=0; l<=k; l++) \
+                    { \
+                        if ( src_is_missing || src_is_vector_end ) { src++; continue; } \
+                        double prob = ma->pl2prob[pl2prob_idx]; \
+                        allele_prob[k] += prob; \
+                        allele_prob[l] += prob; \
+                        src++; \
+                    } \
+                /* insertion sort by allele probability, descending order, with the twist that REF (idx=0) always comes first */ \
+                allele_prob++; idx[0] = -1; idx++; /* keep REF first */ \
+                int si,sj,tmp; \
+                for (si=0; si<line->n_allele-1; si++) idx[si] = si; \
+                for (si=1; si<line->n_allele-1; si++) \
+                    for (sj=si; sj>0 && allele_prob[idx[sj]] > allele_prob[idx[sj-1]]; sj--) \
+                        tmp = idx[sj], idx[sj] = idx[sj-1], idx[sj-1] = tmp; \
+                /*for debugging only: test order*/ \
+                for (si=1; si<line->n_allele-1; si++) \
+                    assert( allele_prob[idx[si-1]] >= allele_prob[idx[si]] ); \
+                allele_prob--; idx--; /* this was to keep REF first */ \
+                int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl; \
+                ptr[0] = 0; \
+                for (k=1; k<=args->local_alleles && k<line->n_allele; k++) ptr[k] = idx[k]+1; \
+                int kmax = k; \
+                for (; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end; \
+                /* insertion sort by indexes to the output order, ascending */ \
+                for (k=1; k<kmax; k++) \
+                    for (l=k; l>0 && map[ptr[l]] < map[ptr[l-1]]; l--) \
+                        tmp = ptr[l], ptr[l] = ptr[l-1], ptr[l-1] = tmp; \
+                ismpl++; \
+            } \
+        }
+        switch (fmt_ori->type)
+        {
+            case BCF_BT_INT8:  BRANCH( int8_t, *src==bcf_int8_missing,  *src==bcf_int8_vector_end,  *src); break;
+            case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *src>=0 && *src<PL2PROB_MAX ? *src : PL2PROB_MAX-1); break;
+            case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *src>=0 && *src<PL2PROB_MAX ? *src : PL2PROB_MAX-1); break;
+            default: error("Unexpected case: %d, PL\n", fmt_ori->type);
+        }
+        #undef BRANCH
+    }
+    ma->nlaa = nlaa;
+}
+
+void update_local_alleles(args_t *args, bcf1_t *out)
+{
+    bcf_srs_t *files = args->files;
+    maux_t *ma = args->maux;
+    int i,j,k,ismpl=0,nsamples = bcf_hdr_nsamples(args->out_hdr);
+    for (i=0; i<files->nreaders; i++)
+    {
+        int irec = ma->buf[i].cur;
+        bcf_sr_t *reader = &files->readers[i];
+        int nsmpl = bcf_hdr_nsamples(reader->header);
+        for (k=0; k<nsmpl; k++)
+        {
+            int32_t *src = ma->laa + ismpl*(1+args->local_alleles);
+            int32_t *dst = ma->laa + ismpl*ma->nlaa;
+            j = 0;
+            if ( irec>=0 )
+            {
+                for (; j<ma->nlaa; j++)
+                {
+                    if ( src[j+1]==bcf_int32_missing ) dst[j] = bcf_int32_missing;
+                    else if ( src[j+1]==bcf_int32_vector_end ) break;
+                    else
+                        dst[j] = ma->buf[i].rec[irec].map[src[j+1]];
+                }
+            }
+            if ( j==0 ) dst[j++] = bcf_int32_missing;
+            for (; j<ma->nlaa; j++) src[j] = bcf_int32_vector_end;
+            ismpl++;
+        }
+    }
+    bcf_update_format_int32(args->out_hdr, out, "LAA", ma->laa, nsamples*ma->nlaa);
+}
+
  void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
  {
      bcf_srs_t *files = args->files;
@@ -1333,7 +1549,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
      int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr);
      static int warned = 0;
  
-    int nsize = 0, msize = sizeof(int32_t);
+    int nsize = 0;
      for (i=0; i<files->nreaders; i++)
      {
          bcf_fmt_t *fmt = fmt_map[i];
@@ -1343,17 +1559,18 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
      }
      if ( nsize==0 ) nsize = 1;
  
-    if ( ma->ntmp_arr < nsamples*nsize*msize )
+    size_t msize = sizeof(int32_t)*nsize*nsamples;
+    if ( msize > 2147483647 )
      {
-        ma->ntmp_arr = nsamples*nsize*msize;
-        ma->tmp_arr  = realloc(ma->tmp_arr, ma->ntmp_arr);
-        if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr);
-        if ( ma->ntmp_arr > 2147483647 )
-        {
-            if ( !warned ) fprintf(stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr);
-            warned = 1;
-            return;
-        }
+        if ( !warned ) fprintf(stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize);
+        warned = 1;
+        return;
+    }
+    if ( ma->ntmp_arr < msize )
+    {
+        ma->tmp_arr  = realloc(ma->tmp_arr, msize);
+        if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",msize);
+        ma->ntmp_arr = msize;
      }
      memset(ma->smpl_ploidy,0,nsamples*sizeof(int));
  
@@ -1509,6 +1726,7 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf
                      int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew);
                      if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret);
                  }
+                if ( nmax < str->l ) nmax = str->l;
                  src += fmt_ori->size;
              }
              continue;
@@ -1520,17 +1738,18 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf
                "If you don't really need it, use `bcftools annotate -x` to remove the annotation before merging.\n", __func__,key);
      }
      // update the record
-    if ( ma->ntmp_arr < nsamples*nmax )
+    size_t msize = nsamples*nmax;
+    if ( msize > 2147483647 )
      {
-        ma->ntmp_arr = nsamples*nmax;
-        ma->tmp_arr  = realloc(ma->tmp_arr, ma->ntmp_arr);
-        if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr);
-        if ( ma->ntmp_arr > 2147483647 )
-        {
-            if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr);
-            warned = 1;
-            return;
-        }
+        if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize);
+        warned = 1;
+        return;
+    }
+    if ( ma->ntmp_arr < msize )
+    {
+        ma->tmp_arr  = realloc(ma->tmp_arr, msize);
+        if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",msize);
+        ma->ntmp_arr = msize;
      }
      char *tgt = (char*) ma->tmp_arr;
      for (i=0; i<nsamples; i++)
@@ -1542,6 +1761,204 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf
      bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nmax);
  }
  
+// Note: only diploid Number=G tags only for now
+void merge_localized_numberG_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out, int irdr)
+{
+    int i,j,k, nsamples = bcf_hdr_nsamples(args->out_hdr);
+    bcf_srs_t *files = args->files;
+    maux_t *ma = args->maux;
+    bcf_fmt_t *fmt = fmt_map[irdr];
+    const char *key = files->readers[irdr].header->id[BCF_DT_ID][fmt_map[irdr]->id].key;
+    size_t nsize = (ma->nlaa+1)*(ma->nlaa+2)/2;             // max number of Number=G localized fields
+    size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
+    msize *= nsamples*nsize;
+    if ( msize > 2147483647 )
+    {
+        static int warned = 0;
+        if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,msize);
+        warned = 1;
+        return;
+    }
+    if ( ma->ntmp_arr < msize )
+    {
+        ma->tmp_arr  = realloc(ma->tmp_arr, msize);
+        if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
+        ma->ntmp_arr = msize;
+    }
+    int ismpl = 0;
+    for (i=0; i<files->nreaders; i++)
+    {
+        bcf_sr_t *reader = &files->readers[i];
+        bcf_hdr_t *hdr = reader->header;
+        bcf_fmt_t *fmt_ori = fmt_map[i];
+        bcf1_t *line = maux_get_line(args, i);
+        int nsmpl = bcf_hdr_nsamples(hdr);
+
+        if ( !fmt_ori )
+        {
+            // fill missing values
+            #define BRANCH(tgt_type_t, tgt_set_missing, tgt_set_vector_end) { \
+                for (j=0; j<nsmpl; j++) \
+                { \
+                    tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+                    tgt_set_missing; \
+                    for (k=1; k<nsize; k++) { tgt++; tgt_set_vector_end; } \
+                    ismpl++; \
+                } \
+            }
+            switch (fmt->type)
+            {
+                case BCF_BT_INT8:  BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+                case BCF_BT_INT16: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+                case BCF_BT_INT32: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+                case BCF_BT_FLOAT: BRANCH(float, bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
+                default: error("Unexpected case: %d, %s\n", fmt->type, key);
+            }
+            #undef BRANCH
+            continue;
+        }
+        if ( 2*fmt_ori->n!=line->n_allele*(line->n_allele+1) ) error("Todo: localization of missing or haploid Number=G tags\n");
+
+        // localize
+        #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+            for (j=0; j<nsmpl; j++) \
+            { \
+                src_type_t *src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+                int *laa = ma->laa + (1+args->local_alleles)*ismpl; \
+                int ii,ij,tgt_idx = 0; \
+                for (ii=0; ii<=ma->nlaa; ii++) \
+                { \
+                    if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \
+                    for (ij=0; ij<=ii; ij++) \
+                    { \
+                        int src_idx = bcf_alleles2gt(laa[ii],laa[ij]); \
+                        if ( src_is_missing ) tgt_set_missing; \
+                        else if ( src_is_vector_end ) break; \
+                        else tgt[tgt_idx] = src[src_idx]; \
+                        tgt_idx++; \
+                    } \
+                } \
+                if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \
+                for (; tgt_idx<nsize; tgt_idx++) tgt_set_vector_end; \
+                ismpl++; \
+            } \
+        }
+        switch (fmt_ori->type)
+        {
+            case BCF_BT_INT8:  BRANCH(int32_t,  int8_t, src[src_idx]==bcf_int8_missing,  src[src_idx]==bcf_int8_vector_end,  tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
+            default: error("Unexpected case: %d, %s\n", fmt_ori->type, key);
+        }
+        #undef BRANCH
+    }
+    args->tmps.l = 0;
+    kputc('L',&args->tmps);
+    kputs(key,&args->tmps);
+    if ( fmt_map[irdr]->type==BCF_BT_FLOAT )
+        bcf_update_format_float(args->out_hdr, out, args->tmps.s, (float*)ma->tmp_arr, nsamples*nsize);
+    else
+        bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
+    ma->laa_dirty = 1;
+}
+void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out, int irdr)
+{
+    int i,j,k, nsamples = bcf_hdr_nsamples(args->out_hdr);
+    bcf_srs_t *files = args->files;
+    maux_t *ma = args->maux;
+    bcf_fmt_t *fmt = fmt_map[irdr];
+    const char *key = files->readers[irdr].header->id[BCF_DT_ID][fmt->id].key;
+    size_t nsize = IS_VL_R(files->readers[irdr].header, fmt->id) ? ma->nlaa + 1 : ma->nlaa;
+    size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
+    msize *= nsamples*nsize;
+    if ( msize > 2147483647 )
+    {
+        static int warned = 0;
+        if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,msize);
+        warned = 1;
+        return;
+    }
+    if ( ma->ntmp_arr < msize )
+    {
+        ma->tmp_arr  = realloc(ma->tmp_arr, msize);
+        if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
+        ma->ntmp_arr = msize;
+    }
+    int ismpl = 0, ibeg = IS_VL_R(files->readers[irdr].header, fmt->id) ? 0 : 1;;
+    for (i=0; i<files->nreaders; i++)
+    {
+        bcf_sr_t *reader = &files->readers[i];
+        bcf_hdr_t *hdr = reader->header;
+        bcf_fmt_t *fmt_ori = fmt_map[i];
+        int nsmpl = bcf_hdr_nsamples(hdr);
+
+        if ( !fmt_ori )
+        {
+            // fill missing values
+            #define BRANCH(tgt_type_t, tgt_set_missing, tgt_set_vector_end) { \
+                for (j=0; j<nsmpl; j++) \
+                { \
+                    tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+                    tgt_set_missing; \
+                    for (k=1; k<nsize; k++) { tgt++; tgt_set_vector_end; } \
+                    ismpl++; \
+                } \
+            }
+            switch (fmt->type)
+            {
+                case BCF_BT_INT8:  BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+                case BCF_BT_INT16: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+                case BCF_BT_INT32: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+                case BCF_BT_FLOAT: BRANCH(float, bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
+                default: error("Unexpected case: %d, %s\n", fmt->type, key);
+            }
+            #undef BRANCH
+            continue;
+        }
+
+        // localize
+        #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+            for (j=0; j<nsmpl; j++) \
+            { \
+                src_type_t *src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+                int *laa = ma->laa + (1+args->local_alleles)*ismpl; \
+                int ii,tgt_idx = 0; \
+                for (ii=ibeg; ii<=ma->nlaa; ii++) \
+                { \
+                    if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \
+                    int src_idx = laa[ii] - ibeg; \
+                    if ( src_is_missing ) tgt_set_missing; \
+                    else if ( src_is_vector_end ) break; \
+                    else tgt[tgt_idx] = src[src_idx]; \
+                    tgt_idx++; \
+                } \
+                if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \
+                for (; tgt_idx<nsize; tgt_idx++) tgt_set_vector_end; \
+                ismpl++; \
+            } \
+        }
+        switch (fmt_ori->type)
+        {
+            case BCF_BT_INT8:  BRANCH(int32_t,  int8_t, src[src_idx]==bcf_int8_missing,  src[src_idx]==bcf_int8_vector_end,  tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
+            default: error("Unexpected case: %d, %s\n", fmt_ori->type, key);
+        }
+        #undef BRANCH
+    }
+    args->tmps.l = 0;
+    kputc('L',&args->tmps);
+    kputs(key,&args->tmps);
+    if ( fmt_map[irdr]->type==BCF_BT_FLOAT )
+        bcf_update_format_float(args->out_hdr, out, args->tmps.s, (float*)ma->tmp_arr, nsamples*nsize);
+    else
+        bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
+    ma->laa_dirty = 1;
+}
  void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
  {
      bcf_srs_t *files = args->files;
@@ -1579,6 +1996,13 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
          }
          if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n;
      }
+    if ( ma->nlaa && length!=BCF_VL_FIXED )
+    {
+        if ( length==BCF_VL_G ) merge_localized_numberG_format_field(args,fmt_map,out,i);
+        else if ( length==BCF_VL_A || length==BCF_VL_R ) merge_localized_numberAR_format_field(args,fmt_map,out,i);
+        return;
+    }
+
      if ( type==BCF_BT_CHAR )
      {
          merge_format_string(args, key, fmt_map, out, length, nsize);
@@ -1586,17 +2010,18 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
      }
  
      size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
-    if ( ma->ntmp_arr < nsamples*nsize*msize )
+    msize *= nsamples*nsize;
+    if ( msize > 2147483647 )
      {
-        ma->ntmp_arr = nsamples*nsize*msize;
-        ma->tmp_arr  = realloc(ma->tmp_arr, ma->ntmp_arr);
-        if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", ma->ntmp_arr,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
-        if ( ma->ntmp_arr > 2147483647 )
-        {
-            if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr);
-            warned = 1;
-            return;
-        }
+        if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize);
+        warned = 1;
+        return;
+    }
+    if ( ma->ntmp_arr < msize )
+    {
+        ma->tmp_arr  = realloc(ma->tmp_arr, msize);
+        if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
+        ma->ntmp_arr = msize;
      }
  
      // Fill the temp array for all samples by collecting values from all files
@@ -1790,7 +2215,7 @@ void merge_format(args_t *args, bcf1_t *out)
      khiter_t kitr;
      strdict_t *tmph = args->tmph;
      kh_clear(strdict, tmph);
-    int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index
+    int i, j, ret, has_GT = 0, has_PL = -1, max_ifmt = 0; // max fmt index
      for (i=0; i<files->nreaders; i++)
      {
          bcf1_t *line = maux_get_line(args,i);
@@ -1820,6 +2245,7 @@ void merge_format(args_t *args, bcf1_t *out)
                          memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*));
                          ma->nfmt_map = max_ifmt+1;
                      }
+                    if ( key[0]=='P' && key[1]=='L' && key[2]==0  ) { has_PL = ifmt; }
                  }
                  kitr = kh_put(strdict, tmph, key, &ret);
                  kh_value(tmph, kitr) = ifmt;
@@ -1833,6 +2259,12 @@ void merge_format(args_t *args, bcf1_t *out)
          ma->buf[i].rec[irec].als_differ = j==line->n_allele ? 0 : 1;
      }
  
+    if ( args->local_alleles )
+    {
+        ma->laa_dirty = ma->nlaa = 0;
+        if ( out->n_allele > args->local_alleles + 1 ) init_local_alleles(args, out, has_PL);
+    }
+
      out->n_sample = bcf_hdr_nsamples(out_hdr);
      if ( has_GT )
          merge_GT(args, ma->fmt_map, out);
@@ -1840,6 +2272,10 @@ void merge_format(args_t *args, bcf1_t *out)
  
      for (i=1; i<=max_ifmt; i++)
          merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
+
+    if ( ma->laa_dirty )
+        update_local_alleles(args, out);
+
      out->d.indiv_dirty = 1;
  }
  
@@ -2041,6 +2477,23 @@ void gvcf_flush(args_t *args, int done)
      }
  }
  
+static inline int is_gvcf_block(bcf1_t *line)
+{
+    if ( line->rlen<=1 ) return 0;
+    if ( strlen(line->d.allele[0])==line->rlen ) return 0;
+    if ( line->n_allele==1 ) return 1;
+
+    int i;
+    for (i=1; i<line->n_allele; i++)
+    {
+        if ( !strcmp(line->d.allele[i],"<*>") ) return 1;
+        if ( !strcmp(line->d.allele[i],"<NON_REF>") ) return 1;
+        if ( !strcmp(line->d.allele[i],"<X>") ) return 1;
+    }
+    return 0;
+}
+static const int snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), indel_mask = VCF_INDEL<<2, ref_mask = 2;
+
  /*
      Check incoming lines for new gVCF blocks, set pointer to the current source
      buffer (gvcf or readers).  In contrast to gvcf_flush, this function can be
@@ -2059,6 +2512,7 @@ void gvcf_stage(args_t *args, int pos)
      maux->gvcf_min = INT_MAX;
      for (i=0; i<files->nreaders; i++)
      {
+        if ( gaux[i].active && gaux[i].end < pos ) gaux[i].active = 0;
          if ( gaux[i].active )
          {
              // gvcf block should not overlap with another record
@@ -2077,7 +2531,7 @@ void gvcf_stage(args_t *args, int pos)
          int irec = maux->buf[i].beg;
          bcf_hdr_t *hdr = bcf_sr_get_header(files, i);
          bcf1_t *line = args->files->readers[i].buffer[irec];
-        int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend);
+        int ret = is_gvcf_block(line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0;
          if ( ret==1 )
          {
              if ( end[0] == line->pos + 1 )  // POS and INFO/END are identical, treat as if a normal w/o INFO/END
@@ -2218,7 +2672,6 @@ void debug_state(args_t *args)
      fprintf(stderr,"\n");
  }
  
-
  /*
     Determine which line should be merged from which reader: go through all
     readers and all buffered lines, expand REF,ALT and try to match lines with
@@ -2227,7 +2680,6 @@ void debug_state(args_t *args)
  int can_merge(args_t *args)
  {
      bcf_srs_t *files = args->files;
-    int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
      maux_t *maux = args->maux;
      gvcf_aux_t *gaux = maux->gvcf;
      char *id = NULL, ref = 'N';
@@ -2240,6 +2692,9 @@ int can_merge(args_t *args)
      }
      maux->var_types = maux->nals = 0;
  
+    // this is only for the `-m none -g` mode, ensure that <*> lines come last
+    #define VCF_GVCF_REF 1
+
      for (i=0; i<files->nreaders; i++)
      {
          buffer_t *buf = &maux->buf[i];
@@ -2257,12 +2712,17 @@ int can_merge(args_t *args)
              buf->rec[j].skip = SKIP_DIFF;
              ntodo++;
  
+            bcf1_t *line = buf->lines[j];
              if ( args->merge_by_id )
-                id = buf->lines[j]->d.id;
+                id = line->d.id;
              else
              {
-                int var_type = bcf_get_variant_types(buf->lines[j]);
-                maux->var_types |= var_type ? var_type<<1 : 1;
+                int var_type = bcf_get_variant_types(line);
+                maux->var_types |= var_type ? var_type<<2 : 2;
+
+                // for the `-m none -g` mode
+                if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) )
+                    maux->var_types |= VCF_GVCF_REF;
              }
          }
  
@@ -2294,7 +2754,7 @@ int can_merge(args_t *args)
              bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
  
              int line_type = bcf_get_variant_types(line);
-            line_type = line_type ? line_type<<1 : 1;
+            line_type = line_type ? line_type<<2 : 2;
  
              // select relevant lines
              if ( args->merge_by_id )
@@ -2303,6 +2763,12 @@ int can_merge(args_t *args)
              }
              else
              {
+                // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant
+                // records come last, otherwise infinite loop is created (#1164)
+                if ( args->collapse==COLLAPSE_NONE && args->do_gvcf )
+                {
+                    if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue;
+                }
                  if ( args->collapse==COLLAPSE_NONE && maux->nals )
                  {
                      // All alleles of the tested record must be present in the
@@ -2366,7 +2832,6 @@ int can_merge(args_t *args)
  */
  void stage_line(args_t *args)
  {
-    int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
      bcf_srs_t *files = args->files;
      maux_t *maux = args->maux;
  
@@ -2436,13 +2901,9 @@ void stage_line(args_t *args)
  
  void merge_line(args_t *args)
  {
-    if ( args->regs )
-    {
-        if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return;
-    }
-
      bcf1_t *out = args->out_line;
      merge_chrom2qual(args, out);
+    if ( args->regs && !regidx_overlap(args->regs,args->maux->chr,out->pos,out->pos+out->rlen-1,NULL) ) return;
      merge_filter(args, out);
      merge_info(args, out);
      if ( args->do_gvcf )
@@ -2490,9 +2951,59 @@ void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *c
      error_errno("[%s] Failed to add program information to header", __func__);
  }
  
+void hdr_add_localized_tags(args_t *args, bcf_hdr_t *hdr)
+{
+    char **str = NULL;
+    int i,j, nstr = 0, mstr = 0;
+    for (i=0; i<hdr->nhrec; i++)
+    {
+        if ( hdr->hrec[i]->type!=BCF_HL_FMT ) continue;
+        j = bcf_hrec_find_key(hdr->hrec[i],"ID");
+        if ( j<0 ) continue;
+        char *key = hdr->hrec[i]->vals[j];
+        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
+        assert( id>=0 );
+        int localize = 0;
+        if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G ) localize = 1;
+        if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A ) localize = 1;
+        if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R ) localize = 1;
+        if ( !localize ) continue;
+        args->tmps.l = 0;
+
+        uint32_t e = 0, nout = 0;
+        e |= ksprintf(&args->tmps, "##%s=<", hdr->hrec[i]->key) < 0;
+        for (j=0; j<hdr->hrec[i]->nkeys; j++)
+        {
+            if ( !strcmp("IDX",hdr->hrec[i]->keys[j]) ) continue;
+            if ( nout ) e |= kputc(',',&args->tmps) < 0;
+            if ( !strcmp("ID",hdr->hrec[i]->keys[j]) )
+                e |= ksprintf(&args->tmps,"%s=L%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]) < 0;
+            else if ( !strcmp("Number",hdr->hrec[i]->keys[j]) )
+                e |= ksprintf(&args->tmps,"Number=.") < 0;
+            else if ( !strcmp("Description",hdr->hrec[i]->keys[j]) && hdr->hrec[i]->vals[j][0]=='"' )
+                e |= ksprintf(&args->tmps,"Description=\"Localized field: %s", hdr->hrec[i]->vals[j]+1) < 0;
+            else
+                e |= ksprintf(&args->tmps,"%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]) < 0;
+            nout++;
+        }
+        e |= ksprintf(&args->tmps,">\n") < 0;
+        if ( e ) error("Failed to format the header line for %s\n", key);
+        nstr++;
+        hts_expand(char*,nstr,mstr,str);
+        str[nstr-1] = strdup(args->tmps.s);
+    }
+    if ( !nstr ) return;
+    bcf_hdr_append(hdr,"##FORMAT=<ID=LAA,Number=.,Type=Integer,Description=\"Localized alleles: subset of alternate alleles relevant for each sample\">");
+    for (i=0; i<nstr; i++)
+    {
+        bcf_hdr_append(hdr, str[i]);
+        free(str[i]);
+    }
+    free(str);
+}
  void merge_vcf(args_t *args)
  {
-    args->out_fh  = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+    args->out_fh  = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
      if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
      if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads);
      args->out_hdr = bcf_hdr_init("w");
@@ -2509,6 +3020,7 @@ void merge_vcf(args_t *args)
              char buf[24]; snprintf(buf,sizeof buf,"%d",i+1);
              merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples);
          }
+        if ( args->local_alleles ) hdr_add_localized_tags(args, args->out_hdr);
          if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge");
          if (bcf_hdr_sync(args->out_hdr) < 0)
              error_errno("[%s] Failed to update header", __func__);
@@ -2580,7 +3092,9 @@ static void usage(void)
      fprintf(stderr, "    -g, --gvcf <-|ref.fa>              merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
      fprintf(stderr, "    -i, --info-rules <tag:method,..>   rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
      fprintf(stderr, "    -l, --file-list <file>             read file names from the file\n");
+    fprintf(stderr, "    -L, --local-alleles <int>          EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
      fprintf(stderr, "    -m, --merge <string>               allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+    fprintf(stderr, "        --no-index                     merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
      fprintf(stderr, "        --no-version                   do not append version and command line to the header\n");
      fprintf(stderr, "    -o, --output <file>                write output to a file [standard output]\n");
      fprintf(stderr, "    -O, --output-type <b|u|z|v>        'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
@@ -2608,6 +3122,7 @@ int main_vcfmerge(int argc, char *argv[])
      {
          {"help",no_argument,NULL,'h'},
          {"merge",required_argument,NULL,'m'},
+        {"local-alleles",required_argument,NULL,'L'},
          {"gvcf",required_argument,NULL,'g'},
          {"file-list",required_argument,NULL,'l'},
          {"missing-to-ref",no_argument,NULL,'0'},
@@ -2622,11 +3137,19 @@ int main_vcfmerge(int argc, char *argv[])
          {"regions-file",required_argument,NULL,'R'},
          {"info-rules",required_argument,NULL,'i'},
          {"no-version",no_argument,NULL,8},
+        {"no-index",no_argument,NULL,10},
          {"filter-logic",required_argument,NULL,'F'},
          {NULL,0,NULL,0}
      };
-    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0",loptions,NULL)) >= 0) {
+    char *tmp;
+    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) {
          switch (c) {
+            case 'L':
+                args->local_alleles = strtol(optarg,&tmp,10);
+                if ( *tmp ) error("Could not parse argument: --local-alleles %s\n", optarg);
+                if ( args->local_alleles < 1 )
+                    error("Error: \"--local-alleles %s\" makes no sense, expected value bigger or equal than 1\n", optarg);
+                break;
              case 'F': 
                  if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD;
                  else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE;
@@ -2672,6 +3195,7 @@ int main_vcfmerge(int argc, char *argv[])
              case  3 : args->force_samples = 1; break;
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
+            case 10 : args->no_index = 1; break;
              case 'h':
              case '?': usage(); break;
              default: error("Unknown argument: %s\n", optarg);
@@ -2680,7 +3204,13 @@ int main_vcfmerge(int argc, char *argv[])
      if ( argc==optind && !args->file_list ) usage();
      if ( argc-optind<2 && !args->file_list ) usage();
  
-    args->files->require_index = 1;
+    if ( args->no_index )
+    {
+        if ( args->regions_list ) error("Error: cannot combine --no-index with -r/-R\n");
+        bcf_sr_set_opt(args->files,BCF_SR_ALLOW_NO_IDX);
+    }
+    else
+        bcf_sr_set_opt(args->files,BCF_SR_REQUIRE_IDX);
      if ( args->regions_list )
      {
          if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c

index 651ea5191cbe4de24f6c7a69e029eed65a29f8a2..0f1c94c543edb1bc460187886712ef1b34d7b360 100644 (file)
--- a/bcftools/vcfmerge.c.pysam.c
+++ b/bcftools/vcfmerge.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
  
-    Copyright (C) 2012-2019 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -27,6 +27,7 @@ THE SOFTWARE.  */
  #include <stdio.h>
  #include <string.h>
  #include <strings.h>
+#include <assert.h>
  #include <errno.h>
  #include <unistd.h>
  #include <getopt.h>
@@ -60,6 +61,8 @@ typedef khash_t(strdict) strdict_t;
  
  #define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
  
+#define PL2PROB_MAX 1024
+
  // For merging INFO Number=A,G,R tags
  typedef struct
  {
@@ -134,6 +137,11 @@ typedef struct
      gvcf_aux_t *gvcf;   // buffer of gVCF lines, for each reader one line
      int nout_smpl;
      kstring_t *str;
+    int32_t *laa;           // localized alternate alleles given as input-based indexes in per-sample blocks of (args->local_alleles+1) values, 0 is always first
+    int nlaa, laa_dirty;    // number of LAA alleles actually used at this site, and was any L* added?
+    int32_t *tmpi, *k2k;
+    double *tmpd, *pl2prob; // mapping from phred-score likelihoods (PL) to probability
+    int ntmpi, ntmpd, nk2k;
  }
  maux_t;
  
@@ -143,7 +151,7 @@ typedef struct
      maux_t *maux;
      regidx_t *regs;    // apply regions only after the blocks are expanded
      regitr_t *regs_itr;
-    int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref;
+    int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index;
      char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
      faidx_t *gvcf_fai;
      info_rule_t *rules;
@@ -156,6 +164,7 @@ typedef struct
      bcf_hdr_t *out_hdr;
      char **argv;
      int argc, n_threads, record_cmd_line;
+    int local_alleles;    // the value of -L option
  }
  args_t;
  
@@ -264,7 +273,28 @@ static void info_rules_merge_join(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rul
          bcf_update_info_string(hdr,line,rule->hdr_tag,rule->vals);
      }
      else
+    {
+        int isrc, idst = 0;
+        #define BRANCH(type_t,is_missing,is_vector_end) { \
+            type_t *ptr = (type_t*) rule->vals; \
+            for (isrc=0; isrc<rule->nvals; isrc++) \
+            { \
+                if ( is_vector_end ) break; \
+                if ( is_missing ) continue; \
+                if ( idst!=isrc ) ptr[idst] = ptr[isrc]; \
+                idst++; \
+            } \
+        }
+        switch (rule->type) {
+            case BCF_HT_INT:  BRANCH(int32_t, ptr[isrc]==bcf_int32_missing, ptr[isrc]==bcf_int32_vector_end); break;
+            case BCF_HT_REAL: BRANCH(float, bcf_float_is_missing(ptr[isrc]), bcf_float_is_vector_end(ptr[isrc])); break;
+            default: error("TODO: %s:%d .. type=%d\n", __FILE__,__LINE__, rule->type);
+        }
+        #undef BRANCH
+
+        rule->nvals = idst;
          bcf_update_info(hdr,line,rule->hdr_tag,rule->vals,rule->nvals,rule->type);
+    }
  }
  
  static int info_rules_comp_key2(const void *a, const void *b)
@@ -346,7 +376,7 @@ static void info_rules_init(args_t *args)
          if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t);
          else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float);
          else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char); 
-        else error("The type is not supported: \"%s\"\n", rule->hdr_tag);
+        else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type);
  
          ss = strchr(ss, '\0'); ss++;
          if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag);
@@ -368,8 +398,17 @@ static void info_rules_init(args_t *args)
                      bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_G ||
                      bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)==BCF_VL_R
                      ) ? 1 : 0;
-            if ( is_join && is_agr )
-                error("Cannot -i %s:join on Number=[AGR] tags is not supported.\n", rule->hdr_tag);
+            if ( is_join && bcf_hdr_id2length(args->out_hdr,BCF_HL_INFO,id)!=BCF_VL_VAR )
+            {
+                bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->out_hdr, BCF_HL_INFO, "ID", rule->hdr_tag, NULL);
+                hrec = bcf_hrec_dup(hrec);
+                int i = bcf_hrec_find_key(hrec, "Number");
+                if ( i<0 ) error("Uh, could not find the entry Number in the header record of %s\n",rule->hdr_tag);
+                free(hrec->vals[i]);
+                hrec->vals[i] = strdup(".");
+                bcf_hdr_remove(args->out_hdr,BCF_HL_INFO, rule->hdr_tag);
+                bcf_hdr_add_hrec(args->out_hdr, hrec);
+            }
              if ( !is_join && !is_agr )
                  error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag);
          }
@@ -691,7 +730,7 @@ maux_t *maux_init(args_t *args)
      assert( n_smpl==bcf_hdr_nsamples(args->out_hdr) );
      if ( args->do_gvcf )
      {
-        ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t));
+        ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t));  // -Walloc-size-larger-than gives a harmless warning caused by signed integer ma->n
          for (i=0; i<ma->n; i++)
              ma->gvcf[i].line = bcf_init1();
      }
@@ -701,6 +740,13 @@ maux_t *maux_init(args_t *args)
      for (i=0; i<ma->n; i++)
          ma->buf[i].rid = -1;
      ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t));
+    if ( args->local_alleles )
+    {
+        ma->laa = (int32_t*)malloc(sizeof(*ma->laa)*ma->nout_smpl*(1+args->local_alleles));
+        ma->pl2prob = (double*)malloc(PL2PROB_MAX*sizeof(*ma->pl2prob));
+        for (i=0; i<PL2PROB_MAX; i++)
+            ma->pl2prob[i] = pow(10,-0.1*i);
+    }
      return ma;
  }
  void maux_destroy(maux_t *ma)
@@ -739,6 +785,11 @@ void maux_destroy(maux_t *ma)
      free(ma->smpl_ploidy);
      free(ma->smpl_nGsize);
      free(ma->chr);
+    free(ma->laa);
+    free(ma->tmpi);
+    free(ma->k2k);
+    free(ma->tmpd);
+    free(ma->pl2prob);
      free(ma);
  }
  void maux_expand1(buffer_t *buf, int size)
@@ -1107,7 +1158,7 @@ static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, i
                  case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, int); break;
                  case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, int); break;
                  case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), float); break;
-                default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
+                default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); bcftools_exit(1);
              }
              #undef BRANCH
          }
@@ -1137,7 +1188,7 @@ static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, i
                  case BCF_BT_INT16: BRANCH(int16_t, src[kori]==bcf_int16_missing, src[kori]==bcf_int16_vector_end, int); break;
                  case BCF_BT_INT32: BRANCH(int32_t, src[kori]==bcf_int32_missing, src[kori]==bcf_int32_vector_end, int); break;
                  case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(src[kori]), bcf_float_is_vector_end(src[kori]), float); break;
-                default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
+                default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); bcftools_exit(1);
              }
              #undef BRANCH
          }
@@ -1327,6 +1378,171 @@ static inline int max_used_gt_ploidy(bcf_fmt_t *fmt, int nsmpl)
      return max_ploidy;
  }
  
+// Sets ma->laa to local indexes relevant for each sample or missing/vector_end.
+// The indexes are with respect to the source indexes and must be translated as
+// the very last step.
+void init_local_alleles(args_t *args, bcf1_t *out, int ifmt_PL)
+{
+    bcf_srs_t *files = args->files;
+    maux_t *ma = args->maux;
+    int i,j,k,l, ismpl = 0, nlaa = 0;
+    static int warned = 0;
+
+    hts_expand(double,out->n_allele,ma->ntmpd,ma->tmpd); // allele probabilities
+    hts_expand(int,out->n_allele,ma->ntmpi,ma->tmpi);    // indexes of the sorted probabilities
+
+    // Let map[] be the mapping from src to output idx. Then k2k[] is mapping from src allele idxs to src allele idxs
+    // reordered so that if i<j then map[k2k[i]] < map[k2k[j]]
+    hts_expand(int,out->n_allele,ma->nk2k,ma->k2k);
+
+    // Determine local alleles: either take all that are present in the reader or use PL to determine the best
+    // subset for each sample. The alleles must be listed in the order of the alleles in the output file.
+    for (i=0; i<files->nreaders; i++)
+    {
+        bcf_sr_t *reader = &files->readers[i];
+        bcf_hdr_t *hdr = reader->header;
+        bcf_fmt_t *fmt_ori = ma->fmt_map[files->nreaders*ifmt_PL+i];
+        bcf1_t *line = maux_get_line(args, i);
+        int nsmpl = bcf_hdr_nsamples(hdr);
+        if ( line )
+        {
+            if ( nlaa < line->n_allele - 1 )
+                nlaa = line->n_allele - 1 <= args->local_alleles ? line->n_allele - 1 : args->local_alleles;
+
+            for (j=0; j<line->n_allele; j++) ma->k2k[j] = j;
+
+            if ( line->n_allele <= args->local_alleles + 1 )
+            {
+                // sort to the output order, insertion sort, ascending 
+                int *map = ma->buf[i].rec[ma->buf[i].cur].map;
+                int *k2k = ma->k2k;
+                int tmp;
+                for (k=1; k<line->n_allele; k++)
+                    for (l=k; l>0 && map[k2k[l]] < map[k2k[l-1]]; l--)
+                        tmp = k2k[l], k2k[l] = k2k[l-1], k2k[l-1] = tmp;
+
+                // fewer than the allowed number of alleles, use all alleles from this file
+                for (j=0; j<nsmpl; j++)
+                {
+                    int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl;
+                    for (k=0; k<line->n_allele; k++) ptr[k] = k2k[k];
+                    for (; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end;
+                    ismpl++;
+                }
+                continue;
+            }
+        }
+        if ( !line || !fmt_ori )
+        {
+            // no values, fill in missing values
+            for (j=0; j<nsmpl; j++)
+            {
+                int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl;
+                ptr[0] = bcf_int32_missing;
+                for (k=1; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end;
+                ismpl++;
+            }
+            continue;
+        }
+
+        // there are more alternate alleles in the input files than is allowed on output, need to subset
+        if ( ifmt_PL==-1 )
+        {
+            if ( !warned )
+                fprintf(bcftools_stderr,"Warning: local alleles are determined from FORMAT/PL but the tag is missing, cannot apply --local-alleles\n");
+            warned = 1;
+            ma->nlaa = 0;
+            return;
+        }
+
+        if ( !IS_VL_G(hdr, fmt_ori->id) ) error("FORMAT/PL must be defined as Number=G\n");
+        if ( 2*fmt_ori->n != line->n_allele*(line->n_allele+1) ) error("Todo: haploid PL to LPL\n");
+
+        int *map = ma->buf[i].rec[ma->buf[i].cur].map;
+        double *allele_prob = ma->tmpd;
+        int *idx = ma->tmpi;
+        #define BRANCH(src_type_t, src_is_missing, src_is_vector_end, pl2prob_idx) { \
+            src_type_t *src = (src_type_t*) fmt_ori->p; \
+            for (j=0; j<nsmpl; j++) \
+            { \
+                for (k=0; k<line->n_allele; k++) allele_prob[k] = 0; \
+                for (k=0; k<line->n_allele; k++) \
+                    for (l=0; l<=k; l++) \
+                    { \
+                        if ( src_is_missing || src_is_vector_end ) { src++; continue; } \
+                        double prob = ma->pl2prob[pl2prob_idx]; \
+                        allele_prob[k] += prob; \
+                        allele_prob[l] += prob; \
+                        src++; \
+                    } \
+                /* insertion sort by allele probability, descending order, with the twist that REF (idx=0) always comes first */ \
+                allele_prob++; idx[0] = -1; idx++; /* keep REF first */ \
+                int si,sj,tmp; \
+                for (si=0; si<line->n_allele-1; si++) idx[si] = si; \
+                for (si=1; si<line->n_allele-1; si++) \
+                    for (sj=si; sj>0 && allele_prob[idx[sj]] > allele_prob[idx[sj-1]]; sj--) \
+                        tmp = idx[sj], idx[sj] = idx[sj-1], idx[sj-1] = tmp; \
+                /*for debugging only: test order*/ \
+                for (si=1; si<line->n_allele-1; si++) \
+                    assert( allele_prob[idx[si-1]] >= allele_prob[idx[si]] ); \
+                allele_prob--; idx--; /* this was to keep REF first */ \
+                int32_t *ptr = ma->laa + (1+args->local_alleles)*ismpl; \
+                ptr[0] = 0; \
+                for (k=1; k<=args->local_alleles && k<line->n_allele; k++) ptr[k] = idx[k]+1; \
+                int kmax = k; \
+                for (; k<=args->local_alleles; k++) ptr[k] = bcf_int32_vector_end; \
+                /* insertion sort by indexes to the output order, ascending */ \
+                for (k=1; k<kmax; k++) \
+                    for (l=k; l>0 && map[ptr[l]] < map[ptr[l-1]]; l--) \
+                        tmp = ptr[l], ptr[l] = ptr[l-1], ptr[l-1] = tmp; \
+                ismpl++; \
+            } \
+        }
+        switch (fmt_ori->type)
+        {
+            case BCF_BT_INT8:  BRANCH( int8_t, *src==bcf_int8_missing,  *src==bcf_int8_vector_end,  *src); break;
+            case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *src>=0 && *src<PL2PROB_MAX ? *src : PL2PROB_MAX-1); break;
+            case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *src>=0 && *src<PL2PROB_MAX ? *src : PL2PROB_MAX-1); break;
+            default: error("Unexpected case: %d, PL\n", fmt_ori->type);
+        }
+        #undef BRANCH
+    }
+    ma->nlaa = nlaa;
+}
+
+void update_local_alleles(args_t *args, bcf1_t *out)
+{
+    bcf_srs_t *files = args->files;
+    maux_t *ma = args->maux;
+    int i,j,k,ismpl=0,nsamples = bcf_hdr_nsamples(args->out_hdr);
+    for (i=0; i<files->nreaders; i++)
+    {
+        int irec = ma->buf[i].cur;
+        bcf_sr_t *reader = &files->readers[i];
+        int nsmpl = bcf_hdr_nsamples(reader->header);
+        for (k=0; k<nsmpl; k++)
+        {
+            int32_t *src = ma->laa + ismpl*(1+args->local_alleles);
+            int32_t *dst = ma->laa + ismpl*ma->nlaa;
+            j = 0;
+            if ( irec>=0 )
+            {
+                for (; j<ma->nlaa; j++)
+                {
+                    if ( src[j+1]==bcf_int32_missing ) dst[j] = bcf_int32_missing;
+                    else if ( src[j+1]==bcf_int32_vector_end ) break;
+                    else
+                        dst[j] = ma->buf[i].rec[irec].map[src[j+1]];
+                }
+            }
+            if ( j==0 ) dst[j++] = bcf_int32_missing;
+            for (; j<ma->nlaa; j++) src[j] = bcf_int32_vector_end;
+            ismpl++;
+        }
+    }
+    bcf_update_format_int32(args->out_hdr, out, "LAA", ma->laa, nsamples*ma->nlaa);
+}
+
  void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
  {
      bcf_srs_t *files = args->files;
@@ -1335,7 +1551,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
      int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr);
      static int warned = 0;
  
-    int nsize = 0, msize = sizeof(int32_t);
+    int nsize = 0;
      for (i=0; i<files->nreaders; i++)
      {
          bcf_fmt_t *fmt = fmt_map[i];
@@ -1345,17 +1561,18 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
      }
      if ( nsize==0 ) nsize = 1;
  
-    if ( ma->ntmp_arr < nsamples*nsize*msize )
+    size_t msize = sizeof(int32_t)*nsize*nsamples;
+    if ( msize > 2147483647 )
      {
-        ma->ntmp_arr = nsamples*nsize*msize;
-        ma->tmp_arr  = realloc(ma->tmp_arr, ma->ntmp_arr);
-        if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr);
-        if ( ma->ntmp_arr > 2147483647 )
-        {
-            if ( !warned ) fprintf(bcftools_stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr);
-            warned = 1;
-            return;
-        }
+        if ( !warned ) fprintf(bcftools_stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize);
+        warned = 1;
+        return;
+    }
+    if ( ma->ntmp_arr < msize )
+    {
+        ma->tmp_arr  = realloc(ma->tmp_arr, msize);
+        if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",msize);
+        ma->ntmp_arr = msize;
      }
      memset(ma->smpl_ploidy,0,nsamples*sizeof(int));
  
@@ -1511,6 +1728,7 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf
                      int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew);
                      if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret);
                  }
+                if ( nmax < str->l ) nmax = str->l;
                  src += fmt_ori->size;
              }
              continue;
@@ -1522,17 +1740,18 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf
                "If you don't really need it, use `bcftools annotate -x` to remove the annotation before merging.\n", __func__,key);
      }
      // update the record
-    if ( ma->ntmp_arr < nsamples*nmax )
+    size_t msize = nsamples*nmax;
+    if ( msize > 2147483647 )
      {
-        ma->ntmp_arr = nsamples*nmax;
-        ma->tmp_arr  = realloc(ma->tmp_arr, ma->ntmp_arr);
-        if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr);
-        if ( ma->ntmp_arr > 2147483647 )
-        {
-            if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr);
-            warned = 1;
-            return;
-        }
+        if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize);
+        warned = 1;
+        return;
+    }
+    if ( ma->ntmp_arr < msize )
+    {
+        ma->tmp_arr  = realloc(ma->tmp_arr, msize);
+        if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",msize);
+        ma->ntmp_arr = msize;
      }
      char *tgt = (char*) ma->tmp_arr;
      for (i=0; i<nsamples; i++)
@@ -1544,6 +1763,204 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf
      bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nmax);
  }
  
+// Note: only diploid Number=G tags only for now
+void merge_localized_numberG_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out, int irdr)
+{
+    int i,j,k, nsamples = bcf_hdr_nsamples(args->out_hdr);
+    bcf_srs_t *files = args->files;
+    maux_t *ma = args->maux;
+    bcf_fmt_t *fmt = fmt_map[irdr];
+    const char *key = files->readers[irdr].header->id[BCF_DT_ID][fmt_map[irdr]->id].key;
+    size_t nsize = (ma->nlaa+1)*(ma->nlaa+2)/2;             // max number of Number=G localized fields
+    size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
+    msize *= nsamples*nsize;
+    if ( msize > 2147483647 )
+    {
+        static int warned = 0;
+        if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,msize);
+        warned = 1;
+        return;
+    }
+    if ( ma->ntmp_arr < msize )
+    {
+        ma->tmp_arr  = realloc(ma->tmp_arr, msize);
+        if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
+        ma->ntmp_arr = msize;
+    }
+    int ismpl = 0;
+    for (i=0; i<files->nreaders; i++)
+    {
+        bcf_sr_t *reader = &files->readers[i];
+        bcf_hdr_t *hdr = reader->header;
+        bcf_fmt_t *fmt_ori = fmt_map[i];
+        bcf1_t *line = maux_get_line(args, i);
+        int nsmpl = bcf_hdr_nsamples(hdr);
+
+        if ( !fmt_ori )
+        {
+            // fill missing values
+            #define BRANCH(tgt_type_t, tgt_set_missing, tgt_set_vector_end) { \
+                for (j=0; j<nsmpl; j++) \
+                { \
+                    tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+                    tgt_set_missing; \
+                    for (k=1; k<nsize; k++) { tgt++; tgt_set_vector_end; } \
+                    ismpl++; \
+                } \
+            }
+            switch (fmt->type)
+            {
+                case BCF_BT_INT8:  BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+                case BCF_BT_INT16: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+                case BCF_BT_INT32: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+                case BCF_BT_FLOAT: BRANCH(float, bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
+                default: error("Unexpected case: %d, %s\n", fmt->type, key);
+            }
+            #undef BRANCH
+            continue;
+        }
+        if ( 2*fmt_ori->n!=line->n_allele*(line->n_allele+1) ) error("Todo: localization of missing or haploid Number=G tags\n");
+
+        // localize
+        #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+            for (j=0; j<nsmpl; j++) \
+            { \
+                src_type_t *src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+                int *laa = ma->laa + (1+args->local_alleles)*ismpl; \
+                int ii,ij,tgt_idx = 0; \
+                for (ii=0; ii<=ma->nlaa; ii++) \
+                { \
+                    if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \
+                    for (ij=0; ij<=ii; ij++) \
+                    { \
+                        int src_idx = bcf_alleles2gt(laa[ii],laa[ij]); \
+                        if ( src_is_missing ) tgt_set_missing; \
+                        else if ( src_is_vector_end ) break; \
+                        else tgt[tgt_idx] = src[src_idx]; \
+                        tgt_idx++; \
+                    } \
+                } \
+                if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \
+                for (; tgt_idx<nsize; tgt_idx++) tgt_set_vector_end; \
+                ismpl++; \
+            } \
+        }
+        switch (fmt_ori->type)
+        {
+            case BCF_BT_INT8:  BRANCH(int32_t,  int8_t, src[src_idx]==bcf_int8_missing,  src[src_idx]==bcf_int8_vector_end,  tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
+            default: error("Unexpected case: %d, %s\n", fmt_ori->type, key);
+        }
+        #undef BRANCH
+    }
+    args->tmps.l = 0;
+    kputc('L',&args->tmps);
+    kputs(key,&args->tmps);
+    if ( fmt_map[irdr]->type==BCF_BT_FLOAT )
+        bcf_update_format_float(args->out_hdr, out, args->tmps.s, (float*)ma->tmp_arr, nsamples*nsize);
+    else
+        bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
+    ma->laa_dirty = 1;
+}
+void merge_localized_numberAR_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out, int irdr)
+{
+    int i,j,k, nsamples = bcf_hdr_nsamples(args->out_hdr);
+    bcf_srs_t *files = args->files;
+    maux_t *ma = args->maux;
+    bcf_fmt_t *fmt = fmt_map[irdr];
+    const char *key = files->readers[irdr].header->id[BCF_DT_ID][fmt->id].key;
+    size_t nsize = IS_VL_R(files->readers[irdr].header, fmt->id) ? ma->nlaa + 1 : ma->nlaa;
+    size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
+    msize *= nsamples*nsize;
+    if ( msize > 2147483647 )
+    {
+        static int warned = 0;
+        if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,msize);
+        warned = 1;
+        return;
+    }
+    if ( ma->ntmp_arr < msize )
+    {
+        ma->tmp_arr  = realloc(ma->tmp_arr, msize);
+        if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
+        ma->ntmp_arr = msize;
+    }
+    int ismpl = 0, ibeg = IS_VL_R(files->readers[irdr].header, fmt->id) ? 0 : 1;;
+    for (i=0; i<files->nreaders; i++)
+    {
+        bcf_sr_t *reader = &files->readers[i];
+        bcf_hdr_t *hdr = reader->header;
+        bcf_fmt_t *fmt_ori = fmt_map[i];
+        int nsmpl = bcf_hdr_nsamples(hdr);
+
+        if ( !fmt_ori )
+        {
+            // fill missing values
+            #define BRANCH(tgt_type_t, tgt_set_missing, tgt_set_vector_end) { \
+                for (j=0; j<nsmpl; j++) \
+                { \
+                    tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+                    tgt_set_missing; \
+                    for (k=1; k<nsize; k++) { tgt++; tgt_set_vector_end; } \
+                    ismpl++; \
+                } \
+            }
+            switch (fmt->type)
+            {
+                case BCF_BT_INT8:  BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+                case BCF_BT_INT16: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+                case BCF_BT_INT32: BRANCH(int32_t, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
+                case BCF_BT_FLOAT: BRANCH(float, bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
+                default: error("Unexpected case: %d, %s\n", fmt->type, key);
+            }
+            #undef BRANCH
+            continue;
+        }
+
+        // localize
+        #define BRANCH(tgt_type_t, src_type_t, src_is_missing, src_is_vector_end, tgt_set_missing, tgt_set_vector_end) { \
+            for (j=0; j<nsmpl; j++) \
+            { \
+                src_type_t *src = (src_type_t*) fmt_ori->p + j*fmt_ori->n; \
+                tgt_type_t *tgt = (tgt_type_t *) ma->tmp_arr + ismpl*nsize; \
+                int *laa = ma->laa + (1+args->local_alleles)*ismpl; \
+                int ii,tgt_idx = 0; \
+                for (ii=ibeg; ii<=ma->nlaa; ii++) \
+                { \
+                    if ( laa[ii]==bcf_int32_missing || laa[ii]==bcf_int32_vector_end ) break; \
+                    int src_idx = laa[ii] - ibeg; \
+                    if ( src_is_missing ) tgt_set_missing; \
+                    else if ( src_is_vector_end ) break; \
+                    else tgt[tgt_idx] = src[src_idx]; \
+                    tgt_idx++; \
+                } \
+                if ( !tgt_idx ) { tgt_set_missing; tgt_idx++; } \
+                for (; tgt_idx<nsize; tgt_idx++) tgt_set_vector_end; \
+                ismpl++; \
+            } \
+        }
+        switch (fmt_ori->type)
+        {
+            case BCF_BT_INT8:  BRANCH(int32_t,  int8_t, src[src_idx]==bcf_int8_missing,  src[src_idx]==bcf_int8_vector_end,  tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT16: BRANCH(int32_t, int16_t, src[src_idx]==bcf_int16_missing, src[src_idx]==bcf_int16_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_INT32: BRANCH(int32_t, int32_t, src[src_idx]==bcf_int32_missing, src[src_idx]==bcf_int32_vector_end, tgt[tgt_idx]=bcf_int32_missing, tgt[tgt_idx]=bcf_int32_vector_end); break;
+            case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(src[src_idx]), bcf_float_is_vector_end(src[src_idx]), bcf_float_set_missing(tgt[tgt_idx]), bcf_float_set_vector_end(tgt[tgt_idx])); break;
+            default: error("Unexpected case: %d, %s\n", fmt_ori->type, key);
+        }
+        #undef BRANCH
+    }
+    args->tmps.l = 0;
+    kputc('L',&args->tmps);
+    kputs(key,&args->tmps);
+    if ( fmt_map[irdr]->type==BCF_BT_FLOAT )
+        bcf_update_format_float(args->out_hdr, out, args->tmps.s, (float*)ma->tmp_arr, nsamples*nsize);
+    else
+        bcf_update_format_int32(args->out_hdr, out, args->tmps.s, (int32_t*)ma->tmp_arr, nsamples*nsize);
+    ma->laa_dirty = 1;
+}
  void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
  {
      bcf_srs_t *files = args->files;
@@ -1581,6 +1998,13 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
          }
          if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n;
      }
+    if ( ma->nlaa && length!=BCF_VL_FIXED )
+    {
+        if ( length==BCF_VL_G ) merge_localized_numberG_format_field(args,fmt_map,out,i);
+        else if ( length==BCF_VL_A || length==BCF_VL_R ) merge_localized_numberAR_format_field(args,fmt_map,out,i);
+        return;
+    }
+
      if ( type==BCF_BT_CHAR )
      {
          merge_format_string(args, key, fmt_map, out, length, nsize);
@@ -1588,17 +2012,18 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
      }
  
      size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
-    if ( ma->ntmp_arr < nsamples*nsize*msize )
+    msize *= nsamples*nsize;
+    if ( msize > 2147483647 )
      {
-        ma->ntmp_arr = nsamples*nsize*msize;
-        ma->tmp_arr  = realloc(ma->tmp_arr, ma->ntmp_arr);
-        if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", ma->ntmp_arr,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
-        if ( ma->ntmp_arr > 2147483647 )
-        {
-            if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr);
-            warned = 1;
-            return;
-        }
+        if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,msize);
+        warned = 1;
+        return;
+    }
+    if ( ma->ntmp_arr < msize )
+    {
+        ma->tmp_arr  = realloc(ma->tmp_arr, msize);
+        if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", msize,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key);
+        ma->ntmp_arr = msize;
      }
  
      // Fill the temp array for all samples by collecting values from all files
@@ -1792,7 +2217,7 @@ void merge_format(args_t *args, bcf1_t *out)
      khiter_t kitr;
      strdict_t *tmph = args->tmph;
      kh_clear(strdict, tmph);
-    int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index
+    int i, j, ret, has_GT = 0, has_PL = -1, max_ifmt = 0; // max fmt index
      for (i=0; i<files->nreaders; i++)
      {
          bcf1_t *line = maux_get_line(args,i);
@@ -1822,6 +2247,7 @@ void merge_format(args_t *args, bcf1_t *out)
                          memset(ma->fmt_map+ma->nfmt_map*files->nreaders, 0, (max_ifmt-ma->nfmt_map+1)*files->nreaders*sizeof(bcf_fmt_t*));
                          ma->nfmt_map = max_ifmt+1;
                      }
+                    if ( key[0]=='P' && key[1]=='L' && key[2]==0  ) { has_PL = ifmt; }
                  }
                  kitr = kh_put(strdict, tmph, key, &ret);
                  kh_value(tmph, kitr) = ifmt;
@@ -1835,6 +2261,12 @@ void merge_format(args_t *args, bcf1_t *out)
          ma->buf[i].rec[irec].als_differ = j==line->n_allele ? 0 : 1;
      }
  
+    if ( args->local_alleles )
+    {
+        ma->laa_dirty = ma->nlaa = 0;
+        if ( out->n_allele > args->local_alleles + 1 ) init_local_alleles(args, out, has_PL);
+    }
+
      out->n_sample = bcf_hdr_nsamples(out_hdr);
      if ( has_GT )
          merge_GT(args, ma->fmt_map, out);
@@ -1842,6 +2274,10 @@ void merge_format(args_t *args, bcf1_t *out)
  
      for (i=1; i<=max_ifmt; i++)
          merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
+
+    if ( ma->laa_dirty )
+        update_local_alleles(args, out);
+
      out->d.indiv_dirty = 1;
  }
  
@@ -2043,6 +2479,23 @@ void gvcf_flush(args_t *args, int done)
      }
  }
  
+static inline int is_gvcf_block(bcf1_t *line)
+{
+    if ( line->rlen<=1 ) return 0;
+    if ( strlen(line->d.allele[0])==line->rlen ) return 0;
+    if ( line->n_allele==1 ) return 1;
+
+    int i;
+    for (i=1; i<line->n_allele; i++)
+    {
+        if ( !strcmp(line->d.allele[i],"<*>") ) return 1;
+        if ( !strcmp(line->d.allele[i],"<NON_REF>") ) return 1;
+        if ( !strcmp(line->d.allele[i],"<X>") ) return 1;
+    }
+    return 0;
+}
+static const int snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), indel_mask = VCF_INDEL<<2, ref_mask = 2;
+
  /*
      Check incoming lines for new gVCF blocks, set pointer to the current source
      buffer (gvcf or readers).  In contrast to gvcf_flush, this function can be
@@ -2061,6 +2514,7 @@ void gvcf_stage(args_t *args, int pos)
      maux->gvcf_min = INT_MAX;
      for (i=0; i<files->nreaders; i++)
      {
+        if ( gaux[i].active && gaux[i].end < pos ) gaux[i].active = 0;
          if ( gaux[i].active )
          {
              // gvcf block should not overlap with another record
@@ -2079,7 +2533,7 @@ void gvcf_stage(args_t *args, int pos)
          int irec = maux->buf[i].beg;
          bcf_hdr_t *hdr = bcf_sr_get_header(files, i);
          bcf1_t *line = args->files->readers[i].buffer[irec];
-        int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend);
+        int ret = is_gvcf_block(line) ? bcf_get_info_int32(hdr,line,"END",&end,&nend) : 0;
          if ( ret==1 )
          {
              if ( end[0] == line->pos + 1 )  // POS and INFO/END are identical, treat as if a normal w/o INFO/END
@@ -2220,7 +2674,6 @@ void debug_state(args_t *args)
      fprintf(bcftools_stderr,"\n");
  }
  
-
  /*
     Determine which line should be merged from which reader: go through all
     readers and all buffered lines, expand REF,ALT and try to match lines with
@@ -2229,7 +2682,6 @@ void debug_state(args_t *args)
  int can_merge(args_t *args)
  {
      bcf_srs_t *files = args->files;
-    int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
      maux_t *maux = args->maux;
      gvcf_aux_t *gaux = maux->gvcf;
      char *id = NULL, ref = 'N';
@@ -2242,6 +2694,9 @@ int can_merge(args_t *args)
      }
      maux->var_types = maux->nals = 0;
  
+    // this is only for the `-m none -g` mode, ensure that <*> lines come last
+    #define VCF_GVCF_REF 1
+
      for (i=0; i<files->nreaders; i++)
      {
          buffer_t *buf = &maux->buf[i];
@@ -2259,12 +2714,17 @@ int can_merge(args_t *args)
              buf->rec[j].skip = SKIP_DIFF;
              ntodo++;
  
+            bcf1_t *line = buf->lines[j];
              if ( args->merge_by_id )
-                id = buf->lines[j]->d.id;
+                id = line->d.id;
              else
              {
-                int var_type = bcf_get_variant_types(buf->lines[j]);
-                maux->var_types |= var_type ? var_type<<1 : 1;
+                int var_type = bcf_get_variant_types(line);
+                maux->var_types |= var_type ? var_type<<2 : 2;
+
+                // for the `-m none -g` mode
+                if ( args->collapse==COLLAPSE_NONE && args->do_gvcf && is_gvcf_block(line) )
+                    maux->var_types |= VCF_GVCF_REF;
              }
          }
  
@@ -2296,7 +2756,7 @@ int can_merge(args_t *args)
              bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
  
              int line_type = bcf_get_variant_types(line);
-            line_type = line_type ? line_type<<1 : 1;
+            line_type = line_type ? line_type<<2 : 2;
  
              // select relevant lines
              if ( args->merge_by_id )
@@ -2305,6 +2765,12 @@ int can_merge(args_t *args)
              }
              else
              {
+                // when merging gVCF in -m none mode, make sure that gVCF blocks with the same POS as variant
+                // records come last, otherwise infinite loop is created (#1164)
+                if ( args->collapse==COLLAPSE_NONE && args->do_gvcf )
+                {
+                    if ( is_gvcf_block(line) && (maux->var_types & (~(VCF_GVCF_REF|2))) ) continue;
+                }
                  if ( args->collapse==COLLAPSE_NONE && maux->nals )
                  {
                      // All alleles of the tested record must be present in the
@@ -2368,7 +2834,6 @@ int can_merge(args_t *args)
  */
  void stage_line(args_t *args)
  {
-    int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
      bcf_srs_t *files = args->files;
      maux_t *maux = args->maux;
  
@@ -2438,13 +2903,9 @@ void stage_line(args_t *args)
  
  void merge_line(args_t *args)
  {
-    if ( args->regs )
-    {
-        if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return;
-    }
-
      bcf1_t *out = args->out_line;
      merge_chrom2qual(args, out);
+    if ( args->regs && !regidx_overlap(args->regs,args->maux->chr,out->pos,out->pos+out->rlen-1,NULL) ) return;
      merge_filter(args, out);
      merge_info(args, out);
      if ( args->do_gvcf )
@@ -2492,9 +2953,59 @@ void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *c
      error_errno("[%s] Failed to add program information to header", __func__);
  }
  
+void hdr_add_localized_tags(args_t *args, bcf_hdr_t *hdr)
+{
+    char **str = NULL;
+    int i,j, nstr = 0, mstr = 0;
+    for (i=0; i<hdr->nhrec; i++)
+    {
+        if ( hdr->hrec[i]->type!=BCF_HL_FMT ) continue;
+        j = bcf_hrec_find_key(hdr->hrec[i],"ID");
+        if ( j<0 ) continue;
+        char *key = hdr->hrec[i]->vals[j];
+        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
+        assert( id>=0 );
+        int localize = 0;
+        if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G ) localize = 1;
+        if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A ) localize = 1;
+        if ( bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R ) localize = 1;
+        if ( !localize ) continue;
+        args->tmps.l = 0;
+
+        uint32_t e = 0, nout = 0;
+        e |= ksprintf(&args->tmps, "##%s=<", hdr->hrec[i]->key) < 0;
+        for (j=0; j<hdr->hrec[i]->nkeys; j++)
+        {
+            if ( !strcmp("IDX",hdr->hrec[i]->keys[j]) ) continue;
+            if ( nout ) e |= kputc(',',&args->tmps) < 0;
+            if ( !strcmp("ID",hdr->hrec[i]->keys[j]) )
+                e |= ksprintf(&args->tmps,"%s=L%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]) < 0;
+            else if ( !strcmp("Number",hdr->hrec[i]->keys[j]) )
+                e |= ksprintf(&args->tmps,"Number=.") < 0;
+            else if ( !strcmp("Description",hdr->hrec[i]->keys[j]) && hdr->hrec[i]->vals[j][0]=='"' )
+                e |= ksprintf(&args->tmps,"Description=\"Localized field: %s", hdr->hrec[i]->vals[j]+1) < 0;
+            else
+                e |= ksprintf(&args->tmps,"%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]) < 0;
+            nout++;
+        }
+        e |= ksprintf(&args->tmps,">\n") < 0;
+        if ( e ) error("Failed to format the header line for %s\n", key);
+        nstr++;
+        hts_expand(char*,nstr,mstr,str);
+        str[nstr-1] = strdup(args->tmps.s);
+    }
+    if ( !nstr ) return;
+    bcf_hdr_append(hdr,"##FORMAT=<ID=LAA,Number=.,Type=Integer,Description=\"Localized alleles: subset of alternate alleles relevant for each sample\">");
+    for (i=0; i<nstr; i++)
+    {
+        bcf_hdr_append(hdr, str[i]);
+        free(str[i]);
+    }
+    free(str);
+}
  void merge_vcf(args_t *args)
  {
-    args->out_fh  = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+    args->out_fh  = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
      if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
      if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads);
      args->out_hdr = bcf_hdr_init("w");
@@ -2511,6 +3022,7 @@ void merge_vcf(args_t *args)
              char buf[24]; snprintf(buf,sizeof buf,"%d",i+1);
              merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples);
          }
+        if ( args->local_alleles ) hdr_add_localized_tags(args, args->out_hdr);
          if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge");
          if (bcf_hdr_sync(args->out_hdr) < 0)
              error_errno("[%s] Failed to update header", __func__);
@@ -2582,7 +3094,9 @@ static void usage(void)
      fprintf(bcftools_stderr, "    -g, --gvcf <-|ref.fa>              merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
      fprintf(bcftools_stderr, "    -i, --info-rules <tag:method,..>   rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
      fprintf(bcftools_stderr, "    -l, --file-list <file>             read file names from the file\n");
+    fprintf(bcftools_stderr, "    -L, --local-alleles <int>          EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
      fprintf(bcftools_stderr, "    -m, --merge <string>               allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+    fprintf(bcftools_stderr, "        --no-index                     merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
      fprintf(bcftools_stderr, "        --no-version                   do not append version and command line to the header\n");
      fprintf(bcftools_stderr, "    -o, --output <file>                write output to a file [standard output]\n");
      fprintf(bcftools_stderr, "    -O, --output-type <b|u|z|v>        'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
@@ -2590,7 +3104,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "    -R, --regions-file <file>          restrict to regions listed in a file\n");
      fprintf(bcftools_stderr, "        --threads <int>                use multithreading with <int> worker threads [0]\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcfmerge(int argc, char *argv[])
@@ -2610,6 +3124,7 @@ int main_vcfmerge(int argc, char *argv[])
      {
          {"help",no_argument,NULL,'h'},
          {"merge",required_argument,NULL,'m'},
+        {"local-alleles",required_argument,NULL,'L'},
          {"gvcf",required_argument,NULL,'g'},
          {"file-list",required_argument,NULL,'l'},
          {"missing-to-ref",no_argument,NULL,'0'},
@@ -2624,11 +3139,19 @@ int main_vcfmerge(int argc, char *argv[])
          {"regions-file",required_argument,NULL,'R'},
          {"info-rules",required_argument,NULL,'i'},
          {"no-version",no_argument,NULL,8},
+        {"no-index",no_argument,NULL,10},
          {"filter-logic",required_argument,NULL,'F'},
          {NULL,0,NULL,0}
      };
-    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0",loptions,NULL)) >= 0) {
+    char *tmp;
+    while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) {
          switch (c) {
+            case 'L':
+                args->local_alleles = strtol(optarg,&tmp,10);
+                if ( *tmp ) error("Could not parse argument: --local-alleles %s\n", optarg);
+                if ( args->local_alleles < 1 )
+                    error("Error: \"--local-alleles %s\" makes no sense, expected value bigger or equal than 1\n", optarg);
+                break;
              case 'F': 
                  if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD;
                  else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE;
@@ -2674,6 +3197,7 @@ int main_vcfmerge(int argc, char *argv[])
              case  3 : args->force_samples = 1; break;
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
+            case 10 : args->no_index = 1; break;
              case 'h':
              case '?': usage(); break;
              default: error("Unknown argument: %s\n", optarg);
@@ -2682,7 +3206,13 @@ int main_vcfmerge(int argc, char *argv[])
      if ( argc==optind && !args->file_list ) usage();
      if ( argc-optind<2 && !args->file_list ) usage();
  
-    args->files->require_index = 1;
+    if ( args->no_index )
+    {
+        if ( args->regions_list ) error("Error: cannot combine --no-index with -r/-R\n");
+        bcf_sr_set_opt(args->files,BCF_SR_ALLOW_NO_IDX);
+    }
+    else
+        bcf_sr_set_opt(args->files,BCF_SR_REQUIRE_IDX);
      if ( args->regions_list )
      {
          if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c

index dcaaba138b3009126967cf2a05ba46677a53d5a1..7b510b109b514a911d37985e75fde5342b2961d4 100644 (file)
--- a/bcftools/vcfnorm.c
+++ b/bcftools/vcfnorm.c
@@ -1,6 +1,6 @@
  /*  vcfnorm.c -- Left-align and normalize indels.
  
-    Copyright (C) 2013-2019 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -26,6 +26,7 @@ THE SOFTWARE.  */
  #include <strings.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <ctype.h>
  #include <string.h>
  #include <errno.h>
@@ -38,6 +39,7 @@ THE SOFTWARE.  */
  #include <htslib/khash_str2int.h>
  #include "bcftools.h"
  #include "rbuf.h"
+#include "abuf.h"
  
  #define CHECK_REF_EXIT 1
  #define CHECK_REF_WARN 2
@@ -84,20 +86,25 @@ typedef struct
      int32_t *int32_arr;
      int ntmp_arr1, ntmp_arr2, nint32_arr;
      kstring_t *tmp_str;
-    kstring_t *tmp_als, tmp_als_str;
+    kstring_t *tmp_als, tmp_kstr;
      int ntmp_als;
      rbuf_t rbuf;
      int buf_win;            // maximum distance between two records to consider
      int aln_win;            // the realignment window size (maximum repeat size)
      bcf_srs_t *files;       // using the synced reader only for -r option
-    bcf_hdr_t *hdr;
+    bcf_hdr_t *hdr, *out_hdr;
      cmpals_t cmpals_in, cmpals_out;
      faidx_t *fai;
      struct { int tot, set, swap; } nref;
      char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
      int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels;
      int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
-    int record_cmd_line, force, force_warned;
+    int record_cmd_line, force, force_warned, keep_sum_ad;
+    abuf_t *abuf;
+    abuf_opt_t atomize;
+    int use_star_allele;
+    char *old_rec_tag;
+    htsFile *out;
  }
  args_t;
  
@@ -136,7 +143,7 @@ static void seq_to_upper(char *seq, int len)
  static void fix_ref(args_t *args, bcf1_t *line)
  {
      int reflen = strlen(line->d.allele[0]);
-    int i, maxlen = reflen, len;
+    int i,j, maxlen = reflen, len;
      for (i=1; i<line->n_allele; i++)
      {
          int len = strlen(line->d.allele[i]);
@@ -149,27 +156,57 @@ static void fix_ref(args_t *args, bcf1_t *line)
  
      args->nref.tot++;
  
-    // is the REF different?
+    // is the REF different? If not, we are done
      if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
  
-    // is the REF allele missing or N?
-    if ( reflen==1 && (line->d.allele[0][0]=='.' || line->d.allele[0][0]=='N' || line->d.allele[0][0]=='n') ) 
+    // is the REF allele missing?
+    if ( reflen==1 && line->d.allele[0][0]=='.' ) 
      { 
          line->d.allele[0][0] = ref[0]; 
          args->nref.set++; 
          free(ref);
-        bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+        bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
          return;
      }
  
-    // does REF contain non-standard bases?
-    if ( replace_iupac_codes(line->d.allele[0],strlen(line->d.allele[0])) )
+    // does REF or ALT contain non-standard bases?
+    int has_non_acgtn = 0;
+    for (i=0; i<line->n_allele; i++)
+    {
+        if ( line->d.allele[i][0]=='<' ) continue;
+        has_non_acgtn += replace_iupac_codes(line->d.allele[i],strlen(line->d.allele[i]));
+    }
+    if ( has_non_acgtn )
      {
          args->nref.set++;
-        bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+        bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
          if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
      }
  
+    // does the REF allele contain N's ?
+    int fix = 0;
+    for (i=0; i<reflen; i++)
+    {
+        if ( line->d.allele[0][i]!='N' ) continue;
+        if ( ref[i]=='N' ) continue;
+        line->d.allele[0][i] = ref[i];
+        fix++;
+        for (j=1; j<line->n_allele; j++)
+        {
+            int len = strlen(line->d.allele[j]);
+            if ( len <= i || line->d.allele[j][i]!='N' ) continue;
+            line->d.allele[j][i] = ref[i];
+            fix++;
+        }
+    }
+    if ( fix )
+    {
+        args->nref.set++;
+        bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
+        if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+    }
+
+
      // is it swapped?
      for (i=1; i<line->n_allele; i++)
      {
@@ -178,45 +215,35 @@ static void fix_ref(args_t *args, bcf1_t *line)
      }
  
      kstring_t str = {0,0,0};
-    if ( i==line->n_allele )
+    if ( i==line->n_allele )    // none of the alternate alleles matches the reference
      {
-        // none of the alternate alleles matches the reference
-        if ( line->n_allele>1 )
-            args->nref.set++;
-        else
-            args->nref.swap++;
-
-        kputs(line->d.allele[0],&str);
-        kputc(',',&str);
+        args->nref.set++;
+        kputsn(ref,reflen,&str);
          for (i=1; i<line->n_allele; i++)
          {
-            kputs(line->d.allele[i],&str);
              kputc(',',&str);
+            kputs(line->d.allele[i],&str);
          }
-        kputc(ref[0],&str);
-        bcf_update_alleles_str(args->hdr,line,str.s);
-        str.l = 0;
+        bcf_update_alleles_str(args->out_hdr,line,str.s);
+        free(ref);
+        free(str.s);
+        return;
      }
-    else
-        args->nref.swap++;
-    free(ref);
  
-    // swap the alleles
-    int j;
+    // one of the alternate alleles matches the reference, assume it's a simple swap
      kputs(line->d.allele[i],&str);
-    for (j=1; j<i; j++)
-    {
-        kputc(',',&str);
-        kputs(line->d.allele[j],&str);
-    }
-    kputc(',',&str);
-    kputs(line->d.allele[0],&str);
-    for (j=i+1; j<line->n_allele; j++)
+    for (j=1; j<line->n_allele; j++)
      {
          kputc(',',&str);
-        kputs(line->d.allele[j],&str);
+        if ( j==i ) 
+            kputs(line->d.allele[0],&str);
+        else
+            kputs(line->d.allele[j],&str);
      }
-    bcf_update_alleles_str(args->hdr,line,str.s);
+    bcf_update_alleles_str(args->out_hdr,line,str.s);
+    args->nref.swap++;
+    free(ref);
+    free(str.s);
  
      // swap genotypes
      int ntmp = args->ntmp_arr1 / sizeof(int32_t); // reuse tmp_arr declared as uint8_t
@@ -231,7 +258,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
          else if ( gts[j]==bcf_gt_unphased(i) ) gts[j] = bcf_gt_unphased(0);
          else if ( gts[j]==bcf_gt_phased(i) ) gts[j] = bcf_gt_phased(0);
      }
-    bcf_update_genotypes(args->hdr,line,gts,ngts);
+    bcf_update_genotypes(args->out_hdr,line,gts,ngts);
  
      // update AC
      int nac = bcf_get_info_int32(args->hdr, line, "AC", &args->tmp_arr1, &ntmp);
@@ -240,10 +267,8 @@ static void fix_ref(args_t *args, bcf1_t *line)
      {
          int32_t *ac = (int32_t*)args->tmp_arr1;
          ac[i-1] = ni;
-        bcf_update_info_int32(args->hdr, line, "AC", ac, nac);
+        bcf_update_info_int32(args->out_hdr, line, "AC", ac, nac);
      }
-    
-    free(str.s);
  }
  
  static void fix_dup_alt(args_t *args, bcf1_t *line)
@@ -268,7 +293,7 @@ static void fix_dup_alt(args_t *args, bcf1_t *line)
          if ( !args->tmp_arr1[i] ) continue;
          line->d.allele[j++] = line->d.allele[i];
      }
-    bcf_update_alleles(args->hdr, line, (const char**)line->d.allele, nals);
+    bcf_update_alleles(args->out_hdr, line, (const char**)line->d.allele, nals);
  
  
      // update genotypes
@@ -286,7 +311,36 @@ static void fix_dup_alt(args_t *args, bcf1_t *line)
          gts[i] = bcf_gt_is_phased(gts[i]) ? bcf_gt_phased(ial_new) : bcf_gt_unphased(ial_new);
          changed = 1;
      }
-    if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts);
+    if ( changed ) bcf_update_genotypes(args->out_hdr,line,gts,ngts);
+}
+
+static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
+{
+    if ( !args->old_rec_tag ) return;
+
+    // only update if the tag is not present already, there can be multiple normalization steps
+    int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
+    bcf_unpack(dst, BCF_UN_INFO);
+    for (i=0; i<dst->n_info; i++)
+    {
+        bcf_info_t *inf = &dst->d.info[i];
+        if ( inf && inf->key == id ) return;
+    }
+
+    args->tmp_kstr.l = 0;
+    ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]);
+    for (i=1; i<src->n_allele; i++)
+    {
+        kputs(src->d.allele[i],&args->tmp_kstr);
+        if ( i+1<src->n_allele ) kputc(',',&args->tmp_kstr);
+    }
+    if ( ialt>0 )
+    {
+        kputc('|',&args->tmp_kstr);
+        kputw(ialt,&args->tmp_kstr);
+    }
+    if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 )
+            error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
  }
  
  #define ERR_DUP_ALLELE       -2
@@ -333,7 +387,7 @@ static int realign(args_t *args, bcf1_t *line)
          if ( line->rlen > 1 )
          {
              line->d.allele[0][1] = 0;
-            bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+            bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
          }
          return ERR_OK;
      }
@@ -363,7 +417,7 @@ static int realign(args_t *args, bcf1_t *line)
      }
  
      // trim from right
-    int ori_pos = line->pos;
+    int new_pos = line->pos;
      while (1)
      {
          // is the rightmost base identical in all alleles?
@@ -374,7 +428,7 @@ static int realign(args_t *args, bcf1_t *line)
              if ( als[i].l < min_len ) min_len = als[i].l;
          }
          if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
-        if ( min_len<=1 && line->pos==0 ) break;
+        if ( min_len<=1 && new_pos==0 ) break;
  
          int pad_from_left = 0;
          for (i=0; i<line->n_allele; i++) // trim all alleles
@@ -384,10 +438,10 @@ static int realign(args_t *args, bcf1_t *line)
          }
          if ( pad_from_left )
          {
-            int npad = line->pos >= args->aln_win ? args->aln_win : line->pos;
+            int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
              free(ref);
-            ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref);
-            if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos-npad+1);
+            ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref);
+            if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1);
              replace_iupac_codes(ref,nref);
              for (i=0; i<line->n_allele; i++)
              {
@@ -396,7 +450,7 @@ static int realign(args_t *args, bcf1_t *line)
                  memcpy(als[i].s,ref,npad);
                  als[i].l += npad;
              }
-            line->pos -= npad;
+            new_pos -= npad;
          }
      }
      free(ref);
@@ -422,39 +476,43 @@ static int realign(args_t *args, bcf1_t *line)
              memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
              als[i].l -= ntrim_left;
          }
-        line->pos += ntrim_left;
+        new_pos += ntrim_left;
      }
  
      // Have the alleles changed?
      als[0].s[ als[0].l ] = 0;  // in order for strcmp to work
-    if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
+    if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
+
+    set_old_rec_tag(args, line, line, 0);
  
      // Create new block of alleles and update
-    args->tmp_als_str.l = 0;
+    args->tmp_kstr.l = 0;
      for (i=0; i<line->n_allele; i++)
      {
-        if (i>0) kputc(',',&args->tmp_als_str);
-        kputsn(als[i].s,als[i].l,&args->tmp_als_str);
+        if (i>0) kputc(',',&args->tmp_kstr);
+        kputsn(als[i].s,als[i].l,&args->tmp_kstr);
      }
-    args->tmp_als_str.s[ args->tmp_als_str.l ] = 0;
-    bcf_update_alleles_str(args->hdr,line,args->tmp_als_str.s);
+    args->tmp_kstr.s[ args->tmp_kstr.l ] = 0;
+    bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s);
      args->nchanged++;
  
      // Update INFO/END if necessary
      int new_reflen = strlen(line->d.allele[0]);
-    if ( (ori_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 )
+    if ( (new_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 )
      {
          // bcf_update_alleles_str() messed up rlen because line->pos changed. This will be fixed by bcf_update_info_int32()
+        line->pos = new_pos;
          args->int32_arr[0] = line->pos + new_reflen;
-        bcf_update_info_int32(args->hdr, line, "END", args->int32_arr, 1);
+        bcf_update_info_int32(args->out_hdr, line, "END", args->int32_arr, 1);
      }
+    line->pos = new_pos;
  
      return ERR_OK;
  }
  
  static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
  {
-    #define BRANCH_NUMERIC(type,type_t) \
+    #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing) \
      { \
          const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); \
          int ntmp = args->ntmp_arr1 / sizeof(type_t); \
@@ -477,13 +535,13 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                  } \
                  if ( args->force ) \
                  { \
-                    bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \
+                    bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
                      return; \
                  } \
                  error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
                          tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \
              } \
-            bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,vals+ialt,1); \
          } \
          else if ( len==BCF_VL_R ) \
          { \
@@ -499,14 +557,24 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                  } \
                  if ( args->force ) \
                  { \
-                    bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \
+                    bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
                      return; \
                  } \
                  error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
                          tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \
              } \
-            if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
-            bcf_update_info_##type(args->hdr,dst,tag,vals,2); \
+            if ( args->keep_sum_ad >= 0 && args->keep_sum_ad==info->key ) \
+            { \
+                int j; \
+                for (j=1; j<info->len; j++) \
+                    if ( j!=ialt+1 && !(is_missing) && !(is_vector_end) ) vals[0] += vals[j]; \
+                vals[1] = vals[ialt+1]; \
+            } \
+            else \
+            { \
+                if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
+            } \
+            bcf_update_info_##type(args->out_hdr,dst,tag,vals,2); \
          } \
          else if ( len==BCF_VL_G ) \
          { \
@@ -522,7 +590,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                  } \
                  if ( args->force ) \
                  { \
-                    bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \
+                    bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
                      return; \
                  } \
                  error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
@@ -533,15 +601,15 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                  vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \
                  vals[2] = vals[bcf_alleles2gt(ialt+1,ialt+1)]; \
              } \
-            bcf_update_info_##type(args->hdr,dst,tag,vals,3); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,vals,3); \
          } \
          else \
-            bcf_update_info_##type(args->hdr,dst,tag,vals,ret); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,vals,ret); \
      }
      switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key))
      {
-        case BCF_HT_INT:  BRANCH_NUMERIC(int32, int32_t); break;
-        case BCF_HT_REAL: BRANCH_NUMERIC(float, float); break;
+        case BCF_HT_INT:  BRANCH_NUMERIC(int32, int32_t, vals[j]==bcf_int32_vector_end, vals[j]==bcf_int32_missing); break;
+        case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(vals[j]), bcf_float_is_missing(vals[j])); break;
      }
      #undef BRANCH_NUMERIC
  }
@@ -589,7 +657,7 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i
          STR_MOVE_NTH(str.s,tmp,str.s+str.l,ialt,len);
          if ( len<0 ) return;   // wrong number of fields: skip
          str.s[len] = 0;
-        bcf_update_info_string(args->hdr,dst,tag,str.s);
+        bcf_update_info_string(args->out_hdr,dst,tag,str.s);
      }
      else if ( len==BCF_VL_R )
      {
@@ -600,7 +668,7 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i
          STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,ialt,len);
          if ( len<0 ) return;   // wrong number of fields: skip
          str.s[len] = 0;
-        bcf_update_info_string(args->hdr,dst,tag,str.s);
+        bcf_update_info_string(args->out_hdr,dst,tag,str.s);
      }
      else if ( len==BCF_VL_G )
      {
@@ -615,16 +683,16 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i
          STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,iaa-i0a-1,len);
          if ( len<0 ) return;   // wrong number of fields: skip
          str.s[len] = 0;
-        bcf_update_info_string(args->hdr,dst,tag,str.s);
+        bcf_update_info_string(args->out_hdr,dst,tag,str.s);
      }
      else
-        bcf_update_info_string(args->hdr,dst,tag,str.s);
+        bcf_update_info_string(args->out_hdr,dst,tag,str.s);
  }
  static void split_info_flag(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
  {
      const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
      int ret = bcf_get_info_flag(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1);
-    bcf_update_info_flag(args->hdr,dst,tag,NULL,ret);
+    bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret);
  }
  
  static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
@@ -650,11 +718,11 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
          }
          gt += ngts;
      }
-    bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl);
+    bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl);
  }
  static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
  {
-    #define BRANCH_NUMERIC(type,type_t,is_vector_end,set_vector_end) \
+    #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end) \
      { \
          const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \
          int ntmp = args->ntmp_arr1 / sizeof(type_t); \
@@ -663,10 +731,10 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
          assert( nvals>0 ); \
          type_t *vals = (type_t *) args->tmp_arr1; \
          int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id); \
-        int i, nsmpl = bcf_hdr_nsamples(args->hdr); \
+        int i,j, nsmpl = bcf_hdr_nsamples(args->hdr); \
          if ( nvals==nsmpl ) /* all values are missing */ \
          { \
-            bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \
              return; \
          } \
          if ( len==BCF_VL_A ) \
@@ -683,7 +751,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                  } \
                  if ( args->force ) \
                  { \
-                    bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \
+                    bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
                      return; \
                  } \
                  error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \
@@ -697,7 +765,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                  dst_vals += 1; \
                  src_vals += nvals; \
              } \
-            bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \
          } \
          else if ( len==BCF_VL_R ) \
          { \
@@ -713,7 +781,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                  } \
                  if ( args->force ) \
                  { \
-                    bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \
+                    bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
                      return; \
                  } \
                  error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \
@@ -721,14 +789,29 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
              } \
              nvals /= nsmpl; \
              type_t *src_vals = vals, *dst_vals = vals; \
-            for (i=0; i<nsmpl; i++) \
+            if ( args->keep_sum_ad >= 0 && args->keep_sum_ad==fmt->id ) \
              { \
-                dst_vals[0] = src_vals[0]; \
-                dst_vals[1] = src_vals[ialt+1]; \
-                dst_vals += 2; \
-                src_vals += nvals; \
+                for (i=0; i<nsmpl; i++) \
+                { \
+                    dst_vals[0] = src_vals[0]; \
+                    for (j=1; j<nvals; j++) \
+                        if ( j!=ialt+1 && !(is_missing) && !(is_vector_end) ) dst_vals[0] += src_vals[j]; \
+                    dst_vals[1] = src_vals[ialt+1]; \
+                    dst_vals += 2; \
+                    src_vals += nvals; \
+                } \
+            } \
+            else \
+            { \
+                for (i=0; i<nsmpl; i++) \
+                { \
+                    dst_vals[0] = src_vals[0]; \
+                    dst_vals[1] = src_vals[ialt+1]; \
+                    dst_vals += 2; \
+                    src_vals += nvals; \
+                } \
              } \
-            bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl*2); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl*2); \
          } \
          else if ( len==BCF_VL_G ) \
          { \
@@ -744,7 +827,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                  } \
                  if ( args->force ) \
                  { \
-                    bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \
+                    bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
                      return; \
                  } \
                  error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \
@@ -775,15 +858,15 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                  dst_vals += all_haploid ? 2 : 3; \
                  src_vals += nvals; \
              } \
-            bcf_update_format_##type(args->hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \
          } \
          else \
-            bcf_update_format_##type(args->hdr,dst,tag,vals,nvals); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,vals,nvals); \
      }
      switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
      {
-        case BCF_HT_INT:  BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, dst_vals[2]=bcf_int32_vector_end); break;
-        case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break;
+        case BCF_HT_INT:  BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, src_vals[j]==bcf_int32_missing, dst_vals[2]=bcf_int32_vector_end); break;
+        case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_is_missing(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break;
      }
      #undef BRANCH_NUMERIC
  }
@@ -825,7 +908,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
              ptr += blen;
          }
          if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
-        bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+        bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
      }
      else if ( len==BCF_VL_R )
      {
@@ -843,7 +926,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
              ptr += blen;
          }
          if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
-        bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+        bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
      }
      else if ( len==BCF_VL_G )
      {
@@ -871,7 +954,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
                  }
                  if ( args->force )
                  {
-                    bcf_update_format_char(args->hdr,dst,tag,NULL,0);
+                    bcf_update_format_char(args->out_hdr,dst,tag,NULL,0);
                      return;
                  }
                  error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n",
@@ -902,13 +985,12 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
              ptr += blen;
          }
          if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
-        bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+        bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
      }
      else
-        bcf_update_format_char(args->hdr,dst,tag,str.s,str.l);
+        bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l);
  }
  
-
  static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
  {
      int i;
@@ -941,11 +1023,11 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
  
          // Not quite sure how to handle IDs, they can be assigned to a specific
          // ALT.  For now we leave the ID unchanged for all.
-        bcf_update_id(args->hdr, dst, line->d.id ? line->d.id : ".");
+        bcf_update_id(args->out_hdr, dst, line->d.id ? line->d.id : ".");
  
          tmp.l = rlen;
          kputs(line->d.allele[i+1],&tmp);
-        bcf_update_alleles_str(args->hdr,dst,tmp.s);
+        bcf_update_alleles_str(args->out_hdr,dst,tmp.s);
  
          if ( line->d.n_flt ) bcf_update_filter(args->hdr, dst, line->d.flt, line->d.n_flt);
  
@@ -958,6 +1040,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
              else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst);
              else split_info_string(args, line, info, i, dst);
          }
+        set_old_rec_tag(args, dst, line, i + 1); // 1-based indexes
  
          dst->n_sample = line->n_sample;
          for (j=0; j<line->n_fmt; j++)
@@ -1021,7 +1104,7 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf
                      vals[ args->maps[i].map[k+1] - 1 ] = vals2[k]; \
                  } \
              } \
-            bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \
          } \
          else if ( len==BCF_VL_R ) \
          { \
@@ -1045,7 +1128,7 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf
                      vals[ args->maps[i].map[k] ] = vals2[k]; \
                  } \
              } \
-            bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \
          } \
          else if ( len==BCF_VL_G ) \
          { \
@@ -1079,10 +1162,10 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf
                      } \
                  } \
              } \
-            bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \
          } \
          else \
-            bcf_update_info_##type(args->hdr,dst,tag,vals,nvals_ori); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,vals,nvals_ori); \
      }
      switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key))
      {
@@ -1095,7 +1178,7 @@ static void merge_info_flag(args_t *args, bcf1_t **lines, int nlines, bcf_info_t
  {
      const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
      int ret = bcf_get_info_flag(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
-    bcf_update_info_flag(args->hdr,dst,tag,NULL,ret);
+    bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret);
  }
  int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
  static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst)
@@ -1123,7 +1206,7 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info
          str.s[str.l] = 0;
          args->tmp_arr1  = (uint8_t*) str.s;
          args->ntmp_arr1 = str.m;
-        bcf_update_info_string(args->hdr,dst,tag,str.s);
+        bcf_update_info_string(args->out_hdr,dst,tag,str.s);
      }
      else if ( len==BCF_VL_G )
      {
@@ -1150,12 +1233,12 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info
          str.s[str.l] = 0;
          args->tmp_arr1  = (uint8_t*) str.s;
          args->ntmp_arr1 = str.m;
-        bcf_update_info_string(args->hdr,dst,tag,str.s);
+        bcf_update_info_string(args->out_hdr,dst,tag,str.s);
      }
      else
      {
          bcf_get_info_string(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
-        bcf_update_info_string(args->hdr,dst,tag,args->tmp_arr1);
+        bcf_update_info_string(args->out_hdr,dst,tag,args->tmp_arr1);
      }
  }
  static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst)
@@ -1198,7 +1281,7 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
              gt2 += ngts;
          }
      }
-    bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl);
+    bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl);
  }
  static int diploid_to_haploid(int size, int nsmpl, int nals, uint8_t *vals)
  {
@@ -1251,7 +1334,7 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f
                      vals2 += nvals2; \
                  } \
              } \
-            bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
          } \
          else if ( len==BCF_VL_R ) \
          { \
@@ -1279,7 +1362,7 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f
                      vals2 += nvals2; \
                  } \
              } \
-            bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
          } \
          else if ( len==BCF_VL_G ) \
          { \
@@ -1358,10 +1441,10 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f
                      vals2 += nvals;\
                  }\
              }\
-            bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
          } \
          else \
-            bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \
      }
      switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
      {
@@ -1378,7 +1461,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm
      if ( len!=BCF_VL_A && len!=BCF_VL_R && len!=BCF_VL_G )
      {
          int nret = bcf_get_format_char(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
-        bcf_update_format_char(args->hdr,dst,tag,args->tmp_arr1,nret);
+        bcf_update_format_char(args->out_hdr,dst,tag,args->tmp_arr1,nret);
          return;
      }
  
@@ -1397,7 +1480,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm
          for (i=0; i<nlines; i++)
          {
              int nret = bcf_get_format_char(args->hdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1);
-            if (nret<0) continue; /* format tag does not exist in this record, skip */ \
+            if (nret<0) continue; /* format tag does not exist in this record, skip */
              nret /= nsmpl;
              for (k=0; k<nsmpl; k++)
              {
@@ -1444,7 +1527,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm
              if ( i ) // we already have a copy
              {
                  nret = bcf_get_format_char(args->hdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1);
-                if (nret<0) continue; /* format tag does not exist in this record, skip */ \
+                if (nret<0) continue; /* format tag does not exist in this record, skip */
                  nret /= nsmpl;
              }
              for (k=0; k<nsmpl; k++)
@@ -1490,7 +1573,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm
      }
      args->ntmp_arr2 = str.m;
      args->tmp_arr2  = (uint8_t*)str.s;
-    bcf_update_format_char(args->hdr,dst,tag,str.s,str.l);
+    bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l);
  }
  
  char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb);   // see vcfmerge.c
@@ -1511,7 +1594,7 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t *
              dst->qual = lines[i]->qual;
      }
  
-    bcf_update_id(args->hdr, dst, lines[0]->d.id);
+    bcf_update_id(args->out_hdr, dst, lines[0]->d.id);
  
      // Merge and set the alleles, create a mapping from source allele indexes to dst idxs
      hts_expand0(map_t,nlines,args->mmaps,args->maps);   // a mapping for each line
@@ -1525,20 +1608,20 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t *
      }
      for (i=1; i<nlines; i++)
      {
-        if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->hdr, dst, lines[i]->d.id);
+        if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->out_hdr, dst, lines[i]->d.id);
          args->maps[i].nals = lines[i]->n_allele;
          hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map);
          args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals);
          if ( !args->als ) error("Failed to merge alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1);
      }
-    bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals);
+    bcf_update_alleles(args->out_hdr, dst, (const char**)args->als, args->nals);
      for (i=0; i<args->nals; i++)
      {
          free(args->als[i]);
          args->als[i] = NULL;
      }
  
-    if ( lines[0]->d.n_flt ) bcf_update_filter(args->hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt);
+    if ( lines[0]->d.n_flt ) bcf_update_filter(args->out_hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt);
      for (i=1; i<nlines; i++) {
          int j;
          for (j=0; j<lines[i]->d.n_flt; j++) {
@@ -1546,13 +1629,13 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t *
              // otherwise accumulate FILTERs
              if (lines[i]->d.flt[j] == bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PASS")) {
                  if (args->strict_filter) {
-                    bcf_update_filter(args->hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt);
+                    bcf_update_filter(args->out_hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt);
                      break;
                  }
                  else
                      continue;
              }
-            bcf_add_filter(args->hdr, dst, lines[i]->d.flt[j]);
+            bcf_add_filter(args->out_hdr, dst, lines[i]->d.flt[j]);
          }
      }
  
@@ -1722,7 +1805,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
              if ( mrows_ready_to_flush(args, args->lines[k]) )
              {
                  while ( (line=mrows_flush(args)) )
-                    if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+                    if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
              }
              int merge = 1;
              if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
@@ -1755,18 +1838,30 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
              prev_type |= line_type;
              if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]);
          }
-        if ( bcf_write1(file, args->hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+        if ( bcf_write1(file, args->out_hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
      }
      if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n )
      {
          while ( (line=mrows_flush(args)) )
-            if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+            if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
      }
  }
  
  static void init_data(args_t *args)
  {
      args->hdr = args->files->readers[0].header;
+    if ( args->keep_sum_ad )
+    {
+        args->keep_sum_ad = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"AD");
+        if ( args->keep_sum_ad < 0 ) error("Error: --keep-sum-ad requested but the tag AD is not present\n");
+    }
+    else
+        args->keep_sum_ad = -1;
+
+    args->out_hdr = bcf_hdr_dup(args->hdr);
+    if ( args->old_rec_tag )
+        bcf_hdr_printf(args->out_hdr,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX\">",args->old_rec_tag); 
+
      rbuf_init(&args->rbuf, 100);
      args->lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
      if ( args->ref_fname )
@@ -1780,6 +1875,14 @@ static void init_data(args_t *args)
          args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t));
          args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr));
      }
+    if ( args->atomize==SPLIT )
+    {
+        args->abuf = abuf_init(args->hdr, SPLIT); 
+        abuf_set_opt(args->abuf, bcf_hdr_t*, BCF_HDR, args->out_hdr);
+        if ( args->old_rec_tag )
+            abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag);
+        abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele);
+    }
  }
  
  static void destroy_data(args_t *args)
@@ -1804,7 +1907,7 @@ static void destroy_data(args_t *args)
      for (i=0; i<args->ntmp_als; i++)
          free(args->tmp_als[i].s);
      free(args->tmp_als);
-    free(args->tmp_als_str.s);
+    free(args->tmp_kstr.s);
      if ( args->tmp_str )
      {
          for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) free(args->tmp_str[i].s);
@@ -1816,15 +1919,16 @@ static void destroy_data(args_t *args)
      free(args->tmp_arr1);
      free(args->tmp_arr2);
      free(args->diploid);
+    if ( args->abuf ) abuf_destroy(args->abuf);
+    bcf_hdr_destroy(args->out_hdr);
      if ( args->mrow_out ) bcf_destroy1(args->mrow_out);
      if ( args->fai ) fai_destroy(args->fai);
      if ( args->mseq ) free(args->seq);
  }
  
  
-static void normalize_line(args_t *args, bcf1_t **line_ptr)
+static void normalize_line(args_t *args, bcf1_t *line)
  {
-    bcf1_t *line = *line_ptr;
      if ( args->fai )
      {
          if ( args->check_ref & CHECK_REF_FIX ) fix_ref(args, line);
@@ -1854,8 +1958,8 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr)
      rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines);
      int i,j;
      i = j = rbuf_append(&args->rbuf);
-    if ( !args->lines[i] ) args->lines[i] = bcf_init1();
-    SWAP(bcf1_t*, (*line_ptr), args->lines[i]);
+    if ( args->lines[i] ) bcf_destroy(args->lines[i]);
+    args->lines[i] = bcf_dup(line);
      while ( rbuf_prev(&args->rbuf,&i) )
      {
          if ( args->lines[i]->pos > args->lines[j]->pos ) SWAP(bcf1_t*, args->lines[i], args->lines[j]);
@@ -1863,21 +1967,38 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr)
      }
  }
  
+static bcf1_t *next_atomized_line(args_t *args)
+{
+    bcf1_t *rec = NULL;
+    if ( args->atomize==SPLIT )
+    {
+        rec = abuf_flush(args->abuf, 0);
+        if ( rec ) return rec;
+    }
+
+    if ( !bcf_sr_next_line(args->files) ) return NULL;
+
+    if ( args->atomize==SPLIT )
+    {
+        abuf_push(args->abuf,bcf_sr_get_line(args->files,0));
+        return abuf_flush(args->abuf, 0);
+    }
+    return bcf_sr_get_line(args->files,0);
+}
  static void normalize_vcf(args_t *args)
  {
-    htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
-    if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+    args->out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
+    if ( args->out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
      if ( args->n_threads )
-        hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p);
-    if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
-    if ( bcf_hdr_write(out, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+        hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
+    if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm");
+    if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
  
+    bcf1_t *line;
      int prev_rid = -1, prev_pos = -1, prev_type = 0;
-    while ( bcf_sr_next_line(args->files) )
+    while ( (line = next_atomized_line(args)) )
      {
          args->ntotal++;
-
-        bcf1_t *line = args->files->readers[0].buffer[0];
          if ( args->rmdup )
          {
              int line_type = bcf_get_variant_types(line);
@@ -1901,7 +2022,7 @@ static void normalize_vcf(args_t *args)
  
          // still on the same chromosome?
          int i,j,ilast = rbuf_last(&args->rbuf);
-        if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, out, args->rbuf.n); // new chromosome
+        if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, args->out, args->rbuf.n); // new chromosome
  
          int split = 0;
          if ( args->mrows_op==MROWS_SPLIT )
@@ -1916,13 +2037,13 @@ static void normalize_vcf(args_t *args)
                  args->nsplit++;
                  split_multiallelic_to_biallelics(args, line);
                  for (j=0; j<args->ntmp_lines; j++)
-                    normalize_line(args, &args->tmp_lines[j]);
+                    normalize_line(args, args->tmp_lines[j]);
              }
              else
                  split = 0;
          }
          if ( !split )
-            normalize_line(args, &args->files->readers[0].buffer[0]);
+            normalize_line(args, line);
  
          // find out how many sites to flush
          ilast = rbuf_last(&args->rbuf);
@@ -1932,10 +2053,10 @@ static void normalize_vcf(args_t *args)
              if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break;
              j++;
          }
-        if ( j>0 ) flush_buffer(args, out, j);
+        if ( j>0 ) flush_buffer(args, args->out, j);
      }
-    flush_buffer(args, out, args->rbuf.n);
-    if ( hts_close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    flush_buffer(args, args->out, args->rbuf.n);
+    if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
  
      fprintf(stderr,"Lines   total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
      if ( args->check_ref & CHECK_REF_FIX )
@@ -1951,23 +2072,27 @@ static void usage(void)
      fprintf(stderr, "Usage:   bcftools norm [options] <in.vcf.gz>\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Options:\n");
-    fprintf(stderr, "    -c, --check-ref <e|w|x|s>         check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
-    fprintf(stderr, "    -D, --remove-duplicates           remove duplicate lines of the same type.\n");
-    fprintf(stderr, "    -d, --rm-dup <type>               remove duplicate snps|indels|both|all|exact\n");
-    fprintf(stderr, "    -f, --fasta-ref <file>            reference sequence\n");
-    fprintf(stderr, "        --force                       try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
-    fprintf(stderr, "    -m, --multiallelics <-|+>[type]   split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
-    fprintf(stderr, "        --no-version                  do not append version and command line to the header\n");
-    fprintf(stderr, "    -N, --do-not-normalize            do not normalize indels (with -m or -c s)\n");
-    fprintf(stderr, "    -o, --output <file>               write output to a file [standard output]\n");
-    fprintf(stderr, "    -O, --output-type <type>          'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
-    fprintf(stderr, "    -r, --regions <region>            restrict to comma-separated list of regions\n");
-    fprintf(stderr, "    -R, --regions-file <file>         restrict to regions listed in a file\n");
-    fprintf(stderr, "    -s, --strict-filter               when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
-    fprintf(stderr, "    -t, --targets <region>            similar to -r but streams rather than index-jumps\n");
-    fprintf(stderr, "    -T, --targets-file <file>         similar to -R but streams rather than index-jumps\n");
-    fprintf(stderr, "        --threads <int>               use multithreading with <int> worker threads [0]\n");
-    fprintf(stderr, "    -w, --site-win <int>              buffer for sorting lines which changed position during realignment [1000]\n");
+    fprintf(stderr, "    -a, --atomize                   Decompose complex variants (e.g. MNVs become consecutive SNVs)\n");
+    fprintf(stderr, "        --atom-overlaps '*'|.       Use the star allele (*) for overlapping alleles or set to missing (.) [*]\n");
+    fprintf(stderr, "    -c, --check-ref e|w|x|s         Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
+    fprintf(stderr, "    -D, --remove-duplicates         Remove duplicate lines of the same type.\n");
+    fprintf(stderr, "    -d, --rm-dup TYPE               Remove duplicate snps|indels|both|all|exact\n");
+    fprintf(stderr, "    -f, --fasta-ref FILE            Reference sequence\n");
+    fprintf(stderr, "        --force                     Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
+    fprintf(stderr, "        --keep-sum TAG,..           Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
+    fprintf(stderr, "    -m, --multiallelics -|+TYPE     Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
+    fprintf(stderr, "        --no-version                Do not append version and command line to the header\n");
+    fprintf(stderr, "    -N, --do-not-normalize          Do not normalize indels (with -m or -c s)\n");
+    fprintf(stderr, "        --old-rec-tag STR           Annotate modified records with INFO/STR indicating the original variant\n");
+    fprintf(stderr, "    -o, --output FILE               Write output to a file [standard output]\n");
+    fprintf(stderr, "    -O, --output-type TYPE          'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+    fprintf(stderr, "    -r, --regions REGION            Restrict to comma-separated list of regions\n");
+    fprintf(stderr, "    -R, --regions-file FILE         Restrict to regions listed in a file\n");
+    fprintf(stderr, "    -s, --strict-filter             When merging (-m+), merged site is PASS only if all sites being merged PASS\n");
+    fprintf(stderr, "    -t, --targets REGION            Similar to -r but streams rather than index-jumps\n");
+    fprintf(stderr, "    -T, --targets-file FILE         Similar to -R but streams rather than index-jumps\n");
+    fprintf(stderr, "        --threads INT               Use multithreading with <int> worker threads [0]\n");
+    fprintf(stderr, "    -w, --site-win INT              Buffer for sorting lines which changed position during realignment [1000]\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Examples:\n");
      fprintf(stderr, "   # normalize and left-align indels\n");
@@ -1995,11 +2120,16 @@ int main_vcfnorm(int argc, char *argv[])
      args->do_indels = 1;
      int region_is_file  = 0;
      int targets_is_file = 0;
+    args->use_star_allele = 1;
  
      static struct option loptions[] =
      {
          {"help",no_argument,NULL,'h'},
          {"force",no_argument,NULL,7},
+        {"atomize",no_argument,NULL,'a'},
+        {"atom-overlaps",required_argument,NULL,11},
+        {"old-rec-tag",required_argument,NULL,12},
+        {"keep-sum",required_argument,NULL,10},
          {"fasta-ref",required_argument,NULL,'f'},
          {"do-not-normalize",no_argument,NULL,'N'},
          {"multiallelics",required_argument,NULL,'m'},
@@ -2019,8 +2149,21 @@ int main_vcfnorm(int argc, char *argv[])
          {NULL,0,NULL,0}
      };
      char *tmp;
-    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sN",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) {
          switch (c) {
+            case  10:
+                // possibly generalize this also to INFO/AD and other tags
+                if ( strcasecmp("ad",optarg) )
+                    error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n");
+                args->keep_sum_ad = 1;  // this will be set to the header id or -1 in init_data
+                break;
+            case 'a': args->atomize = SPLIT; break;
+            case 11 :
+                if ( optarg[0]=='*' ) args->use_star_allele = 1;
+                else if ( optarg[0]=='.' ) args->use_star_allele = 0;
+                else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n");
+                break;
+            case 12 : args->old_rec_tag = optarg; break;
              case 'N': args->do_indels = 0; break;
              case 'd':
                  if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
@@ -2092,7 +2235,7 @@ int main_vcfnorm(int argc, char *argv[])
      }
      else fname = argv[optind];
  
-    if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n");
+    if ( !args->ref_fname && !args->mrows_op && !args->rmdup && args->atomize==NONE ) error("Expected -a, -f, -m, -D or -d option\n");
      if ( !args->check_ref && args->ref_fname ) args->check_ref = CHECK_REF_EXIT;
      if ( args->check_ref && !args->ref_fname ) error("Expected --fasta-ref with --check-ref\n");
  
diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c

index 6125a1b6b185e52d076f0901d63eba83054146e8..e48443fbe88e1c4cb0b9c2c218caad847134c7ac 100644 (file)
--- a/bcftools/vcfnorm.c.pysam.c
+++ b/bcftools/vcfnorm.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfnorm.c -- Left-align and normalize indels.
  
-    Copyright (C) 2013-2019 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -28,6 +28,7 @@ THE SOFTWARE.  */
  #include <strings.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <ctype.h>
  #include <string.h>
  #include <errno.h>
@@ -40,6 +41,7 @@ THE SOFTWARE.  */
  #include <htslib/khash_str2int.h>
  #include "bcftools.h"
  #include "rbuf.h"
+#include "abuf.h"
  
  #define CHECK_REF_EXIT 1
  #define CHECK_REF_WARN 2
@@ -86,20 +88,25 @@ typedef struct
      int32_t *int32_arr;
      int ntmp_arr1, ntmp_arr2, nint32_arr;
      kstring_t *tmp_str;
-    kstring_t *tmp_als, tmp_als_str;
+    kstring_t *tmp_als, tmp_kstr;
      int ntmp_als;
      rbuf_t rbuf;
      int buf_win;            // maximum distance between two records to consider
      int aln_win;            // the realignment window size (maximum repeat size)
      bcf_srs_t *files;       // using the synced reader only for -r option
-    bcf_hdr_t *hdr;
+    bcf_hdr_t *hdr, *out_hdr;
      cmpals_t cmpals_in, cmpals_out;
      faidx_t *fai;
      struct { int tot, set, swap; } nref;
      char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
      int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels;
      int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
-    int record_cmd_line, force, force_warned;
+    int record_cmd_line, force, force_warned, keep_sum_ad;
+    abuf_t *abuf;
+    abuf_opt_t atomize;
+    int use_star_allele;
+    char *old_rec_tag;
+    htsFile *out;
  }
  args_t;
  
@@ -138,7 +145,7 @@ static void seq_to_upper(char *seq, int len)
  static void fix_ref(args_t *args, bcf1_t *line)
  {
      int reflen = strlen(line->d.allele[0]);
-    int i, maxlen = reflen, len;
+    int i,j, maxlen = reflen, len;
      for (i=1; i<line->n_allele; i++)
      {
          int len = strlen(line->d.allele[i]);
@@ -151,27 +158,57 @@ static void fix_ref(args_t *args, bcf1_t *line)
  
      args->nref.tot++;
  
-    // is the REF different?
+    // is the REF different? If not, we are done
      if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
  
-    // is the REF allele missing or N?
-    if ( reflen==1 && (line->d.allele[0][0]=='.' || line->d.allele[0][0]=='N' || line->d.allele[0][0]=='n') ) 
+    // is the REF allele missing?
+    if ( reflen==1 && line->d.allele[0][0]=='.' ) 
      { 
          line->d.allele[0][0] = ref[0]; 
          args->nref.set++; 
          free(ref);
-        bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+        bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
          return;
      }
  
-    // does REF contain non-standard bases?
-    if ( replace_iupac_codes(line->d.allele[0],strlen(line->d.allele[0])) )
+    // does REF or ALT contain non-standard bases?
+    int has_non_acgtn = 0;
+    for (i=0; i<line->n_allele; i++)
+    {
+        if ( line->d.allele[i][0]=='<' ) continue;
+        has_non_acgtn += replace_iupac_codes(line->d.allele[i],strlen(line->d.allele[i]));
+    }
+    if ( has_non_acgtn )
      {
          args->nref.set++;
-        bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+        bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
          if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
      }
  
+    // does the REF allele contain N's ?
+    int fix = 0;
+    for (i=0; i<reflen; i++)
+    {
+        if ( line->d.allele[0][i]!='N' ) continue;
+        if ( ref[i]=='N' ) continue;
+        line->d.allele[0][i] = ref[i];
+        fix++;
+        for (j=1; j<line->n_allele; j++)
+        {
+            int len = strlen(line->d.allele[j]);
+            if ( len <= i || line->d.allele[j][i]!='N' ) continue;
+            line->d.allele[j][i] = ref[i];
+            fix++;
+        }
+    }
+    if ( fix )
+    {
+        args->nref.set++;
+        bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
+        if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
+    }
+
+
      // is it swapped?
      for (i=1; i<line->n_allele; i++)
      {
@@ -180,45 +217,35 @@ static void fix_ref(args_t *args, bcf1_t *line)
      }
  
      kstring_t str = {0,0,0};
-    if ( i==line->n_allele )
+    if ( i==line->n_allele )    // none of the alternate alleles matches the reference
      {
-        // none of the alternate alleles matches the reference
-        if ( line->n_allele>1 )
-            args->nref.set++;
-        else
-            args->nref.swap++;
-
-        kputs(line->d.allele[0],&str);
-        kputc(',',&str);
+        args->nref.set++;
+        kputsn(ref,reflen,&str);
          for (i=1; i<line->n_allele; i++)
          {
-            kputs(line->d.allele[i],&str);
              kputc(',',&str);
+            kputs(line->d.allele[i],&str);
          }
-        kputc(ref[0],&str);
-        bcf_update_alleles_str(args->hdr,line,str.s);
-        str.l = 0;
+        bcf_update_alleles_str(args->out_hdr,line,str.s);
+        free(ref);
+        free(str.s);
+        return;
      }
-    else
-        args->nref.swap++;
-    free(ref);
  
-    // swap the alleles
-    int j;
+    // one of the alternate alleles matches the reference, assume it's a simple swap
      kputs(line->d.allele[i],&str);
-    for (j=1; j<i; j++)
-    {
-        kputc(',',&str);
-        kputs(line->d.allele[j],&str);
-    }
-    kputc(',',&str);
-    kputs(line->d.allele[0],&str);
-    for (j=i+1; j<line->n_allele; j++)
+    for (j=1; j<line->n_allele; j++)
      {
          kputc(',',&str);
-        kputs(line->d.allele[j],&str);
+        if ( j==i ) 
+            kputs(line->d.allele[0],&str);
+        else
+            kputs(line->d.allele[j],&str);
      }
-    bcf_update_alleles_str(args->hdr,line,str.s);
+    bcf_update_alleles_str(args->out_hdr,line,str.s);
+    args->nref.swap++;
+    free(ref);
+    free(str.s);
  
      // swap genotypes
      int ntmp = args->ntmp_arr1 / sizeof(int32_t); // reuse tmp_arr declared as uint8_t
@@ -233,7 +260,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
          else if ( gts[j]==bcf_gt_unphased(i) ) gts[j] = bcf_gt_unphased(0);
          else if ( gts[j]==bcf_gt_phased(i) ) gts[j] = bcf_gt_phased(0);
      }
-    bcf_update_genotypes(args->hdr,line,gts,ngts);
+    bcf_update_genotypes(args->out_hdr,line,gts,ngts);
  
      // update AC
      int nac = bcf_get_info_int32(args->hdr, line, "AC", &args->tmp_arr1, &ntmp);
@@ -242,10 +269,8 @@ static void fix_ref(args_t *args, bcf1_t *line)
      {
          int32_t *ac = (int32_t*)args->tmp_arr1;
          ac[i-1] = ni;
-        bcf_update_info_int32(args->hdr, line, "AC", ac, nac);
+        bcf_update_info_int32(args->out_hdr, line, "AC", ac, nac);
      }
-    
-    free(str.s);
  }
  
  static void fix_dup_alt(args_t *args, bcf1_t *line)
@@ -270,7 +295,7 @@ static void fix_dup_alt(args_t *args, bcf1_t *line)
          if ( !args->tmp_arr1[i] ) continue;
          line->d.allele[j++] = line->d.allele[i];
      }
-    bcf_update_alleles(args->hdr, line, (const char**)line->d.allele, nals);
+    bcf_update_alleles(args->out_hdr, line, (const char**)line->d.allele, nals);
  
  
      // update genotypes
@@ -288,7 +313,36 @@ static void fix_dup_alt(args_t *args, bcf1_t *line)
          gts[i] = bcf_gt_is_phased(gts[i]) ? bcf_gt_phased(ial_new) : bcf_gt_unphased(ial_new);
          changed = 1;
      }
-    if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts);
+    if ( changed ) bcf_update_genotypes(args->out_hdr,line,gts,ngts);
+}
+
+static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
+{
+    if ( !args->old_rec_tag ) return;
+
+    // only update if the tag is not present already, there can be multiple normalization steps
+    int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
+    bcf_unpack(dst, BCF_UN_INFO);
+    for (i=0; i<dst->n_info; i++)
+    {
+        bcf_info_t *inf = &dst->d.info[i];
+        if ( inf && inf->key == id ) return;
+    }
+
+    args->tmp_kstr.l = 0;
+    ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]);
+    for (i=1; i<src->n_allele; i++)
+    {
+        kputs(src->d.allele[i],&args->tmp_kstr);
+        if ( i+1<src->n_allele ) kputc(',',&args->tmp_kstr);
+    }
+    if ( ialt>0 )
+    {
+        kputc('|',&args->tmp_kstr);
+        kputw(ialt,&args->tmp_kstr);
+    }
+    if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 )
+            error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
  }
  
  #define ERR_DUP_ALLELE       -2
@@ -335,7 +389,7 @@ static int realign(args_t *args, bcf1_t *line)
          if ( line->rlen > 1 )
          {
              line->d.allele[0][1] = 0;
-            bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+            bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
          }
          return ERR_OK;
      }
@@ -365,7 +419,7 @@ static int realign(args_t *args, bcf1_t *line)
      }
  
      // trim from right
-    int ori_pos = line->pos;
+    int new_pos = line->pos;
      while (1)
      {
          // is the rightmost base identical in all alleles?
@@ -376,7 +430,7 @@ static int realign(args_t *args, bcf1_t *line)
              if ( als[i].l < min_len ) min_len = als[i].l;
          }
          if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed
-        if ( min_len<=1 && line->pos==0 ) break;
+        if ( min_len<=1 && new_pos==0 ) break;
  
          int pad_from_left = 0;
          for (i=0; i<line->n_allele; i++) // trim all alleles
@@ -386,10 +440,10 @@ static int realign(args_t *args, bcf1_t *line)
          }
          if ( pad_from_left )
          {
-            int npad = line->pos >= args->aln_win ? args->aln_win : line->pos;
+            int npad = new_pos >= args->aln_win ? args->aln_win : new_pos;
              free(ref);
-            ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref);
-            if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos-npad+1);
+            ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref);
+            if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1);
              replace_iupac_codes(ref,nref);
              for (i=0; i<line->n_allele; i++)
              {
@@ -398,7 +452,7 @@ static int realign(args_t *args, bcf1_t *line)
                  memcpy(als[i].s,ref,npad);
                  als[i].l += npad;
              }
-            line->pos -= npad;
+            new_pos -= npad;
          }
      }
      free(ref);
@@ -424,39 +478,43 @@ static int realign(args_t *args, bcf1_t *line)
              memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left);
              als[i].l -= ntrim_left;
          }
-        line->pos += ntrim_left;
+        new_pos += ntrim_left;
      }
  
      // Have the alleles changed?
      als[0].s[ als[0].l ] = 0;  // in order for strcmp to work
-    if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
+    if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
+
+    set_old_rec_tag(args, line, line, 0);
  
      // Create new block of alleles and update
-    args->tmp_als_str.l = 0;
+    args->tmp_kstr.l = 0;
      for (i=0; i<line->n_allele; i++)
      {
-        if (i>0) kputc(',',&args->tmp_als_str);
-        kputsn(als[i].s,als[i].l,&args->tmp_als_str);
+        if (i>0) kputc(',',&args->tmp_kstr);
+        kputsn(als[i].s,als[i].l,&args->tmp_kstr);
      }
-    args->tmp_als_str.s[ args->tmp_als_str.l ] = 0;
-    bcf_update_alleles_str(args->hdr,line,args->tmp_als_str.s);
+    args->tmp_kstr.s[ args->tmp_kstr.l ] = 0;
+    bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s);
      args->nchanged++;
  
      // Update INFO/END if necessary
      int new_reflen = strlen(line->d.allele[0]);
-    if ( (ori_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 )
+    if ( (new_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 )
      {
          // bcf_update_alleles_str() messed up rlen because line->pos changed. This will be fixed by bcf_update_info_int32()
+        line->pos = new_pos;
          args->int32_arr[0] = line->pos + new_reflen;
-        bcf_update_info_int32(args->hdr, line, "END", args->int32_arr, 1);
+        bcf_update_info_int32(args->out_hdr, line, "END", args->int32_arr, 1);
      }
+    line->pos = new_pos;
  
      return ERR_OK;
  }
  
  static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
  {
-    #define BRANCH_NUMERIC(type,type_t) \
+    #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing) \
      { \
          const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); \
          int ntmp = args->ntmp_arr1 / sizeof(type_t); \
@@ -479,13 +537,13 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                  } \
                  if ( args->force ) \
                  { \
-                    bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \
+                    bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
                      return; \
                  } \
                  error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
                          tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \
              } \
-            bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,vals+ialt,1); \
          } \
          else if ( len==BCF_VL_R ) \
          { \
@@ -501,14 +559,24 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                  } \
                  if ( args->force ) \
                  { \
-                    bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \
+                    bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
                      return; \
                  } \
                  error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
                          tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \
              } \
-            if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
-            bcf_update_info_##type(args->hdr,dst,tag,vals,2); \
+            if ( args->keep_sum_ad >= 0 && args->keep_sum_ad==info->key ) \
+            { \
+                int j; \
+                for (j=1; j<info->len; j++) \
+                    if ( j!=ialt+1 && !(is_missing) && !(is_vector_end) ) vals[0] += vals[j]; \
+                vals[1] = vals[ialt+1]; \
+            } \
+            else \
+            { \
+                if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
+            } \
+            bcf_update_info_##type(args->out_hdr,dst,tag,vals,2); \
          } \
          else if ( len==BCF_VL_G ) \
          { \
@@ -524,7 +592,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                  } \
                  if ( args->force ) \
                  { \
-                    bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \
+                    bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \
                      return; \
                  } \
                  error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \
@@ -535,15 +603,15 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int
                  vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \
                  vals[2] = vals[bcf_alleles2gt(ialt+1,ialt+1)]; \
              } \
-            bcf_update_info_##type(args->hdr,dst,tag,vals,3); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,vals,3); \
          } \
          else \
-            bcf_update_info_##type(args->hdr,dst,tag,vals,ret); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,vals,ret); \
      }
      switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key))
      {
-        case BCF_HT_INT:  BRANCH_NUMERIC(int32, int32_t); break;
-        case BCF_HT_REAL: BRANCH_NUMERIC(float, float); break;
+        case BCF_HT_INT:  BRANCH_NUMERIC(int32, int32_t, vals[j]==bcf_int32_vector_end, vals[j]==bcf_int32_missing); break;
+        case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(vals[j]), bcf_float_is_missing(vals[j])); break;
      }
      #undef BRANCH_NUMERIC
  }
@@ -591,7 +659,7 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i
          STR_MOVE_NTH(str.s,tmp,str.s+str.l,ialt,len);
          if ( len<0 ) return;   // wrong number of fields: skip
          str.s[len] = 0;
-        bcf_update_info_string(args->hdr,dst,tag,str.s);
+        bcf_update_info_string(args->out_hdr,dst,tag,str.s);
      }
      else if ( len==BCF_VL_R )
      {
@@ -602,7 +670,7 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i
          STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,ialt,len);
          if ( len<0 ) return;   // wrong number of fields: skip
          str.s[len] = 0;
-        bcf_update_info_string(args->hdr,dst,tag,str.s);
+        bcf_update_info_string(args->out_hdr,dst,tag,str.s);
      }
      else if ( len==BCF_VL_G )
      {
@@ -617,16 +685,16 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i
          STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,iaa-i0a-1,len);
          if ( len<0 ) return;   // wrong number of fields: skip
          str.s[len] = 0;
-        bcf_update_info_string(args->hdr,dst,tag,str.s);
+        bcf_update_info_string(args->out_hdr,dst,tag,str.s);
      }
      else
-        bcf_update_info_string(args->hdr,dst,tag,str.s);
+        bcf_update_info_string(args->out_hdr,dst,tag,str.s);
  }
  static void split_info_flag(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst)
  {
      const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
      int ret = bcf_get_info_flag(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1);
-    bcf_update_info_flag(args->hdr,dst,tag,NULL,ret);
+    bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret);
  }
  
  static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
@@ -652,11 +720,11 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
          }
          gt += ngts;
      }
-    bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl);
+    bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl);
  }
  static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
  {
-    #define BRANCH_NUMERIC(type,type_t,is_vector_end,set_vector_end) \
+    #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end) \
      { \
          const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \
          int ntmp = args->ntmp_arr1 / sizeof(type_t); \
@@ -665,10 +733,10 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
          assert( nvals>0 ); \
          type_t *vals = (type_t *) args->tmp_arr1; \
          int len = bcf_hdr_id2length(args->hdr,BCF_HL_FMT,fmt->id); \
-        int i, nsmpl = bcf_hdr_nsamples(args->hdr); \
+        int i,j, nsmpl = bcf_hdr_nsamples(args->hdr); \
          if ( nvals==nsmpl ) /* all values are missing */ \
          { \
-            bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \
              return; \
          } \
          if ( len==BCF_VL_A ) \
@@ -685,7 +753,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                  } \
                  if ( args->force ) \
                  { \
-                    bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \
+                    bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
                      return; \
                  } \
                  error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \
@@ -699,7 +767,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                  dst_vals += 1; \
                  src_vals += nvals; \
              } \
-            bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \
          } \
          else if ( len==BCF_VL_R ) \
          { \
@@ -715,7 +783,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                  } \
                  if ( args->force ) \
                  { \
-                    bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \
+                    bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
                      return; \
                  } \
                  error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \
@@ -723,14 +791,29 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
              } \
              nvals /= nsmpl; \
              type_t *src_vals = vals, *dst_vals = vals; \
-            for (i=0; i<nsmpl; i++) \
+            if ( args->keep_sum_ad >= 0 && args->keep_sum_ad==fmt->id ) \
              { \
-                dst_vals[0] = src_vals[0]; \
-                dst_vals[1] = src_vals[ialt+1]; \
-                dst_vals += 2; \
-                src_vals += nvals; \
+                for (i=0; i<nsmpl; i++) \
+                { \
+                    dst_vals[0] = src_vals[0]; \
+                    for (j=1; j<nvals; j++) \
+                        if ( j!=ialt+1 && !(is_missing) && !(is_vector_end) ) dst_vals[0] += src_vals[j]; \
+                    dst_vals[1] = src_vals[ialt+1]; \
+                    dst_vals += 2; \
+                    src_vals += nvals; \
+                } \
+            } \
+            else \
+            { \
+                for (i=0; i<nsmpl; i++) \
+                { \
+                    dst_vals[0] = src_vals[0]; \
+                    dst_vals[1] = src_vals[ialt+1]; \
+                    dst_vals += 2; \
+                    src_vals += nvals; \
+                } \
              } \
-            bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl*2); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl*2); \
          } \
          else if ( len==BCF_VL_G ) \
          { \
@@ -746,7 +829,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                  } \
                  if ( args->force ) \
                  { \
-                    bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \
+                    bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \
                      return; \
                  } \
                  error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \
@@ -777,15 +860,15 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
                  dst_vals += all_haploid ? 2 : 3; \
                  src_vals += nvals; \
              } \
-            bcf_update_format_##type(args->hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \
          } \
          else \
-            bcf_update_format_##type(args->hdr,dst,tag,vals,nvals); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,vals,nvals); \
      }
      switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
      {
-        case BCF_HT_INT:  BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, dst_vals[2]=bcf_int32_vector_end); break;
-        case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break;
+        case BCF_HT_INT:  BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, src_vals[j]==bcf_int32_missing, dst_vals[2]=bcf_int32_vector_end); break;
+        case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_is_missing(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break;
      }
      #undef BRANCH_NUMERIC
  }
@@ -827,7 +910,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
              ptr += blen;
          }
          if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
-        bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+        bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
      }
      else if ( len==BCF_VL_R )
      {
@@ -845,7 +928,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
              ptr += blen;
          }
          if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
-        bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+        bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
      }
      else if ( len==BCF_VL_G )
      {
@@ -873,7 +956,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
                  }
                  if ( args->force )
                  {
-                    bcf_update_format_char(args->hdr,dst,tag,NULL,0);
+                    bcf_update_format_char(args->out_hdr,dst,tag,NULL,0);
                      return;
                  }
                  error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n",
@@ -904,13 +987,12 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i
              ptr += blen;
          }
          if ( maxlen<blen ) squeeze_format_char(str.s,blen,maxlen,nsmpl);
-        bcf_update_format_char(args->hdr,dst,tag,str.s,nsmpl*maxlen);
+        bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen);
      }
      else
-        bcf_update_format_char(args->hdr,dst,tag,str.s,str.l);
+        bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l);
  }
  
-
  static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
  {
      int i;
@@ -943,11 +1025,11 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
  
          // Not quite sure how to handle IDs, they can be assigned to a specific
          // ALT.  For now we leave the ID unchanged for all.
-        bcf_update_id(args->hdr, dst, line->d.id ? line->d.id : ".");
+        bcf_update_id(args->out_hdr, dst, line->d.id ? line->d.id : ".");
  
          tmp.l = rlen;
          kputs(line->d.allele[i+1],&tmp);
-        bcf_update_alleles_str(args->hdr,dst,tmp.s);
+        bcf_update_alleles_str(args->out_hdr,dst,tmp.s);
  
          if ( line->d.n_flt ) bcf_update_filter(args->hdr, dst, line->d.flt, line->d.n_flt);
  
@@ -960,6 +1042,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
              else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst);
              else split_info_string(args, line, info, i, dst);
          }
+        set_old_rec_tag(args, dst, line, i + 1); // 1-based indexes
  
          dst->n_sample = line->n_sample;
          for (j=0; j<line->n_fmt; j++)
@@ -1023,7 +1106,7 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf
                      vals[ args->maps[i].map[k+1] - 1 ] = vals2[k]; \
                  } \
              } \
-            bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \
          } \
          else if ( len==BCF_VL_R ) \
          { \
@@ -1047,7 +1130,7 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf
                      vals[ args->maps[i].map[k] ] = vals2[k]; \
                  } \
              } \
-            bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \
          } \
          else if ( len==BCF_VL_G ) \
          { \
@@ -1081,10 +1164,10 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf
                      } \
                  } \
              } \
-            bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \
          } \
          else \
-            bcf_update_info_##type(args->hdr,dst,tag,vals,nvals_ori); \
+            bcf_update_info_##type(args->out_hdr,dst,tag,vals,nvals_ori); \
      }
      switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key))
      {
@@ -1097,7 +1180,7 @@ static void merge_info_flag(args_t *args, bcf1_t **lines, int nlines, bcf_info_t
  {
      const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key);
      int ret = bcf_get_info_flag(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
-    bcf_update_info_flag(args->hdr,dst,tag,NULL,ret);
+    bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret);
  }
  int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
  static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst)
@@ -1125,7 +1208,7 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info
          str.s[str.l] = 0;
          args->tmp_arr1  = (uint8_t*) str.s;
          args->ntmp_arr1 = str.m;
-        bcf_update_info_string(args->hdr,dst,tag,str.s);
+        bcf_update_info_string(args->out_hdr,dst,tag,str.s);
      }
      else if ( len==BCF_VL_G )
      {
@@ -1152,12 +1235,12 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info
          str.s[str.l] = 0;
          args->tmp_arr1  = (uint8_t*) str.s;
          args->ntmp_arr1 = str.m;
-        bcf_update_info_string(args->hdr,dst,tag,str.s);
+        bcf_update_info_string(args->out_hdr,dst,tag,str.s);
      }
      else
      {
          bcf_get_info_string(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
-        bcf_update_info_string(args->hdr,dst,tag,args->tmp_arr1);
+        bcf_update_info_string(args->out_hdr,dst,tag,args->tmp_arr1);
      }
  }
  static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst)
@@ -1200,7 +1283,7 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_
              gt2 += ngts;
          }
      }
-    bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl);
+    bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl);
  }
  static int diploid_to_haploid(int size, int nsmpl, int nals, uint8_t *vals)
  {
@@ -1253,7 +1336,7 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f
                      vals2 += nvals2; \
                  } \
              } \
-            bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
          } \
          else if ( len==BCF_VL_R ) \
          { \
@@ -1281,7 +1364,7 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f
                      vals2 += nvals2; \
                  } \
              } \
-            bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
          } \
          else if ( len==BCF_VL_G ) \
          { \
@@ -1360,10 +1443,10 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f
                      vals2 += nvals;\
                  }\
              }\
-            bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \
          } \
          else \
-            bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \
+            bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \
      }
      switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
      {
@@ -1380,7 +1463,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm
      if ( len!=BCF_VL_A && len!=BCF_VL_R && len!=BCF_VL_G )
      {
          int nret = bcf_get_format_char(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1);
-        bcf_update_format_char(args->hdr,dst,tag,args->tmp_arr1,nret);
+        bcf_update_format_char(args->out_hdr,dst,tag,args->tmp_arr1,nret);
          return;
      }
  
@@ -1399,7 +1482,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm
          for (i=0; i<nlines; i++)
          {
              int nret = bcf_get_format_char(args->hdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1);
-            if (nret<0) continue; /* format tag does not exist in this record, skip */ \
+            if (nret<0) continue; /* format tag does not exist in this record, skip */
              nret /= nsmpl;
              for (k=0; k<nsmpl; k++)
              {
@@ -1446,7 +1529,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm
              if ( i ) // we already have a copy
              {
                  nret = bcf_get_format_char(args->hdr,lines[i],tag,&args->tmp_arr1,&args->ntmp_arr1);
-                if (nret<0) continue; /* format tag does not exist in this record, skip */ \
+                if (nret<0) continue; /* format tag does not exist in this record, skip */
                  nret /= nsmpl;
              }
              for (k=0; k<nsmpl; k++)
@@ -1492,7 +1575,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm
      }
      args->ntmp_arr2 = str.m;
      args->tmp_arr2  = (uint8_t*)str.s;
-    bcf_update_format_char(args->hdr,dst,tag,str.s,str.l);
+    bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l);
  }
  
  char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb);   // see vcfmerge.c
@@ -1513,7 +1596,7 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t *
              dst->qual = lines[i]->qual;
      }
  
-    bcf_update_id(args->hdr, dst, lines[0]->d.id);
+    bcf_update_id(args->out_hdr, dst, lines[0]->d.id);
  
      // Merge and set the alleles, create a mapping from source allele indexes to dst idxs
      hts_expand0(map_t,nlines,args->mmaps,args->maps);   // a mapping for each line
@@ -1527,20 +1610,20 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t *
      }
      for (i=1; i<nlines; i++)
      {
-        if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->hdr, dst, lines[i]->d.id);
+        if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->out_hdr, dst, lines[i]->d.id);
          args->maps[i].nals = lines[i]->n_allele;
          hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map);
          args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals);
          if ( !args->als ) error("Failed to merge alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1);
      }
-    bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals);
+    bcf_update_alleles(args->out_hdr, dst, (const char**)args->als, args->nals);
      for (i=0; i<args->nals; i++)
      {
          free(args->als[i]);
          args->als[i] = NULL;
      }
  
-    if ( lines[0]->d.n_flt ) bcf_update_filter(args->hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt);
+    if ( lines[0]->d.n_flt ) bcf_update_filter(args->out_hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt);
      for (i=1; i<nlines; i++) {
          int j;
          for (j=0; j<lines[i]->d.n_flt; j++) {
@@ -1548,13 +1631,13 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t *
              // otherwise accumulate FILTERs
              if (lines[i]->d.flt[j] == bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PASS")) {
                  if (args->strict_filter) {
-                    bcf_update_filter(args->hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt);
+                    bcf_update_filter(args->out_hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt);
                      break;
                  }
                  else
                      continue;
              }
-            bcf_add_filter(args->hdr, dst, lines[i]->d.flt[j]);
+            bcf_add_filter(args->out_hdr, dst, lines[i]->d.flt[j]);
          }
      }
  
@@ -1724,7 +1807,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
              if ( mrows_ready_to_flush(args, args->lines[k]) )
              {
                  while ( (line=mrows_flush(args)) )
-                    if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+                    if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
              }
              int merge = 1;
              if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY )
@@ -1757,18 +1840,30 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
              prev_type |= line_type;
              if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]);
          }
-        if ( bcf_write1(file, args->hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+        if ( bcf_write1(file, args->out_hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
      }
      if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n )
      {
          while ( (line=mrows_flush(args)) )
-            if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+            if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
      }
  }
  
  static void init_data(args_t *args)
  {
      args->hdr = args->files->readers[0].header;
+    if ( args->keep_sum_ad )
+    {
+        args->keep_sum_ad = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"AD");
+        if ( args->keep_sum_ad < 0 ) error("Error: --keep-sum-ad requested but the tag AD is not present\n");
+    }
+    else
+        args->keep_sum_ad = -1;
+
+    args->out_hdr = bcf_hdr_dup(args->hdr);
+    if ( args->old_rec_tag )
+        bcf_hdr_printf(args->out_hdr,"##INFO=<ID=%s,Number=1,Type=String,Description=\"Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX\">",args->old_rec_tag); 
+
      rbuf_init(&args->rbuf, 100);
      args->lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
      if ( args->ref_fname )
@@ -1782,6 +1877,14 @@ static void init_data(args_t *args)
          args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t));
          args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr));
      }
+    if ( args->atomize==SPLIT )
+    {
+        args->abuf = abuf_init(args->hdr, SPLIT); 
+        abuf_set_opt(args->abuf, bcf_hdr_t*, BCF_HDR, args->out_hdr);
+        if ( args->old_rec_tag )
+            abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag);
+        abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele);
+    }
  }
  
  static void destroy_data(args_t *args)
@@ -1806,7 +1909,7 @@ static void destroy_data(args_t *args)
      for (i=0; i<args->ntmp_als; i++)
          free(args->tmp_als[i].s);
      free(args->tmp_als);
-    free(args->tmp_als_str.s);
+    free(args->tmp_kstr.s);
      if ( args->tmp_str )
      {
          for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) free(args->tmp_str[i].s);
@@ -1818,15 +1921,16 @@ static void destroy_data(args_t *args)
      free(args->tmp_arr1);
      free(args->tmp_arr2);
      free(args->diploid);
+    if ( args->abuf ) abuf_destroy(args->abuf);
+    bcf_hdr_destroy(args->out_hdr);
      if ( args->mrow_out ) bcf_destroy1(args->mrow_out);
      if ( args->fai ) fai_destroy(args->fai);
      if ( args->mseq ) free(args->seq);
  }
  
  
-static void normalize_line(args_t *args, bcf1_t **line_ptr)
+static void normalize_line(args_t *args, bcf1_t *line)
  {
-    bcf1_t *line = *line_ptr;
      if ( args->fai )
      {
          if ( args->check_ref & CHECK_REF_FIX ) fix_ref(args, line);
@@ -1856,8 +1960,8 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr)
      rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines);
      int i,j;
      i = j = rbuf_append(&args->rbuf);
-    if ( !args->lines[i] ) args->lines[i] = bcf_init1();
-    SWAP(bcf1_t*, (*line_ptr), args->lines[i]);
+    if ( args->lines[i] ) bcf_destroy(args->lines[i]);
+    args->lines[i] = bcf_dup(line);
      while ( rbuf_prev(&args->rbuf,&i) )
      {
          if ( args->lines[i]->pos > args->lines[j]->pos ) SWAP(bcf1_t*, args->lines[i], args->lines[j]);
@@ -1865,21 +1969,38 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr)
      }
  }
  
+static bcf1_t *next_atomized_line(args_t *args)
+{
+    bcf1_t *rec = NULL;
+    if ( args->atomize==SPLIT )
+    {
+        rec = abuf_flush(args->abuf, 0);
+        if ( rec ) return rec;
+    }
+
+    if ( !bcf_sr_next_line(args->files) ) return NULL;
+
+    if ( args->atomize==SPLIT )
+    {
+        abuf_push(args->abuf,bcf_sr_get_line(args->files,0));
+        return abuf_flush(args->abuf, 0);
+    }
+    return bcf_sr_get_line(args->files,0);
+}
  static void normalize_vcf(args_t *args)
  {
-    htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
-    if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+    args->out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
+    if ( args->out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
      if ( args->n_threads )
-        hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p);
-    if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
-    if ( bcf_hdr_write(out, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+        hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
+    if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm");
+    if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
  
+    bcf1_t *line;
      int prev_rid = -1, prev_pos = -1, prev_type = 0;
-    while ( bcf_sr_next_line(args->files) )
+    while ( (line = next_atomized_line(args)) )
      {
          args->ntotal++;
-
-        bcf1_t *line = args->files->readers[0].buffer[0];
          if ( args->rmdup )
          {
              int line_type = bcf_get_variant_types(line);
@@ -1903,7 +2024,7 @@ static void normalize_vcf(args_t *args)
  
          // still on the same chromosome?
          int i,j,ilast = rbuf_last(&args->rbuf);
-        if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, out, args->rbuf.n); // new chromosome
+        if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, args->out, args->rbuf.n); // new chromosome
  
          int split = 0;
          if ( args->mrows_op==MROWS_SPLIT )
@@ -1918,13 +2039,13 @@ static void normalize_vcf(args_t *args)
                  args->nsplit++;
                  split_multiallelic_to_biallelics(args, line);
                  for (j=0; j<args->ntmp_lines; j++)
-                    normalize_line(args, &args->tmp_lines[j]);
+                    normalize_line(args, args->tmp_lines[j]);
              }
              else
                  split = 0;
          }
          if ( !split )
-            normalize_line(args, &args->files->readers[0].buffer[0]);
+            normalize_line(args, line);
  
          // find out how many sites to flush
          ilast = rbuf_last(&args->rbuf);
@@ -1934,10 +2055,10 @@ static void normalize_vcf(args_t *args)
              if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break;
              j++;
          }
-        if ( j>0 ) flush_buffer(args, out, j);
+        if ( j>0 ) flush_buffer(args, args->out, j);
      }
-    flush_buffer(args, out, args->rbuf.n);
-    if ( hts_close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
+    flush_buffer(args, args->out, args->rbuf.n);
+    if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname);
  
      fprintf(bcftools_stderr,"Lines   total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped);
      if ( args->check_ref & CHECK_REF_FIX )
@@ -1953,23 +2074,27 @@ static void usage(void)
      fprintf(bcftools_stderr, "Usage:   bcftools norm [options] <in.vcf.gz>\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Options:\n");
-    fprintf(bcftools_stderr, "    -c, --check-ref <e|w|x|s>         check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
-    fprintf(bcftools_stderr, "    -D, --remove-duplicates           remove duplicate lines of the same type.\n");
-    fprintf(bcftools_stderr, "    -d, --rm-dup <type>               remove duplicate snps|indels|both|all|exact\n");
-    fprintf(bcftools_stderr, "    -f, --fasta-ref <file>            reference sequence\n");
-    fprintf(bcftools_stderr, "        --force                       try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
-    fprintf(bcftools_stderr, "    -m, --multiallelics <-|+>[type]   split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
-    fprintf(bcftools_stderr, "        --no-version                  do not append version and command line to the header\n");
-    fprintf(bcftools_stderr, "    -N, --do-not-normalize            do not normalize indels (with -m or -c s)\n");
-    fprintf(bcftools_stderr, "    -o, --output <file>               write output to a file [standard output]\n");
-    fprintf(bcftools_stderr, "    -O, --output-type <type>          'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
-    fprintf(bcftools_stderr, "    -r, --regions <region>            restrict to comma-separated list of regions\n");
-    fprintf(bcftools_stderr, "    -R, --regions-file <file>         restrict to regions listed in a file\n");
-    fprintf(bcftools_stderr, "    -s, --strict-filter               when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
-    fprintf(bcftools_stderr, "    -t, --targets <region>            similar to -r but streams rather than index-jumps\n");
-    fprintf(bcftools_stderr, "    -T, --targets-file <file>         similar to -R but streams rather than index-jumps\n");
-    fprintf(bcftools_stderr, "        --threads <int>               use multithreading with <int> worker threads [0]\n");
-    fprintf(bcftools_stderr, "    -w, --site-win <int>              buffer for sorting lines which changed position during realignment [1000]\n");
+    fprintf(bcftools_stderr, "    -a, --atomize                   Decompose complex variants (e.g. MNVs become consecutive SNVs)\n");
+    fprintf(bcftools_stderr, "        --atom-overlaps '*'|.       Use the star allele (*) for overlapping alleles or set to missing (.) [*]\n");
+    fprintf(bcftools_stderr, "    -c, --check-ref e|w|x|s         Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
+    fprintf(bcftools_stderr, "    -D, --remove-duplicates         Remove duplicate lines of the same type.\n");
+    fprintf(bcftools_stderr, "    -d, --rm-dup TYPE               Remove duplicate snps|indels|both|all|exact\n");
+    fprintf(bcftools_stderr, "    -f, --fasta-ref FILE            Reference sequence\n");
+    fprintf(bcftools_stderr, "        --force                     Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
+    fprintf(bcftools_stderr, "        --keep-sum TAG,..           Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
+    fprintf(bcftools_stderr, "    -m, --multiallelics -|+TYPE     Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
+    fprintf(bcftools_stderr, "        --no-version                Do not append version and command line to the header\n");
+    fprintf(bcftools_stderr, "    -N, --do-not-normalize          Do not normalize indels (with -m or -c s)\n");
+    fprintf(bcftools_stderr, "        --old-rec-tag STR           Annotate modified records with INFO/STR indicating the original variant\n");
+    fprintf(bcftools_stderr, "    -o, --output FILE               Write output to a file [standard output]\n");
+    fprintf(bcftools_stderr, "    -O, --output-type TYPE          'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+    fprintf(bcftools_stderr, "    -r, --regions REGION            Restrict to comma-separated list of regions\n");
+    fprintf(bcftools_stderr, "    -R, --regions-file FILE         Restrict to regions listed in a file\n");
+    fprintf(bcftools_stderr, "    -s, --strict-filter             When merging (-m+), merged site is PASS only if all sites being merged PASS\n");
+    fprintf(bcftools_stderr, "    -t, --targets REGION            Similar to -r but streams rather than index-jumps\n");
+    fprintf(bcftools_stderr, "    -T, --targets-file FILE         Similar to -R but streams rather than index-jumps\n");
+    fprintf(bcftools_stderr, "        --threads INT               Use multithreading with <int> worker threads [0]\n");
+    fprintf(bcftools_stderr, "    -w, --site-win INT              Buffer for sorting lines which changed position during realignment [1000]\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Examples:\n");
      fprintf(bcftools_stderr, "   # normalize and left-align indels\n");
@@ -1978,7 +2103,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "   # split multi-allelic sites\n");
      fprintf(bcftools_stderr, "   bcftools norm -m- in.vcf\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcfnorm(int argc, char *argv[])
@@ -1997,11 +2122,16 @@ int main_vcfnorm(int argc, char *argv[])
      args->do_indels = 1;
      int region_is_file  = 0;
      int targets_is_file = 0;
+    args->use_star_allele = 1;
  
      static struct option loptions[] =
      {
          {"help",no_argument,NULL,'h'},
          {"force",no_argument,NULL,7},
+        {"atomize",no_argument,NULL,'a'},
+        {"atom-overlaps",required_argument,NULL,11},
+        {"old-rec-tag",required_argument,NULL,12},
+        {"keep-sum",required_argument,NULL,10},
          {"fasta-ref",required_argument,NULL,'f'},
          {"do-not-normalize",no_argument,NULL,'N'},
          {"multiallelics",required_argument,NULL,'m'},
@@ -2021,8 +2151,21 @@ int main_vcfnorm(int argc, char *argv[])
          {NULL,0,NULL,0}
      };
      char *tmp;
-    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sN",loptions,NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) {
          switch (c) {
+            case  10:
+                // possibly generalize this also to INFO/AD and other tags
+                if ( strcasecmp("ad",optarg) )
+                    error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n");
+                args->keep_sum_ad = 1;  // this will be set to the header id or -1 in init_data
+                break;
+            case 'a': args->atomize = SPLIT; break;
+            case 11 :
+                if ( optarg[0]=='*' ) args->use_star_allele = 1;
+                else if ( optarg[0]=='.' ) args->use_star_allele = 0;
+                else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n");
+                break;
+            case 12 : args->old_rec_tag = optarg; break;
              case 'N': args->do_indels = 0; break;
              case 'd':
                  if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
@@ -2094,7 +2237,7 @@ int main_vcfnorm(int argc, char *argv[])
      }
      else fname = argv[optind];
  
-    if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n");
+    if ( !args->ref_fname && !args->mrows_op && !args->rmdup && args->atomize==NONE ) error("Expected -a, -f, -m, -D or -d option\n");
      if ( !args->check_ref && args->ref_fname ) args->check_ref = CHECK_REF_EXIT;
      if ( args->check_ref && !args->ref_fname ) error("Expected --fasta-ref with --check-ref\n");
  
diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c

index a161529a04cc0cac884180dff77cd410379d35ff..c4ea52d61f4358a23ea376f513eeef0de13467f2 100644 (file)
--- a/bcftools/vcfplugin.c
+++ b/bcftools/vcfplugin.c
@@ -1,6 +1,6 @@
  /*  vcfplugin.c -- plugin modules for operating on VCF/BCF files.
  
-    Copyright (C) 2013-2017 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -148,7 +148,7 @@ typedef struct _args_t
      char **plugin_paths;
  
      char **argv, *output_fname, *regions_list, *targets_list;
-    int argc, drop_header, verbose, record_cmd_line;
+    int argc, drop_header, verbose, record_cmd_line, plist_only;
  }
  args_t;
  
@@ -178,7 +178,7 @@ static void add_plugin_paths(args_t *args, const char *path)
                  args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1));
                  args->plugin_paths[args->nplugin_paths] = dir;
                  args->nplugin_paths++;
-                if ( args->verbose > 1 ) fprintf(stderr, "plugin directory %s .. ok\n", dir);
+                if ( args->verbose > 1 && strcmp(".",dir) ) fprintf(stderr, "plugin directory %s .. ok\n", dir);
              }
              else
              {
@@ -220,6 +220,8 @@ static void *dlopen_plugin(args_t *args, const char *fname)
  #else
      if ( fname[0]=='/' ) is_absolute_path = 1;
  #endif
+
+    kstring_t err = {0,0,0};
      if ( !is_absolute_path )
      {
          int i;
@@ -231,16 +233,14 @@ static void *dlopen_plugin(args_t *args, const char *fname)
  #else
              handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
  #endif
-            if ( args->verbose > 1 )
-            {
-                if ( !handle )
+            if ( !handle )
  #ifdef _WIN32
-                    fprintf(stderr,"%s:\n\tLoadLibraryA   .. %lu\n", tmp, GetLastError());
+                ksprintf(&err,"LoadLibraryA   .. %lu\n", GetLastError());
  #else
-                    fprintf(stderr,"%s:\n\tdlopen   .. %s\n", tmp, dlerror());
+                ksprintf(&err,"%s:\n\tdlopen   .. %s\n", tmp,dlerror());
  #endif
-                else fprintf(stderr,"%s:\n\tplugin open   .. ok\n", tmp);
-            }
+            else if ( args->verbose > 1 )
+                fprintf(stderr,"%s:\n\tplugin open   .. ok\n", tmp);
              free(tmp);
              if ( handle ) return handle;
          }
@@ -251,33 +251,46 @@ static void *dlopen_plugin(args_t *args, const char *fname)
  #else
      handle = dlopen(fname, RTLD_NOW);
  #endif
-    if ( args->verbose > 1 )
-    {
-        if ( !handle )
+    if ( !handle )
  #ifdef _WIN32
-            fprintf(stderr,"%s:\n\tLoadLibraryA   .. %lu\n", fname, GetLastError());
+        ksprintf(&err,"LoadLibraryA   .. %lu\n", GetLastError());
  #else
-            fprintf(stderr,"%s:\n\tdlopen   .. %s\n", fname, dlerror());
+        ksprintf(&err,"%s:\n\tdlopen   .. %s\n", fname,dlerror());
  #endif
-        else fprintf(stderr,"%s:\n\tplugin open   .. ok\n", fname);
-    }
+    else if ( args->verbose > 1 )
+        fprintf(stderr,"%s:\n\tplugin open   .. ok\n", fname);
+
+    if ( !handle && (!args->plist_only || args->verbose>1) )
+        fprintf(stderr,"%s",err.s);
+    free(err.s);
  
      return handle;
  }
  
-static void print_plugin_usage_hint(void)
+static void print_plugin_usage_hint(const char *name)
  {
-    fprintf(stderr, "\nNo functional bcftools plugins were found");
+    if ( name )
+        fprintf(stderr, "\nThe bcftools plugin \"%s\" was not found or is not functional", name);
+    else
+        fprintf(stderr, "\nNo functional bcftools plugins were found");
      if ( !getenv("BCFTOOLS_PLUGINS") )
-        fprintf(stderr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n");
+    {
+        fprintf(stderr,". The environment variable BCFTOOLS_PLUGINS is not set");
+#ifdef PLUGINPATH
+        fprintf(stderr,"\nand no usable plugins were found in %s", PLUGINPATH);
+#endif
+        fprintf(stderr,".\n\n");
+    }
      else
+    {
          fprintf(stderr,
                  " in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n"
                  "- Is the plugin path correct?\n\n"
-                "- Run \"bcftools plugin -lv\" for more detailed error output.\n"
+                "- Run \"bcftools plugin -l\" or \"bcftools plugin -lvv\" for a list of available plugins.\n"
                  "\n",
                  getenv("BCFTOOLS_PLUGINS")
                 );
+    }
  }
  
  static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugin_t *plugin)
@@ -289,7 +302,7 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi
      {
          if ( exit_on_error )
          {
-            print_plugin_usage_hint();
+            print_plugin_usage_hint(fname);
              error("Could not load \"%s\".\n\n", fname);
          }
          return -1;
@@ -410,12 +423,9 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi
      return 0;
  }
  
-static void init_plugin(args_t *args)
+static void check_version(args_t *args)
  {
      static int warned_bcftools = 0, warned_htslib = 0;
-
-    int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out);
-    if ( ret<0 ) error("The plugin exited with an error.\n");
      const char *bver, *hver;
      args->plugin.version(&bver, &hver);
      if ( strcmp(bver,bcftools_version()) && !warned_bcftools )
@@ -428,6 +438,13 @@ static void init_plugin(args_t *args)
          fprintf(stderr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver);
          warned_htslib = 1;
      }
+}
+
+static void init_plugin(args_t *args)
+{
+    int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out);
+    if ( ret<0 ) error("The plugin exited with an error.\n");
+    check_version(args);
      args->drop_header += ret;
  }
  
@@ -487,7 +504,7 @@ static int list_plugins(args_t *args)
          if ( args->verbose ) printf("\n");
      }
      else
-        print_plugin_usage_hint();
+        print_plugin_usage_hint(NULL);
      free(str.s);
      return nplugins ? 0 : 1;
  }
@@ -505,7 +522,7 @@ static void init_data(args_t *args)
      if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
      if ( !args->drop_header )
      {
-        args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+        args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
          if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
          if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
          if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
@@ -592,10 +609,9 @@ int main_plugin(int argc, char *argv[])
      args->n_threads = 0;
      args->record_cmd_line = 1;
      args->nplugin_paths = -1;
-    int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0;
+    int regions_is_file = 0, targets_is_file = 0, usage_only = 0, version_only = 0;
  
      if ( argc==1 ) usage(args);
-
      char *plugin_name = NULL;
      if ( argv[1][0]!='-' )
      {
@@ -606,6 +622,7 @@ int main_plugin(int argc, char *argv[])
          load_plugin(args, plugin_name, 1, &args->plugin);
          if ( args->plugin.run )
          {
+            check_version(args);
              int ret = args->plugin.run(argc, argv);
              destroy_data(args);
              free(args);
@@ -646,13 +663,17 @@ int main_plugin(int argc, char *argv[])
                      default: error("The output type \"%s\" not recognised\n", optarg);
                  };
                  break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'r': args->regions_list = optarg; break;
              case 'R': args->regions_list = optarg; regions_is_file = 1; break;
              case 't': args->targets_list = optarg; break;
              case 'T': args->targets_list = optarg; targets_is_file = 1; break;
-            case 'l': plist_only = 1; break;
+            case 'l': args->plist_only = 1; break;
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
              case '?':
@@ -660,8 +681,8 @@ int main_plugin(int argc, char *argv[])
              default: error("Unknown argument: %s\n", optarg);
          }
      }
-    if ( plist_only )  return list_plugins(args);
-    if ( usage_only && ! plugin_name ) usage(args);
+    if ( args->plist_only )  return list_plugins(args);
+    if ( !plugin_name ) usage(args);
  
      if ( version_only )
      {
@@ -682,7 +703,7 @@ int main_plugin(int argc, char *argv[])
      }
  
      char *fname = NULL;
-    if ( optind>=argc || argv[optind][0]=='-' )
+    if ( optind>=argc || (argv[optind][0]=='-' && argv[optind][1]) )
      {
          args->plugin.argc = argc - optind + 1;
          args->plugin.argv = argv + optind - 1;
diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c

index 3b63c8c67c3441b1d58f0037872be5c58496ea58..2143a0a0471bc1eaa88a6551a427fe4cf3984499 100644 (file)
--- a/bcftools/vcfplugin.c.pysam.c
+++ b/bcftools/vcfplugin.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfplugin.c -- plugin modules for operating on VCF/BCF files.
  
-    Copyright (C) 2013-2017 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -150,7 +150,7 @@ typedef struct _args_t
      char **plugin_paths;
  
      char **argv, *output_fname, *regions_list, *targets_list;
-    int argc, drop_header, verbose, record_cmd_line;
+    int argc, drop_header, verbose, record_cmd_line, plist_only;
  }
  args_t;
  
@@ -180,7 +180,7 @@ static void add_plugin_paths(args_t *args, const char *path)
                  args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1));
                  args->plugin_paths[args->nplugin_paths] = dir;
                  args->nplugin_paths++;
-                if ( args->verbose > 1 ) fprintf(bcftools_stderr, "plugin directory %s .. ok\n", dir);
+                if ( args->verbose > 1 && strcmp(".",dir) ) fprintf(bcftools_stderr, "plugin directory %s .. ok\n", dir);
              }
              else
              {
@@ -222,6 +222,8 @@ static void *dlopen_plugin(args_t *args, const char *fname)
  #else
      if ( fname[0]=='/' ) is_absolute_path = 1;
  #endif
+
+    kstring_t err = {0,0,0};
      if ( !is_absolute_path )
      {
          int i;
@@ -233,16 +235,14 @@ static void *dlopen_plugin(args_t *args, const char *fname)
  #else
              handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
  #endif
-            if ( args->verbose > 1 )
-            {
-                if ( !handle )
+            if ( !handle )
  #ifdef _WIN32
-                    fprintf(bcftools_stderr,"%s:\n\tLoadLibraryA   .. %lu\n", tmp, GetLastError());
+                ksprintf(&err,"LoadLibraryA   .. %lu\n", GetLastError());
  #else
-                    fprintf(bcftools_stderr,"%s:\n\tdlopen   .. %s\n", tmp, dlerror());
+                ksprintf(&err,"%s:\n\tdlopen   .. %s\n", tmp,dlerror());
  #endif
-                else fprintf(bcftools_stderr,"%s:\n\tplugin open   .. ok\n", tmp);
-            }
+            else if ( args->verbose > 1 )
+                fprintf(bcftools_stderr,"%s:\n\tplugin open   .. ok\n", tmp);
              free(tmp);
              if ( handle ) return handle;
          }
@@ -253,33 +253,46 @@ static void *dlopen_plugin(args_t *args, const char *fname)
  #else
      handle = dlopen(fname, RTLD_NOW);
  #endif
-    if ( args->verbose > 1 )
-    {
-        if ( !handle )
+    if ( !handle )
  #ifdef _WIN32
-            fprintf(bcftools_stderr,"%s:\n\tLoadLibraryA   .. %lu\n", fname, GetLastError());
+        ksprintf(&err,"LoadLibraryA   .. %lu\n", GetLastError());
  #else
-            fprintf(bcftools_stderr,"%s:\n\tdlopen   .. %s\n", fname, dlerror());
+        ksprintf(&err,"%s:\n\tdlopen   .. %s\n", fname,dlerror());
  #endif
-        else fprintf(bcftools_stderr,"%s:\n\tplugin open   .. ok\n", fname);
-    }
+    else if ( args->verbose > 1 )
+        fprintf(bcftools_stderr,"%s:\n\tplugin open   .. ok\n", fname);
+
+    if ( !handle && (!args->plist_only || args->verbose>1) )
+        fprintf(bcftools_stderr,"%s",err.s);
+    free(err.s);
  
      return handle;
  }
  
-static void print_plugin_usage_hint(void)
+static void print_plugin_usage_hint(const char *name)
  {
-    fprintf(bcftools_stderr, "\nNo functional bcftools plugins were found");
+    if ( name )
+        fprintf(bcftools_stderr, "\nThe bcftools plugin \"%s\" was not found or is not functional", name);
+    else
+        fprintf(bcftools_stderr, "\nNo functional bcftools plugins were found");
      if ( !getenv("BCFTOOLS_PLUGINS") )
-        fprintf(bcftools_stderr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n");
+    {
+        fprintf(bcftools_stderr,". The environment variable BCFTOOLS_PLUGINS is not set");
+#ifdef PLUGINPATH
+        fprintf(bcftools_stderr,"\nand no usable plugins were found in %s", PLUGINPATH);
+#endif
+        fprintf(bcftools_stderr,".\n\n");
+    }
      else
+    {
          fprintf(bcftools_stderr,
                  " in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n"
                  "- Is the plugin path correct?\n\n"
-                "- Run \"bcftools plugin -lv\" for more detailed error output.\n"
+                "- Run \"bcftools plugin -l\" or \"bcftools plugin -lvv\" for a list of available plugins.\n"
                  "\n",
                  getenv("BCFTOOLS_PLUGINS")
                 );
+    }
  }
  
  static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugin_t *plugin)
@@ -291,7 +304,7 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi
      {
          if ( exit_on_error )
          {
-            print_plugin_usage_hint();
+            print_plugin_usage_hint(fname);
              error("Could not load \"%s\".\n\n", fname);
          }
          return -1;
@@ -412,12 +425,9 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi
      return 0;
  }
  
-static void init_plugin(args_t *args)
+static void check_version(args_t *args)
  {
      static int warned_bcftools = 0, warned_htslib = 0;
-
-    int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out);
-    if ( ret<0 ) error("The plugin exited with an error.\n");
      const char *bver, *hver;
      args->plugin.version(&bver, &hver);
      if ( strcmp(bver,bcftools_version()) && !warned_bcftools )
@@ -430,6 +440,13 @@ static void init_plugin(args_t *args)
          fprintf(bcftools_stderr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver);
          warned_htslib = 1;
      }
+}
+
+static void init_plugin(args_t *args)
+{
+    int ret = args->plugin.init(args->plugin.argc,args->plugin.argv,args->hdr,args->hdr_out);
+    if ( ret<0 ) error("The plugin exited with an error.\n");
+    check_version(args);
      args->drop_header += ret;
  }
  
@@ -489,7 +506,7 @@ static int list_plugins(args_t *args)
          if ( args->verbose ) fprintf(bcftools_stdout, "\n");
      }
      else
-        print_plugin_usage_hint();
+        print_plugin_usage_hint(NULL);
      free(str.s);
      return nplugins ? 0 : 1;
  }
@@ -507,7 +524,7 @@ static void init_data(args_t *args)
      if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
      if ( !args->drop_header )
      {
-        args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+        args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
          if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
          if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
          if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
@@ -560,7 +577,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "   -v, --verbose               print verbose information, -vv increases verbosity\n");
      fprintf(bcftools_stderr, "   -V, --version               print version string and exit\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  static int is_verbose(int argc, char *argv[])
@@ -594,10 +611,9 @@ int main_plugin(int argc, char *argv[])
      args->n_threads = 0;
      args->record_cmd_line = 1;
      args->nplugin_paths = -1;
-    int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0;
+    int regions_is_file = 0, targets_is_file = 0, usage_only = 0, version_only = 0;
  
      if ( argc==1 ) usage(args);
-
      char *plugin_name = NULL;
      if ( argv[1][0]!='-' )
      {
@@ -608,6 +624,7 @@ int main_plugin(int argc, char *argv[])
          load_plugin(args, plugin_name, 1, &args->plugin);
          if ( args->plugin.run )
          {
+            check_version(args);
              int ret = args->plugin.run(argc, argv);
              destroy_data(args);
              free(args);
@@ -648,13 +665,17 @@ int main_plugin(int argc, char *argv[])
                      default: error("The output type \"%s\" not recognised\n", optarg);
                  };
                  break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'r': args->regions_list = optarg; break;
              case 'R': args->regions_list = optarg; regions_is_file = 1; break;
              case 't': args->targets_list = optarg; break;
              case 'T': args->targets_list = optarg; targets_is_file = 1; break;
-            case 'l': plist_only = 1; break;
+            case 'l': args->plist_only = 1; break;
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case  8 : args->record_cmd_line = 0; break;
              case '?':
@@ -662,8 +683,8 @@ int main_plugin(int argc, char *argv[])
              default: error("Unknown argument: %s\n", optarg);
          }
      }
-    if ( plist_only )  return list_plugins(args);
-    if ( usage_only && ! plugin_name ) usage(args);
+    if ( args->plist_only )  return list_plugins(args);
+    if ( !plugin_name ) usage(args);
  
      if ( version_only )
      {
@@ -684,7 +705,7 @@ int main_plugin(int argc, char *argv[])
      }
  
      char *fname = NULL;
-    if ( optind>=argc || argv[optind][0]=='-' )
+    if ( optind>=argc || (argv[optind][0]=='-' && argv[optind][1]) )
      {
          args->plugin.argc = argc - optind + 1;
          args->plugin.argv = argv + optind - 1;
diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c

index 806ecf1cca1bd422fc1433b7cb070620452bd729..6568c8208ce9c8a6fababf7ef78ba342ee1171e5 100644 (file)
--- a/bcftools/vcfquery.c
+++ b/bcftools/vcfquery.c
@@ -1,6 +1,6 @@
  /*  vcfquery.c -- Extracts fields from VCF/BCF file.
  
-    Copyright (C) 2013-2017 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -226,7 +226,7 @@ static void usage(void)
      fprintf(stderr, "    -H, --print-header                print header\n");
      fprintf(stderr, "    -i, --include <expr>              select sites for which the expression is true (see man page for details)\n");
      fprintf(stderr, "    -l, --list-samples                print the list of samples and exit\n");
-    fprintf(stderr, "    -o, --output-file <file>          output file name [stdout]\n");
+    fprintf(stderr, "    -o, --output <file>               output file name [stdout]\n");
      fprintf(stderr, "    -r, --regions <region>            restrict to comma-separated list of regions\n");
      fprintf(stderr, "    -R, --regions-file <file>         restrict to regions listed in a file\n");
      fprintf(stderr, "    -s, --samples <list>              list of samples to include\n");
@@ -257,6 +257,7 @@ int main_vcfquery(int argc, char *argv[])
          {"exclude",1,0,'e'},
          {"format",1,0,'f'},
          {"output-file",1,0,'o'},
+        {"output",1,0,'o'},
          {"regions",1,0,'r'},
          {"regions-file",1,0,'R'},
          {"targets",1,0,'t'},
@@ -296,8 +297,12 @@ int main_vcfquery(int argc, char *argv[])
                      args->format_str = str.s;
                      break;
                  }
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'r': args->regions_list = optarg; break;
              case 'R': args->regions_list = optarg; regions_is_file = 1; break;
              case 't': args->targets_list = optarg; break;
diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c

index 66afb086900e91fb447d573f0b9a70f8116bad91..fc264b7f771236871584b8799f0f18e1ababd2ed 100644 (file)
--- a/bcftools/vcfquery.c.pysam.c
+++ b/bcftools/vcfquery.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfquery.c -- Extracts fields from VCF/BCF file.
  
-    Copyright (C) 2013-2017 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -228,7 +228,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "    -H, --print-header                print header\n");
      fprintf(bcftools_stderr, "    -i, --include <expr>              select sites for which the expression is true (see man page for details)\n");
      fprintf(bcftools_stderr, "    -l, --list-samples                print the list of samples and exit\n");
-    fprintf(bcftools_stderr, "    -o, --output-file <file>          output file name [bcftools_stdout]\n");
+    fprintf(bcftools_stderr, "    -o, --output <file>               output file name [bcftools_stdout]\n");
      fprintf(bcftools_stderr, "    -r, --regions <region>            restrict to comma-separated list of regions\n");
      fprintf(bcftools_stderr, "    -R, --regions-file <file>         restrict to regions listed in a file\n");
      fprintf(bcftools_stderr, "    -s, --samples <list>              list of samples to include\n");
@@ -241,7 +241,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "Examples:\n");
      fprintf(bcftools_stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcfquery(int argc, char *argv[])
@@ -259,6 +259,7 @@ int main_vcfquery(int argc, char *argv[])
          {"exclude",1,0,'e'},
          {"format",1,0,'f'},
          {"output-file",1,0,'o'},
+        {"output",1,0,'o'},
          {"regions",1,0,'r'},
          {"regions-file",1,0,'R'},
          {"targets",1,0,'t'},
@@ -298,8 +299,12 @@ int main_vcfquery(int argc, char *argv[])
                      args->format_str = str.s;
                      break;
                  }
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'r': args->regions_list = optarg; break;
              case 'R': args->regions_list = optarg; regions_is_file = 1; break;
              case 't': args->targets_list = optarg; break;
diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c

index 1c822cbcaa90cf80c13a344de7258c3ff3cfff1f..8e95c9a79cf1e271247a066d24a2ee0022490164 100644 (file)
--- a/bcftools/vcfroh.c
+++ b/bcftools/vcfroh.c
@@ -1,6 +1,6 @@
  /*  vcfroh.c -- HMM model for detecting runs of autozygosity.
  
-    Copyright (C) 2013-2018 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -103,7 +103,7 @@ typedef struct _args_t
      int ntot;                   // some stats to detect if things didn't go wrong
      int nno_af;                 // number of sites rejected because AF could not be determined
      int nfiltered;              // .. because of filters
-    int nnot_biallelic, ndup;
+    int nno_alt, nmultiallelic, ndup;
      smpl_t *smpl;               // HMM data for each sample
      smpl_ilist_t *af_smpl;      // list of samples to estimate AF from (--estimate-AF)
      smpl_ilist_t *roh_smpl;     // list of samples to analyze (--samples, --samples-file)
@@ -111,6 +111,7 @@ typedef struct _args_t
      int af_from_PL;             // estimate AF from FMT/PL rather than FMT/GT
      char **argv, *targets_list, *regions_list, *af_fname, *af_tag, *samples, *buffer_size, *output_fname;
      int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads;
+    int include_noalt_sites;
      BGZF *out;
      kstring_t str;
  
@@ -548,6 +549,7 @@ static void flush_viterbi(args_t *args, int ismpl)
                      {
                          smpl->rg.state = 1;
                          smpl->rg.beg = smpl->sites[i];
+                        smpl->rg.end = smpl->sites[i];
                          smpl->rg.rid = args->prev_rid;
                          smpl->rg.qual  = qual;
                          smpl->rg.nqual = 1;
@@ -656,8 +658,10 @@ static void flush_viterbi(args_t *args, int ismpl)
      }
  }
  
-int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
+int read_AF(args_t *args, bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
  {
+    if ( tgt->nals < 2 )
+        error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", args->af_fname,tgt->line.s);
      if ( tgt->nals != line->n_allele ) return -1;    // number of alleles does not match
  
      int i;
@@ -837,7 +841,7 @@ int process_line(args_t *args, bcf1_t *line, int ial)
      else if ( args->af_fname ) 
      {
          // Read AF from a file
-        ret = read_AF(args->files->targets, line, &alt_freq);
+        ret = read_AF(args, args->files->targets, line, &alt_freq);
      }
      else if ( args->dflt_AF > 0 )
      {
@@ -997,33 +1001,32 @@ static void vcfroh(args_t *args, bcf1_t *line)
  
      // Skip unwanted lines, for simplicity we consider only biallelic sites 
      if ( line->rid == args->skip_rid ) return;
-    if ( line->n_allele==1 ) { args->nnot_biallelic++; return; }   // no ALT allele
-    if ( line->n_allele > 3 ) { args->nnot_biallelic++; return; }   // cannot be bi-allelic, even with <*>
  
      // This can be raw callable VCF with the symbolic unseen allele <*>
-    int ial = 0;
+    int ial = 0, nalt = line->n_allele - 1;
      for (i=1; i<line->n_allele; i++)
-        if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; }
-    if ( ial==0 )    // normal VCF, the symbolic allele is not present
      {
-        if ( line->n_allele!=2 ) { args->nnot_biallelic++; return; }   // not biallelic
-        ial = 1;
+        if ( !strcmp("<*>",line->d.allele[i]) || !strcmp("<NON_REF>",line->d.allele[i]) ) nalt--;
+        else if ( !ial ) ial = i;
      }
-    else
+
+    if ( !nalt ) // no ALT allele
      {
-        if ( line->n_allele!=3 ) return;    // not biallelic
-        ial = ial==1 ? 2 : 1;               // <*> can come in any order
+        args->nno_alt++;
+        if ( !args->include_noalt_sites ) return;
+    }
+    else if ( nalt>1 )
+    {
+        args->nmultiallelic++;
+        return;
      }
+
      if ( args->snps_only && !bcf_is_snp(line) ) return;
  
      // Initialize genetic map
      int skip_rid = 0;
      if ( args->prev_rid<0 )
-    {
-        args->prev_rid = line->rid;
-        args->prev_pos = line->pos;
          skip_rid = load_genmap(args, bcf_seqname(args->hdr,line));
-    }
  
      // New chromosome?
      if ( args->prev_rid!=line->rid )
@@ -1071,7 +1074,7 @@ static void usage(args_t *args)
      fprintf(stderr, "General Options:\n");
      fprintf(stderr, "        --AF-dflt <float>              if AF is not known, use this allele frequency [skip]\n");
      fprintf(stderr, "        --AF-tag <TAG>                 use TAG for allele frequency\n");
-    fprintf(stderr, "        --AF-file <file>               read allele frequencies from file (CHR\\tPOS\\tREF\\tALT\\tAF)\n");
+    fprintf(stderr, "        --AF-file <file>               read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
      fprintf(stderr, "    -b  --buffer-size <int[,int]>      buffer size and the number of overlapping sites, 0 for unlimited [0]\n");
      fprintf(stderr, "                                           If the first number is negative, it is interpreted as the maximum memory to\n");
      fprintf(stderr, "                                           use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n");
@@ -1082,6 +1085,7 @@ static void usage(args_t *args)
      fprintf(stderr, "                                           Safe value to use is 30 to account for GT errors.\n");
      fprintf(stderr, "        --include <expr>               select sites for which the expression is true\n");
      fprintf(stderr, "    -i, --ignore-homref                skip hom-ref genotypes (0/0)\n");
+    fprintf(stderr, "        --include-noalt                include sites with no ALT allele (ignored by default)\n");
      fprintf(stderr, "    -I, --skip-indels                  skip indels as their genotypes are enriched for errors\n");
      fprintf(stderr, "    -m, --genetic-map <file>           genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n");
      fprintf(stderr, "                                           is replaced with chromosome name\n");
@@ -1122,6 +1126,7 @@ int main_vcfroh(int argc, char *argv[])
          {"AF-dflt",1,0,2},
          {"include",1,0,3},
          {"exclude",1,0,4},
+        {"include-noalt",0,0,5},
          {"buffer-size",1,0,'b'},
          {"ignore-homref",0,0,'i'},
          {"estimate-AF",1,0,'e'},
@@ -1154,8 +1159,13 @@ int main_vcfroh(int argc, char *argv[])
                  args->dflt_AF = strtod(optarg,&tmp);
                  if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
                  break;
-            case 3: args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break;
-            case 4: args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break;
+            case  3 :
+                if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case  4 :
+                if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 5: args->include_noalt_sites = 1; break;
              case 'o': args->output_fname = optarg; break;
              case 'O': 
                  if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST;
@@ -1257,7 +1267,7 @@ int main_vcfroh(int argc, char *argv[])
          fprintf(stderr,"Number of lines overlapping with --AF-file/processed: %d/%d\n", args->ntot,nmin);
      else
          fprintf(stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin);
-    fprintf(stderr,"Number of lines filtered/no AF/not biallelic/dup: %d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nnot_biallelic,args->ndup);
+    fprintf(stderr,"Number of lines filtered/no AF/no alt/multiallelic/dup: %d/%d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nno_alt,args->nmultiallelic,args->ndup);
      if ( nmin==0 )
      {
          fprintf(stderr,"No usable sites were found.\n");
diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c

index 33defa488633647f611d23a03c5f5ac259ff4f4e..b742faac47afc0770776936fbf94abf3615129a3 100644 (file)
--- a/bcftools/vcfroh.c.pysam.c
+++ b/bcftools/vcfroh.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfroh.c -- HMM model for detecting runs of autozygosity.
  
-    Copyright (C) 2013-2018 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -105,7 +105,7 @@ typedef struct _args_t
      int ntot;                   // some stats to detect if things didn't go wrong
      int nno_af;                 // number of sites rejected because AF could not be determined
      int nfiltered;              // .. because of filters
-    int nnot_biallelic, ndup;
+    int nno_alt, nmultiallelic, ndup;
      smpl_t *smpl;               // HMM data for each sample
      smpl_ilist_t *af_smpl;      // list of samples to estimate AF from (--estimate-AF)
      smpl_ilist_t *roh_smpl;     // list of samples to analyze (--samples, --samples-file)
@@ -113,6 +113,7 @@ typedef struct _args_t
      int af_from_PL;             // estimate AF from FMT/PL rather than FMT/GT
      char **argv, *targets_list, *regions_list, *af_fname, *af_tag, *samples, *buffer_size, *output_fname;
      int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads;
+    int include_noalt_sites;
      BGZF *out;
      kstring_t str;
  
@@ -550,6 +551,7 @@ static void flush_viterbi(args_t *args, int ismpl)
                      {
                          smpl->rg.state = 1;
                          smpl->rg.beg = smpl->sites[i];
+                        smpl->rg.end = smpl->sites[i];
                          smpl->rg.rid = args->prev_rid;
                          smpl->rg.qual  = qual;
                          smpl->rg.nqual = 1;
@@ -658,8 +660,10 @@ static void flush_viterbi(args_t *args, int ismpl)
      }
  }
  
-int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
+int read_AF(args_t *args, bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
  {
+    if ( tgt->nals < 2 )
+        error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", args->af_fname,tgt->line.s);
      if ( tgt->nals != line->n_allele ) return -1;    // number of alleles does not match
  
      int i;
@@ -769,7 +773,7 @@ int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_fr
              case BCF_BT_INT8:  BRANCH(int8_t); break;
              case BCF_BT_INT16: BRANCH(int16_t); break;
              case BCF_BT_INT32: BRANCH(int32_t); break;
-            default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+            default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); bcftools_exit(1);
          }
          #undef BRANCH
      }
@@ -799,7 +803,7 @@ int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_fr
              case BCF_BT_INT8:  BRANCH(int8_t); break;
              case BCF_BT_INT16: BRANCH(int16_t); break;
              case BCF_BT_INT32: BRANCH(int32_t); break;
-            default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+            default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); bcftools_exit(1);
          }
          #undef BRANCH
      }
@@ -839,7 +843,7 @@ int process_line(args_t *args, bcf1_t *line, int ial)
      else if ( args->af_fname ) 
      {
          // Read AF from a file
-        ret = read_AF(args->files->targets, line, &alt_freq);
+        ret = read_AF(args, args->files->targets, line, &alt_freq);
      }
      else if ( args->dflt_AF > 0 )
      {
@@ -941,7 +945,7 @@ int process_line(args_t *args, bcf1_t *line, int ial)
                  case BCF_BT_INT8:  BRANCH(int8_t); break;
                  case BCF_BT_INT16: BRANCH(int16_t); break;
                  case BCF_BT_INT32: BRANCH(int32_t); break;
-                default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+                default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); bcftools_exit(1);
              }
              #undef BRANCH
          }
@@ -999,33 +1003,32 @@ static void vcfroh(args_t *args, bcf1_t *line)
  
      // Skip unwanted lines, for simplicity we consider only biallelic sites 
      if ( line->rid == args->skip_rid ) return;
-    if ( line->n_allele==1 ) { args->nnot_biallelic++; return; }   // no ALT allele
-    if ( line->n_allele > 3 ) { args->nnot_biallelic++; return; }   // cannot be bi-allelic, even with <*>
  
      // This can be raw callable VCF with the symbolic unseen allele <*>
-    int ial = 0;
+    int ial = 0, nalt = line->n_allele - 1;
      for (i=1; i<line->n_allele; i++)
-        if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; }
-    if ( ial==0 )    // normal VCF, the symbolic allele is not present
      {
-        if ( line->n_allele!=2 ) { args->nnot_biallelic++; return; }   // not biallelic
-        ial = 1;
+        if ( !strcmp("<*>",line->d.allele[i]) || !strcmp("<NON_REF>",line->d.allele[i]) ) nalt--;
+        else if ( !ial ) ial = i;
      }
-    else
+
+    if ( !nalt ) // no ALT allele
      {
-        if ( line->n_allele!=3 ) return;    // not biallelic
-        ial = ial==1 ? 2 : 1;               // <*> can come in any order
+        args->nno_alt++;
+        if ( !args->include_noalt_sites ) return;
+    }
+    else if ( nalt>1 )
+    {
+        args->nmultiallelic++;
+        return;
      }
+
      if ( args->snps_only && !bcf_is_snp(line) ) return;
  
      // Initialize genetic map
      int skip_rid = 0;
      if ( args->prev_rid<0 )
-    {
-        args->prev_rid = line->rid;
-        args->prev_pos = line->pos;
          skip_rid = load_genmap(args, bcf_seqname(args->hdr,line));
-    }
  
      // New chromosome?
      if ( args->prev_rid!=line->rid )
@@ -1073,7 +1076,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "General Options:\n");
      fprintf(bcftools_stderr, "        --AF-dflt <float>              if AF is not known, use this allele frequency [skip]\n");
      fprintf(bcftools_stderr, "        --AF-tag <TAG>                 use TAG for allele frequency\n");
-    fprintf(bcftools_stderr, "        --AF-file <file>               read allele frequencies from file (CHR\\tPOS\\tREF\\tALT\\tAF)\n");
+    fprintf(bcftools_stderr, "        --AF-file <file>               read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
      fprintf(bcftools_stderr, "    -b  --buffer-size <int[,int]>      buffer size and the number of overlapping sites, 0 for unlimited [0]\n");
      fprintf(bcftools_stderr, "                                           If the first number is negative, it is interpreted as the maximum memory to\n");
      fprintf(bcftools_stderr, "                                           use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n");
@@ -1084,6 +1087,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "                                           Safe value to use is 30 to account for GT errors.\n");
      fprintf(bcftools_stderr, "        --include <expr>               select sites for which the expression is true\n");
      fprintf(bcftools_stderr, "    -i, --ignore-homref                skip hom-ref genotypes (0/0)\n");
+    fprintf(bcftools_stderr, "        --include-noalt                include sites with no ALT allele (ignored by default)\n");
      fprintf(bcftools_stderr, "    -I, --skip-indels                  skip indels as their genotypes are enriched for errors\n");
      fprintf(bcftools_stderr, "    -m, --genetic-map <file>           genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n");
      fprintf(bcftools_stderr, "                                           is replaced with chromosome name\n");
@@ -1103,7 +1107,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "    -H, --az-to-hw <float>             P(HW|AZ) transition probability from AZ to HW state [5e-9]\n");
      fprintf(bcftools_stderr, "    -V, --viterbi-training <float>     estimate HMM parameters, <float> is the convergence threshold, e.g. 1e-10 (experimental)\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcfroh(int argc, char *argv[])
@@ -1124,6 +1128,7 @@ int main_vcfroh(int argc, char *argv[])
          {"AF-dflt",1,0,2},
          {"include",1,0,3},
          {"exclude",1,0,4},
+        {"include-noalt",0,0,5},
          {"buffer-size",1,0,'b'},
          {"ignore-homref",0,0,'i'},
          {"estimate-AF",1,0,'e'},
@@ -1156,8 +1161,13 @@ int main_vcfroh(int argc, char *argv[])
                  args->dflt_AF = strtod(optarg,&tmp);
                  if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
                  break;
-            case 3: args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break;
-            case 4: args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break;
+            case  3 :
+                if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case  4 :
+                if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 5: args->include_noalt_sites = 1; break;
              case 'o': args->output_fname = optarg; break;
              case 'O': 
                  if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST;
@@ -1259,7 +1269,7 @@ int main_vcfroh(int argc, char *argv[])
          fprintf(bcftools_stderr,"Number of lines overlapping with --AF-file/processed: %d/%d\n", args->ntot,nmin);
      else
          fprintf(bcftools_stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin);
-    fprintf(bcftools_stderr,"Number of lines filtered/no AF/not biallelic/dup: %d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nnot_biallelic,args->ndup);
+    fprintf(bcftools_stderr,"Number of lines filtered/no AF/no alt/multiallelic/dup: %d/%d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nno_alt,args->nmultiallelic,args->ndup);
      if ( nmin==0 )
      {
          fprintf(bcftools_stderr,"No usable sites were found.\n");
diff --git a/bcftools/vcfsom.c b/bcftools/vcfsom.c

index ed864227938c54fe7eeeff0f9ea091e0246aa766..db01d24fd55c6e44576c90677fd2d03b71250071 100644 (file)
--- a/bcftools/vcfsom.c
+++ b/bcftools/vcfsom.c
@@ -1,6 +1,6 @@
  /*  vcfsom.c -- SOM (Self-Organizing Map) filtering.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2014, 2020 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -25,6 +25,7 @@ THE SOFTWARE.  */
  #include <stdio.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <ctype.h>
  #include <string.h>
  #include <errno.h>
@@ -165,15 +166,16 @@ void annots_reader_close(args_t *args)
  static void som_write_map(char *prefix, som_t **som, int nsom)
  {
      FILE *fp = open_file(NULL,"w","%s.som",prefix);
-    fwrite("SOMv1",5,1,fp);
-    fwrite(&nsom,sizeof(int),1,fp);
+    size_t nw;
+    if ( (nw=fwrite("SOMv1",5,1,fp))!=5 ) error("Failed to write 5 bytes\n");
+    if ( (nw=fwrite(&nsom,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int));
      int i;
      for (i=0; i<nsom; i++)
      {
-        fwrite(&som[i]->size,sizeof(int),1,fp);
-        fwrite(&som[i]->kdim,sizeof(int),1,fp);
-        fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp);
-        fwrite(som[i]->c,sizeof(double),som[i]->size,fp);
+        if ( (nw=fwrite(&som[i]->size,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int));
+        if ( (nw=fwrite(&som[i]->kdim,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int));
+        if ( (nw=fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp))!=sizeof(double)*som[i]->size*som[i]->kdim ) error("Failed to write %zu bytes\n",sizeof(double)*som[i]->size*som[i]->kdim);
+        if ( (nw=fwrite(som[i]->c,sizeof(double),som[i]->size,fp))!=sizeof(double)*som[i]->size ) error("Failed to write %zu bytes\n",sizeof(double)*som[i]->size);
      }
      if ( fclose(fp) ) error("%s.som: fclose failed\n",prefix);
  }
diff --git a/bcftools/vcfsom.c.pysam.c b/bcftools/vcfsom.c.pysam.c

index b8368f6091cb3b2837f35b48d75a5ea428be9308..effd3521066f8d5efc2238ad0107d8d63af81455 100644 (file)
--- a/bcftools/vcfsom.c.pysam.c
+++ b/bcftools/vcfsom.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfsom.c -- SOM (Self-Organizing Map) filtering.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2014, 2020 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -27,6 +27,7 @@ THE SOFTWARE.  */
  #include <stdio.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <ctype.h>
  #include <string.h>
  #include <errno.h>
@@ -167,15 +168,16 @@ void annots_reader_close(args_t *args)
  static void som_write_map(char *prefix, som_t **som, int nsom)
  {
      FILE *fp = open_file(NULL,"w","%s.som",prefix);
-    fwrite("SOMv1",5,1,fp);
-    fwrite(&nsom,sizeof(int),1,fp);
+    size_t nw;
+    if ( (nw=fwrite("SOMv1",5,1,fp))!=5 ) error("Failed to write 5 bytes\n");
+    if ( (nw=fwrite(&nsom,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int));
      int i;
      for (i=0; i<nsom; i++)
      {
-        fwrite(&som[i]->size,sizeof(int),1,fp);
-        fwrite(&som[i]->kdim,sizeof(int),1,fp);
-        fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp);
-        fwrite(som[i]->c,sizeof(double),som[i]->size,fp);
+        if ( (nw=fwrite(&som[i]->size,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int));
+        if ( (nw=fwrite(&som[i]->kdim,sizeof(int),1,fp))!=sizeof(int) ) error("Failed to write %zu bytes\n",sizeof(int));
+        if ( (nw=fwrite(som[i]->w,sizeof(double),som[i]->size*som[i]->kdim,fp))!=sizeof(double)*som[i]->size*som[i]->kdim ) error("Failed to write %zu bytes\n",sizeof(double)*som[i]->size*som[i]->kdim);
+        if ( (nw=fwrite(som[i]->c,sizeof(double),som[i]->size,fp))!=sizeof(double)*som[i]->size ) error("Failed to write %zu bytes\n",sizeof(double)*som[i]->size);
      }
      if ( fclose(fp) ) error("%s.som: fclose failed\n",prefix);
  }
@@ -638,7 +640,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "    -n, --ntrain-sites <int>           effective number of training sites [number of good sites]\n");
      fprintf(bcftools_stderr, "    -r, --random-seed <int>            random seed, 0 for time() [1]\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcfsom(int argc, char *argv[])
diff --git a/bcftools/vcfsort.c b/bcftools/vcfsort.c

index 99aa59868d186bade40a37fd53bc007bb31d2aa5..7ec13fb4d23e6938c7bc04427ee72064931b17c4 100644 (file)
--- a/bcftools/vcfsort.c
+++ b/bcftools/vcfsort.c
@@ -1,6 +1,6 @@
  /*  vcfsort.c -- sort subcommand
  
-   Copyright (C) 2017 Genome Research Ltd.
+   Copyright (C) 2017-2021 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -227,7 +227,7 @@ void merge_blocks(args_t *args)
          blk_read(args, bhp, args->hdr, blk);
      }
  
-    htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+    htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
      if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
      while ( bhp->ndat )
      {
@@ -252,19 +252,23 @@ static void usage(args_t *args)
      fprintf(stderr, "Usage:   bcftools sort [OPTIONS] <FILE.vcf>\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Options:\n");
-    fprintf(stderr, "    -m, --max-mem <float>[kMG]    maximum memory to use [768M]\n");    // using metric units, 1M=1e6
-    fprintf(stderr, "    -o, --output-file <file>      output file name [stdout]\n");
-    fprintf(stderr, "    -O, --output-type <b|u|z|v>   b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
-    fprintf(stderr, "    -T, --temp-dir <dir>          temporary files [/tmp/bcftools-sort.XXXXXX]\n");
+    fprintf(stderr, "    -m, --max-mem FLOAT[kMG]    maximum memory to use [768M]\n");    // using metric units, 1M=1e6
+    fprintf(stderr, "    -o, --output FILE           output file name [stdout]\n");
+    fprintf(stderr, "    -O, --output-type b|u|z|v   b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+#ifdef _WIN32
+    fprintf(stderr, "    -T, --temp-dir DIR          temporary files [/bcftools.XXXXXX]\n");
+#else
+    fprintf(stderr, "    -T, --temp-dir DIR          temporary files [/tmp/bcftools.XXXXXX]\n");
+#endif
      fprintf(stderr, "\n");
      exit(1);
  }
  
-size_t parse_mem_string(char *str) 
+size_t parse_mem_string(const char *str) 
  {
      char *tmp;
      double mem = strtod(str, &tmp);
-    if ( tmp==str ) error("Could not parse: --max-mem %s\n", str);
+    if ( tmp==str ) error("Could not parse the memory string: \"%s\"\n", str);
      if ( !strcasecmp("k",tmp) ) mem *= 1000;
      else if ( !strcasecmp("m",tmp) ) mem *= 1000*1000;
      else if ( !strcasecmp("g",tmp) ) mem *= 1000*1000*1000;
@@ -274,21 +278,8 @@ size_t parse_mem_string(char *str)
  void mkdir_p(const char *fmt, ...);
  static void init(args_t *args)
  {
-#ifdef _WIN32
-    char tmp_path[MAX_PATH];
-    int ret = GetTempPath(MAX_PATH, tmp_path);
-    if (!ret || ret > MAX_PATH)
-        error("Could not get the path to the temporary folder\n");
-    if (strlen(tmp_path) + strlen("/bcftools-sort.XXXXXX") >= MAX_PATH)
-        error("Full path to the temporary folder is too long\n");
-    strcat(tmp_path, "/bcftools-sort.XXXXXX");
-    args->tmp_dir = strdup(tmp_path);
-#else
-    args->tmp_dir = args->tmp_dir ? strdup(args->tmp_dir) : strdup("/tmp/bcftools-sort.XXXXXX");
-#endif
-    size_t len = strlen(args->tmp_dir);
-    if ( !strcmp("XXXXXX",args->tmp_dir+len-6) )
-    {
+    args->tmp_dir = init_tmp_prefix(args->tmp_dir);
+
  #ifdef _WIN32
          int ret = mkdir(mktemp(args->tmp_dir), 0700);
          if ( ret ) error("mkdir(%s) failed: %s\n", args->tmp_dir,strerror(errno));
@@ -298,10 +289,6 @@ static void init(args_t *args)
          int ret = chmod(tmp, S_IRUSR|S_IWUSR|S_IXUSR);
          if ( ret ) error("chmod(%s,S_IRUSR|S_IWUSR|S_IXUSR) failed: %s\n", args->tmp_dir,strerror(errno));
  #endif
-    }
-    else {
-        mkdir_p("%s/",args->tmp_dir);
-    }
  
      fprintf(stderr,"Writing to %s\n", args->tmp_dir);
  }
@@ -326,6 +313,7 @@ int main_sort(int argc, char *argv[])
          {"temp-dir",required_argument,NULL,'T'},
          {"output-type",required_argument,NULL,'O'},
          {"output-file",required_argument,NULL,'o'},
+        {"output",required_argument,NULL,'o'},
          {"help",no_argument,NULL,'h'},
          {0,0,0,0}
      };
diff --git a/bcftools/vcfsort.c.pysam.c b/bcftools/vcfsort.c.pysam.c

index 542fc28ccbc4b6264d3fedbebd868461f87cfa63..1fd74d3ea6caf787e2e43ee06ea4da7c08295126 100644 (file)
--- a/bcftools/vcfsort.c.pysam.c
+++ b/bcftools/vcfsort.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfsort.c -- sort subcommand
  
-   Copyright (C) 2017 Genome Research Ltd.
+   Copyright (C) 2017-2021 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -90,7 +90,7 @@ void clean_files_and_throw(args_t *args, const char *format, ...)
      vfprintf(bcftools_stderr, format, ap);
      va_end(ap);
      clean_files(args);
-    exit(-1);
+    bcftools_exit(-1);
  }
  
  int cmp_bcf_pos(const void *aptr, const void *bptr)
@@ -229,7 +229,7 @@ void merge_blocks(args_t *args)
          blk_read(args, bhp, args->hdr, blk);
      }
  
-    htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
+    htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
      if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
      while ( bhp->ndat )
      {
@@ -254,19 +254,23 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "Usage:   bcftools sort [OPTIONS] <FILE.vcf>\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Options:\n");
-    fprintf(bcftools_stderr, "    -m, --max-mem <float>[kMG]    maximum memory to use [768M]\n");    // using metric units, 1M=1e6
-    fprintf(bcftools_stderr, "    -o, --output-file <file>      output file name [bcftools_stdout]\n");
-    fprintf(bcftools_stderr, "    -O, --output-type <b|u|z|v>   b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
-    fprintf(bcftools_stderr, "    -T, --temp-dir <dir>          temporary files [/tmp/bcftools-sort.XXXXXX]\n");
+    fprintf(bcftools_stderr, "    -m, --max-mem FLOAT[kMG]    maximum memory to use [768M]\n");    // using metric units, 1M=1e6
+    fprintf(bcftools_stderr, "    -o, --output FILE           output file name [bcftools_stdout]\n");
+    fprintf(bcftools_stderr, "    -O, --output-type b|u|z|v   b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+#ifdef _WIN32
+    fprintf(bcftools_stderr, "    -T, --temp-dir DIR          temporary files [/bcftools.XXXXXX]\n");
+#else
+    fprintf(bcftools_stderr, "    -T, --temp-dir DIR          temporary files [/tmp/bcftools.XXXXXX]\n");
+#endif
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
-size_t parse_mem_string(char *str) 
+size_t parse_mem_string(const char *str) 
  {
      char *tmp;
      double mem = strtod(str, &tmp);
-    if ( tmp==str ) error("Could not parse: --max-mem %s\n", str);
+    if ( tmp==str ) error("Could not parse the memory string: \"%s\"\n", str);
      if ( !strcasecmp("k",tmp) ) mem *= 1000;
      else if ( !strcasecmp("m",tmp) ) mem *= 1000*1000;
      else if ( !strcasecmp("g",tmp) ) mem *= 1000*1000*1000;
@@ -276,21 +280,8 @@ size_t parse_mem_string(char *str)
  void mkdir_p(const char *fmt, ...);
  static void init(args_t *args)
  {
-#ifdef _WIN32
-    char tmp_path[MAX_PATH];
-    int ret = GetTempPath(MAX_PATH, tmp_path);
-    if (!ret || ret > MAX_PATH)
-        error("Could not get the path to the temporary folder\n");
-    if (strlen(tmp_path) + strlen("/bcftools-sort.XXXXXX") >= MAX_PATH)
-        error("Full path to the temporary folder is too long\n");
-    strcat(tmp_path, "/bcftools-sort.XXXXXX");
-    args->tmp_dir = strdup(tmp_path);
-#else
-    args->tmp_dir = args->tmp_dir ? strdup(args->tmp_dir) : strdup("/tmp/bcftools-sort.XXXXXX");
-#endif
-    size_t len = strlen(args->tmp_dir);
-    if ( !strcmp("XXXXXX",args->tmp_dir+len-6) )
-    {
+    args->tmp_dir = init_tmp_prefix(args->tmp_dir);
+
  #ifdef _WIN32
          int ret = mkdir(mktemp(args->tmp_dir), 0700);
          if ( ret ) error("mkdir(%s) failed: %s\n", args->tmp_dir,strerror(errno));
@@ -300,10 +291,6 @@ static void init(args_t *args)
          int ret = chmod(tmp, S_IRUSR|S_IWUSR|S_IXUSR);
          if ( ret ) error("chmod(%s,S_IRUSR|S_IWUSR|S_IXUSR) failed: %s\n", args->tmp_dir,strerror(errno));
  #endif
-    }
-    else {
-        mkdir_p("%s/",args->tmp_dir);
-    }
  
      fprintf(bcftools_stderr,"Writing to %s\n", args->tmp_dir);
  }
@@ -328,6 +315,7 @@ int main_sort(int argc, char *argv[])
          {"temp-dir",required_argument,NULL,'T'},
          {"output-type",required_argument,NULL,'O'},
          {"output-file",required_argument,NULL,'o'},
+        {"output",required_argument,NULL,'o'},
          {"help",no_argument,NULL,'h'},
          {0,0,0,0}
      };
diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c

index ffa367d7fa0b99122d30356de40c541b99cea04a..601c557d9b030a4731686c291d5c52e06f7b450e 100644 (file)
--- a/bcftools/vcfstats.c
+++ b/bcftools/vcfstats.c
@@ -1,6 +1,6 @@
  /*  vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
  
-    Copyright (C) 2012-2017 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -31,6 +31,7 @@ THE SOFTWARE.  */
  #include <stdarg.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <math.h>
  #include <htslib/vcf.h>
  #include <htslib/synced_bcf_reader.h>
@@ -40,6 +41,7 @@ THE SOFTWARE.  */
  #include "bcftools.h"
  #include "filter.h"
  #include "bin.h"
+#include "dist.h"
  
  // Logic of the filters: include or exclude sites which match the filters?
  #define FLT_INCLUDE 1
@@ -57,7 +59,7 @@ typedef struct
      float min, max;
      uint64_t *vals_ts, *vals_tv;
      void *val;
-    int nbins, type, m_val;
+    int nbins, type, m_val, idx;
  }
  user_stats_t;
  
@@ -81,7 +83,9 @@ typedef struct
      #endif
      int ts_alt1, tv_alt1;
      #if QUAL_STATS
-        int *qual_ts, *qual_tv, *qual_snps, *qual_indels;
+        // Values are rounded to one significant digit and 1 is added (Q*10+1); missing and negative values go in the first bin
+        // Only SNPs that are the 1st alternate allele are counted
+        dist_t *qual_ts, *qual_tv, *qual_indels;
      #endif
      int *insertions, *deletions, m_indel;   // maximum indel length
      int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
@@ -186,13 +190,6 @@ static inline int idist_i2bin(idist_t *d, int i)
      return i-1+d->min;
  }
  
-static inline int clip_nonnegative(float x, int limit)
-{
-    if (x >= limit || isnan(x)) return limit - 1;
-    else if (x <= 0.0) return 0;
-    else return (int) x;
-}
-
  #define IC_DBG 0
  #if IC_DBG
  static void _indel_ctx_print1(_idc1_t *idc)
@@ -349,12 +346,29 @@ static void add_user_stats(args_t *args, char *str)
      args->usr = (user_stats_t*) realloc(args->usr,sizeof(user_stats_t)*args->nusr);
      user_stats_t *usr = &args->usr[args->nusr-1];
      memset(usr,0,sizeof(*usr));
-    usr->min  = 0;
-    usr->max  = 1;
+    usr->min   = 0;
+    usr->max   = 1;
      usr->nbins = 100;
+    usr->idx   = 0;
  
      char *tmp = str;
      while ( *tmp && *tmp!=':' ) tmp++;
+
+    // Tag with an index or just tag? (e.g. PV4[1] vs DP)
+    if ( tmp > str && tmp[-1]==']' )
+    {
+        char *ptr = tmp;
+        while ( ptr>str && *ptr!='[' ) ptr--;
+        if ( *ptr=='[' )
+        {
+            char *ptr2;
+            usr->idx = strtol(ptr+1, &ptr2, 10);
+            if ( ptr+1==ptr2 || ptr2 != tmp-1 ) error("Could not parse the index in \"%s\" (ptr=%s;ptr2=%s(%p),tmp=%s(%p),idx=%d)\n", str,ptr,ptr2,ptr2,tmp,tmp,usr->idx);
+            if ( usr->idx<0 ) error("Error: negative index is not allowed: \"%s\"\n", str);
+            *ptr = 0;
+        }
+    }
+
      usr->tag = (char*)calloc(tmp-str+2,sizeof(char));
      memcpy(usr->tag,str,tmp-str);
  
@@ -465,10 +479,9 @@ static void init_stats(args_t *args)
          int j;
          for (j=0; j<3; j++) stats->af_repeats[j] = (int*) calloc(args->m_af,sizeof(int));
          #if QUAL_STATS
-            stats->qual_ts     = (int*) calloc(args->m_qual,sizeof(int));
-            stats->qual_tv     = (int*) calloc(args->m_qual,sizeof(int));
-            stats->qual_snps   = (int*) calloc(args->m_qual,sizeof(int));
-            stats->qual_indels = (int*) calloc(args->m_qual,sizeof(int));
+            stats->qual_ts     = dist_init(5);
+            stats->qual_tv     = dist_init(5);
+            stats->qual_indels = dist_init(5);
          #endif
          if ( args->files->n_smpl )
          {
@@ -548,10 +561,9 @@ static void destroy_stats(args_t *args)
          for (j=0; j<3; j++)
              if (stats->af_repeats[j]) free(stats->af_repeats[j]);
          #if QUAL_STATS
-            if (stats->qual_ts) free(stats->qual_ts);
-            if (stats->qual_tv) free(stats->qual_tv);
-            if (stats->qual_snps) free(stats->qual_snps);
-            if (stats->qual_indels) free(stats->qual_indels);
+            if (stats->qual_ts) dist_destroy(stats->qual_ts);
+            if (stats->qual_tv) dist_destroy(stats->qual_tv);
+            if (stats->qual_indels) dist_destroy(stats->qual_indels);
          #endif
          #if HWE_STATS
              free(stats->af_hwe);
@@ -678,8 +690,8 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
      bcf1_t *line = reader->buffer[0];
  
      #if QUAL_STATS
-        int iqual = clip_nonnegative(line->qual, args->m_qual);
-        stats->qual_indels[iqual]++;
+        int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10);
+        dist_insert(stats->qual_indels, iqual);
      #endif
  
      // Check if the indel is near an exon for the frameshift statistics
@@ -780,7 +792,7 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
  
  static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts)
  {
-    int i;
+    int i, nval;
      for (i=0; i<stats->nusr; i++)
      {
          user_stats_t *usr = &stats->usr[i];
@@ -788,13 +800,15 @@ static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts)
          float val;
          if ( usr->type==BCF_HT_REAL )
          {
-            if ( bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue;
-            val = ((float*)usr->val)[0];
+            if ( (nval=bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue;
+            if ( usr->idx >= nval ) continue;
+            val = ((float*)usr->val)[usr->idx];
          }
          else
          {
-            if ( bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue;
-            val = ((int32_t*)usr->val)[0];
+            if ( (nval=bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue;
+            if ( usr->idx >= nval ) continue;
+            val = ((int32_t*)usr->val)[usr->idx];
          }
          int idx;
          if ( val<=usr->min ) idx = 0;
@@ -813,8 +827,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
      if ( ref<0 ) return;
  
      #if QUAL_STATS
-        int iqual = clip_nonnegative(line->qual, args->m_qual);
-        stats->qual_snps[iqual]++;
+        int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10);
      #endif
  
      int i;
@@ -833,7 +846,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
              {
                  stats->ts_alt1++;
                  #if QUAL_STATS
-                    stats->qual_ts[iqual]++;
+                    dist_insert(stats->qual_ts,iqual);
                  #endif
                  do_user_stats(stats, reader, 1);
              }
@@ -845,7 +858,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
              {
                  stats->tv_alt1++;
                  #if QUAL_STATS
-                    stats->qual_tv[iqual]++;
+                    dist_insert(stats->qual_tv,iqual);
                  #endif
                  do_user_stats(stats, reader, 0);
              }
@@ -1354,21 +1367,50 @@ static void print_stats(args_t *args)
          }
      }
      #if QUAL_STATS
-        printf("# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n");
+        printf("# QUAL, Stats by quality\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n");
          for (id=0; id<args->nstats; id++)
          {
              stats_t *stats = &args->stats[id];
-            for (i=0; i<args->m_qual; i++)
+            int ndist_ts = dist_nbins(stats->qual_ts);
+            int ndist_tv = dist_nbins(stats->qual_tv);
+            int ndist_in = dist_nbins(stats->qual_indels);
+            int ndist_max = ndist_ts;
+            if ( ndist_max < ndist_tv ) ndist_max = ndist_tv;
+            if ( ndist_max < ndist_in ) ndist_max = ndist_in;
+            uint32_t beg, end;
+            uint32_t nts, ntv, nin;
+            for (i=0; i<ndist_max; i++)
              {
-                if ( stats->qual_snps[i]+stats->qual_ts[i]+stats->qual_tv[i]+stats->qual_indels[i] == 0  ) continue;
-                printf("QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]);
+                nts = ntv = nin = 0;
+                float qval = -1;
+                if ( i < ndist_ts )
+                {
+                    nts = dist_get(stats->qual_ts, i, &beg, &end);
+                    qval = beg>0 ? 0.1*(beg - 1) : -1;
+                }
+                if ( i < ndist_tv )
+                {
+                    ntv = dist_get(stats->qual_tv, i, &beg, &end);
+                    if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1;
+                }
+                if ( i < ndist_in )
+                {
+                    nin = dist_get(stats->qual_indels, i, &beg, &end);
+                    if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1;
+                }
+                if ( nts+ntv+nin==0 ) continue;
+
+                printf("QUAL\t%d\t",id);
+                if ( qval==-1 ) printf(".");
+                else printf("%.1f",qval);
+                printf("\t%d\t%d\t%d\t%d\n",nts+ntv,nts,ntv,nin);
              }
          }
      #endif
      for (i=0; i<args->nusr; i++)
      {
-        printf("# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n",
-            args->usr[i].tag,args->usr[i].tag,args->usr[i].tag,args->usr[i].tag);
+        printf("# USR:%s/%d\t[2]id\t[3]%s/%d\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n",
+            args->usr[i].tag,args->usr[i].idx,args->usr[i].tag,args->usr[i].idx);
          for (id=0; id<args->nstats; id++)
          {
              user_stats_t *usr = &args->stats[id].usr[i];
@@ -1377,8 +1419,8 @@ static void print_stats(args_t *args)
              {
                  if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue;   // skip empty bins
                  float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1);
-                const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s\t%d\t%.0f\t%d\t%d\t%d\n";
-                printf(fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
+                const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s/%d\t%d\t%.0f\t%d\t%d\t%d\n";
+                printf(fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
              }
          }
      }
@@ -1482,10 +1524,10 @@ static void print_stats(args_t *args)
                  printf("# NRD and discordance is calculated as follows:\n");
                  printf("#   m .. number of matches\n");
                  printf("#   x .. number of mismatches\n");
-                printf("#   NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n");
-                printf("#   RR discordance = xRR / (xRR + mRR)\n");
-                printf("#   RA discordance = xRA / (xRA + mRA)\n");
-                printf("#   AA discordance = xAA / (xAA + mAA)\n");
+                printf("#   NRD = 100 * (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n");
+                printf("#   RR discordance = 100 * xRR / (xRR + mRR)\n");
+                printf("#   RA discordance = 100 * xRA / (xRA + mRA)\n");
+                printf("#   AA discordance = 100 * xAA / (xAA + mAA)\n");
                  printf("# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
              }
              else
@@ -1704,26 +1746,27 @@ static void usage(void)
      fprintf(stderr, "Usage:   bcftools stats [options] <A.vcf.gz> [<B.vcf.gz>]\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Options:\n");
-    fprintf(stderr, "        --af-bins <list>               allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n");
-    fprintf(stderr, "        --af-tag <string>              allele frequency tag to use, by default estimated from AN,AC or GT\n");
-    fprintf(stderr, "    -1, --1st-allele-only              include only 1st allele at multiallelic sites\n");
-    fprintf(stderr, "    -c, --collapse <string>            treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
-    fprintf(stderr, "    -d, --depth <int,int,int>          depth distribution: min,max,bin size [0,500,1]\n");
-    fprintf(stderr, "    -e, --exclude <expr>               exclude sites for which the expression is true (see man page for details)\n");
-    fprintf(stderr, "    -E, --exons <file.gz>              tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n");
-    fprintf(stderr, "    -f, --apply-filters <list>         require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
-    fprintf(stderr, "    -F, --fasta-ref <file>             faidx indexed reference sequence file to determine INDEL context\n");
-    fprintf(stderr, "    -i, --include <expr>               select sites for which the expression is true (see man page for details)\n");
-    fprintf(stderr, "    -I, --split-by-ID                  collect stats for sites with ID separately (known vs novel)\n");
-    fprintf(stderr, "    -r, --regions <region>             restrict to comma-separated list of regions\n");
-    fprintf(stderr, "    -R, --regions-file <file>          restrict to regions listed in a file\n");
-    fprintf(stderr, "    -s, --samples <list>               list of samples for sample stats, \"-\" to include all samples\n");
-    fprintf(stderr, "    -S, --samples-file <file>          file of samples to include\n");
-    fprintf(stderr, "    -t, --targets <region>             similar to -r but streams rather than index-jumps\n");
-    fprintf(stderr, "    -T, --targets-file <file>          similar to -R but streams rather than index-jumps\n");
-    fprintf(stderr, "    -u, --user-tstv <TAG[:min:max:n]>  collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
-    fprintf(stderr, "        --threads <int>                use multithreading with <int> worker threads [0]\n");
-    fprintf(stderr, "    -v, --verbose                      produce verbose per-site and per-sample output\n");
+    fprintf(stderr, "        --af-bins LIST               Allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n");
+    fprintf(stderr, "        --af-tag STRING              Allele frequency tag to use, by default estimated from AN,AC or GT\n");
+    fprintf(stderr, "    -1, --1st-allele-only            Include only 1st allele at multiallelic sites\n");
+    fprintf(stderr, "    -c, --collapse STRING            Treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
+    fprintf(stderr, "    -d, --depth INT,INT,INT          Depth distribution: min,max,bin size [0,500,1]\n");
+    fprintf(stderr, "    -e, --exclude EXPR               Exclude sites for which the expression is true (see man page for details)\n");
+    fprintf(stderr, "    -E, --exons FILE.gz              Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, bgzip compressed)\n");
+    fprintf(stderr, "    -f, --apply-filters LIST         Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+    fprintf(stderr, "    -F, --fasta-ref FILE             Faidx indexed reference sequence file to determine INDEL context\n");
+    fprintf(stderr, "    -i, --include EXPR               Select sites for which the expression is true (see man page for details)\n");
+    fprintf(stderr, "    -I, --split-by-ID                Collect stats for sites with ID separately (known vs novel)\n");
+    fprintf(stderr, "    -r, --regions REGION             Restrict to comma-separated list of regions\n");
+    fprintf(stderr, "    -R, --regions-file FILE          Restrict to regions listed in a file\n");
+    fprintf(stderr, "    -s, --samples LIST               List of samples for sample stats, \"-\" to include all samples\n");
+    fprintf(stderr, "    -S, --samples-file FILE          File of samples to include\n");
+    fprintf(stderr, "    -t, --targets REGION             Similar to -r but streams rather than index-jumps\n");
+    fprintf(stderr, "    -T, --targets-file FILE          Similar to -R but streams rather than index-jumps\n");
+    fprintf(stderr, "    -u, --user-tstv TAG[:min:max:n]  Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
+    fprintf(stderr, "                                       A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n");
+    fprintf(stderr, "        --threads INT                Use multithreading with <int> worker threads [0]\n");
+    fprintf(stderr, "    -v, --verbose                    Produce verbose per-site and per-sample output\n");
      fprintf(stderr, "\n");
      exit(1);
  }
@@ -1795,8 +1838,12 @@ int main_vcfstats(int argc, char *argv[])
              case 's': args->samples_list = optarg; break;
              case 'S': args->samples_list = optarg; args->samples_is_file = 1; break;
              case 'I': args->split_by_id = 1; break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case 'h':
              case '?': usage(); break;
diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c

index c52d016568d35e765ff1f89e56938d18bbdb22f2..050a68ada5fa4b3073164c77d97f7e4b8eb68170 100644 (file)
--- a/bcftools/vcfstats.c.pysam.c
+++ b/bcftools/vcfstats.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
  
-    Copyright (C) 2012-2017 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -33,6 +33,7 @@ THE SOFTWARE.  */
  #include <stdarg.h>
  #include <unistd.h>
  #include <getopt.h>
+#include <assert.h>
  #include <math.h>
  #include <htslib/vcf.h>
  #include <htslib/synced_bcf_reader.h>
@@ -42,6 +43,7 @@ THE SOFTWARE.  */
  #include "bcftools.h"
  #include "filter.h"
  #include "bin.h"
+#include "dist.h"
  
  // Logic of the filters: include or exclude sites which match the filters?
  #define FLT_INCLUDE 1
@@ -59,7 +61,7 @@ typedef struct
      float min, max;
      uint64_t *vals_ts, *vals_tv;
      void *val;
-    int nbins, type, m_val;
+    int nbins, type, m_val, idx;
  }
  user_stats_t;
  
@@ -83,7 +85,9 @@ typedef struct
      #endif
      int ts_alt1, tv_alt1;
      #if QUAL_STATS
-        int *qual_ts, *qual_tv, *qual_snps, *qual_indels;
+        // Values are rounded to one significant digit and 1 is added (Q*10+1); missing and negative values go in the first bin
+        // Only SNPs that are the 1st alternate allele are counted
+        dist_t *qual_ts, *qual_tv, *qual_indels;
      #endif
      int *insertions, *deletions, m_indel;   // maximum indel length
      int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
@@ -188,13 +192,6 @@ static inline int idist_i2bin(idist_t *d, int i)
      return i-1+d->min;
  }
  
-static inline int clip_nonnegative(float x, int limit)
-{
-    if (x >= limit || isnan(x)) return limit - 1;
-    else if (x <= 0.0) return 0;
-    else return (int) x;
-}
-
  #define IC_DBG 0
  #if IC_DBG
  static void _indel_ctx_print1(_idc1_t *idc)
@@ -351,12 +348,29 @@ static void add_user_stats(args_t *args, char *str)
      args->usr = (user_stats_t*) realloc(args->usr,sizeof(user_stats_t)*args->nusr);
      user_stats_t *usr = &args->usr[args->nusr-1];
      memset(usr,0,sizeof(*usr));
-    usr->min  = 0;
-    usr->max  = 1;
+    usr->min   = 0;
+    usr->max   = 1;
      usr->nbins = 100;
+    usr->idx   = 0;
  
      char *tmp = str;
      while ( *tmp && *tmp!=':' ) tmp++;
+
+    // Tag with an index or just tag? (e.g. PV4[1] vs DP)
+    if ( tmp > str && tmp[-1]==']' )
+    {
+        char *ptr = tmp;
+        while ( ptr>str && *ptr!='[' ) ptr--;
+        if ( *ptr=='[' )
+        {
+            char *ptr2;
+            usr->idx = strtol(ptr+1, &ptr2, 10);
+            if ( ptr+1==ptr2 || ptr2 != tmp-1 ) error("Could not parse the index in \"%s\" (ptr=%s;ptr2=%s(%p),tmp=%s(%p),idx=%d)\n", str,ptr,ptr2,ptr2,tmp,tmp,usr->idx);
+            if ( usr->idx<0 ) error("Error: negative index is not allowed: \"%s\"\n", str);
+            *ptr = 0;
+        }
+    }
+
      usr->tag = (char*)calloc(tmp-str+2,sizeof(char));
      memcpy(usr->tag,str,tmp-str);
  
@@ -467,10 +481,9 @@ static void init_stats(args_t *args)
          int j;
          for (j=0; j<3; j++) stats->af_repeats[j] = (int*) calloc(args->m_af,sizeof(int));
          #if QUAL_STATS
-            stats->qual_ts     = (int*) calloc(args->m_qual,sizeof(int));
-            stats->qual_tv     = (int*) calloc(args->m_qual,sizeof(int));
-            stats->qual_snps   = (int*) calloc(args->m_qual,sizeof(int));
-            stats->qual_indels = (int*) calloc(args->m_qual,sizeof(int));
+            stats->qual_ts     = dist_init(5);
+            stats->qual_tv     = dist_init(5);
+            stats->qual_indels = dist_init(5);
          #endif
          if ( args->files->n_smpl )
          {
@@ -550,10 +563,9 @@ static void destroy_stats(args_t *args)
          for (j=0; j<3; j++)
              if (stats->af_repeats[j]) free(stats->af_repeats[j]);
          #if QUAL_STATS
-            if (stats->qual_ts) free(stats->qual_ts);
-            if (stats->qual_tv) free(stats->qual_tv);
-            if (stats->qual_snps) free(stats->qual_snps);
-            if (stats->qual_indels) free(stats->qual_indels);
+            if (stats->qual_ts) dist_destroy(stats->qual_ts);
+            if (stats->qual_tv) dist_destroy(stats->qual_tv);
+            if (stats->qual_indels) dist_destroy(stats->qual_indels);
          #endif
          #if HWE_STATS
              free(stats->af_hwe);
@@ -680,8 +692,8 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
      bcf1_t *line = reader->buffer[0];
  
      #if QUAL_STATS
-        int iqual = clip_nonnegative(line->qual, args->m_qual);
-        stats->qual_indels[iqual]++;
+        int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10);
+        dist_insert(stats->qual_indels, iqual);
      #endif
  
      // Check if the indel is near an exon for the frameshift statistics
@@ -782,7 +794,7 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
  
  static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts)
  {
-    int i;
+    int i, nval;
      for (i=0; i<stats->nusr; i++)
      {
          user_stats_t *usr = &stats->usr[i];
@@ -790,13 +802,15 @@ static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts)
          float val;
          if ( usr->type==BCF_HT_REAL )
          {
-            if ( bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue;
-            val = ((float*)usr->val)[0];
+            if ( (nval=bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue;
+            if ( usr->idx >= nval ) continue;
+            val = ((float*)usr->val)[usr->idx];
          }
          else
          {
-            if ( bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue;
-            val = ((int32_t*)usr->val)[0];
+            if ( (nval=bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue;
+            if ( usr->idx >= nval ) continue;
+            val = ((int32_t*)usr->val)[usr->idx];
          }
          int idx;
          if ( val<=usr->min ) idx = 0;
@@ -815,8 +829,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
      if ( ref<0 ) return;
  
      #if QUAL_STATS
-        int iqual = clip_nonnegative(line->qual, args->m_qual);
-        stats->qual_snps[iqual]++;
+        int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10);
      #endif
  
      int i;
@@ -835,7 +848,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
              {
                  stats->ts_alt1++;
                  #if QUAL_STATS
-                    stats->qual_ts[iqual]++;
+                    dist_insert(stats->qual_ts,iqual);
                  #endif
                  do_user_stats(stats, reader, 1);
              }
@@ -847,7 +860,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
              {
                  stats->tv_alt1++;
                  #if QUAL_STATS
-                    stats->qual_tv[iqual]++;
+                    dist_insert(stats->qual_tv,iqual);
                  #endif
                  do_user_stats(stats, reader, 0);
              }
@@ -872,7 +885,7 @@ static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int
          case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
          case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
          case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
-        default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt->type); exit(1); break;
+        default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt->type); bcftools_exit(1); break;
      }
      #undef BRANCH_INT
  
@@ -1020,7 +1033,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
              case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
              case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
              case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
-            default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
+            default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break;
          }
          #undef BRANCH_INT
      }
@@ -1051,7 +1064,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
              case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
              case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
              case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
-            default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break;
+            default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break;
          }
          #undef BRANCH_INT
      }
@@ -1356,21 +1369,50 @@ static void print_stats(args_t *args)
          }
      }
      #if QUAL_STATS
-        fprintf(bcftools_stdout, "# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n");
+        fprintf(bcftools_stdout, "# QUAL, Stats by quality\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n");
          for (id=0; id<args->nstats; id++)
          {
              stats_t *stats = &args->stats[id];
-            for (i=0; i<args->m_qual; i++)
+            int ndist_ts = dist_nbins(stats->qual_ts);
+            int ndist_tv = dist_nbins(stats->qual_tv);
+            int ndist_in = dist_nbins(stats->qual_indels);
+            int ndist_max = ndist_ts;
+            if ( ndist_max < ndist_tv ) ndist_max = ndist_tv;
+            if ( ndist_max < ndist_in ) ndist_max = ndist_in;
+            uint32_t beg, end;
+            uint32_t nts, ntv, nin;
+            for (i=0; i<ndist_max; i++)
              {
-                if ( stats->qual_snps[i]+stats->qual_ts[i]+stats->qual_tv[i]+stats->qual_indels[i] == 0  ) continue;
-                fprintf(bcftools_stdout, "QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]);
+                nts = ntv = nin = 0;
+                float qval = -1;
+                if ( i < ndist_ts )
+                {
+                    nts = dist_get(stats->qual_ts, i, &beg, &end);
+                    qval = beg>0 ? 0.1*(beg - 1) : -1;
+                }
+                if ( i < ndist_tv )
+                {
+                    ntv = dist_get(stats->qual_tv, i, &beg, &end);
+                    if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1;
+                }
+                if ( i < ndist_in )
+                {
+                    nin = dist_get(stats->qual_indels, i, &beg, &end);
+                    if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1;
+                }
+                if ( nts+ntv+nin==0 ) continue;
+
+                fprintf(bcftools_stdout, "QUAL\t%d\t",id);
+                if ( qval==-1 ) fprintf(bcftools_stdout, ".");
+                else fprintf(bcftools_stdout, "%.1f",qval);
+                fprintf(bcftools_stdout, "\t%d\t%d\t%d\t%d\n",nts+ntv,nts,ntv,nin);
              }
          }
      #endif
      for (i=0; i<args->nusr; i++)
      {
-        fprintf(bcftools_stdout, "# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n",
-            args->usr[i].tag,args->usr[i].tag,args->usr[i].tag,args->usr[i].tag);
+        fprintf(bcftools_stdout, "# USR:%s/%d\t[2]id\t[3]%s/%d\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n",
+            args->usr[i].tag,args->usr[i].idx,args->usr[i].tag,args->usr[i].idx);
          for (id=0; id<args->nstats; id++)
          {
              user_stats_t *usr = &args->stats[id].usr[i];
@@ -1379,8 +1421,8 @@ static void print_stats(args_t *args)
              {
                  if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue;   // skip empty bins
                  float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1);
-                const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s\t%d\t%.0f\t%d\t%d\t%d\n";
-                fprintf(bcftools_stdout, fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
+                const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s/%d\t%d\t%.0f\t%d\t%d\t%d\n";
+                fprintf(bcftools_stdout, fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]);
              }
          }
      }
@@ -1484,10 +1526,10 @@ static void print_stats(args_t *args)
                  fprintf(bcftools_stdout, "# NRD and discordance is calculated as follows:\n");
                  fprintf(bcftools_stdout, "#   m .. number of matches\n");
                  fprintf(bcftools_stdout, "#   x .. number of mismatches\n");
-                fprintf(bcftools_stdout, "#   NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n");
-                fprintf(bcftools_stdout, "#   RR discordance = xRR / (xRR + mRR)\n");
-                fprintf(bcftools_stdout, "#   RA discordance = xRA / (xRA + mRA)\n");
-                fprintf(bcftools_stdout, "#   AA discordance = xAA / (xAA + mAA)\n");
+                fprintf(bcftools_stdout, "#   NRD = 100 * (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n");
+                fprintf(bcftools_stdout, "#   RR discordance = 100 * xRR / (xRR + mRR)\n");
+                fprintf(bcftools_stdout, "#   RA discordance = 100 * xRA / (xRA + mRA)\n");
+                fprintf(bcftools_stdout, "#   AA discordance = 100 * xAA / (xAA + mAA)\n");
                  fprintf(bcftools_stdout, "# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
              }
              else
@@ -1706,28 +1748,29 @@ static void usage(void)
      fprintf(bcftools_stderr, "Usage:   bcftools stats [options] <A.vcf.gz> [<B.vcf.gz>]\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Options:\n");
-    fprintf(bcftools_stderr, "        --af-bins <list>               allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n");
-    fprintf(bcftools_stderr, "        --af-tag <string>              allele frequency tag to use, by default estimated from AN,AC or GT\n");
-    fprintf(bcftools_stderr, "    -1, --1st-allele-only              include only 1st allele at multiallelic sites\n");
-    fprintf(bcftools_stderr, "    -c, --collapse <string>            treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
-    fprintf(bcftools_stderr, "    -d, --depth <int,int,int>          depth distribution: min,max,bin size [0,500,1]\n");
-    fprintf(bcftools_stderr, "    -e, --exclude <expr>               exclude sites for which the expression is true (see man page for details)\n");
-    fprintf(bcftools_stderr, "    -E, --exons <file.gz>              tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n");
-    fprintf(bcftools_stderr, "    -f, --apply-filters <list>         require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
-    fprintf(bcftools_stderr, "    -F, --fasta-ref <file>             faidx indexed reference sequence file to determine INDEL context\n");
-    fprintf(bcftools_stderr, "    -i, --include <expr>               select sites for which the expression is true (see man page for details)\n");
-    fprintf(bcftools_stderr, "    -I, --split-by-ID                  collect stats for sites with ID separately (known vs novel)\n");
-    fprintf(bcftools_stderr, "    -r, --regions <region>             restrict to comma-separated list of regions\n");
-    fprintf(bcftools_stderr, "    -R, --regions-file <file>          restrict to regions listed in a file\n");
-    fprintf(bcftools_stderr, "    -s, --samples <list>               list of samples for sample stats, \"-\" to include all samples\n");
-    fprintf(bcftools_stderr, "    -S, --samples-file <file>          file of samples to include\n");
-    fprintf(bcftools_stderr, "    -t, --targets <region>             similar to -r but streams rather than index-jumps\n");
-    fprintf(bcftools_stderr, "    -T, --targets-file <file>          similar to -R but streams rather than index-jumps\n");
-    fprintf(bcftools_stderr, "    -u, --user-tstv <TAG[:min:max:n]>  collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
-    fprintf(bcftools_stderr, "        --threads <int>                use multithreading with <int> worker threads [0]\n");
-    fprintf(bcftools_stderr, "    -v, --verbose                      produce verbose per-site and per-sample output\n");
+    fprintf(bcftools_stderr, "        --af-bins LIST               Allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n");
+    fprintf(bcftools_stderr, "        --af-tag STRING              Allele frequency tag to use, by default estimated from AN,AC or GT\n");
+    fprintf(bcftools_stderr, "    -1, --1st-allele-only            Include only 1st allele at multiallelic sites\n");
+    fprintf(bcftools_stderr, "    -c, --collapse STRING            Treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
+    fprintf(bcftools_stderr, "    -d, --depth INT,INT,INT          Depth distribution: min,max,bin size [0,500,1]\n");
+    fprintf(bcftools_stderr, "    -e, --exclude EXPR               Exclude sites for which the expression is true (see man page for details)\n");
+    fprintf(bcftools_stderr, "    -E, --exons FILE.gz              Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, bgzip compressed)\n");
+    fprintf(bcftools_stderr, "    -f, --apply-filters LIST         Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+    fprintf(bcftools_stderr, "    -F, --fasta-ref FILE             Faidx indexed reference sequence file to determine INDEL context\n");
+    fprintf(bcftools_stderr, "    -i, --include EXPR               Select sites for which the expression is true (see man page for details)\n");
+    fprintf(bcftools_stderr, "    -I, --split-by-ID                Collect stats for sites with ID separately (known vs novel)\n");
+    fprintf(bcftools_stderr, "    -r, --regions REGION             Restrict to comma-separated list of regions\n");
+    fprintf(bcftools_stderr, "    -R, --regions-file FILE          Restrict to regions listed in a file\n");
+    fprintf(bcftools_stderr, "    -s, --samples LIST               List of samples for sample stats, \"-\" to include all samples\n");
+    fprintf(bcftools_stderr, "    -S, --samples-file FILE          File of samples to include\n");
+    fprintf(bcftools_stderr, "    -t, --targets REGION             Similar to -r but streams rather than index-jumps\n");
+    fprintf(bcftools_stderr, "    -T, --targets-file FILE          Similar to -R but streams rather than index-jumps\n");
+    fprintf(bcftools_stderr, "    -u, --user-tstv TAG[:min:max:n]  Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
+    fprintf(bcftools_stderr, "                                       A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n");
+    fprintf(bcftools_stderr, "        --threads INT                Use multithreading with <int> worker threads [0]\n");
+    fprintf(bcftools_stderr, "    -v, --verbose                    Produce verbose per-site and per-sample output\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcfstats(int argc, char *argv[])
@@ -1797,8 +1840,12 @@ int main_vcfstats(int argc, char *argv[])
              case 's': args->samples_list = optarg; break;
              case 'S': args->samples_list = optarg; args->samples_is_file = 1; break;
              case 'I': args->split_by_id = 1; break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case  9 : args->n_threads = strtol(optarg, 0, 0); break;
              case 'h':
              case '?': usage(); break;
diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c

index 4117d100d182bd87ff87fdf10d7ce1706b7f8a38..ce4c8108c7f3c48add6c1caacbd17f79430a7909 100644 (file)
--- a/bcftools/vcfview.c
+++ b/bcftools/vcfview.c
@@ -1,6 +1,6 @@
  /*  vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
  
-    Copyright (C) 2013-2018 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Shane McCarthy <sm15@sanger.ac.uk>
  
@@ -221,12 +221,10 @@ static void init_data(args_t *args)
      }
  
      // setup output
+    const char *tmp = hts_bcf_wmode2(args->output_type,args->fn_out);
      char modew[8];
-    strcpy(modew, "w");
+    strcpy(modew,tmp);
      if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel);
-    if (args->output_type==FT_BCF) strcat(modew, "bu");         // uncompressed BCF
-    else if (args->output_type & FT_BCF) strcat(modew, "b");    // compressed BCF
-    else if (args->output_type & FT_GZ) strcat(modew,"z");      // compressed VCF
      args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
      if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
      if ( args->n_threads > 0)
@@ -501,7 +499,7 @@ static void usage(args_t *args)
      fprintf(stderr, "    -h/H, --header-only/--no-header     print the header only/suppress the header in VCF output\n");
      fprintf(stderr, "    -l,   --compression-level [0-9]     compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
      fprintf(stderr, "          --no-version                  do not append version and command line to the header\n");
-    fprintf(stderr, "    -o,   --output-file <file>          output file name [stdout]\n");
+    fprintf(stderr, "    -o,   --output <file>               output file name [stdout]\n");
      fprintf(stderr, "    -O,   --output-type <b|u|z|v>       b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
      fprintf(stderr, "    -r, --regions <region>              restrict to comma-separated list of regions\n");
      fprintf(stderr, "    -R, --regions-file <file>           restrict to regions listed in a file\n");
@@ -575,6 +573,7 @@ int main_vcfview(int argc, char *argv[])
          {"force-samples",no_argument,NULL,1},
          {"output-type",required_argument,NULL,'O'},
          {"output-file",required_argument,NULL,'o'},
+        {"output",required_argument,NULL,'o'},
          {"types",required_argument,NULL,'v'},
          {"exclude-types",required_argument,NULL,'V'},
          {"targets",required_argument,NULL,'t'},
@@ -639,9 +638,12 @@ int main_vcfview(int argc, char *argv[])
                  break;
              case 'v': args->include_types = optarg; break;
              case 'V': args->exclude_types = optarg; break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
-
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'c':
              {
                  args->min_ac_type = ALLELE_NONREF;
diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c

index 77643b7d86c965418f37c37a340f3a8a33a792d6..75b3e64719dc16b4644f226c8a5b258d06682df2 100644 (file)
--- a/bcftools/vcfview.c.pysam.c
+++ b/bcftools/vcfview.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
  
-    Copyright (C) 2013-2018 Genome Research Ltd.
+    Copyright (C) 2013-2021 Genome Research Ltd.
  
      Author: Shane McCarthy <sm15@sanger.ac.uk>
  
@@ -166,7 +166,7 @@ static void init_data(args_t *args)
      if (args->include_types || args->exclude_types) {
          if (args->include_types && args->exclude_types) {
              fprintf(bcftools_stderr, "Error: only supply one of --include-types, --exclude-types options\n");
-            exit(1);
+            bcftools_exit(1);
          }
          char **type_list = 0;
          int m = 0, n = 0;
@@ -197,7 +197,7 @@ static void init_data(args_t *args)
                  else {
                      fprintf(bcftools_stderr, "[E::%s] unknown type\n", type_list[i]);
                      fprintf(bcftools_stderr, "Accepted types are snps, indels, mnps, other\n");
-                    exit(1);
+                    bcftools_exit(1);
                  }
              }
          }
@@ -213,7 +213,7 @@ static void init_data(args_t *args)
                  else {
                      fprintf(bcftools_stderr, "[E::%s] unknown type\n", type_list[i]);
                      fprintf(bcftools_stderr, "Accepted types are snps, indels, mnps, other\n");
-                    exit(1);
+                    bcftools_exit(1);
                  }
              }
          }
@@ -223,12 +223,10 @@ static void init_data(args_t *args)
      }
  
      // setup output
+    const char *tmp = hts_bcf_wmode2(args->output_type,args->fn_out);
      char modew[8];
-    strcpy(modew, "w");
+    strcpy(modew,tmp);
      if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel);
-    if (args->output_type==FT_BCF) strcat(modew, "bu");         // uncompressed BCF
-    else if (args->output_type & FT_BCF) strcat(modew, "b");    // compressed BCF
-    else if (args->output_type & FT_GZ) strcat(modew,"z");      // compressed VCF
      args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
      if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
      if ( args->n_threads > 0)
@@ -302,7 +300,7 @@ int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line)
                  case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_vector_end); break;
                  case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
                  case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
-                default: fprintf(bcftools_stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
+                default: fprintf(bcftools_stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); bcftools_exit(1); break;
              }
              #undef BRANCH_INT
              if (!sample_phased) {
@@ -503,7 +501,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "    -h/H, --header-only/--no-header     print the header only/suppress the header in VCF output\n");
      fprintf(bcftools_stderr, "    -l,   --compression-level [0-9]     compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
      fprintf(bcftools_stderr, "          --no-version                  do not append version and command line to the header\n");
-    fprintf(bcftools_stderr, "    -o,   --output-file <file>          output file name [bcftools_stdout]\n");
+    fprintf(bcftools_stderr, "    -o,   --output <file>               output file name [bcftools_stdout]\n");
      fprintf(bcftools_stderr, "    -O,   --output-type <b|u|z|v>       b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
      fprintf(bcftools_stderr, "    -r, --regions <region>              restrict to comma-separated list of regions\n");
      fprintf(bcftools_stderr, "    -R, --regions-file <file>           restrict to regions listed in a file\n");
@@ -533,7 +531,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "    -v/V, --types/--exclude-types <list>        select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
      fprintf(bcftools_stderr, "    -x/X, --private/--exclude-private           select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
      fprintf(bcftools_stderr, "\n");
-    exit(1);
+    bcftools_exit(1);
  }
  
  int main_vcfview(int argc, char *argv[])
@@ -577,6 +575,7 @@ int main_vcfview(int argc, char *argv[])
          {"force-samples",no_argument,NULL,1},
          {"output-type",required_argument,NULL,'O'},
          {"output-file",required_argument,NULL,'o'},
+        {"output",required_argument,NULL,'o'},
          {"types",required_argument,NULL,'v'},
          {"exclude-types",required_argument,NULL,'V'},
          {"targets",required_argument,NULL,'t'},
@@ -641,9 +640,12 @@ int main_vcfview(int argc, char *argv[])
                  break;
              case 'v': args->include_types = optarg; break;
              case 'V': args->exclude_types = optarg; break;
-            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
-            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
-
+            case 'e':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i':
+                if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
+                args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
              case 'c':
              {
                  args->min_ac_type = ALLELE_NONREF;
diff --git a/bcftools/vcmp.c b/bcftools/vcmp.c

index 7d3b0f9ea81518217168b97709e3c8b0db513c3d..dbdc4b7ac7404d52da1d166919fac5c81bafb4de 100644 (file)
--- a/bcftools/vcmp.c
+++ b/bcftools/vcmp.c
@@ -1,6 +1,6 @@
  /*  vcmp.c -- reference allele utility functions.
  
-    Copyright (C) 2013 Genome Research Ltd.
+    Copyright (C) 2013-2015, 2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/vcmp.c.pysam.c b/bcftools/vcmp.c.pysam.c

index 00435bd744cf6b8892dfcd88f97f8f192930f6a0..18a6813d0b81ec7089bd68ded7124d3ab2e06ec3 100644 (file)
--- a/bcftools/vcmp.c.pysam.c
+++ b/bcftools/vcmp.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcmp.c -- reference allele utility functions.
  
-    Copyright (C) 2013 Genome Research Ltd.
+    Copyright (C) 2013-2015, 2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/vcmp.h b/bcftools/vcmp.h

index 9c6370ce206e98e28af7b42e3a668988b74a6ae9..03234b4b2008209e1ff8c6f80e6b2368b3fcde49 100644 (file)
--- a/bcftools/vcmp.h
+++ b/bcftools/vcmp.h
@@ -1,6 +1,6 @@
  /*  vcmp.h -- reference allele utility functions.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2015 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/version.c b/bcftools/version.c

index 19cec91ac8157a93c80f2f2373c936843d9d146d..d06889726da2a1655996c2d27ce9ecad0ea1a3c5 100644 (file)
--- a/bcftools/version.c
+++ b/bcftools/version.c
@@ -1,6 +1,6 @@
  /*  version.c -- report version numbers for plugins.
  
-    Copyright (C) 2014 Genome Research Ltd.
+    Copyright (C) 2014-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include <stdarg.h>
  #include <stdlib.h>
  #include <stdio.h>
+#include <strings.h>
  #include <errno.h>
  #include <htslib/hts.h>
  #include "bcftools.h"
@@ -60,7 +61,6 @@ void error_errno(const char *format, ...)
      exit(-1);
  }
  
-
  const char *hts_bcf_wmode(int file_type)
  {
      if ( file_type == FT_BCF ) return "wbu";    // uncompressed BCF
@@ -69,4 +69,14 @@ const char *hts_bcf_wmode(int file_type)
      return "w";                                 // uncompressed VCF
  }
  
+const char *hts_bcf_wmode2(int file_type, char *fname)
+{
+    if ( !fname ) return hts_bcf_wmode(file_type);
+    int len = strlen(fname);
+    if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
+    if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF);
+    if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+    if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+    return hts_bcf_wmode(file_type);
+}
  
diff --git a/bcftools/version.c.pysam.c b/bcftools/version.c.pysam.c

index 01dad071e33f22e94b001267c0b58156e52a1b36..37fa8289679c80fa27338c1a2eafcc8189e9055d 100644 (file)
--- a/bcftools/version.c.pysam.c
+++ b/bcftools/version.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  version.c -- report version numbers for plugins.
  
-    Copyright (C) 2014 Genome Research Ltd.
+    Copyright (C) 2014-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include <stdarg.h>
  #include <stdlib.h>
  #include <stdio.h>
+#include <strings.h>
  #include <errno.h>
  #include <htslib/hts.h>
  #include "bcftools.h"
@@ -44,7 +45,7 @@ void error(const char *format, ...)
      va_start(ap, format);
      vfprintf(bcftools_stderr, format, ap);
      va_end(ap);
-    exit(-1);
+    bcftools_exit(-1);
  }
  
  void error_errno(const char *format, ...)
@@ -59,10 +60,9 @@ void error_errno(const char *format, ...)
      } else {
          fprintf(bcftools_stderr, "\n");
      }
-    exit(-1);
+    bcftools_exit(-1);
  }
  
-
  const char *hts_bcf_wmode(int file_type)
  {
      if ( file_type == FT_BCF ) return "wbu";    // uncompressed BCF
@@ -71,4 +71,14 @@ const char *hts_bcf_wmode(int file_type)
      return "w";                                 // uncompressed VCF
  }
  
+const char *hts_bcf_wmode2(int file_type, char *fname)
+{
+    if ( !fname ) return hts_bcf_wmode(file_type);
+    int len = strlen(fname);
+    if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ);
+    if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF);
+    if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+    if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ);
+    return hts_bcf_wmode(file_type);
+}
  
diff --git a/bcftools/version.sh b/bcftools/version.sh

index 7232440c637116238b1ba55f31cc94dbfd1daa5e..52b1e08b784960104e39d1972b082b4a7e3b4ef7 100755 (executable)
--- a/bcftools/version.sh
+++ b/bcftools/version.sh
@@ -1,7 +1,30 @@
  #!/bin/sh
+# version.sh 
+#
+#     Author : Petr Danecek <pd3@sanger.ac.uk>
+#
+#     Copyright (C) 2018-2021 Genome Research Ltd.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
  
  # Master version, for use in tarballs or non-git source copies
-VERSION=1.10.2
+VERSION=1.13
  
  # If we have a git clone, then check against the current tag
  if [ -e .git ]
diff --git a/cy_build.py b/cy_build.py

index fae705557abb73c77767b40d3da6938af7488a2d..aff41a034d4e8ec471341b1fc9dce2a485f7ae9b 100644 (file)
--- a/cy_build.py
+++ b/cy_build.py
@@ -31,6 +31,7 @@ def is_pip_install():
  class CyExtension(Extension):
      def __init__(self, *args, **kwargs):
          self._init_func = kwargs.pop("init_func", None)
+        self._prebuild_func = kwargs.pop("prebuild_func", None)
          Extension.__init__(self, *args, **kwargs)
  
      def extend_includes(self, includes):
@@ -82,5 +83,8 @@ class cy_build_ext(build_ext):
                  ext.extra_link_args = []
  
              ext.extra_link_args += ['-Wl,-rpath,$ORIGIN']
-                                    
+
+        if isinstance(ext, CyExtension) and ext._prebuild_func:
+            ext._prebuild_func(ext, self.force)
+
          build_ext.build_extension(self, ext)
diff --git a/devtools/import.py b/devtools/import.py

index f54138bf82307e1186e10a96d61c28c959a04619..ea357923b0b1ab6c70f6079395c3eb65656826f5 100644 (file)
--- a/devtools/import.py
+++ b/devtools/import.py
@@ -40,7 +40,7 @@ EXCLUDE = {
      "htslib": (
          'htslib/tabix.c', 'htslib/bgzip.c',
          'htslib/htsfile.c',
-        "test"),
+        "test", "tests"),
  }
  
  
@@ -93,6 +93,10 @@ def _update_pysam_files(cf, destdir):
                  else:
                      lines = re.sub(r"int main\(", "int {}_{}_main(".format(
                          basename, subname), lines)
+                if basename == "samtools":
+                    lines = re.sub(r"main_(reheader)\(",
+                                   r"samtools_main_\1(", lines)
+                lines = re.sub(r"\bexit\(", "{}_exit(".format(basename), lines)
                  lines = re.sub("stderr", "{}_stderr".format(basename), lines)
                  lines = re.sub("stdout", "{}_stdout".format(basename), lines)
                  lines = re.sub(r" printf\(", " fprintf({}_stdout, ".format(basename), lines)
@@ -103,9 +107,6 @@ def _update_pysam_files(cf, destdir):
                  fn = os.path.basename(filename)
                  # some specific fixes:
                  SPECIFIC_SUBSTITUTIONS = {
-                    "bamtk.c": (
-                        'else if (strcmp(argv[1], "tview") == 0)',
-                        '//else if (strcmp(argv[1], "tview") == 0)'),
                      "bam_md.c": (
                          'sam_open_format("-", mode_w',
                          'sam_open_format({}_stdout_fn, mode_w'.format(basename)),
@@ -120,6 +121,10 @@ def _update_pysam_files(cf, destdir):
                      lines = lines.replace(
                          SPECIFIC_SUBSTITUTIONS[fn][0],
                          SPECIFIC_SUBSTITUTIONS[fn][1])
+                if fn == "bamtk.c":
+                    lines = re.sub(r'(#include "version.h")', r'\1\n#include "samtools_config_vars.h"', lines)
+                    lines = re.sub(r'(else if.*"tview")', r'//\1', lines)
+
                  outfile.write(lines)
  
      with open(os.path.join("import", "pysam.h")) as inf, \
@@ -224,9 +229,25 @@ if len(sys.argv) >= 1:
                      outf.write(line)
          os.rename(tmpfilename, filename)
  
+    def _update_version_doc_file(dest, value, filename):
+        tmpfilename = filename + ".tmp"
+        with open(filename, encoding="utf-8") as inf:
+            with open(tmpfilename, "w", encoding="utf-8") as outf:
+                for line in inf:
+                    if " wraps " in line:
+                        # hide the sentence's fullstop from the main regexp
+                        line = re.sub(r'\.$', ',DOT', line)
+                        line = re.sub(r'{}-[^*,]*'.format(dest),
+                                      '{}-{}'.format(dest, value), line)
+                        line = re.sub(',DOT', '.', line)
+                    outf.write(line)
+        os.rename(tmpfilename, filename)
+
      version = _getVersion(srcdir)
      _update_version_file("__{}_version__".format(dest), version, "pysam/version.py")
      _update_version_file(C_VERSION[dest], version + " (pysam)", "pysam/version.h")
+    _update_version_doc_file(dest, version, "README.rst")
+    _update_version_doc_file(dest, version, "doc/index.rst")
  
      sys.exit(0)
  
diff --git a/devtools/install-CGAT-tools.sh b/devtools/install-CGAT-tools.sh

index 27eb481a354d95a88911b3ec0e1348da027755c3..e45d39159934cfdd91c8fd541bfa8f4df036a136 100755 (executable)
--- a/devtools/install-CGAT-tools.sh
+++ b/devtools/install-CGAT-tools.sh
@@ -80,7 +80,7 @@ else
  fi # if-OS
  } # install_os_packages
  
-# funcion to install Python dependencies
+# function to install Python dependencies
  install_python_deps() {
  
  if [ "$OS" == "ubuntu" -o "$OS" == "sl" ] ; then
@@ -185,12 +185,13 @@ python setup.py install
  # problems in the compilation test.
  cd tests
  
-# create auxilliary data
+# create auxiliary data
  echo
  echo 'building test data'
  echo 
  make -C pysam_data all
  make -C cbcf_data all
+make -C tabix_data all
  
  # run nosetests
  # -s: do not capture stdout, conflicts with pysam.dispatch
diff --git a/devtools/run_tests_travis.sh b/devtools/run_tests_travis.sh

index 9ad41a709c078ce76f49b51eaabbc5c405ca3446..1f14fc34d527f6e352f23e427696390b2a00d222 100755 (executable)
--- a/devtools/run_tests_travis.sh
+++ b/devtools/run_tests_travis.sh
@@ -37,8 +37,8 @@ conda config --add channels conda-forge
  
  # pin versions, so that tests do not fail when pysam/htslib out of step
  # add htslib dependencies
-# NB: we force conda-forge:ncurses due to bioconda/bioconda-recipes#13488
-conda install -y "samtools=1.9" "bcftools=1.9" "htslib=1.9" xz curl bzip2 conda-forge:ncurses
+# NB: force conda-forge:blas due to conda/conda#7548
+conda install -y "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" xz curl bzip2 "conda-forge::blas=*=openblas"
  
  # As HTSLIB_MODE is (defaulted to) 'shared', ensure we don't pick up
  # the external headers from the Conda-installed htslib package.
@@ -60,12 +60,13 @@ echo "============ installing via setup.py from repository ============"
  echo
  python setup.py install || exit
  
-# create auxilliary data
+# create auxiliary data
  echo
  echo 'building test data'
  echo
  make -C tests/pysam_data
  make -C tests/cbcf_data
+make -C tests/tabix_data
  
  # echo any limits that are in place
  ulimit -a
diff --git a/doc/api.rst b/doc/api.rst

index 3f2c042deefc071e6757a7b521afae1574b52718..6246c353a6ca789776ad127792604c21905cc936 100644 (file)
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -1,7 +1,4 @@
-======================================================
-pysam - An interface for reading and writing SAM files
-======================================================
-
+============
  Introduction
  ============
  
@@ -24,7 +21,7 @@ iteration returns a :class:`~pysam.AlignedSegment` object which
  represents a single read along with its fields and optional tags::
  
     for read in samfile.fetch('chr1', 100, 120):
-       print read
+       print read
  
     samfile.close()
  
@@ -41,8 +38,8 @@ You can also write to a :class:`~pysam.AlignmentFile`::
     samfile = pysam.AlignmentFile("ex1.bam", "rb")
     pairedreads = pysam.AlignmentFile("allpaired.bam", "wb", template=samfile)
     for read in samfile.fetch():
-       if read.is_paired:
-               pairedreads.write(read)
+       if read.is_paired:
+           pairedreads.write(read)
  
     pairedreads.close()
     samfile.close()
@@ -130,11 +127,12 @@ More detailed usage instructions is at :ref:`usage`.
  
         The pysam website containing documentation
  
+===
  API
  ===
  
  SAM/BAM/CRAM files
--------------------
+==================
  
  Objects of type :class:`~pysam.AlignmentFile` allow working with
  BAM/SAM formatted files.
@@ -162,7 +160,7 @@ a SAM/BAM file.
  
  
  Tabix files
------------
+===========
  
  :class:`~pysam.TabixFile` opens tabular files that have been
  indexed with tabix_.
@@ -191,14 +189,14 @@ To iterate over tabix files, use :func:`~pysam.tabix_iterator`:
     :members:
  
  
-Fasta files
------------
+FASTA files
+===========
  
  .. autoclass:: pysam.FastaFile
     :members:
  
-Fastq files
------------
+FASTQ files
+===========
  
  .. autoclass:: pysam.FastxFile
     :members:
@@ -208,8 +206,8 @@ Fastq files
     :members:
  
  
-VCF files
----------
+VCF/BCF files
+=============
  
  .. autoclass:: pysam.VariantFile
     :members:
@@ -224,7 +222,7 @@ VCF files
     :members:
  
  HTSFile
--------
+=======
  
  HTSFile is the base class for :class:`pysam.AlignmentFile` and
  :class:`pysam.VariantFile`.
diff --git a/doc/benchmarking.rst b/doc/benchmarking.rst

index 1ec0d43c59c7c2307ee07b4de46720a47b083201..8fc054ae8c9e76d0d70eec723a74c1be7d5e3944 100644 (file)
--- a/doc/benchmarking.rst
+++ b/doc/benchmarking.rst
@@ -1,3 +1,5 @@
+.. _Benchmarking:
+
  ============
  Benchmarking
  ============
diff --git a/doc/conf.py b/doc/conf.py

index 375aa559ffd39b7f90cddade0dde9181ed4af699..39b6f45b01e2bdfd59d3064c3805dc932e82e1d3 100644 (file)
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -11,13 +11,13 @@
  # All configuration values have a default; values that are commented out
  # serve to show the default.
  
-import sys, os, glob
+import sys, os, sysconfig
  
  # If extensions (or modules to document with autodoc) are in another directory,
  # add these directories to sys.path here. If the directory is relative to the
  # documentation root, use os.path.abspath to make it absolute, like shown here.
-_libdir = "../build/lib.%s-%s-%s.%s" % (os.uname()[0].lower(), os.uname()[4],
-                                        sys.version_info[0], sys.version_info[1])
+_pyversion = sysconfig.get_python_version()
+_libdir = "../build/lib.%s-%s" % (sysconfig.get_platform(), _pyversion)
  if os.path.exists(_libdir):
      sys.path.insert(0, os.path.abspath(_libdir))
  
@@ -32,7 +32,7 @@ extensions = ['sphinx.ext.autodoc',
                'sphinx.ext.intersphinx',
                'sphinx.ext.napoleon']
  
-intersphinx_mapping = {'python': ('http://docs.python.org/3.5', None)}
+intersphinx_mapping = {'python': ('https://docs.python.org/%s' % _pyversion, None)}
  
  # Add any paths that contain templates here, relative to this directory.
  templates_path = ['_templates']
@@ -48,7 +48,7 @@ master_doc = 'index'
  
  # General information about the project.
  project = u'pysam'
-copyright = u'2009, Andreas Heger, Kevin Jacobs et al.'
+copyright = u'2009–2021, Andreas Heger, Kevin Jacobs, et al'
  
  # Included at the end of each rst file
  rst_epilog = '''
@@ -61,6 +61,8 @@ rst_epilog = '''
  .. _Galaxy: https://main.g2.bx.psu.edu/
  .. _cython: http://cython.org/
  .. _python: http://python.org/
+.. _pypi: https://pypi.org/
+.. _pip: https://pip.pypa.io/
  .. _pyximport: http://www.prescod.net/pyximport/
  .. _conda: https://conda.io/docs/
  .. _bioconda: https://bioconda.github.io/
@@ -201,8 +203,8 @@ htmlhelp_basename = 'samtoolsdoc'
  # Grouping the document tree into LaTeX files. List of tuples
  # (source start file, target name, title, author, documentclass [howto/manual]).
  latex_documents = [
-    ('index', 'pysam.tex', ur'pysam documentation',
-     ur'Andreas Heger, Kevin Jacobs et al.', 'manual'),
+    ('index', 'pysam.tex', u'pysam documentation',
+     u'Andreas Heger, Kevin Jacobs, et al.', 'manual'),
  ]
  
  # The name of an image file (relative to this directory) to place at the top of
diff --git a/doc/developer.rst b/doc/developer.rst

index 09ae832e1f9f6b639d66f6a48658188f20cf0483..ca49fdc9312928b3015f5d0212e8b181f232e19c 100644 (file)
--- a/doc/developer.rst
+++ b/doc/developer.rst
@@ -12,7 +12,7 @@ directories:
     Code specific to pysam
  
  :file:`doc`
-   The documentation. To build the latest documention type::
+   The documentation. To build the latest documentation type::
  
         make -C doc html
  
@@ -46,6 +46,17 @@ run::
  
     pytest tests
  
+Most tests use test data from the :file:`tests/*_data` directories.
+Some of these test data files are generated from other files in these
+directories, which is done by running ``make`` in each directory::
+
+   make -C tests/pysam_data
+   # etc
+
+Alternatively if any :file:`tests/*_data/all.stamp` file is not already
+present, running the unit tests should generate that directory's data
+files automatically.
+
  Benchmarking
  ============
  
diff --git a/doc/faq.rst b/doc/faq.rst

index 62fe11dd3f4b929834f0aacf68902f910fa8d5c1..fc39b6065fbcc7838244416247ce3b7ca4d54dc8 100644 (file)
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -5,17 +5,18 @@ FAQ
  How should I cite pysam
  =======================
  
-Pysam has not been published in print. When refering pysam, please
+Pysam has not been published in print. When referring to pysam, please
  use the github URL: https://github.com/pysam-developers/pysam. 
  As pysam is a wrapper around htslib and the samtools package, I
-suggest cite `Li et al (2009) <http://www.ncbi.nlm.nih.gov/pubmed/19505943>`.
+suggest citing [Li.2009]_, [Bonfield.2021]_, and/or [Danecek.2021]_,
+as appropriate.
  
  Is pysam thread-safe?
  =====================
  
  Pysam is a mix of python and C code. Instructions within python are
  generally made thread-safe through python's `global interpreter lock`_
-(GIL_). This ensures that python data structures will always be in a
+(:dfn:`GIL`). This ensures that python data structures will always be in a
  consistent state. 
  
  If an external function outside python is called, the programmer has a
@@ -28,7 +29,7 @@ Alternatively, the GIL can be released while the external function is
  called. This will allow other threads to run concurrently. This can be
  beneficial if the external function is expected to halt, for example
  when waiting for data to read or write. However, to achieve
-thread-safety, the external function needs to implememented with
+thread-safety, the external function needs to be implemented with
  thread-safety in mind. This means that there can be no shared state
  between threads, or if there is shared, it needs to be controlled to
  prevent any access conflicts.
@@ -38,7 +39,7 @@ I/O intensive tasks. This is generally fine, but thread-safety of all
  parts have not been fully tested. 
  
  A related issue is when different threads read from the same file
-objec - or the same thread uses two iterators over a file. There is
+object - or the same thread uses two iterators over a file. There is
  only a single file-position for each opened file. To prevent this from
  hapeding, use the option ``multiple_iterator=True`` when calling
  a fetch() method. This will return an iterator on a newly opened
@@ -141,7 +142,7 @@ I can't call AlignmentFile.fetch on a file without index
  
  :meth:`~pysam.AlignmentFile.fetch` requires an index when
  iterating over a SAM/BAM file. To iterate over a file without
-index, use the ``until_eof=True`::
+index, use the ``until_eof=True``::
  
      bf = pysam.AlignmentFile(fname, "rb")
      for r in bf.fetch(until_eof=True):
diff --git a/doc/glossary.rst b/doc/glossary.rst

index 4e9fa57e664337f87d42e911d723a1e9cce58f10..03892701339e2d80f43f6e1cd724150e64121c5b 100644 (file)
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -48,6 +48,11 @@ Glossary
         Binary SAM format. BAM files are binary formatted, indexed and
         allow random access.
  
+   CRAM
+       CRAM is a binary format representing the same sequence alignment
+       information as SAM and BAM, but offering significantly better
+       lossless compression than BAM.
+
     TAM
         Text SAM file. TAM files are human readable files of
         tab-separated fields. TAM files do not allow random access.
@@ -106,6 +111,14 @@ Glossary
     BCF
        Binary :term:`VCF`
  
+   FASTA
+      Simple text format containing sequence data, with only the bare
+      minimum of metadata. Typically used for reference sequence data.
+
+   FASTQ
+      Simple text format containing sequence data and associated base
+      qualities.
+
     tabix
        Utility in the htslib package to index :term:`bgzip` compressed
        files.
diff --git a/doc/index.rst b/doc/index.rst

index 4e18b7627ca3e288ac59659aec69d34e565e5a87..15de2ca66c9bee8dcc7fa7a4a11f130c2fc406da 100644 (file)
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -18,8 +18,7 @@ This module provides a low-level wrapper around the htslib_ C-API as
  using cython and a high-level, pythonic API for convenient access to
  the data within genomic file formats. 
  
-The current version wraps *htslib-1.10.2*, *samtools-1.10* and
-*bcftools-1.10.2*.
+The current version wraps *htslib-1.13*, *samtools-1.13*, and *bcftools-1.13*.
  
  To install the latest release, type::
  
@@ -54,9 +53,21 @@ Contents:
  References
  ----------
  
-.. [Li2009] The Sequence Alignment/Map format and SAMtools. Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup.
-           Bioinformatics. 2009 Aug 15;25(16):2078-9. Epub 2009 Jun 8.
-           `PMID: 19505943 <http://www.ncbi.nlm.nih.gov/pubmed/19505943?dopt=Abstract>`_
+.. [Li.2009] *The Sequence Alignment/Map format and SAMtools.*
+   Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup.
+   Bioinformatics. 2009 Aug 15;25(16):2078-9.
+   Epub 2009 Jun 8 `btp352 <https://doi.org/10.1093/bioinformatics/btp352>`_.
+   PMID: `19505943 <https://pubmed.ncbi.nlm.nih.gov/19505943>`_.
+
+.. [Bonfield.2021] *HTSlib: C library for reading/writing high-throughput sequencing data.*
+   Bonfield JK, Marshall J, Danecek P, Li H, Ohan V, Whitwham A, Keane T, Davies RM.
+   GigaScience (2021) 10(2) `giab007 <https://doi.org/10.1093/gigascience/giab007>`_.
+   PMID: `33594436 <https://pubmed.ncbi.nlm.nih.gov/33594436>`_.
+
+.. [Danecek.2021] *Twelve years of SAMtools and BCFtools.*
+   Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H.
+   GigaScience (2021) 10(2) `giab008 <https://doi.org/10.1093/gigascience/giab008>`_.
+   PMID: `33590861 <https://pubmed.ncbi.nlm.nih.gov/33590861>`_.
  
  .. seealso::
   
diff --git a/doc/installation.rst b/doc/installation.rst

index 535f4bce376c95d9fa56cac4ba11a3c4a3627a1e..a286c2733f44e353c7fd90d70fdf5d3e84b3c919 100644 (file)
--- a/doc/installation.rst
+++ b/doc/installation.rst
@@ -46,7 +46,7 @@ features. If these fail, for example due to missing library
  dependencies (`libcurl`, `libcrypto`), it will fall back to
  conservative defaults.
  
-Options can be passed to the configure script explicitely by
+Options can be passed to the configure script explicitly by
  setting the environment variable `HTSLIB_CONFIGURE_OPTIONS`.
  For example::
  
diff --git a/doc/release.rst b/doc/release.rst

index 06c602bea8fdd9fee860dfe98e1398b47e6694b3..966ee6ab67764ee767258e543833e8df009b4726 100644 (file)
--- a/doc/release.rst
+++ b/doc/release.rst
@@ -2,6 +2,48 @@
  Release notes
  =============
  
+Release 0.17.0
+==============
+
+This release wraps htslib/samtools/bcftools version 1.13. Corresponding
+to new samtools commands, `pysam.samtools` now has additional functions
+`ampliconclip`, `ampliconstats`, `fqimport`, and `version`.
+
+Bugs fixed:
+
+* [#447] The maximum QNAME length is fully restored to 254
+* [#506, #958, #1000] Don't crash the Python interpreter on ``pysam.bcftools.*()`` errors
+* [#603] count_coverage: ignore reads that have no SEQ field
+* [#928] Fix ``pysam.bcftools.mpileup()`` segmentation fault
+* [#983] Add win32/\*.[ch] to MANIFEST.in
+* [#994] Raise exception in ``get_tid()`` if header could not be parsed
+* [#995] Choose TBI/CSI in ``tabix_index()`` via both min_shift and csi
+* [#996] ``AlignmentFile.fetch()`` now works with large chromosomes longer than 2\ :sup:`29` bases
+* [#1019] Fix Sphinx documentation generation by avoiding Python 2 ``ur'string'`` syntax
+* [#1035] Improved handling of file iteration errors
+* [#1038] ``tabix_index()`` no longer leaks file descriptors
+* [#1040] ``print(aligned_segment)`` now prints the correct TLEN value
+  (it also now prints RNAME/RNEXT more clearly and prints POS/PNEXT 1-based)
+* *setup.py* longer uses ``setup(use_2to3)`` for compatibility with setuptools >= v58.0.0
+
+New facilities:
+
+* [PR #963] Additional VCF classes are exposed to pysam programmers
+* [#998, PR #1001] Add ``get/set_encoding_error_handler()`` to control UTF-8 conversion
+* [PR #1012] Running ``python setup.py sdist`` now automatically runs cythonize
+* Running tests with ``pytest`` now automatically runs ``make`` to generate test data
+
+Documentation improvements:
+
+* [#726] Clarify get_forward_sequence/get_forward_qualities documentation
+* [#865] Improved example
+* [#968] ``get_index_statstics`` parameters
+* [#986] Clarify ``VariantFile.fetch`` start/stop region parameters are 0-based and half-open.
+* [#990] Corrected ``PileupColumn.get_query_sequences`` documentation
+* [#999] Fix documentation for ``AlignmentFile.get_reference_length()``
+* [#1002] Document the default min_base_quality for ``pileup()``
+
+
  Release 0.16.0
  ==============
  
@@ -149,7 +191,7 @@ Backwards incompatible changes:
  
    The rationale for this change is to have consistency between
    AlignmentFile and VariantFile.
-             
+
  * AlignmentFile and FastaFile now raise IOError instead of OSError
  
  Medium term we plan to have a 1.0 release. The pysam
@@ -190,6 +232,7 @@ contains a series of bugfixes.
  * [#473] A new FastxRecord class that can be instantiated from class and
    modified in-place. Replaces PersistentFastqProxy.
  * [#521] In AligmentFile, Simplify file detection logic and allow remote index files
+
    * Removed attempts to guess data and index file names; this is magic left
      to htslib.
    * Removed file existence check prior to opening files with htslib
@@ -200,6 +243,7 @@ contains a series of bugfixes.
    * Allow remote indices (tested using S3 signed URLs).
    * Document filepath_index and make it an alias for index_filename.
    * Added a require_index parameter to AlignmentFile
+
  * [#526] handle unset ref when creating new records
  * [#513] fix bcf_translate to skip deleted FORMAT fields to avoid
    segfaults
@@ -225,7 +269,7 @@ are created will need to change as the constructor requires a header::
      header = pysam.AlignmentHeader(
          reference_names=["chr1", "chr2"],
          reference_lengths=[1000, 1000])
-        
+
      read = pysam.AlignedSegment(header)
  
  This will affect all code that instantiates AlignedSegment objects
@@ -252,7 +296,7 @@ Release 0.11.2
  ==============
  
  This release wraps htslib/samtools/bcfools versions 1.4.1 in response
-to a security fix in these libraries. Additionaly the following
+to a security fix in these libraries. Additionally the following
  issues have been fixed:
  
  * [#452] add GFF3 support for tabix parsers
@@ -373,7 +417,7 @@ Overview
  --------
  
  The 0.9.0 release upgrades htslib to htslib 1.3 and numerous other
-enchancements and bugfixes. See below for a detailed list.
+enhancements and bugfixes. See below for a detailed list.
  
  `Htslib 1.3 <https://github.com/samtools/htslib/releases/tag/1.3>`_
  comes with additional capabilities for remote file access which depend
@@ -416,7 +460,7 @@ Detailed release notes
       and code bloat.
     * run configure for the builtin htslib library in order to detect
       optional libraries such as libcurl. Configure behaviour can be
-     controlled by setting the environmet variable
+     controlled by setting the environment variable
       HTSLIB_CONFIGURE_OPTIONS.
  * get_reference_sequence() now returns the reference sequence and not
    something looking like it. This bug had effects on
@@ -440,15 +484,17 @@ Potential isses when upgrading from v0.8.3:
  
  * renamed several methods for pep8 compatibility, old names still retained for 
    backwards compatibility, but should be considered deprecated.
+
     * gettid() is now get_tid()
     * getrname() is now get_reference_name()
     * parseRegion() is now parse_region()
  
  * some methods have changed for pep8 compatibility without the old
    names being present:
+
     * fromQualityString() is now qualitystring_to_array()
     * toQualityString() is now qualities_to_qualitystring()
-   
+
  * faidx now returns strings and not binary strings in py3.
  
  * The cython components have been broken up into smaller files with
@@ -557,7 +603,7 @@ Release 0.8.2
    with reading and writing capability. However, the interface is still
    incomplete and preliminary and lacks capability to mutate the
    resulting data.
-  
+
  Release 0.8.1
  =============
  
@@ -569,7 +615,7 @@ Release 0.8.1
    * issue #19: multiple iterators can now be made to work on the same tabix file
    * issue #24: All strings returned from/passed to the pysam API are now unicode in python 3
    * issue #5:  type guessing for lists of integers fixed    
-    
+
  * API changes for consistency. The old API is still present,
    but deprecated.
    In particular:
@@ -619,7 +665,7 @@ Other changes:
  
  Backwards incompatible changes
  
-* Empty cigarstring now returns None (intstead of '')
+* Empty cigarstring now returns None (instead of '')
  * Empty cigar now returns None (instead of [])
  * When using the extension classes in cython modules, AlignedRead
    needs to be substituted with AlignedSegment. 
@@ -686,18 +732,18 @@ Release 0.7.5
  
  Release 0.7.4
  =============
-       
+
  * further bugfixes to setup.py and package layout
  
  Release 0.7.3
  =============
-       
+
  * further bugfixes to setup.py
  * upgraded distribute_setup.py to 0.6.34
  
  Release 0.7.2
  =============
-  
+
  * bugfix in installer - failed when cython not present
  * changed installation locations of shared libraries
  
diff --git a/doc/usage.rst b/doc/usage.rst

index f4b7498070c1ad48d8553f1e3148392b7f17db07..fc4f2bb97a277590c4daced4899c57881c8e3f4b 100644 (file)
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -269,7 +269,8 @@ simple variant attributes such as :class:`~pysam.VariantRecord.contig`,
         print (rec.pos)
  
  but also to complex attributes such as the contents to the
-:term:`info`, :term:`format` and :term:`genotype` columns. These
+:class:`~pysam.VariantRecord.info`, :class:`~pysam.VariantRecord.format`
+and :term:`genotype` columns. These
  complex attributes are views on the underlying htslib data structures
  and provide dictionary-like access to the data::
  
diff --git a/import/pysam.c b/import/pysam.c

index 56926222a882c97bc9a178415df746c599f8fc72..2a81e4d0eb605fd6860c2b59f1349bec5a3768f8 100644 (file)
--- a/import/pysam.c
+++ b/import/pysam.c
@@ -1,6 +1,7 @@
  #include <ctype.h>
  #include <assert.h>
  #include <unistd.h>
+#include <setjmp.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
@@ -55,6 +56,25 @@ int @pysam@_puts(const char *s)
    return putc('\n', @pysam@_stdout);
  }
  
+
+static jmp_buf @pysam@_jmpbuf;
+static int @pysam@_status = 0;
+
+int @pysam@_dispatch(int argc, char *argv[])
+{
+  if (setjmp(@pysam@_jmpbuf) == 0)
+    return @pysam@_main(argc, argv);
+  else
+    return @pysam@_status;
+}
+
+void @pysam@_exit(int status)
+{
+  @pysam@_status = status;
+  longjmp(@pysam@_jmpbuf, 1);
+}
+
+
  void @pysam@_set_optind(int val)
  {
    // setting this in cython via 
diff --git a/import/pysam.h b/import/pysam.h

index 6abb884e7ad0653ec293a25dceeb04603cbc8851..8dbb09eb47bceeff367c96c65ee20a1727a73863 100644 (file)
--- a/import/pysam.h
+++ b/import/pysam.h
@@ -3,6 +3,17 @@
  
  #include <stdio.h>
  
+#ifndef __has_attribute
+#define __has_attribute(attribute) 0
+#endif
+#ifndef PYSAM_NORETURN
+#if __has_attribute(__noreturn__) || __GNUC__ >= 3
+#define PYSAM_NORETURN __attribute__((__noreturn__))
+#else
+#define PYSAM_NORETURN
+#endif
+#endif
+
  extern FILE * @pysam@_stderr;
  
  extern FILE * @pysam@_stdout;
@@ -40,6 +51,8 @@ int @pysam@_puts(const char *s);
  
  int @pysam@_dispatch(int argc, char *argv[]);
  
+void PYSAM_NORETURN @pysam@_exit(int status);
+
  void @pysam@_set_optind(int);
  
  extern int @pysam@_main(int argc, char *argv[]);
diff --git a/pysam.py b/pysam.py

deleted file mode 100644 (file)

index 0823abd..0000000
--- a/pysam.py
+++ /dev/null
@@ -1 +0,0 @@
-raise ImportError('''calling "import pysam" from the source directory is not supported - please import pysam from somewhere else.''')
diff --git a/pysam/__init__.py b/pysam/__init__.py

index 40877da052eea7a88cc7ffede8686f8d8c8d4ef0..a6ff6d755e816817b37606331640e2843cc9c25b 100644 (file)
--- a/pysam/__init__.py
+++ b/pysam/__init__.py
@@ -11,8 +11,8 @@ import pysam.libcfaidx as libcfaidx
  from pysam.libcfaidx import *
  import pysam.libctabix as libctabix
  from pysam.libctabix import *
-# import pysam.libctabixproxies as libctabixproxies
-# from pysam.libctabixproxies import *
+import pysam.libctabixproxies as libctabixproxies
+from pysam.libctabixproxies import *
  import pysam.libcsamfile as libcsamfile
  from pysam.libcsamfile import *
  import pysam.libcalignmentfile as libcalignmentfile
diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd

index c964160382f41893cf7569f726255cb458a3ad8c..473c5b158f844c859099dafbd9f832a32eab1a59 100644 (file)
--- a/pysam/libcalignedsegment.pxd
+++ b/pysam/libcalignedsegment.pxd
@@ -64,7 +64,7 @@ cdef class AlignedSegment:
  
  
  cdef class PileupColumn:
-    cdef bam_pileup1_t ** plp
+    cdef const bam_pileup1_t ** plp
      cdef int tid
      cdef int pos
      cdef int n_pu
@@ -89,7 +89,7 @@ cdef AlignedSegment makeAlignedSegment(
      AlignmentHeader header)
  
  cdef PileupColumn makePileupColumn(
-     bam_pileup1_t ** plp,
+    const bam_pileup1_t ** plp,
      int tid,
      int pos,
      int n_pu,
@@ -97,7 +97,7 @@ cdef PileupColumn makePileupColumn(
      char * reference_sequence,
      AlignmentHeader header)
  
-cdef PileupRead makePileupRead(bam_pileup1_t * src,
+cdef PileupRead makePileupRead(const bam_pileup1_t * src,
                                AlignmentHeader header)
  
  cdef uint32_t get_alignment_length(bam1_t * src)
diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx

index 5674b494ffda48688bf0c09eba6b60a88d9c310e..da7274cfe872bfb9000c4ae278fb3fe89bff3d98 100644 (file)
--- a/pysam/libcalignedsegment.pyx
+++ b/pysam/libcalignedsegment.pyx
@@ -134,7 +134,7 @@ cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b):
              return toupper(ch)
  
  
-cdef inline bint pileup_base_qual_skip(bam_pileup1_t * p, uint32_t threshold):
+cdef inline bint pileup_base_qual_skip(const bam_pileup1_t * p, uint32_t threshold):
      cdef uint32_t c
      if p.qpos < p.b.core.l_qseq:
          c = bam_get_qual(p.b)[p.qpos]
@@ -608,7 +608,7 @@ cdef AlignedSegment makeAlignedSegment(bam1_t *src,
  
  
  cdef class PileupColumn
-cdef PileupColumn makePileupColumn(bam_pileup1_t ** plp,
+cdef PileupColumn makePileupColumn(const bam_pileup1_t ** plp,
                        int tid,
                        int pos,
                        int n_pu,
@@ -635,7 +635,7 @@ cdef PileupColumn makePileupColumn(bam_pileup1_t ** plp,
  
  
  cdef class PileupRead
-cdef PileupRead makePileupRead(bam_pileup1_t *src,
+cdef PileupRead makePileupRead(const bam_pileup1_t *src,
                                 AlignmentHeader header):
      '''return a PileupRead object construted from a bam_pileup1_t * object.'''
      # note that the following does not call __init__
@@ -784,7 +784,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src):
  
      # Check if MD tag is valid by matching CIGAR length to MD tag defined length
      # Insertions would be in addition to what is described by MD, so we calculate
-    # the number of insertions seperately.
+    # the number of insertions separately.
      cdef int insertions = 0
  
      while s[s_idx] != 0:
@@ -978,13 +978,13 @@ cdef class AlignedSegment:
          # requires a valid header.
          return "\t".join(map(str, (self.query_name,
                                     self.flag,
-                                   self.reference_id,
-                                   self.reference_start,
+                                   "#%d" % self.reference_id if self.reference_id >= 0 else "*",
+                                   self.reference_start + 1,
                                     self.mapping_quality,
                                     self.cigarstring,
-                                   self.next_reference_id,
-                                   self.next_reference_start,
-                                   self.query_alignment_length,
+                                   "#%d" % self.next_reference_id if self.next_reference_id >= 0 else "*",
+                                   self.next_reference_start + 1,
+                                   self.template_length,
                                     self.query_sequence,
                                     self.query_qualities,
                                     self.tags)))
@@ -1169,10 +1169,8 @@ cdef class AlignedSegment:
              if qname is None or len(qname) == 0:
                  return
  
-            # See issue #447
-            # (The threshold is 252 chars, but this includes a \0 byte.
-            if len(qname) > 251:
-                raise ValueError("query length out of range {} > 251".format(
+            if len(qname) > 254:
+                raise ValueError("query length out of range {} > 254".format(
                      len(qname)))
  
              qname = force_bytes(qname)
@@ -1392,9 +1390,9 @@ cdef class AlignedSegment:
             read.query_squence = read.query_sequence[5:10]
             read.query_qualities = q[5:10]
  
-        The sequence is returned as it is stored in the BAM file. Some mappers
-        might have stored a reverse complement of the original read
-        sequence.
+        The sequence is returned as it is stored in the BAM file. (This will
+        be the reverse complement of the original read sequence if the mapper
+        has aligned the read to the reverse strand.)
          """
          def __get__(self):
              if self.cache_query_sequence:
@@ -1570,7 +1568,7 @@ cdef class AlignedSegment:
          def __set__(self, val):
              pysam_update_flag(self._delegate, val, BAM_FUNMAP)
              # setting the unmapped flag requires recalculation of
-            # bin as alignment length is now implicitely 1
+            # bin as alignment length is now implicitly 1
              update_bin(self._delegate)
  
      property mate_is_unmapped:
@@ -1843,8 +1841,9 @@ cdef class AlignedSegment:
      def get_forward_sequence(self):
          """return the original read sequence.
  
-        Reads mapping to the reverse strand will be reverse
-        complemented.
+        Reads mapped to the reverse strand are stored reverse complemented in
+        the BAM file. This method returns such reads reverse complemented back
+        to their original orientation.
  
          Returns None if the record has no query sequence.
          """
@@ -1856,9 +1855,12 @@ cdef class AlignedSegment:
          return s
  
      def get_forward_qualities(self):
-        """return base qualities of the read sequence.
+        """return the original base qualities of the read sequence,
+        in the same format as the :attr:`query_qualities` property.
  
-        Reads mapping to the reverse strand will be reversed.
+        Reads mapped to the reverse strand have their base qualities stored
+        reversed in the BAM file. This method returns such reads' base qualities
+        reversed back to their original orientation.
          """
          if self.is_reverse:
              return self.query_qualities[::-1]
@@ -2242,7 +2244,7 @@ cdef class AlignedSegment:
          *value*.
  
          An existing value of the same *tag* will be overwritten unless
-        *replace* is set to False. This is usually not recommened as a
+        *replace* is set to False. This is usually not recommended as a
          tag may only appear once in the optional alignment section.
  
          If *value* is None, the tag will be deleted.
@@ -2468,7 +2470,7 @@ cdef class AlignedSegment:
              return value
  
      def get_tags(self, with_value_type=False):
-        """the fields in the optional aligment section.
+        """the fields in the optional alignment section.
  
          Returns a list of all fields in the optional
          alignment section. Values are converted to appropriate python
@@ -2841,7 +2843,7 @@ cdef class PileupColumn:
                  raise ValueError("PileupColumn accessed after iterator finished")
  
              cdef int x
-            cdef bam_pileup1_t * p = NULL
+            cdef const bam_pileup1_t * p = NULL
              pileups = []
  
              # warning: there could be problems if self.n and self.buf are
@@ -2893,7 +2895,7 @@ cdef class PileupColumn:
          cdef uint32_t x = 0
          cdef uint32_t c = 0
          cdef uint32_t cnt = 0
-        cdef bam_pileup1_t * p = NULL
+        cdef const bam_pileup1_t * p = NULL
          if self.plp == NULL or self.plp[0] == NULL:
              raise ValueError("PileupColumn accessed after iterator finished")
  
@@ -2941,7 +2943,7 @@ cdef class PileupColumn:
  
          mark_matches: bool
  
-          If True, output bases matching the reference as "," or "."
+          If True, output bases matching the reference as "." or ","
            for forward and reverse strand, respectively. This mark
            requires the reference sequence. If no reference is
            present, this option is ignored.
@@ -2969,7 +2971,7 @@ cdef class PileupColumn:
          cdef uint8_t cc = 0
          cdef uint8_t rb = 0
          cdef kstring_t * buf = &self.buf
-        cdef bam_pileup1_t * p = NULL
+        cdef const bam_pileup1_t * p = NULL
  
          if self.plp == NULL or self.plp[0] == NULL:
              raise ValueError("PileupColumn accessed after iterator finished")
@@ -3052,7 +3054,7 @@ cdef class PileupColumn:
          list: a list of quality scores
          """
          cdef uint32_t x = 0
-        cdef bam_pileup1_t * p = NULL
+        cdef const bam_pileup1_t * p = NULL
          cdef uint32_t c = 0
          result = []
          for x from 0 <= x < self.n_pu:
@@ -3083,7 +3085,7 @@ cdef class PileupColumn:
              raise ValueError("PileupColumn accessed after iterator finished")
  
          cdef uint32_t x = 0
-        cdef bam_pileup1_t * p = NULL
+        cdef const bam_pileup1_t * p = NULL
          result = []
          for x from 0 <= x < self.n_pu:
              p = &(self.plp[0][x])
@@ -3109,7 +3111,7 @@ cdef class PileupColumn:
              raise ValueError("PileupColumn accessed after iterator finished")
  
          cdef uint32_t x = 0
-        cdef bam_pileup1_t * p = NULL
+        cdef const bam_pileup1_t * p = NULL
          result = []
          for x from 0 <= x < self.n_pu:
              p = &(self.plp[0][x])
@@ -3135,7 +3137,7 @@ cdef class PileupColumn:
              raise ValueError("PileupColumn accessed after iterator finished")
  
          cdef uint32_t x = 0
-        cdef bam_pileup1_t * p = NULL
+        cdef const bam_pileup1_t * p = NULL
          result = []
          for x from 0 <= x < self.n_pu:
              p = &(self.plp[0][x])
diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd

index 6ee496386a3537828fc247d89539e4b697bc4bd9..2a17fbee827d58539d747c6c83534193afac52a1 100644 (file)
--- a/pysam/libcalignmentfile.pxd
+++ b/pysam/libcalignmentfile.pxd
@@ -58,24 +58,6 @@ cdef class AlignmentFile(HTSFile):
      cpdef int write(self, AlignedSegment read) except -1
  
  
-cdef class PileupColumn:
-    cdef bam_pileup1_t ** plp
-    cdef int tid
-    cdef int pos
-    cdef int n_pu
-
-
-cdef class PileupRead:
-    cdef AlignedSegment _alignment
-    cdef int32_t  _qpos
-    cdef int _indel
-    cdef int _level
-    cdef uint32_t _is_del
-    cdef uint32_t _is_head
-    cdef uint32_t _is_tail
-    cdef uint32_t _is_refskip
-
-
  cdef class IteratorRow:
      cdef int retval
      cdef bam1_t * b
@@ -124,7 +106,7 @@ cdef class IteratorColumn:
      cdef int pos
      cdef int n_plp
      cdef uint32_t min_base_quality
-    cdef bam_pileup1_t * plp
+    cdef const bam_pileup1_t * plp
      cdef bam_mplp_t pileup_iter
      cdef __iterdata iterdata
      cdef AlignmentFile samfile
diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx

index b8e42303d40d05cd79fce91b074eded3f1f81bbe..e192ff3d7254da41239142fd0e3d6a182061855c 100644 (file)
--- a/pysam/libcalignmentfile.pyx
+++ b/pysam/libcalignmentfile.pyx
@@ -100,7 +100,7 @@ IndexStats = collections.namedtuple("IndexStats",
  ########################################################
  ## global variables
  # maximum genomic coordinace
-# for some reason, using 'int' causes overlflow
+# for some reason, using 'int' causes overflow
  cdef int MAX_POS = (1 << 31) - 1
  
  # valid types for SAM headers
@@ -175,6 +175,12 @@ cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr):
  
      return header
  
+def read_failure_reason(code):
+    if code == -2:
+        return 'truncated file'
+    else:
+        return "error {} while reading file".format(code)
+
  
  # the following should be class-method for VariantHeader, but cdef @classmethods
  # are not implemented in cython.
@@ -522,7 +528,10 @@ cdef class AlignmentHeader(object):
          returns -1 if reference is not known.
          """
          reference = force_bytes(reference)
-        return bam_name2id(self.ptr, reference)
+        tid = bam_name2id(self.ptr, reference)
+        if tid < -1:
+            raise ValueError('could not parse header')
+        return tid
  
      def __str__(self):
          '''string with the full contents of the :term:`sam file` header as a
@@ -1029,7 +1038,7 @@ cdef class AlignmentFile(HTSFile):
  
          See :meth:`~pysam.HTSFile.parse_region` for more information
          on how genomic regions can be specified. :term:`reference` and
-        `end` are also accepted for backward compatiblity as synonyms
+        `end` are also accepted for backward compatibility as synonyms
          for :term:`contig` and `stop`, respectively.
  
          Without a `contig` or `region` all mapped reads in the file
@@ -1212,7 +1221,7 @@ cdef class AlignmentFile(HTSFile):
          """perform a :term:`pileup` within a :term:`region`. The region is
          specified by :term:`contig`, `start` and `stop` (using
          0-based indexing).  :term:`reference` and `end` are also accepted for
-        backward compatiblity as synonyms for :term:`contig` and `stop`,
+        backward compatibility as synonyms for :term:`contig` and `stop`,
          respectively.  Alternatively, a samtools 'region' string
          can be supplied.
  
@@ -1239,7 +1248,7 @@ cdef class AlignmentFile(HTSFile):
  
             By default, the samtools pileup engine outputs all reads
             overlapping a region. If truncate is True and a region is
-           given, only columns in the exact region specificied are
+           given, only columns in the exact region specified are
             returned.
  
          max_depth : int
@@ -1288,7 +1297,7 @@ cdef class AlignmentFile(HTSFile):
          min_base_quality: int
  
             Minimum base quality. Bases below the minimum quality will
-           not be output.
+           not be output. The default is 13.
  
          adjust_capq_threshold: int
  
@@ -1354,7 +1363,7 @@ cdef class AlignmentFile(HTSFile):
  
          The region is specified by :term:`contig`, `start` and `stop`.
          :term:`reference` and `end` are also accepted for backward
-        compatiblity as synonyms for :term:`contig` and `stop`,
+        compatibility as synonyms for :term:`contig` and `stop`,
          respectively.  Alternatively, a :term:`samtools` :term:`region`
          string can be supplied.
  
@@ -1458,7 +1467,7 @@ cdef class AlignmentFile(HTSFile):
  
          The region is specified by :term:`contig`, `start` and `stop`.
          :term:`reference` and `end` are also accepted for backward
-        compatiblity as synonyms for :term:`contig` and `stop`,
+        compatibility as synonyms for :term:`contig` and `stop`,
          respectively.  Alternatively, a :term:`samtools` :term:`region`
          string can be supplied.  The coverage is computed per-base [ACGT].
  
@@ -1575,6 +1584,8 @@ cdef class AlignmentFile(HTSFile):
  
              # count
              seq = read.seq
+            if seq is None:
+                continue
              quality = read.query_qualities
  
              for qpos, refpos in read.get_aligned_pairs(True):
@@ -1779,7 +1790,8 @@ cdef class AlignmentFile(HTSFile):
  
      property nocoordinate:
          """int with total number of reads without coordinates according to the
-        statistics recorded in the index. This is a read-only attribute.
+        statistics recorded in the index, i.e., the statistic printed for "*"
+        by the ``samtools idxstats`` command. This is a read-only attribute.
          """
          def __get__(self):
              self.check_index()
@@ -1790,7 +1802,8 @@ cdef class AlignmentFile(HTSFile):
  
      def get_index_statistics(self):
          """return statistics about mapped/unmapped reads per chromosome as
-        they are stored in the index.
+        they are stored in the index, similarly to the statistics printed
+        by the ``samtools idxstats`` command.
  
          Returns:
              list :
@@ -1846,12 +1859,12 @@ cdef class AlignmentFile(HTSFile):
  
      def __next__(self):
          cdef int ret = self.cnext()
-        if (ret >= 0):
+        if ret >= 0:
              return makeAlignedSegment(self.b, self.header)
-        elif ret == -2:
-            raise IOError('truncated file')
-        else:
+        elif ret == -1:
              raise StopIteration
+        else:
+            raise IOError(read_failure_reason(ret))
  
      ###########################################
      # methods/properties referencing the header
@@ -1886,7 +1899,7 @@ cdef class AlignmentFile(HTSFile):
  
      def get_reference_length(self, reference):
          """
-        return :term:`reference` name corresponding to numerical :term:`tid`
+        return :term:`reference` length corresponding to numerical :term:`tid`
          """
          if self.header is None:
              raise ValueError("header not available in closed files")
@@ -2138,10 +2151,10 @@ cdef class IteratorRowHead(IteratorRow):
          if ret >= 0:
              self.current_row += 1
              return makeAlignedSegment(self.b, self.header)
-        elif ret == -2:
-            raise IOError('truncated file')
-        else:
+        elif ret == -1:
              raise StopIteration
+        else:
+            raise IOError(read_failure_reason(ret))
  
  
  cdef class IteratorRowAll(IteratorRow):
@@ -2183,10 +2196,10 @@ cdef class IteratorRowAll(IteratorRow):
          cdef int ret = self.cnext()
          if ret >= 0:
              return makeAlignedSegment(self.b, self.header)
-        elif ret == -2:
-            raise IOError('truncated file')
-        else:
+        elif ret == -1:
              raise StopIteration
+        else:
+            raise IOError(read_failure_reason(ret))
  
  
  cdef class IteratorRowAllRefs(IteratorRow):
@@ -2217,7 +2230,7 @@ cdef class IteratorRowAllRefs(IteratorRow):
          self.rowiter = IteratorRowRegion(self.samfile,
                                           self.tid,
                                           0,
-                                         1<<29)
+                                         MAX_POS)
          # set htsfile and header of the rowiter
          # to the values in this iterator to reflect multiple_iterators
          self.rowiter.htsfile = self.htsfile
@@ -2301,10 +2314,10 @@ cdef class IteratorRowSelection(IteratorRow):
          cdef int ret = self.cnext()
          if ret >= 0:
              return makeAlignedSegment(self.b, self.header)
-        elif ret == -2:
-            raise IOError('truncated file')
-        else:
+        elif ret == -1:
              raise StopIteration
+        else:
+            raise IOError(read_failure_reason(ret))
  
  
  cdef int __advance_nofilter(void *data, bam1_t *b):
@@ -2434,7 +2447,7 @@ cdef class IteratorColumn:
  
      For reasons of efficiency, the iterator points to the current
      pileup buffer. The pileup buffer is updated at every iteration.
-    This might cause some unexpected behavious. For example,
+    This might cause some unexpected behaviour. For example,
      consider the conversion to a list::
  
         f = AlignmentFile("file.bam", "rb")
@@ -2661,7 +2674,7 @@ cdef class IteratorColumn:
          # reset in order to avoid memory leak messages for iterators
          # that have not been fully consumed
          self._free_pileup_iter()
-        self.plp = <bam_pileup1_t*>NULL
+        self.plp = <const bam_pileup1_t*>NULL
  
          if self.iterdata.seq != NULL:
              free(self.iterdata.seq)
@@ -2858,9 +2871,7 @@ cdef class SNPCall:
  
  
  cdef class IndexedReads:
-    """*(AlignmentFile samfile, multiple_iterators=True)
-
-    Index a Sam/BAM-file by query name while keeping the
+    """Index a Sam/BAM-file by query name while keeping the
      original sort order intact.
  
      The index is kept in memory and can be substantial.
diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx

index c9bcbd25b77feddfa762bc223f15a244c38d5220..05a5fe8ade55b882967d9616be880416f1708588 100644 (file)
--- a/pysam/libcbcf.pyx
+++ b/pysam/libcbcf.pyx
@@ -106,6 +106,24 @@ from pysam.utils import unquoted_str
  __all__ = ['VariantFile',
             'VariantHeader',
             'VariantHeaderRecord',
+           'VariantHeaderRecords',
+           'VariantMetadata',
+           'VariantHeaderMetadata',
+           'VariantContig',
+           'VariantHeaderContigs',
+           'VariantHeaderSamples',
+           'VariantRecordFilter',
+           'VariantRecordFormat',
+           'VariantRecordInfo',
+           'VariantRecordSamples',
+           'VariantRecord',
+           'VariantRecordSample',
+           'BaseIndex',
+           'BCFIndex',
+           'TabixIndex',
+           'BaseIterator',
+           'BCFIterator',
+           'TabixIterator',
             'VariantRecord']
  
  ########################################################################
@@ -125,7 +143,7 @@ cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
  ########################################################################
  
  from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
-from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcutils cimport encode_filename, from_string_and_size, decode_bytes
  
  
  ########################################################################
@@ -166,7 +184,7 @@ cdef inline bcf_str_cache_get_charptr(const char* s):
  ########################################################################
  
  cdef int comb(int n, int k) except -1:
-    """Return binomial coeffient: n choose k
+    """Return binomial coefficient: n choose k
  
      >>> comb(5, 1)
      5
@@ -284,7 +302,7 @@ cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int sca
              else:
                  # Otherwise, copy the entire block
                  b = datac[:n]
-            value = tuple(v.decode('utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
+            value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
      else:
          value = []
          if type == BCF_BT_INT8:
@@ -3141,7 +3159,7 @@ cdef class VariantRecord(object):
          # causes a memory leak https://github.com/pysam-developers/pysam/issues/773
          # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
          if (r.d.m_id == 0):
-            raise ValueError('Error extracing ID')
+            raise ValueError('Error extracting ID')
          return charptr_to_str(r.d.id) if r.d.id != b'.' else None
  
      @id.setter
@@ -3755,7 +3773,7 @@ cdef class BaseIterator(object):
      pass
  
  
-# Interal function to clean up after iteration stop or failure.
+# Internal function to clean up after iteration stop or failure.
  # This would be a nested function if it weren't a cdef function.
  cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
      bcf_destroy1(record)
@@ -3786,7 +3804,7 @@ cdef class BCFIterator(BaseIterator):
          try:
              rid = index.refmap[contig]
          except KeyError:
-            # A query for a non-existant contig yields an empty iterator, does not raise an error
+            # A query for a non-existent contig yields an empty iterator, does not raise an error
              self.iter = NULL
              return
  
@@ -3874,7 +3892,7 @@ cdef class TabixIterator(BaseIterator):
          try:
              rid = index.refmap[contig]
          except KeyError:
-            # A query for a non-existant contig yields an empty iterator, does not raise an error
+            # A query for a non-existent contig yields an empty iterator, does not raise an error
              self.iter = NULL
              return
  
@@ -4346,9 +4364,10 @@ cdef class VariantFile(HTSFile):
          return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
  
      def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None):
-        """fetch records in a :term:`region` using 0-based indexing. The
-        region is specified by :term:`contig`, *start* and *end*.
-        Alternatively, a samtools :term:`region` string can be supplied.
+        """fetch records in a :term:`region`, specified either by
+        :term:`contig`, *start*, and *end* (which are 0-based, half-open);
+        or alternatively by a samtools :term:`region` string (which is
+        1-based inclusive).
  
          Without *contig* or *region* all mapped records will be fetched.  The
          records will be returned ordered by contig, which will not necessarily
diff --git a/pysam/libcbcftools.pxd b/pysam/libcbcftools.pxd

index 62a6f3d4ad9c1362115de52c5859521a315b2cc6..d57f784d89c03f1106140ecff629185ddc3efee6 100644 (file)
--- a/pysam/libcbcftools.pxd
+++ b/pysam/libcbcftools.pxd
@@ -1,6 +1,6 @@
  cdef extern from "bcftools.pysam.h":
  
-    int bcftools_main(int argc, char *argv[])
+    int bcftools_dispatch(int argc, char *argv[])
      void bcftools_set_stderr(int fd)
      void bcftools_close_stderr()
      void bcftools_set_stdout(int fd)
diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx

index a70d42dfc2167870c61f3804787e2bf5102ded81..e73adf937b4b7a453dcf4261097b54f150c94fc5 100644 (file)
--- a/pysam/libcfaidx.pyx
+++ b/pysam/libcfaidx.pyx
@@ -496,7 +496,7 @@ cdef class FastxRecord:
  
  
  cdef class FastxFile:
-    """Stream access to :term:`fasta` or :term:`fastq` formatted files.
+    r"""Stream access to :term:`fasta` or :term:`fastq` formatted files.
  
      The file is automatically opened.
  
@@ -541,7 +541,7 @@ cdef class FastxFile:
      ...        print(entry.quality)
      >>> with pysam.FastxFile(filename) as fin, open(out_filename, mode='w') as fout:
      ...    for entry in fin:
-    ...        fout.write(str(entry))
+    ...        fout.write(str(entry) + '\n')
  
      """
      def __cinit__(self, *args, **kwargs):
diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd

index 370e4923bac091838908868f210da9964d75643b..9684ef9de13f8ee7685c3adbf9a513058b77065e 100644 (file)
--- a/pysam/libchtslib.pxd
+++ b/pysam/libchtslib.pxd
@@ -275,7 +275,7 @@ cdef extern from "htslib/bgzf.h" nogil:
      int SEEK_SET
  
      #  Return a virtual file pointer to the current location in the file.
-    #  No interpetation of the value should be made, other than a subsequent
+    #  No interpretation of the value should be made, other than a subsequent
      #  call to bgzf_seek can be used to position the file at the same point.
      #  Return value is non-negative on success.
      int64_t bgzf_tell(BGZF *fp)
@@ -326,7 +326,7 @@ cdef extern from "htslib/bgzf.h" nogil:
      #  Read one line from a BGZF file. It is faster than bgzf_getc()
      #
      #  @param fp     BGZF file handler
-    #  @param delim  delimitor
+    #  @param delim  delimiter
      #  @param str    string to write to; must be initialized
      #  @return       length of the string; 0 on end-of-file; negative on error
      int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
@@ -796,7 +796,7 @@ cdef extern from "htslib/hts.h" nogil:
  
      ctypedef struct hts_md5_context
  
-    # /*! @abstract   Intialises an MD5 context.
+    # /*! @abstract   Initialises an MD5 context.
      #  *  @discussion
      #  *    The expected use is to allocate an hts_md5_context using
      #  *    hts_md5_init().  This pointer is then passed into one or more calls
@@ -1353,10 +1353,10 @@ cdef extern from "htslib/tbx.h" nogil:
  
      # tbx.h definitions
      int8_t TBX_MAX_SHIFT
-    int8_t TBX_GENERIC
-    int8_t TBX_SAM
-    int8_t TBX_VCF
-    int8_t TBX_UCSC
+    int32_t TBX_GENERIC
+    int32_t TBX_SAM
+    int32_t TBX_VCF
+    int32_t TBX_UCSC
  
      ctypedef struct tbx_conf_t:
          int32_t preset
@@ -1418,7 +1418,7 @@ cdef extern from "htslib/vcf.h" nogil:
  
      # === Dictionary ===
      #
-    # The header keeps three dictonaries. The first keeps IDs in the
+    # The header keeps three dictionaries. The first keeps IDs in the
      # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
      # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
      # is the actual hash table, which is opaque to the end users. In the hash
@@ -2112,8 +2112,7 @@ cdef extern from "htslib/vcfutils.h" nogil:
      # be determined.
      #
      # The value of @which determines if existing INFO/AC,AN can be
-    # used (BCF_UN_INFO) and and if indv fields can be splitted
-    # (BCF_UN_FMT).
+    # used (BCF_UN_INFO) and and if indv fields can be split (BCF_UN_FMT).
      int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
  
      # bcf_gt_type() - determines type of the genotype
@@ -2261,7 +2260,7 @@ cdef extern from "htslib/cram.h" nogil:
      # the container, meaning multiple compression headers to manipulate.
      # Changing RG may change the size of the compression header and
      # therefore the length field in the container.  Hence we rewrite all
-    # blocks just incase and also emit the adjusted container.
+    # blocks just in case and also emit the adjusted container.
      #
      # The current implementation can only cope with renumbering a single
      # RG (and only then if it is using HUFFMAN or BETA codecs).  In
@@ -2511,7 +2510,7 @@ cdef extern from "htslib/cram.h" nogil:
      #         2 if the file is a stream and thus unseekable
      #         1 if the file contains an EOF block
      #         0 if the file does not contain an EOF block
-    #        -1 if an error occured whilst reading the file or we could not seek back to where we were
+    #        -1 if an error occurred whilst reading the file or we could not seek back to where we were
      #
      #
      int cram_check_EOF(cram_fd *fd)
diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx

index 92d4e8f4a20af69daa2508aa454394f1bad7273d..778fc23cba15fac13b6fdda9e2f7c64d444fb9ff 100644 (file)
--- a/pysam/libchtslib.pyx
+++ b/pysam/libchtslib.pyx
@@ -72,7 +72,7 @@ cdef class HFile(object):
      cdef hFILE *fp
      cdef readonly object name, mode
  
-    def __init__(self, name, mode='r', closedf=True):
+    def __init__(self, name, mode='r', closefd=True):
          self._open(name, mode, closefd=True)
  
      def __dealloc__(self):
@@ -585,7 +585,7 @@ cdef class HTSFile(object):
                  rval = hts_opt_apply(self.htsfile, opts)
                  if rval != 0:
                      hts_opt_free(opts)
-                    raise RuntimeError('An error occured while applying the requested format options')
+                    raise RuntimeError('An error occurred while applying the requested format options')
                  hts_opt_free(opts)
  
      def parse_region(self, contig=None, start=None, stop=None,
@@ -595,7 +595,7 @@ cdef class HTSFile(object):
          either be specified by :term:`contig`, `start` and
          `stop`. `start` and `stop` denote 0-based, half-open
          intervals. :term:`reference` and `end` are also accepted for
-        backward compatiblity as synonyms for :term:`contig` and
+        backward compatibility as synonyms for :term:`contig` and
          `stop`, respectively.
  
          Alternatively, a samtools :term:`region` string can be
diff --git a/pysam/libcsamtools.pxd b/pysam/libcsamtools.pxd

index 70fda608fd4fb85b279029fb4de390523136fbfe..3c3947604f9106079c15770093faf1afc5cceb43 100644 (file)
--- a/pysam/libcsamtools.pxd
+++ b/pysam/libcsamtools.pxd
@@ -1,6 +1,6 @@
  cdef extern from "samtools.pysam.h":
  
-    int samtools_main(int argc, char *argv[])
+    int samtools_dispatch(int argc, char *argv[])
      void samtools_set_stderr(int fd)
      void samtools_close_stderr()
      void samtools_set_stdout(int fd)
diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx

index e581b617eb7eef694fb31e54d15cb9b876a9e041..44364208eac198ab680b946e25fd5f9e46a0100b 100644 (file)
--- a/pysam/libctabix.pyx
+++ b/pysam/libctabix.pyx
@@ -53,7 +53,6 @@
  # DEALINGS IN THE SOFTWARE.
  #
  ###############################################################################
-import binascii
  import os
  import sys
  
@@ -75,8 +74,8 @@ from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
      tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
      tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
      tbx_destroy, hisremote, region_list, hts_getline, \
-    TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, htsExactFormat, bcf, \
-    bcf_index_build2
+    TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
+    no_compression, bcf, bcf_index_build2
  
  from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
  from pysam.libcutils cimport encode_filename, from_string_and_size
@@ -302,7 +301,7 @@ cdef class TabixFile:
  
      index : string
          The filename of the index. If not set, the default is to
-        assume that the index is called ``filename.tbi`
+        assume that the index is called ``filename.tbi``
  
      mode : char
          The file opening mode. Currently, only ``r`` is permitted.
@@ -581,7 +580,7 @@ cdef class TabixFile:
      property contigs:
          '''list of chromosome names'''
          def __get__(self):
-            cdef char ** sequences
+            cdef const char ** sequences
              cdef int nsequences
              
              with nogil:
@@ -880,13 +879,6 @@ def tabix_compress(filename_in,
              raise IOError("error %i when closing file %s" % (r, filename_in))
  
  
-def is_gzip_file(filename):
-    gzip_magic_hex = b'1f8b'
-    fd = os.open(filename, os.O_RDONLY)
-    header = os.read(fd, 2)
-    return header == binascii.a2b_hex(gzip_magic_hex)
-
-
  def tabix_index(filename,
                  force=False,
                  seq_col=None,
@@ -928,16 +920,13 @@ def tabix_index(filename,
      compressed. The original file will be removed and only the compressed
      file will be retained.
  
-    *min-shift* sets the minimal interval size to 1<<INT; 0 for the
-    old tabix index. The default of -1 is changed inside htslib to 
-    the old tabix default of 0.
+    By default or when *min_shift* is 0, creates a TBI index. If *min_shift*
+    is greater than zero and/or *csi* is True, creates a CSI index with a
+    minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set).
  
      *index* controls the filename which should be used for creating the index.
      If not set, the default is to append ``.tbi`` to *filename*.
  
-    If *csi* is set, create a CSI index, the default is to create a
-    TBI index.
-
      When automatically compressing files, if *keep_original* is set the
      uncompressed file will not be deleted.
  
@@ -945,27 +934,29 @@ def tabix_index(filename,
  
      '''
      
-    if not os.path.exists(filename):
-        raise IOError("No such file '%s'" % filename)
-
      if preset is None and \
         (seq_col is None or start_col is None or end_col is None):
          raise ValueError(
              "neither preset nor seq_col,start_col and end_col given")
  
-    if not is_gzip_file(filename):
-        tabix_compress(filename, filename + ".gz", force=force)
-        if not keep_original:
-            os.unlink(filename)
-        filename += ".gz"
-
      fn = encode_filename(filename)
      cdef char *cfn = fn
  
      cdef htsFile *fp = hts_open(cfn, "r")
-    cdef htsExactFormat fmt = fp.format.format
+    if fp == NULL:
+        raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))
+
+    cdef htsFormat fmt = hts_get_format(fp)[0]
      hts_close(fp)
-    
+
+    if fmt.compression == no_compression:
+        tabix_compress(filename, filename + ".gz", force=force)
+        if not keep_original:
+            os.unlink(filename)
+        filename += ".gz"
+        fn = encode_filename(filename)
+        cfn = fn
+
      # columns (1-based):
      #   preset-code, contig, start, end, metachar for
      #     comments, lines to ignore at beginning
@@ -979,10 +970,8 @@ def tabix_index(filename,
          }
      
      conf_data = None
-    if preset == "bcf" or fmt == bcf:
+    if preset == "bcf" or fmt.format == bcf:
          csi = True
-        if min_shift == -1:
-            min_shift = 14
      elif preset:
          try:
              conf_data = preset2conf[preset]
@@ -1010,10 +999,13 @@ def tabix_index(filename,
      if conf_data:
          conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
  
-    if csi:
+    if csi or min_shift > 0:
          suffix = ".csi"
+        if min_shift <= 0: min_shift = 14
      else:
          suffix = ".tbi"
+        min_shift = 0
+
      index = index or filename + suffix    
      fn_index = encode_filename(index)
  
@@ -1024,7 +1016,7 @@ def tabix_index(filename,
      cdef char *fnidx = fn_index
      cdef int retval = 0
  
-    if csi and fmt == bcf:
+    if csi and fmt.format == bcf:
          with nogil:
              retval = bcf_index_build2(cfn, fnidx, min_shift)
      else:
diff --git a/pysam/libcutils.pxd b/pysam/libcutils.pxd

index 9e1cce1a8e64ad5cb7f847e640f85020d132baab..d78b70608fecfd0729dbc0fa825737e166dcaab3 100644 (file)
--- a/pysam/libcutils.pxd
+++ b/pysam/libcutils.pxd
@@ -14,15 +14,21 @@ cpdef array_to_qualitystring(c_array.array arr, int offset=*)
  cpdef qualities_to_qualitystring(qualities, int offset=*)
  
  ########################################################################
+## String encoding configuration facilities
  ########################################################################
+
+cpdef get_encoding_error_handler()
+cpdef set_encoding_error_handler(name)
+
  ########################################################################
  ## Python 3 compatibility functions
  ########################################################################
-cdef charptr_to_str(const char *s, encoding=*)
-cdef bytes charptr_to_bytes(const char *s, encoding=*)
-cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*)
-cdef force_str(object s, encoding=*)
-cdef bytes force_bytes(object s, encoding=*)
+cdef charptr_to_str(const char *s, encoding=*, errors=*)
+cdef bytes charptr_to_bytes(const char *s, encoding=*, errors=*)
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*, errors=*)
+cdef force_str(object s, encoding=*, errors=*)
+cdef bytes force_bytes(object s, encoding=*, errors=*)
+cdef decode_bytes(bytes s, encoding=*, errors=*)
  cdef bytes encode_filename(object filename)
  cdef from_string_and_size(const char *s, size_t length)
  
diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx

index fe61bb8f9db2a534a16369b7a2e48fe6b8a4d0b0..adc9cec7a19ea08ce6eaeb0c0777c05fa7a4f0db 100644 (file)
--- a/pysam/libcutils.pyx
+++ b/pysam/libcutils.pyx
@@ -6,6 +6,7 @@ import tempfile
  import os
  import io
  from contextlib import contextmanager
+from codecs import register_error
  
  from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION
  from cpython cimport PyBytes_Check, PyUnicode_Check
@@ -17,10 +18,10 @@ from libc.stdio cimport fprintf, stderr, fflush
  from libc.stdio cimport stdout as c_stdout
  from posix.fcntl cimport open as c_open, O_WRONLY
  
-from libcsamtools cimport samtools_main, samtools_set_stdout, samtools_set_stderr, \
+from libcsamtools cimport samtools_dispatch, samtools_set_stdout, samtools_set_stderr, \
      samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn, samtools_set_optind
  
-from libcbcftools cimport bcftools_main, bcftools_set_stdout, bcftools_set_stderr, \
+from libcbcftools cimport bcftools_dispatch, bcftools_set_stdout, bcftools_set_stderr, \
      bcftools_close_stdout, bcftools_close_stderr, bcftools_set_stdout_fn, bcftools_set_optind
  
  #####################################################################
@@ -82,7 +83,27 @@ cpdef qualities_to_qualitystring(qualities, int offset=33):
  
  
  ########################################################################
+## String encoding configuration facilities
  ########################################################################
+
+# Codec error handler that just interprets each bad byte as ISO-8859-1.
+def latin1_replace(exception):
+    return (chr(exception.object[exception.start]), exception.end)
+
+register_error('pysam.latin1replace', latin1_replace)
+
+
+cdef str ERROR_HANDLER = 'strict'
+
+cpdef get_encoding_error_handler():
+    return ERROR_HANDLER
+
+cpdef set_encoding_error_handler(name):
+    global ERROR_HANDLER
+    previous = ERROR_HANDLER
+    ERROR_HANDLER = name
+    return previous
+
  ########################################################################
  ## Python 3 compatibility functions
  ########################################################################
@@ -91,7 +112,7 @@ cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
  
  cdef from_string_and_size(const char* s, size_t length):
      if IS_PYTHON3:
-        return s[:length].decode("utf8")
+        return s[:length].decode('utf-8', ERROR_HANDLER)
      else:
          return s[:length]
  
@@ -115,7 +136,7 @@ cdef bytes encode_filename(object filename):
          raise TypeError("Argument must be string or unicode.")
  
  
-cdef bytes force_bytes(object s, encoding=TEXT_ENCODING):
+cdef bytes force_bytes(object s, encoding=None, errors=None):
      """convert string or unicode object to bytes, assuming
      utf8 encoding.
      """
@@ -124,37 +145,37 @@ cdef bytes force_bytes(object s, encoding=TEXT_ENCODING):
      elif PyBytes_Check(s):
          return s
      elif PyUnicode_Check(s):
-        return s.encode(encoding)
+        return s.encode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
      else:
          raise TypeError("Argument must be string, bytes or unicode.")
  
  
-cdef charptr_to_str(const char* s, encoding=TEXT_ENCODING):
+cdef charptr_to_str(const char* s, encoding=None, errors=None):
      if s == NULL:
          return None
      if PY_MAJOR_VERSION < 3:
          return s
      else:
-        return s.decode(encoding)
+        return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
  
  
-cdef charptr_to_str_w_len(const char* s, size_t n, encoding=TEXT_ENCODING):
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding=None, errors=None):
      if s == NULL:
          return None
      if PY_MAJOR_VERSION < 3:
          return s[:n]
      else:
-        return s[:n].decode(encoding)
+        return s[:n].decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
  
  
-cdef bytes charptr_to_bytes(const char* s, encoding=TEXT_ENCODING):
+cdef bytes charptr_to_bytes(const char* s, encoding=None, errors=None):
      if s == NULL:
          return None
      else:
          return s
  
  
-cdef force_str(object s, encoding=TEXT_ENCODING):
+cdef force_str(object s, encoding=None, errors=None):
      """Return s converted to str type of current Python
      (bytes in Py2, unicode in Py3)"""
      if s is None:
@@ -162,12 +183,21 @@ cdef force_str(object s, encoding=TEXT_ENCODING):
      if PY_MAJOR_VERSION < 3:
          return s
      elif PyBytes_Check(s):
-        return s.decode(encoding)
+        return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
      else:
          # assume unicode
          return s
  
  
+cdef decode_bytes(bytes s, encoding=None, errors=None):
+    """Return s converted to current Python's str type,
+    always decoding even in Python 2"""
+    if s is None:
+        return None
+    else:
+        return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
+
+
  cpdef parse_region(contig=None,
                     start=None,
                     stop=None,
@@ -179,7 +209,7 @@ cpdef parse_region(contig=None,
      `end`. `start` and `end` denote 0-based, half-open intervals.
      
      :term:`reference` and `end` are also accepted for backward
-    compatiblity as synonyms for :term:`contig` and `stop`,
+    compatibility as synonyms for :term:`contig` and `stop`,
      respectively.
  
      Alternatively, a samtools :term:`region` string can be supplied.
@@ -386,13 +416,13 @@ def _pysam_dispatch(collection,
      if collection == b"samtools":
          samtools_set_stdout(stdout_h)
          samtools_set_stderr(stderr_h)
-        retval = samtools_main(n + 2, cargs)
+        retval = samtools_dispatch(n + 2, cargs)
          samtools_close_stdout()
          samtools_close_stderr()
      elif collection == b"bcftools":
          bcftools_set_stdout(stdout_h)
          bcftools_set_stderr(stderr_h)
-        retval = bcftools_main(n + 2, cargs)
+        retval = bcftools_dispatch(n + 2, cargs)
          bcftools_close_stdout()
          bcftools_close_stderr()
  
@@ -425,6 +455,10 @@ def _pysam_dispatch(collection,
      return retval, out_stderr, out_stdout
  
  
-__all__ = ["qualitystring_to_array",
-           "array_to_qualitystring",
-           "qualities_to_qualitystring"]
+__all__ = [
+    "qualitystring_to_array",
+    "array_to_qualitystring",
+    "qualities_to_qualitystring",
+    "get_encoding_error_handler",
+    "set_encoding_error_handler",
+]
diff --git a/pysam/samtools.py b/pysam/samtools.py

index 58cc2eea55b7e38ef7ea3cbb9869d37560823df2..9042cc1768497d607e466ec6133f8e44e9880294 100644 (file)
--- a/pysam/samtools.py
+++ b/pysam/samtools.py
@@ -37,6 +37,10 @@ SAMTOOLS_DISPATCH = {
      "quickcheck": ("quickcheck", None),
      "split": ("split", None),
      "flags": ("flags", None),
+    "ampliconclip": ("ampliconclip", None),
+    "ampliconstats": ("ampliconstats", None),
+    "version": ("version", None),
+    "fqimport": ("import", None),
  }
  
  # instantiate samtools commands as python functions
diff --git a/pysam/version.h b/pysam/version.h

index 7c4ea995fa98c480e1d067dd540b2366f98e488f..33676ea6cedf96120099cf424852405b2af4fb54 100644 (file)
--- a/pysam/version.h
+++ b/pysam/version.h
@@ -1,5 +1,5 @@
  // Version information used while compiling samtools, bcftools, and htslib
  
-#define SAMTOOLS_VERSION "1.10 (pysam)"
-#define BCFTOOLS_VERSION "1.10.2 (pysam)"
-#define HTS_VERSION_TEXT "1.10.2 (pysam)"
+#define SAMTOOLS_VERSION "1.13 (pysam)"
+#define BCFTOOLS_VERSION "1.13 (pysam)"
+#define HTS_VERSION_TEXT "1.13 (pysam)"
diff --git a/pysam/version.py b/pysam/version.py

index 3ad71c7d5e70201fe99ec4ad89aed003b81279e6..8c871baec97286935a633b58f439804a1aea943f 100644 (file)
--- a/pysam/version.py
+++ b/pysam/version.py
@@ -1,6 +1,6 @@
  # pysam versioning information
-__version__ = "0.16.0.1"
+__version__ = "0.17.0"
  
-__samtools_version__ = "1.10"
-__bcftools_version__ = "1.10.2"
-__htslib_version__ = "1.10.2"
+__samtools_version__ = "1.13"
+__bcftools_version__ = "1.13"
+__htslib_version__ = "1.13"
diff --git a/samtools/LICENSE b/samtools/LICENSE

index 3c56f4841d8985bf5a85faa2544564125e46830b..cd102b88d2e6e68b8444a8acbb894395c40e2754 100644 (file)
--- a/samtools/LICENSE
+++ b/samtools/LICENSE
@@ -1,6 +1,6 @@
  The MIT/Expat License
  
-Copyright (C) 2008-2019 Genome Research Ltd.
+Copyright (C) 2008-2021 Genome Research Ltd.
  
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
diff --git a/samtools/README b/samtools/README

index bb7af6c66b27a40f279d8d418bbff77c2ac55f7b..dd276706833a7d4f0d6258388a2da6d3422f3fd7 100644 (file)
--- a/samtools/README
+++ b/samtools/README
@@ -9,7 +9,7 @@ Building samtools
  The typical simple case of building Samtools using the HTSlib bundled within
  this Samtools release tarball is done as follows:
  
-    cd .../samtools-1.10 # Within the unpacked release directory
+    cd .../samtools-1.13 # Within the unpacked release directory
      ./configure
      make
  
@@ -21,7 +21,7 @@ install samtools etc properly into a directory of your choosing.  Building for
  installation using the HTSlib bundled within this Samtools release tarball,
  and building the various HTSlib utilities such as bgzip is done as follows:
  
-    cd .../samtools-1.10 # Within the unpacked release directory
+    cd .../samtools-1.13 # Within the unpacked release directory
      ./configure --prefix=/path/to/location
      make all all-htslib
      make install install-htslib
@@ -48,7 +48,7 @@ There are two advantages to this:
  To build with plug-ins, you need to use the --enable-plugins configure option
  as follows:
  
-    cd .../samtools-1.10 # Within the unpacked release directory
+    cd .../samtools-1.13 # Within the unpacked release directory
      ./configure --enable-plugins --prefix=/path/to/location
      make all all-htslib
      make install install-htslib
@@ -66,8 +66,8 @@ Setting --with-plugin-path is useful if you want to run directly from
  the source distribution instead of installing the package.  In that case
  you can use:
  
-    cd .../samtools-1.10 # Within the unpacked release directory
-    ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.10
+    cd .../samtools-1.13 # Within the unpacked release directory
+    ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.13
      make all all-htslib
  
  It is possible to override the built-in search path using the HTS_PATH
@@ -99,3 +99,28 @@ Benchmarks comparing the various zlibs are available at:
  
  It is recommended that you perform your own rigorous tests for an entire
  pipeline if you wish to switch to one of the optimised zlib implementations.
+
+Citing
+======
+
+Please cite this paper when using SAMtools for your publications:
+
+Twelve years of SAMtools and BCFtools
+Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li
+GigaScience, Volume 10, Issue 2, February 2021, giab008, https://doi.org/10.1093/gigascience/giab008
+
+@article{10.1093/gigascience/giab008,
+    author = {Danecek, Petr and Bonfield, James K and Liddle, Jennifer and Marshall, John and Ohan, Valeriu and Pollard, Martin O and Whitwham, Andrew and Keane, Thomas and McCarthy, Shane A and Davies, Robert M and Li, Heng},
+    title = "{Twelve years of SAMtools and BCFtools}",
+    journal = {GigaScience},
+    volume = {10},
+    number = {2},
+    year = {2021},
+    month = {02},
+    abstract = "{SAMtools and BCFtools are widely used programs for processing and analysing high-throughput sequencing data. They include tools for file format conversion and manipulation, sorting, querying, statistics, variant calling, and effect analysis amongst other methods.The first version appeared online 12 years ago and has been maintained and further developed ever since, with many new features and improvements added over the years. The SAMtools and BCFtools packages represent a unique collection of tools that have been used in numerous other software projects and countless genomic pipelines.Both SAMtools and BCFtools are freely available on GitHub under the permissive MIT licence, free for both non-commercial and commercial use. Both packages have been installed \\&gt;1 million times via Bioconda. The source code and documentation are available from https://www.htslib.org.}",
+    issn = {2047-217X},
+    doi = {10.1093/gigascience/giab008},
+    url = {https://doi.org/10.1093/gigascience/giab008},
+    note = {giab008},
+    eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab008/36332246/giab008.pdf},
+}
diff --git a/samtools/amplicon_stats.c b/samtools/amplicon_stats.c

new file mode 100644 (file)

index 0000000..62bb15c
--- /dev/null
+++ b/samtools/amplicon_stats.c
@@ -0,0 +1,1754 @@
+/*  stats.c -- This is the former bamcheck integrated into samtools/htslib.
+
+    Copyright (C) 2020-2021 Genome Research Ltd.
+
+    Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+/*
+ * This tool is designed to give "samtools stats" style output, but dedicated
+ * to small amplicon sequencing projects.  It gathers stats on the
+ * distribution of reads across amplicons.
+ */
+
+/*
+ * TODO:
+ * - Cope with multiple references.  What do we do here?  Just request one?
+ * - Permit regions rather than consuming whole file (maybe solves above).
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <math.h>
+
+#include <htslib/sam.h>
+#include <htslib/khash.h>
+
+#include "samtools.h"
+#include "sam_opts.h"
+#include "bam_ampliconclip.h"
+
+KHASH_MAP_INIT_INT64(tcoord, int64_t)
+KHASH_MAP_INIT_STR(qname, int64_t)
+
+#ifndef MIN
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef MAX
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#ifndef ABS
+#define ABS(a) ((a)>=0?(a):-(a))
+#endif
+
+#define TCOORD_MIN_COUNT   10
+#define MAX_AMP 1000       // Default maximum number of amplicons
+#define MAX_AMP_LEN 1000   // Default maximum length of any single amplicon
+#define MAX_PRIMER_PER_AMPLICON 4  // Max primers per LEFT/RIGHT
+#define MAX_DEPTH 5        // Number of different depths permitted
+
+typedef struct {
+    sam_global_args ga;
+    uint32_t flag_require;
+    uint32_t flag_filter;
+    int max_delta;   // Used for matching read to amplicon primer loc
+    int min_depth[MAX_DEPTH]; // Used for coverage; must be >= min_depth deep
+    int use_sample_name;
+    int max_amp;     // Total number of amplicons
+    int max_amp_len; // Maximum length of an individual amplicon
+    double depth_bin;// aggregate depth within this fraction
+    int tlen_adj;    // Adjust tlen by this amount, due to clip but no fixmate
+    FILE *out_fp;
+    char *argv;
+    int tcoord_min_count;
+    int tcoord_bin;
+    int multi_ref;
+} astats_args_t;
+
+typedef struct {
+    int nseq;       // total sequence count
+    int nfiltered;  // sequence filtered
+    int nfailprimer;// count of sequences not matching the primer locations
+
+    // Sizes of memory allocated below, to permit reset
+    int max_amp, max_amp_len, max_len;
+
+    // Summary across all samples, sum(x) plus sum(x^2) for s.d. calc
+    int64_t *nreads, *nreads2;          // [max_amp]
+    double  *nfull_reads;               // [max_amp]; 0.5/read if paired.
+    double  *nrperc, *nrperc2;          // [max_amp]
+    int64_t *nbases, *nbases2;          // [max_amp]
+    int64_t *coverage;                  // [max_amp][max_amp_len]
+    double  (*covered_perc)[MAX_DEPTH]; // [max_amp][MAX_DEPTH]
+    double  (*covered_perc2)[MAX_DEPTH];// [max_amp][MAX_DEPTH];
+    khash_t(tcoord) **tcoord;           // [max_amp+1]
+
+    // 0 is correct pair, 1 is incorrect pair, 2 is unidentified
+    int     (*amp_dist)[3];             // [MAX_AMP][3];
+
+    int *depth_valid; // [max_len]
+    int *depth_all;   // [max_len]
+    khash_t(qname) *qend;  // queryname end, for overlap removal
+} astats_t;
+
+// We can have multiple primers for LEFT / RIGHT, so this
+// permits detection by any compatible combination.
+// One reference:
+typedef struct {
+    int64_t left[MAX_PRIMER_PER_AMPLICON];
+    int nleft;
+    int64_t right[MAX_PRIMER_PER_AMPLICON];
+    int nright;
+    int64_t max_left, min_right; // inner dimensions
+    int64_t min_left, max_right; // outer dimensions
+} amplicon_t;
+
+// Multiple references, we have an array of amplicons_t - one per used ref.
+// We have per reference local and global stats here, as some of the stats
+// are coordinate based.  However we report them combined together as a single
+// list across all references.
+// "namp" is the number of amplicons in this reference, but they're
+// numbered first_amp to first_amp+namp-1 inclusively.
+typedef struct {
+    int tid, namp;
+    int64_t len;
+    bed_entry_list_t *sites;
+    amplicon_t *amp;
+    astats_t *lstats, *gstats; // local (1 file) and global (all file) stats
+    const char *ref;           // ref name (pointer to the bed hash table key)
+    int first_amp;             // first amplicon number for this ref
+} amplicons_t;
+
+// Reinitialised for each new reference/chromosome.
+// Counts from 1 to namp, -1 for no match and 0 for ?.
+static int *pos2start = NULL;
+static int *pos2end = NULL;
+static int pos2size = 0; // allocated size of pos2start/end
+
+// Lookup table to go from position to amplicon based on
+// read start / end.
+static int initialise_amp_pos_lookup(astats_args_t *args,
+                                     amplicons_t *amps,
+                                     int ref) {
+    int64_t i, j;
+    amplicon_t *amp = amps[ref].amp;
+    int64_t max_len = amps[ref].len;
+    int namp = amps[ref].namp;
+
+    if (max_len+1 > pos2size) {
+        if (!(pos2start = realloc(pos2start, (max_len+1)*sizeof(*pos2start))))
+            return -1;
+        if (!(pos2end   = realloc(pos2end,   (max_len+1)*sizeof(*pos2end))))
+            return -1;
+        pos2size = max_len;
+    }
+    for (i = 0; i < max_len; i++)
+        pos2start[i] = pos2end[i] = -1;
+
+    for (i = 0; i < namp; i++) {
+        for (j = 0; j < amp[i].nleft; j++) {
+            int64_t p;
+            for (p = amp[i].left[j] - args->max_delta;
+                 p <= amp[i].left[j] + args->max_delta; p++) {
+                if (p < 1 || p > max_len)
+                    continue;
+                pos2start[p-1] = i;
+            }
+        }
+        for (j = 0; j < amp[i].nright; j++) {
+            int64_t p;
+            for (p = amp[i].right[j] - args->max_delta;
+                 p <= amp[i].right[j] + args->max_delta; p++) {
+                if (p < 1 || p > max_len)
+                    continue;
+                pos2end[p-1] = i;
+            }
+        }
+    }
+
+    return 0;
+}
+
+// Counts amplicons.
+// Assumption: input BED file alternates between LEFT and RIGHT primers
+// per amplicon, thus we can count the number based on the switching
+// orientation.
+static int count_amplicon(bed_entry_list_t *sites) {
+    int i, namp, last_rev = 0;
+    for (i = namp = 0; i < sites->length; i++) {
+        if (sites->bp[i].rev == 0 && last_rev)
+            namp++;
+        last_rev = sites->bp[i].rev;
+    }
+
+    return ++namp;
+}
+
+// We're only interest in the internal part of the amplicon.
+// Our bed file has LEFT start/end followed by RIGHT start/end,
+// so collapse these to LEFT end / RIGHT start.
+//
+// Returns right most amplicon position on success,
+//         < 0 on error
+static int64_t bed2amplicon(astats_args_t *args, bed_entry_list_t *sites,
+                            amplicon_t *amp, int *namp, int do_title,
+                            const char *ref, int first_amp) {
+    int i, j;
+    int64_t max_right = 0;
+    FILE *ofp = args->out_fp;
+
+    *namp = 0;
+
+    // Assume all primers for the same amplicon are adjacent in BED
+    // with all + followed by all -.  Thus - to + signifies next primer set.
+    int last_rev = 0;
+    amp[0].max_left = 0;
+    amp[0].min_right = INT64_MAX;
+    amp[0].min_left = INT64_MAX;
+    amp[0].max_right = 0;
+    if (do_title) {
+        fprintf(ofp, "# Amplicon locations from BED file.\n");
+        fprintf(ofp, "# LEFT/RIGHT are <start>-<end> format and "
+                "comma-separated for alt-primers.\n");
+        if (args->multi_ref)
+            fprintf(ofp, "#\n# AMPLICON\tREF\tNUMBER\tLEFT\tRIGHT\n");
+        else
+            fprintf(ofp, "#\n# AMPLICON\tNUMBER\tLEFT\tRIGHT\n");
+    }
+    for (i = j = 0; i < sites->length; i++) {
+        if (i == 0 && sites->bp[i].rev != 0) {
+            fprintf(stderr, "[ampliconstats] error: BED file should start"
+                    " with the + strand primer\n");
+            return -1;
+        }
+        if (sites->bp[i].rev == 0 && last_rev) {
+            j++;
+            if (j >= args->max_amp) {
+                fprintf(stderr, "[ampliconstats] error: too many amplicons"
+                        " (%d). Use -a option to raise this.\n", j);
+                return -1;
+            }
+            amp[j].max_left = 0;
+            amp[j].min_right = INT64_MAX;
+            amp[j].min_left = INT64_MAX;
+            amp[j].max_right = 0;
+        }
+        if (sites->bp[i].rev == 0) {
+            if (i == 0 || last_rev) {
+                if (j>0) fprintf(ofp, "\n");
+                if (args->multi_ref)
+                    fprintf(ofp, "AMPLICON\t%s\t%d", ref, j+1 + first_amp);
+                else
+                    fprintf(ofp, "AMPLICON\t%d", j+1);
+            }
+            if (amp[j].nleft >= MAX_PRIMER_PER_AMPLICON) {
+                print_error_errno("ampliconstats",
+                                  "too many primers per amplicon (%d).\n",
+                                  MAX_PRIMER_PER_AMPLICON);
+                return -1;
+            }
+            amp[j].left[amp[j].nleft++] = sites->bp[i].right;
+            if (amp[j].max_left < sites->bp[i].right+1)
+                amp[j].max_left = sites->bp[i].right+1;
+            if (amp[j].min_left > sites->bp[i].right+1)
+                amp[j].min_left = sites->bp[i].right+1;
+            // BED file, so left+1 as zero based. right(+1-1) as
+            // BED goes one beyond end (and we want inclusive range).
+            fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nleft > 1],
+                    sites->bp[i].left+1, sites->bp[i].right);
+        } else {
+            if (amp[j].nright >= MAX_PRIMER_PER_AMPLICON) {
+                print_error_errno("ampliconstats",
+                                  "too many primers per amplicon (%d)",
+                                  MAX_PRIMER_PER_AMPLICON);
+                return -1;
+            }
+            amp[j].right[amp[j].nright++] = sites->bp[i].left;
+            if (amp[j].min_right > sites->bp[i].left-1)
+                amp[j].min_right = sites->bp[i].left-1;
+            if (amp[j].max_right < sites->bp[i].left-1) {
+                amp[j].max_right = sites->bp[i].left-1;
+                if (amp[j].max_right - amp[j].min_left + 1 >=
+                    args->max_amp_len) {
+                    fprintf(stderr, "[ampliconstats] error: amplicon "
+                            "longer (%d) than max_amp_len option (%d)\n",
+                            (int)(amp[j].max_right - amp[j].min_left + 2),
+                            args->max_amp_len);
+                    return -1;
+                }
+                if (max_right < amp[j].max_right)
+                    max_right = amp[j].max_right;
+            }
+            fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nright > 1],
+                    sites->bp[i].left+1, sites->bp[i].right);
+        }
+        last_rev = sites->bp[i].rev;
+    }
+    if (last_rev != 1) {
+        fprintf(ofp, "\n"); // useful if going to stdout
+        fprintf(stderr, "[ampliconstats] error: bed file does not end on"
+                " a reverse strand primer.\n");
+        return -1;
+    }
+    *namp = ++j;
+    if (j) fprintf(ofp, "\n");
+
+    if (j >= args->max_amp) {
+        fprintf(stderr, "[ampliconstats] error: "
+                "too many amplicons (%d). Use -a option to raise this.", j);
+        return -1;
+    }
+
+//    for (i = 0; i < *namp; i++) {
+//      printf("%d\t%ld", i, amp[i].length);
+//      for (j = 0; j < amp[i].nleft; j++)
+//          printf("%c%ld", "\t,"[j>0], amp[i].left[j]);
+//      for (j = 0; j < amp[i].nright; j++)
+//          printf("%c%ld", "\t,"[j>0], amp[i].right[j]);
+//      printf("\n");
+//    }
+
+    return max_right;
+}
+
+void stats_free(astats_t *st) {
+    if (!st)
+        return;
+
+    free(st->nreads);
+    free(st->nreads2);
+    free(st->nfull_reads);
+    free(st->nrperc);
+    free(st->nrperc2);
+    free(st->nbases);
+    free(st->nbases2);
+    free(st->coverage);
+    free(st->covered_perc);
+    free(st->covered_perc2);
+    free(st->amp_dist);
+
+    free(st->depth_valid);
+    free(st->depth_all);
+
+    if (st->tcoord) {
+        int i;
+        for (i = 0; i <= st->max_amp; i++) {
+            if (st->tcoord[i])
+                kh_destroy(tcoord, st->tcoord[i]);
+        }
+        free(st->tcoord);
+    }
+
+    khiter_t k;
+    for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
+        if (kh_exist(st->qend, k))
+            free((void *)kh_key(st->qend, k));
+    kh_destroy(qname, st->qend);
+
+    free(st);
+}
+
+astats_t *stats_alloc(int64_t max_len, int max_amp, int max_amp_len) {
+    astats_t *st = calloc(1, sizeof(*st));
+    if (!st)
+        return NULL;
+
+    st->max_amp = max_amp;
+    st->max_amp_len = max_amp_len;
+    st->max_len = max_len;
+
+    if (!(st->nreads  = calloc(max_amp, sizeof(*st->nreads))))  goto err;
+    if (!(st->nreads2 = calloc(max_amp, sizeof(*st->nreads2)))) goto err;
+    if (!(st->nrperc  = calloc(max_amp, sizeof(*st->nrperc))))  goto err;
+    if (!(st->nrperc2 = calloc(max_amp, sizeof(*st->nrperc2)))) goto err;
+    if (!(st->nbases  = calloc(max_amp, sizeof(*st->nbases))))  goto err;
+    if (!(st->nbases2 = calloc(max_amp, sizeof(*st->nbases2)))) goto err;
+
+    if (!(st->nfull_reads = calloc(max_amp, sizeof(*st->nfull_reads))))
+        goto err;
+
+    if (!(st->coverage = calloc(max_amp*max_amp_len, sizeof(*st->coverage))))
+        goto err;
+
+    if (!(st->covered_perc  = calloc(max_amp, sizeof(*st->covered_perc))))
+        goto err;
+    if (!(st->covered_perc2 = calloc(max_amp, sizeof(*st->covered_perc2))))
+        goto err;
+
+    if (!(st->tcoord = calloc(max_amp+1, sizeof(*st->tcoord)))) goto err;
+    int i;
+    for (i = 0; i <= st->max_amp; i++)
+        if (!(st->tcoord[i] = kh_init(tcoord)))
+            goto err;
+
+    if (!(st->qend = kh_init(qname)))
+        goto err;
+
+    if (!(st->depth_valid = calloc(max_len, sizeof(*st->depth_valid))))
+        goto err;
+    if (!(st->depth_all   = calloc(max_len, sizeof(*st->depth_all))))
+        goto err;
+
+    if (!(st->amp_dist  = calloc(max_amp, sizeof(*st->amp_dist))))  goto err;
+
+    return st;
+
+ err:
+    stats_free(st);
+    return NULL;
+}
+
+static void stats_reset(astats_t *st) {
+    st->nseq = 0;
+    st->nfiltered = 0;
+    st->nfailprimer = 0;
+
+    memset(st->nreads,  0, st->max_amp * sizeof(*st->nreads));
+    memset(st->nreads2, 0, st->max_amp * sizeof(*st->nreads2));
+    memset(st->nfull_reads, 0, st->max_amp * sizeof(*st->nfull_reads));
+
+    memset(st->nrperc,  0, st->max_amp * sizeof(*st->nrperc));
+    memset(st->nrperc2, 0, st->max_amp * sizeof(*st->nrperc2));
+
+    memset(st->nbases,  0, st->max_amp * sizeof(*st->nbases));
+    memset(st->nbases2, 0, st->max_amp * sizeof(*st->nbases2));
+
+    memset(st->coverage, 0, st->max_amp * st->max_amp_len
+           * sizeof(*st->coverage));
+    memset(st->covered_perc,  0, st->max_amp * sizeof(*st->covered_perc));
+    memset(st->covered_perc2, 0, st->max_amp * sizeof(*st->covered_perc2));
+
+    // Keep the allocated entries as it's likely all files will share
+    // the same keys.  Instead we reset counters to zero for common ones
+    // and delete rare ones.
+    int i;
+    for (i = 0; i <= st->max_amp; i++) {
+        khiter_t k;
+        for (k = kh_begin(st->tcoord[i]);
+             k != kh_end(st->tcoord[i]); k++)
+            if (kh_exist(st->tcoord[i], k)) {
+                if (kh_value(st->tcoord[i], k) < 5)
+                    kh_del(tcoord, st->tcoord[i], k);
+                else
+                    kh_value(st->tcoord[i], k) = 0;
+            }
+    }
+
+    khiter_t k;
+    for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
+        if (kh_exist(st->qend, k))
+            free((void *)kh_key(st->qend, k));
+    kh_clear(qname, st->qend);
+
+    memset(st->depth_valid, 0, st->max_len * sizeof(*st->depth_valid));
+    memset(st->depth_all,   0, st->max_len * sizeof(*st->depth_all));
+    memset(st->amp_dist,  0, st->max_amp * sizeof(*st->amp_dist));
+}
+
+static void amp_stats_reset(amplicons_t *amps, int nref) {
+    int i;
+    for (i = 0; i < nref; i++) {
+        if (!amps[i].sites)
+            continue;
+        stats_reset(amps[i].lstats);
+    }
+}
+
+static int accumulate_stats(astats_args_t *args, amplicons_t *amps,
+                            bam1_t *b) {
+    int ref = b->core.tid;
+    amplicon_t *amp = amps[ref].amp;
+    astats_t *stats = amps[ref].lstats;
+    int len = amps[ref].len;
+
+    if (!stats)
+        return 0;
+
+    stats->nseq++;
+    if ((b->core.flag & args->flag_require) != args->flag_require ||
+        (b->core.flag & args->flag_filter)  != 0) {
+        stats->nfiltered++;
+        return 0;
+    }
+
+    int64_t start = b->core.pos, mstart = start; // modified start
+    int64_t end = bam_endpos(b), i;
+
+    // Compute all-template-depth and valid-template-depth.
+    // We track current end location per read name so we can remove overlaps.
+    // Potentially we could use this data for a better amplicon-depth
+    // count too, but for now it's purely for the per-base plots.
+    int ret;
+    khiter_t k;
+    int prev_start = 0, prev_end = 0;
+    if ((b->core.flag & BAM_FPAIRED)
+        && !(b->core.flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY))) {
+        k = kh_put(qname, stats->qend, bam_get_qname(b), &ret);
+        if (ret == 0) {
+            prev_start = kh_value(stats->qend, k) & 0xffffffff;
+            prev_end = kh_value(stats->qend, k)>>32;
+            mstart = MAX(mstart, prev_end);
+            // Ideally we'd reuse strings so we don't thrash free/malloc.
+            // However let's see if the official way of doing that (malloc
+            // itself) is fast enough first.
+            free((void *)kh_key(stats->qend, k));
+            kh_del(qname, stats->qend, k);
+            //fprintf(stderr, "remove overlap %d to %d\n", (int)start, (int)mstart);
+        } else {
+            if (!(kh_key(stats->qend, k) = strdup(bam_get_qname(b))))
+                return -1;
+
+            kh_value(stats->qend, k) = start | (end << 32);
+        }
+    }
+    for (i = mstart; i < end && i < len; i++)
+        stats->depth_all[i]++;
+    if (i < end) {
+        print_error("ampliconstats", "record %s overhangs end of reference",
+                    bam_get_qname(b));
+        // But keep going, as it's harmless.
+    }
+
+    // On single ended runs, eg ONT or PacBio, we just use the start/end
+    // of the template to assign.
+    int anum = (b->core.flag & BAM_FREVERSE) || !(b->core.flag & BAM_FPAIRED)
+        ? (end-1 >= 0 && end-1 < len ? pos2end[end-1] : -1)
+        : (start >= 0 && start < len ? pos2start[start] : -1);
+
+    // ivar sometimes soft-clips 100% of the bases.
+    // This is essentially unmapped
+    if (end == start && (args->flag_filter & BAM_FUNMAP)) {
+        stats->nfiltered++;
+        return 0;
+    }
+
+    if (anum == -1)
+        stats->nfailprimer++;
+
+    if (anum >= 0) {
+        int64_t c = MIN(end,amp[anum].min_right+1) - MAX(start,amp[anum].max_left);
+        if (c > 0) {
+            stats->nreads[anum]++;
+            // NB: ref bases rather than read bases
+            stats->nbases[anum] += c;
+
+            int64_t i;
+            if (start < 0) start = 0;
+            if (end > len) end = len;
+
+            int64_t ostart = MAX(start, amp[anum].min_left-1);
+            int64_t oend = MIN(end, amp[anum].max_right);
+            int64_t offset = amp[anum].min_left-1;
+            for (i = ostart; i < oend; i++)
+                stats->coverage[anum*stats->max_amp_len + i-offset]++;
+        } else {
+            stats->nfailprimer++;
+        }
+    }
+
+    // Template length in terms of amplicon number to amplicon number.
+    // We expect left to right of same amplicon (len 0), but it may go
+    // to next amplicon (len 1) or prev (len -1), etc.
+    int64_t t_end;
+    int oth_anum = -1;
+
+    if (b->core.flag & BAM_FPAIRED) {
+        t_end = (b->core.flag & BAM_FREVERSE ? end : start)
+            + b->core.isize;
+
+        // If we've clipped the primers but not followed up with a fixmates
+        // then our start+TLEN will take us to a location which is
+        // length(LEFT_PRIMER) + length(RIGHT_PRIMER) too far away.
+        //
+        // The correct solution is to run samtools fixmate so TLEN is correct.
+        // The hacky solution is to fudge the expected tlen by double the
+        // average primer length (e.g. 50).
+        t_end += b->core.isize > 0 ? -args->tlen_adj : +args->tlen_adj;
+
+        if (t_end > 0 && t_end < len && b->core.isize != 0)
+            oth_anum = (b->core.flag & BAM_FREVERSE)
+                ? pos2start[t_end]
+                : pos2end[t_end];
+    } else {
+        // Not paired (see int anum = (REV || !PAIR) ?en :st expr above)
+        oth_anum = pos2start[start];
+        t_end = end;
+    }
+
+    // We don't want to count our pairs twice.
+    // If both left/right are known, count it on left only.
+    // If only one is known, we'll only get to this code once
+    // so we can also count it.
+    int astatus = 2;
+    if (anum != -1 && oth_anum != -1) {
+        astatus = oth_anum == anum ? 0 : 1;
+        if (start <= t_end)
+            stats->amp_dist[anum][astatus]++;
+    } else if (anum >= 0) {
+        stats->amp_dist[anum][astatus = 2]++;
+    }
+
+    if (astatus == 0 && !(b->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))) {
+        if (prev_end && mstart > prev_end) {
+            // 2nd read with gap to 1st; undo previous increment.
+            for (i = prev_start; i < prev_end; i++)
+                stats->depth_valid[i]--;
+            stats->nfull_reads[anum] -= (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
+        } else {
+            // 1st read, or 2nd read that overlaps 1st
+            for (i = mstart; i < end; i++)
+                stats->depth_valid[i]++;
+            stats->nfull_reads[anum] += (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
+        }
+    }
+
+    // Track template start,end frequencies, so we can give stats on
+    // amplicon primer usage.
+    if ((b->core.flag & BAM_FPAIRED) && b->core.isize <= 0)
+        // left to right only, so we don't double count template positions.
+        return 0;
+
+    start = b->core.pos;
+    t_end = b->core.flag & BAM_FPAIRED
+        ? start + b->core.isize-1
+        : end;
+    uint64_t tcoord = MIN(start+1, UINT32_MAX) | (MIN(t_end+1, UINT32_MAX)<<32);
+    k = kh_put(tcoord, stats->tcoord[anum+1], tcoord, &ret);
+    if (ret < 0)
+        return -1;
+    if (ret == 0)
+        kh_value(stats->tcoord[anum+1], k)++;
+    else
+        kh_value(stats->tcoord[anum+1], k)=1;
+    kh_value(stats->tcoord[anum+1], k) |= ((int64_t)astatus<<32);
+
+    return 0;
+}
+
+// Append file local stats to global stats
+int append_lstats(astats_t *lstats, astats_t *gstats, int namp, int all_nseq) {
+    gstats->nseq += lstats->nseq;
+    gstats->nfiltered += lstats->nfiltered;
+    gstats->nfailprimer += lstats->nfailprimer;
+
+    int a;
+    for (a = -1; a < namp; a++) {
+        // Add khash local (kl) to khash global (kg)
+        khiter_t kl, kg;
+        for (kl = kh_begin(lstats->tcoord[a+1]);
+             kl != kh_end(lstats->tcoord[a+1]); kl++) {
+            if (!kh_exist(lstats->tcoord[a+1], kl) ||
+                kh_value(lstats->tcoord[a+1], kl) == 0)
+                continue;
+
+            int ret;
+            kg = kh_put(tcoord, gstats->tcoord[a+1],
+                        kh_key(lstats->tcoord[a+1], kl),
+                        &ret);
+            if (ret < 0)
+                return -1;
+
+            kh_value(gstats->tcoord[a+1], kg) =
+                (ret == 0
+                 ? (kh_value(gstats->tcoord[a+1], kg) & 0xFFFFFFFF)
+                 : 0)
+                + kh_value(lstats->tcoord[a+1], kl);
+        }
+        if (a == -1) continue;
+
+        gstats->nreads[a]  += lstats->nreads[a];
+        gstats->nreads2[a] += lstats->nreads[a] * lstats->nreads[a];
+        gstats->nfull_reads[a] += lstats->nfull_reads[a];
+
+        // To get mean & sd for amplicon read percentage, we need
+        // to do the divisions here as nseq differs for each sample.
+        double nrperc = all_nseq ? 100.0 * lstats->nreads[a] / all_nseq : 0;
+        gstats->nrperc[a]  += nrperc;
+        gstats->nrperc2[a] += nrperc*nrperc;
+
+        gstats->nbases[a]  += lstats->nbases[a];
+        gstats->nbases2[a] += lstats->nbases[a] * lstats->nbases[a];
+
+        int d;
+        for (d = 0; d < MAX_DEPTH; d++) {
+            gstats->covered_perc[a][d]  += lstats->covered_perc[a][d];
+            gstats->covered_perc2[a][d] += lstats->covered_perc[a][d]
+                                         * lstats->covered_perc[a][d];
+        }
+
+        for (d = 0; d < 3; d++)
+            gstats->amp_dist[a][d] += lstats->amp_dist[a][d];
+    }
+
+    for (a = 0; a < lstats->max_len; a++) {
+        gstats->depth_valid[a] += lstats->depth_valid[a];
+        gstats->depth_all[a]   += lstats->depth_all[a];
+    }
+
+    return 0;
+}
+
+int append_stats(amplicons_t *amps, int nref) {
+    int i, r, all_nseq = 0;
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = amps[r].lstats;
+        all_nseq  += stats->nseq - stats->nfiltered - stats->nfailprimer;
+    }
+
+    for (i = 0; i < nref; i++) {
+        if (!amps[i].sites)
+            continue;
+        if (append_lstats(amps[i].lstats, amps[i].gstats, amps[i].namp,
+                          all_nseq) < 0)
+            return -1;
+    }
+
+    return 0;
+}
+
+typedef struct {
+    int32_t start, end;
+    uint32_t freq;
+    uint32_t status;
+} tcoord_t;
+
+// Sort tcoord by descending frequency and then ascending start and  end.
+static int tcoord_freq_sort(const void *vp1, const void *vp2) {
+    const tcoord_t *t1 = (const tcoord_t *)vp1;
+    const tcoord_t *t2 = (const tcoord_t *)vp2;
+
+    if (t1->freq != t2->freq)
+        return t2->freq - t1->freq;
+
+    if (t1->start != t2->start)
+        return t1->start - t2->start;
+
+    return t1->end - t2->end;
+}
+
+
+/*
+ * Merges tcoord start,end,freq,status tuples if their coordinates are
+ * close together.  We aim to keep the start,end for the most frequent
+ * value and assume that is the correct coordinate and all others are
+ * minor fluctuations due to errors or variants.
+ *
+ * We sort by frequency first and then merge later items in the list into
+ * the earlier more frequent ones.  It's O(N^2), but sufficient for now
+ * given current scale of projects.
+ *
+ * If we ever need to resolve that then consider sorting by start
+ * coordinate and scanning the list to find all items within X, find
+ * the most frequent of those, and then cluster that way.  (I'd have
+ * done that had I thought of it at the time!)
+ */
+static void aggregate_tcoord(astats_args_t *args, tcoord_t *tpos, size_t *np){
+    size_t n = *np, j, j2, j3, k;
+
+    // Sort by frequency and cluster infrequent coords into frequent
+    // ones provided they're close by.
+    // This is O(N^2), but we've already binned by tcoord_bin/2 so
+    // the list isn't intended to be vast at this point.
+    qsort(tpos, n, sizeof(*tpos), tcoord_freq_sort);
+
+    // For frequency ties, find mid start coord, and then find mid end
+    // coord of those matching start.
+    // We make that the first item so we merge into that mid point.
+    for (j = 0; j < n; j++) {
+        for (j2 = j+1; j2 < n; j2++) {
+            if (tpos[j].freq != tpos[j2].freq)
+                break;
+            if (tpos[j2].start - tpos[j].start >= args->tcoord_bin)
+                break;
+        }
+
+        // j to j2 all within bin of a common start,
+        // m is the mid start.
+        if (j2-1 > j) {
+            size_t m = (j2-1 + j)/2;
+
+            // Find mid end for this same start
+            while (m > 1 && tpos[m].start == tpos[m-1].start)
+                m--;
+            for (j3 = m+1; j3 < j2; j3++) {
+                if (tpos[m].start != tpos[j3].start)
+                    break;
+                if (tpos[m].end - tpos[j3].end >= args->tcoord_bin)
+                    break;
+            }
+            if (j3-1 > m)
+                m = (j3-1 + m)/2;
+
+            // Swap with first item.
+            tcoord_t tmp = tpos[j];
+            tpos[j] = tpos[m];
+            tpos[m] = tmp;
+            j = j2-1;
+        }
+    }
+
+    // Now merge in coordinates.
+    // This bit is O(N^2), so consider binning first to reduce the
+    // size of the list if we have excessive positional variation.
+    for (k = j = 0; j < n; j++) {
+        if (!tpos[j].freq)
+            continue;
+
+        if (k < j)
+            tpos[k] = tpos[j];
+
+        for (j2 = j+1; j2 < n; j2++) {
+            if (ABS(tpos[j].start-tpos[j2].start) < args->tcoord_bin/2 &&
+                ABS(tpos[j].end  -tpos[j2].end)  < args->tcoord_bin/2 &&
+                tpos[j].status == tpos[j2].status) {
+                tpos[k].freq += tpos[j2].freq;
+                tpos[j2].freq = 0;
+            }
+        }
+        k++;
+    }
+
+    *np = k;
+}
+
+int dump_stats(astats_args_t *args, char type, char *name, int nfile,
+               amplicons_t *amps, int nref, int local) {
+    int i, r;
+    FILE *ofp = args->out_fp;
+    tcoord_t *tpos = NULL;
+    size_t ntcoord = 0;
+
+    // summary stats for this sample (or for all samples)
+    fprintf(ofp, "# Summary stats.\n");
+    fprintf(ofp, "# Use 'grep ^%cSS | cut -f 2-' to extract this part.\n", type);
+
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        int nmatch = stats->nseq - stats->nfiltered - stats->nfailprimer;
+        char *name_ref = malloc(strlen(name) + strlen(amps[r].ref) + 2);
+        if (!name_ref)
+            return -1;
+        if (args->multi_ref)
+            sprintf(name_ref, "%s\t%s", name, amps[r].ref);
+        else
+            sprintf(name_ref, "%s", name);
+        fprintf(ofp, "%cSS\t%s\traw total sequences:\t%d\n",
+                type, name_ref, stats->nseq);
+        fprintf(ofp, "%cSS\t%s\tfiltered sequences:\t%d\n",
+                type, name_ref, stats->nfiltered);
+        fprintf(ofp, "%cSS\t%s\tfailed primer match:\t%d\n",
+                type, name_ref, stats->nfailprimer);
+        fprintf(ofp, "%cSS\t%s\tmatching sequences:\t%d\n",
+                type, name_ref, nmatch);
+
+        int d = 0;
+        do {
+            // From first to last amplicon only, so not entire consensus.
+            // If contig length is known, maybe we want to add the missing
+            // count to < DEPTH figures?
+            int64_t start = 0, covered = 0, total = 0;
+            amplicon_t *amp = amps[r].amp;
+            for (i = 0; i < amps[r].namp; i++) {
+                int64_t j, offset = amp[i].min_left-1;
+                if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
+                    fprintf(stderr, "[ampliconstats] error: "
+                            "Maximum amplicon length (%d) exceeded for '%s'\n",
+                            stats->max_amp, name);
+                    return -1;
+                }
+                for (j = MAX(start, amp[i].max_left-1);
+                     j < MAX(start, amp[i].min_right); j++) {
+                    if (stats->coverage[i*stats->max_amp_len + j-offset]
+                        >= args->min_depth[d])
+                        covered++;
+                    total++;
+                }
+                start = MAX(start, amp[i].min_right);
+            }
+            fprintf(ofp, "%cSS\t%s\tconsensus depth count < %d and >= %d:\t%"
+                    PRId64"\t%"PRId64"\n", type, name_ref,
+                    args->min_depth[d], args->min_depth[d],
+                    total-covered, covered);
+        } while (++d < MAX_DEPTH && args->min_depth[d]);
+
+        free(name_ref);
+    }
+
+    // Read count
+    fprintf(ofp, "# Absolute matching read counts per amplicon.\n");
+    fprintf(ofp, "# Use 'grep ^%cREADS | cut -f 2-' to extract this part.\n", type);
+    fprintf(ofp, "%cREADS\t%s", type, name);
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        for (i = 0; i < amps[r].namp; i++) {
+            fprintf(ofp, "\t%"PRId64, stats->nreads[i]);
+        }
+    }
+    fprintf(ofp, "\n");
+
+    // Valid depth is the number of full length reads (already divided
+    // by the number we expect to cover), so +0.5 per read in pair.
+    // A.k.a "usable depth" in the plots.
+    fprintf(ofp, "%cVDEPTH\t%s", type, name);
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        for (i = 0; i < amps[r].namp; i++)
+            fprintf(ofp, "\t%d", (int)stats->nfull_reads[i]);
+    }
+    fprintf(ofp, "\n");
+
+    if (type == 'C') {
+        // For combined we can compute mean & standard deviation too
+        fprintf(ofp, "CREADS\tMEAN");
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].sites)
+                continue;
+            astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+            for (i = 0; i < amps[r].namp; i++) {
+                fprintf(ofp, "\t%.1f", stats->nreads[i] / (double)nfile);
+            }
+        }
+        fprintf(ofp, "\n");
+
+        fprintf(ofp, "CREADS\tSTDDEV");
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].sites)
+                continue;
+            astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+            for (i = 0; i < amps[r].namp; i++) {
+                double n1 = stats->nreads[i];
+                fprintf(ofp, "\t%.1f", nfile > 1 && stats->nreads2[i] > 0
+                        ? sqrt(stats->nreads2[i]/(double)nfile
+                               - (n1/nfile)*(n1/nfile))
+                        : 0);
+            }
+        }
+        fprintf(ofp, "\n");
+    }
+
+    fprintf(ofp, "# Read percentage of distribution between amplicons.\n");
+    fprintf(ofp, "# Use 'grep ^%cRPERC | cut -f 2-' to extract this part.\n", type);
+    fprintf(ofp, "%cRPERC\t%s", type, name);
+    int all_nseq = 0;
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        all_nseq  += stats->nseq - stats->nfiltered - stats->nfailprimer;
+    }
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        for (i = 0; i < amps[r].namp; i++) {
+            if (type == 'C') {
+                fprintf(ofp, "\t%.3f", (double)stats->nrperc[i] / nfile);
+            } else {
+                fprintf(ofp, "\t%.3f",
+                        all_nseq ? 100.0 * stats->nreads[i] / all_nseq : 0);
+            }
+        }
+    }
+    fprintf(ofp, "\n");
+
+    if (type == 'C') {
+        // For combined we compute mean and standard deviation too
+        fprintf(ofp, "CRPERC\tMEAN");
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].sites)
+                continue;
+            astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+            for (i = 0; i < amps[r].namp; i++) {
+                fprintf(ofp, "\t%.3f", stats->nrperc[i] / nfile);
+            }
+        }
+        fprintf(ofp, "\n");
+
+        fprintf(ofp, "CRPERC\tSTDDEV");
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].sites)
+                continue;
+            astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+            for (i = 0; i < amps[r].namp; i++) {
+                // variance = SUM(X^2) - ((SUM(X)^2) / N)
+                double n1 = stats->nrperc[i];
+                double v = stats->nrperc2[i]/nfile - (n1/nfile)*(n1/nfile);
+                fprintf(ofp, "\t%.3f", v>0?sqrt(v):0);
+            }
+        }
+        fprintf(ofp, "\n");
+    }
+
+    // Base depth
+    fprintf(ofp, "# Read depth per amplicon.\n");
+    fprintf(ofp, "# Use 'grep ^%cDEPTH | cut -f 2-' to extract this part.\n", type);
+    fprintf(ofp, "%cDEPTH\t%s", type, name);
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        amplicon_t *amp = amps[r].amp;
+        for (i = 0; i < amps[r].namp; i++) {
+            int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
+            int64_t alen = amp[i].min_right - amp[i].max_left+1;
+            fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen : 0);
+        }
+    }
+    fprintf(ofp, "\n");
+
+    if (type == 'C') {
+        // For combined we can compute mean & standard deviation too
+        fprintf(ofp, "CDEPTH\tMEAN");
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].sites)
+                continue;
+            astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+            amplicon_t *amp = amps[r].amp;
+            int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
+            for (i = 0; i < amps[r].namp; i++) {
+                int64_t alen = amp[i].min_right - amp[i].max_left+1;
+                fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen / nfile : 0);
+            }
+        }
+        fprintf(ofp, "\n");
+
+        fprintf(ofp, "CDEPTH\tSTDDEV");
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].sites)
+                continue;
+            astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+            amplicon_t *amp = amps[r].amp;
+            for (i = 0; i < amps[r].namp; i++) {
+                double alen = amp[i].min_right - amp[i].max_left+1;
+                double n1 = stats->nbases[i] / alen;
+                double v = stats->nbases2[i] / (alen*alen) /nfile
+                    - (n1/nfile)*(n1/nfile);
+                fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
+            }
+        }
+        fprintf(ofp, "\n");
+    }
+
+    // Percent Coverage
+    if (type == 'F') {
+        fprintf(ofp, "# Percentage coverage per amplicon\n");
+        fprintf(ofp, "# Use 'grep ^%cPCOV | cut -f 2-' to extract this part.\n", type);
+        int d = 0;
+        do {
+            fprintf(ofp, "%cPCOV-%d\t%s", type, args->min_depth[d], name);
+
+            for (r = 0; r < nref; r++) {
+                if (!amps[r].sites)
+                    continue;
+                astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+                amplicon_t *amp = amps[r].amp;
+                for (i = 0; i < amps[r].namp; i++) {
+                    int covered = 0;
+                    if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
+                        fprintf(stderr, "[ampliconstats] error: "
+                                "Maximum amplicon length (%d) exceeded for '%s'\n",
+                                stats->max_amp, name);
+                        return -1;
+                    }
+                    int64_t j, offset = amp[i].min_left-1;
+                    for (j = amp[i].max_left-1; j < amp[i].min_right; j++) {
+                        int apos = i*stats->max_amp_len + j-offset;
+                        if (stats->coverage[apos] >= args->min_depth[d])
+                            covered++;
+                    }
+                    int64_t alen = amp[i].min_right - amp[i].max_left+1;
+                    stats->covered_perc[i][d] = 100.0 * covered / alen;
+                    fprintf(ofp, "\t%.2f", 100.0 * covered / alen);
+                }
+            }
+            fprintf(ofp, "\n");
+        } while (++d < MAX_DEPTH && args->min_depth[d]);
+
+    } else if (type == 'C') {
+        // For combined we can compute mean & standard deviation too
+        int d = 0;
+        do {
+            fprintf(ofp, "CPCOV-%d\tMEAN", args->min_depth[d]);
+            for (r = 0; r < nref; r++) {
+                if (!amps[r].sites)
+                    continue;
+                astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+                for (i = 0; i < amps[r].namp; i++) {
+                    fprintf(ofp, "\t%.1f", stats->covered_perc[i][d] / nfile);
+                }
+            }
+            fprintf(ofp, "\n");
+
+            fprintf(ofp, "CPCOV-%d\tSTDDEV", args->min_depth[d]);
+            for (r = 0; r < nref; r++) {
+                if (!amps[r].sites)
+                    continue;
+                astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+                for (i = 0; i < amps[r].namp; i++) {
+                    double n1 = stats->covered_perc[i][d] / nfile;
+                    double v = stats->covered_perc2[i][d] / nfile - n1*n1;
+                    fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
+                }
+            }
+            fprintf(ofp, "\n");
+        } while (++d < MAX_DEPTH && args->min_depth[d]);
+    }
+
+    // Plus base depth for all reads, irrespective of amplicon.
+    // This is post overlap removal, if reads in the read-pair overlap.
+    fprintf(ofp, "# Depth per reference base for ALL data.\n");
+    fprintf(ofp, "# Use 'grep ^%cDP_ALL | cut -f 2-' to extract this part.\n",
+            type);
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        if (args->multi_ref)
+            fprintf(ofp, "%cDP_ALL\t%s\t%s", type, name, amps[r].ref);
+        else
+            fprintf(ofp, "%cDP_ALL\t%s", type, name);
+
+        for (i = 0; i < amps[r].len; i++) {
+            // Basic run-length encoding provided all values are within
+            // +- depth_bin fraction of the mid-point.
+            int dmin = stats->depth_all[i], dmax = stats->depth_all[i], j;
+            double dmid = (dmin + dmax)/2.0;
+            double low  = dmid*(1-args->depth_bin);
+            double high = dmid*(1+args->depth_bin);
+            for (j = i+1; j < amps[r].len; j++) {
+                int d = stats->depth_all[j];
+                if (d < low || d > high)
+                    break;
+                if (dmin > d) {
+                    dmin = d;
+                    dmid = (dmin + dmax)/2.0;
+                    low  = dmid*(1-args->depth_bin);
+                    high = dmid*(1+args->depth_bin);
+                } else if (dmax < d) {
+                    dmax = d;
+                    dmid = (dmin + dmax)/2.0;
+                    low  = dmid*(1-args->depth_bin);
+                    high = dmid*(1+args->depth_bin);
+                }
+            }
+            fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
+            i = j-1;
+        }
+        fprintf(ofp, "\n");
+    }
+
+    // And depth for only reads matching to a single amplicon for full
+    // length.  This is post read overlap removal.
+    fprintf(ofp, "# Depth per reference base for full-length valid amplicon data.\n");
+    fprintf(ofp, "# Use 'grep ^%cDP_VALID | cut -f 2-' to extract this "
+            "part.\n", type);
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        if (args->multi_ref)
+            fprintf(ofp, "%cDP_VALID\t%s\t%s", type, name, amps[r].ref);
+        else
+            fprintf(ofp, "%cDP_VALID\t%s", type, name);
+
+        for (i = 0; i < amps[r].len; i++) {
+            int dmin = stats->depth_valid[i], dmax = stats->depth_valid[i], j;
+            double dmid = (dmin + dmax)/2.0;
+            double low  = dmid*(1-args->depth_bin);
+            double high = dmid*(1+args->depth_bin);
+            for (j = i+1; j < amps[r].len; j++) {
+                int d = stats->depth_valid[j];
+                if (d < low || d > high)
+                    break;
+                if (dmin > d) {
+                    dmin = d;
+                    dmid = (dmin + dmax)/2.0;
+                    low  = dmid*(1-args->depth_bin);
+                    high = dmid*(1+args->depth_bin);
+                } else if (dmax < d) {
+                    dmax = d;
+                    dmid = (dmin + dmax)/2.0;
+                    low  = dmid*(1-args->depth_bin);
+                    high = dmid*(1+args->depth_bin);
+                }
+            }
+            fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
+            i = j-1;
+        }
+        fprintf(ofp, "\n");
+    }
+
+    // TCOORD (start to end) distribution
+    fprintf(ofp, "# Distribution of aligned template coordinates.\n");
+    fprintf(ofp, "# Use 'grep ^%cTCOORD | cut -f 2-' to extract this part.\n", type);
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        for (i = 0 - (nref==1); i < amps[r].namp; i++) {
+            if (ntcoord < kh_size(stats->tcoord[i+1])) {
+                ntcoord = kh_size(stats->tcoord[i+1]);
+                tcoord_t *tmp = realloc(tpos, ntcoord * sizeof(*tmp));
+                if (!tmp) {
+                    free(tpos);
+                    return -1;
+                }
+                tpos = tmp;
+            }
+
+            khiter_t k;
+            size_t n = 0, j;
+            for (k = kh_begin(stats->tcoord[i+1]);
+                 k != kh_end(stats->tcoord[i+1]); k++) {
+                if (!kh_exist(stats->tcoord[i+1], k) ||
+                    (kh_value(stats->tcoord[i+1], k) & 0xFFFFFFFF) == 0)
+                    continue;
+                // Key is start,end in 32-bit quantities.
+                // Yes this limits us to 4Gb references, but just how
+                // many primers are we planning on making?  Not that many
+                // I hope.
+                tpos[n].start = kh_key(stats->tcoord[i+1], k)&0xffffffff;
+                tpos[n].end   = kh_key(stats->tcoord[i+1], k)>>32;
+
+                // Value is frequency (top 32-bits) and status (bottom 32).
+                tpos[n].freq   = kh_value(stats->tcoord[i+1], k)&0xffffffff;
+                tpos[n].status = kh_value(stats->tcoord[i+1], k)>>32;
+                n++;
+            }
+
+            if (args->tcoord_bin > 1)
+                aggregate_tcoord(args, tpos, &n);
+
+            fprintf(ofp, "%cTCOORD\t%s\t%d", type, name,
+                    i+1+amps[r].first_amp); // per amplicon
+            for (j = 0; j < n; j++) {
+                if (tpos[j].freq < args->tcoord_min_count)
+                    continue;
+                fprintf(ofp, "\t%d,%d,%u,%u",
+                        tpos[j].start,
+                        tpos[j].end,
+                        tpos[j].freq,
+                        tpos[j].status);
+            }
+            fprintf(ofp, "\n");
+        }
+    }
+
+
+    // AMP length distribution.
+    // 0 = both ends in this amplicon
+    // 1 = ends in different amplicons
+    // 2 = other end matching an unknown amplicon site
+    //     (see tcoord for further analysis of where)
+    fprintf(ofp, "# Classification of amplicon status.  Columns are\n");
+    fprintf(ofp, "# number with both primers from this amplicon, number with\n");
+    fprintf(ofp, "# primers from different amplicon, and number with a position\n");
+    fprintf(ofp, "# not matching any valid amplicon primer site\n");
+    fprintf(ofp, "# Use 'grep ^%cAMP | cut -f 2-' to extract this part.\n", type);
+
+    fprintf(ofp, "%cAMP\t%s\t0", type, name); // all merged
+    int amp_dist[3] = {0};
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        for (i = 0; i < amps[r].namp; i++) { // accumulate for all amps
+            amp_dist[0] += stats->amp_dist[i][0];
+            amp_dist[1] += stats->amp_dist[i][1];
+            amp_dist[2] += stats->amp_dist[i][2];
+        }
+    }
+    fprintf(ofp, "\t%d\t%d\t%d\n", amp_dist[0], amp_dist[1], amp_dist[2]);
+
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        for (i = 0; i < amps[r].namp; i++) {
+            // per amplicon
+            fprintf(ofp, "%cAMP\t%s\t%d", type, name, i+1+amps[r].first_amp);
+            fprintf(ofp, "\t%d\t%d\t%d\n", stats->amp_dist[i][0],
+                    stats->amp_dist[i][1], stats->amp_dist[i][2]);
+        }
+    }
+
+    free(tpos);
+    return 0;
+}
+
+int dump_lstats(astats_args_t *args, char type, char *name, int nfile,
+               amplicons_t *amps, int nref) {
+    return dump_stats(args, type, name, nfile, amps, nref, 1);
+}
+
+int dump_gstats(astats_args_t *args, char type, char *name, int nfile,
+               amplicons_t *amps, int nref) {
+    return dump_stats(args, type, name, nfile, amps, nref, 0);
+}
+
+char const *get_sample_name(sam_hdr_t *header, char *RG) {
+    kstring_t ks = {0};
+    sam_hdr_find_tag_id(header, "RG", RG?"ID":NULL, RG, "SM", &ks);
+    return ks.s;
+}
+
+// Return maximum reference length (SQ is NULL) or the length
+// of the specified reference in SQ.
+int64_t get_ref_len(sam_hdr_t *header, const char *SQ) {
+    if (SQ) {
+        int tid = SQ ? sam_hdr_name2tid(header, SQ) : 0;
+        return tid >= 0 ? sam_hdr_tid2len(header, tid) : -1;
+    } else {
+        int nref = sam_hdr_nref(header), tid;;
+        int64_t len = 0;
+        for (tid = 0; tid < nref; tid++) {
+            int64_t rl = sam_hdr_tid2len(header, tid);
+            if (len < rl)
+                len = rl;
+        }
+        return len;
+    }
+}
+
+static int amplicon_stats(astats_args_t *args,
+                          khash_t(bed_list_hash) *bed_hash,
+                          char **filev, int filec) {
+    int i, ref = -1, ref_tid = -1, ret = -1, nref = 0;
+    samFile *fp = NULL;
+    sam_hdr_t *header = NULL;
+    bam1_t *b = bam_init1();
+    FILE *ofp = args->out_fp;
+    char sname_[8192], *sname = NULL;
+    amplicons_t *amps = NULL;
+
+    // Report initial SS header.  We gather data from the bed_hash entries
+    // as well as from the first SAM header (with the requirement that all
+    // headers should be compatible).
+    if (filec) {
+        if (!(fp = sam_open_format(filev[0], "r", &args->ga.in))) {
+            print_error_errno("ampliconstats",
+                              "Cannot open input file \"%s\"",
+                              filev[0]);
+            goto err;
+        }
+        if (!(header = sam_hdr_read(fp)))
+            goto err;
+
+        if (!amps) {
+            amps = calloc(nref=sam_hdr_nref(header), sizeof(*amps));
+            if (!amps)
+                goto err;
+            fprintf(ofp, "# Summary statistics, used for scaling the plots.\n");
+            fprintf(ofp, "SS\tSamtools version: %s\n", samtools_version());
+            fprintf(ofp, "SS\tCommand line: %s\n", args->argv);
+            fprintf(ofp, "SS\tNumber of files:\t%d\n", filec);
+
+            // Note: order of hash entries will be different to order of
+            // BED file which may also differ to order of SQ headers.
+            // SQ header is canonical ordering (pos sorted file).
+            khiter_t k;
+            int bam_nref = sam_hdr_nref(header);
+            for (i = 0; i < bam_nref; i++) {
+                k = kh_get(bed_list_hash, bed_hash,
+                           sam_hdr_tid2name(header, i));
+                if (!kh_exist(bed_hash, k))
+                    continue;
+
+                bed_entry_list_t *sites = &kh_value(bed_hash, k);
+
+                ref = i;
+                amps[ref].ref = kh_key(bed_hash, k);
+                amps[ref].sites = sites;
+                amps[ref].namp = count_amplicon(sites);
+                amps[ref].amp  = calloc(sites->length,
+                                        sizeof(*amps[ref].amp));
+                if (!amps[ref].amp)
+                    goto err;
+                if (args->multi_ref)
+                    fprintf(ofp, "SS\tNumber of amplicons:\t%s\t%d\n",
+                            kh_key(bed_hash, k), amps[ref].namp);
+                else
+                    fprintf(ofp, "SS\tNumber of amplicons:\t%d\n",
+                            amps[ref].namp);
+
+                amps[ref].tid = ref;
+                if (ref_tid == -1)
+                    ref_tid = ref;
+
+                int64_t len = get_ref_len(header, kh_key(bed_hash, k));
+                amps[ref].len = len;
+                if (args->multi_ref)
+                    fprintf(ofp, "SS\tReference length:\t%s\t%"PRId64"\n",
+                            kh_key(bed_hash, k), len);
+                else
+                    fprintf(ofp, "SS\tReference length:\t%"PRId64"\n",
+                            len);
+
+                amps[ref].lstats = stats_alloc(len, args->max_amp,
+                                               args->max_amp_len);
+                amps[ref].gstats = stats_alloc(len, args->max_amp,
+                                               args->max_amp_len);
+                if (!amps[ref].lstats || !amps[ref].gstats)
+                    goto err;
+            }
+        }
+
+        sam_hdr_destroy(header);
+        header = NULL;
+        if (sam_close(fp) < 0) {
+            fp = NULL;
+            goto err;
+        }
+        fp = NULL;
+    }
+    fprintf(ofp, "SS\tEnd of summary\n");
+
+    // Extract the bits of amplicon data we need from bed hash and turn
+    // it into a position-to-amplicon lookup table.
+    int offset = 0;
+    for (i = 0; i < nref; i++) {
+        if (!amps[i].sites)
+            continue;
+
+        amps[i].first_amp = offset;
+        if (bed2amplicon(args, amps[i].sites, amps[i].amp,
+                         &amps[i].namp, i==0, amps[i].ref, offset) < 0)
+            goto err;
+
+        offset += amps[i].namp; // cumulative amplicon number across refs
+    }
+
+    // Now iterate over file contents, one at a time.
+    for (i = 0; i < filec; i++) {
+        char *nstart = filev[i];
+
+        fp = sam_open_format(filev[i], "r", &args->ga.in);
+        if (!fp) {
+            print_error_errno("ampliconstats",
+                              "Cannot open input file \"%s\"",
+                              filev[i]);
+            goto err;
+        }
+
+        if (args->ga.nthreads > 0)
+            hts_set_threads(fp, args->ga.nthreads);
+
+        if (!(header = sam_hdr_read(fp)))
+            goto err;
+
+        if (nref != sam_hdr_nref(header)) {
+            print_error_errno("ampliconstats",
+                              "SAM headers are not consistent across input files");
+            goto err;
+        }
+        int r;
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].ref ||
+                strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 ||
+                amps[r].len != sam_hdr_tid2len(header, r)) {
+                print_error_errno("ampliconstats",
+                                  "SAM headers are not consistent across "
+                                  "input files");
+                goto err;
+            }
+        }
+
+        if (args->use_sample_name)
+            sname = (char *)get_sample_name(header, NULL);
+
+        if (!sname) {
+            sname = sname_;
+            char *nend = filev[i] + strlen(filev[i]), *cp;
+            if ((cp = strrchr(filev[i], '/')))
+                nstart = cp+1;
+            if ((cp = strrchr(nstart, '.')) &&
+                (strcmp(cp, ".bam") == 0 ||
+                 strcmp(cp, ".sam") == 0 ||
+                 strcmp(cp, ".cram") == 0))
+                nend = cp;
+            if (nend - nstart >= 8192) nend = nstart+8191;
+            memcpy(sname, nstart, nend-nstart);
+            sname[nend-nstart] = 0;
+        }
+
+        // Stats local to this sample only
+        amp_stats_reset(amps, nref);
+
+        int last_ref = -9;
+        while ((r = sam_read1(fp, header, b)) >= 0) {
+            // Other filter options useful here?
+            if (b->core.tid < 0)
+                continue;
+
+            if (last_ref != b->core.tid) {
+                last_ref  = b->core.tid;
+                if (initialise_amp_pos_lookup(args, amps, last_ref) < 0)
+                    goto err;
+            }
+
+            if (accumulate_stats(args, amps, b) < 0)
+                goto err;
+        }
+
+        if (r < -1) {
+            print_error_errno("ampliconstats", "Fail reading record");
+            goto err;
+        }
+
+        sam_hdr_destroy(header);
+        if (sam_close(fp) < 0) {
+            fp = NULL;
+            goto err;
+        }
+
+        fp = NULL;
+        header = NULL;
+
+        if (dump_lstats(args, 'F', sname, filec, amps, nref) < 0)
+            goto err;
+
+        if (append_stats(amps, nref) < 0)
+            goto err;
+
+        if (sname && sname != sname_)
+            free(sname);
+        sname = NULL;
+    }
+
+    if (dump_gstats(args, 'C', "COMBINED", filec, amps, nref) < 0)
+        goto err;
+
+    ret = 0;
+ err:
+    bam_destroy1(b);
+    if (ret) {
+        if (header)
+            sam_hdr_destroy(header);
+        if (fp)
+            sam_close(fp);
+    }
+    for (i = 0; i < nref; i++) {
+        stats_free(amps[i].lstats);
+        stats_free(amps[i].gstats);
+        free(amps[i].amp);
+    }
+    free(amps);
+    free(pos2start);
+    free(pos2end);
+    if (ret) {
+        if (sname && sname != sname_)
+            free(sname);
+    }
+
+    return ret;
+}
+
+static int usage(astats_args_t *args, FILE *fp, int exit_status) {
+    fprintf(fp,
+"\n"
+"Usage: samtools ampliconstats [options] primers.bed *.bam > astats.txt\n"
+"\n"
+"Options:\n");
+    fprintf(fp, "  -f, --required-flag STR|INT\n"
+            "               Only include reads with all of the FLAGs present [0x%X]\n",args->flag_require);
+    fprintf(fp, "  -F, --filter-flag STR|INT\n"
+            "               Only include reads with none of the FLAGs present [0x%X]\n",args->flag_filter & 0xffff);
+    fprintf(fp, "  -a, --max-amplicons INT\n"
+            "               Change the maximum number of amplicons permitted [%d]\n", MAX_AMP);
+    fprintf(fp, "  -l, --max-amplicon-length INT\n"
+            "               Change the maximum length of an individual amplicon [%d]\n", MAX_AMP_LEN);
+    fprintf(fp, "  -d, --min-depth INT[,INT]...\n"
+            "               Minimum base depth(s) to consider position covered [%d]\n", args->min_depth[0]);
+    fprintf(fp, "  -m, --pos-margin INT\n"
+            "               Margin of error for matching primer positions [%d]\n", args->max_delta);
+    fprintf(fp, "  -o, --output FILE\n"
+            "               Specify output file [stdout if unset]\n");
+    fprintf(fp, "  -s, --use-sample-name\n"
+            "               Use the sample name from the first @RG header line\n");
+    fprintf(fp, "  -t, --tlen-adjust INT\n"
+            "               Add/subtract from TLEN; use when clipping but no fixmate step\n");
+    fprintf(fp, "  -b, --tcoord-bin INT\n"
+            "               Bin template start,end positions into multiples of INT[1]\n");
+    fprintf(fp, "  -c, --tcoord-min-count INT\n"
+            "               Minimum template start,end frequency for recording [%d]\n", TCOORD_MIN_COUNT);
+    fprintf(fp, "  -D, --depth-bin FRACTION\n"
+            "               Merge FDP values within +/- FRACTION together\n");
+    fprintf(fp, "  -S, --single-ref\n"
+            "               Force single-ref (<=1.12) output format\n");
+    sam_global_opt_help(fp, "I.--.@");
+
+    return exit_status;
+}
+
+int main_ampliconstats(int argc, char **argv) {
+    astats_args_t args = {
+        .ga = SAM_GLOBAL_ARGS_INIT,
+        .flag_require = 0,
+        .flag_filter = 0x10B04,
+        //.sites = BED_LIST_INIT,
+        .max_delta = 30, // large enough to cope with alt primers
+        .min_depth = {1},
+        .use_sample_name = 0,
+        .max_amp = MAX_AMP,
+        .max_amp_len = MAX_AMP_LEN,
+        .tlen_adj = 0,
+        .out_fp = stdout,
+        .tcoord_min_count = TCOORD_MIN_COUNT,
+        .tcoord_bin = 1,
+        .depth_bin = 0.01,
+        .multi_ref = 1
+    }, oargs = args;
+
+    static const struct option loptions[] =
+    {
+        SAM_OPT_GLOBAL_OPTIONS('I', 0, '-', '-', 0, '@'),
+        {"help", no_argument, NULL, 'h'},
+        {"flag-require", required_argument, NULL, 'f'},
+        {"flag-filter", required_argument, NULL, 'F'},
+        {"min-depth", required_argument, NULL, 'd'},
+        {"output", required_argument, NULL, 'o'},
+        {"pos-margin", required_argument, NULL, 'm'},
+        {"use-sample-name", no_argument, NULL, 's'},
+        {"max-amplicons", required_argument, NULL, 'a'},
+        {"max-amplicon-length", required_argument, NULL, 'l'},
+        {"tlen-adjust", required_argument, NULL, 't'},
+        {"tcoord-min-count", required_argument, NULL, 'c'},
+        {"tcoord-bin", required_argument, NULL, 'b'},
+        {"depth-bin", required_argument, NULL, 'D'},
+        {"single-ref", no_argument, NULL, 'S'},
+        {NULL, 0, NULL, 0}
+    };
+    int opt;
+
+    while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) {
+        switch (opt) {
+        case 'f': args.flag_require = bam_str2flag(optarg); break;
+        case 'F':
+            if (args.flag_filter & 0x10000)
+                args.flag_filter = 0; // strip default on first -F usage
+            args.flag_filter |= bam_str2flag(optarg); break;
+
+        case 'm': args.max_delta = atoi(optarg); break; // margin
+        case 'D': args.depth_bin = atof(optarg); break; // depth bin fraction
+        case 'd': {
+            int d = 0;
+            char *cp = optarg, *ep;
+            do {
+                long n = strtol(cp, &ep, 10);
+                args.min_depth[d++] = n;
+                if (*ep != ',')
+                    break;
+                cp = ep+1;
+            } while (d < MAX_DEPTH);
+            break;
+        }
+
+        case 'a': args.max_amp = atoi(optarg)+1;break;
+        case 'l': args.max_amp_len = atoi(optarg)+1;break;
+
+        case 'c': args.tcoord_min_count = atoi(optarg);break;
+        case 'b':
+            args.tcoord_bin = atoi(optarg);
+            if (args.tcoord_bin < 1)
+                args.tcoord_bin = 1;
+            break;
+
+        case 't': args.tlen_adj = atoi(optarg);break;
+
+        case 's': args.use_sample_name = 1;break;
+
+        case 'o':
+            if (!(args.out_fp = fopen(optarg, "w"))) {
+                perror(optarg);
+                return 1;
+            }
+            break;
+
+        case 'S':
+            args.multi_ref = 0;
+            break;
+
+        case '?': return usage(&oargs, stderr, EXIT_FAILURE);
+        case 'h': return usage(&oargs, stdout, EXIT_SUCCESS);
+
+        default:
+            if (parse_sam_global_opt(opt, optarg, loptions, &args.ga) != 0)
+                usage(&oargs,stderr, EXIT_FAILURE);
+            break;
+        }
+    }
+
+    if (argc <= optind)
+        return usage(&oargs, stdout, EXIT_SUCCESS);
+    if (argc <= optind+1 && isatty(STDIN_FILENO))
+        return usage(&oargs, stderr, EXIT_FAILURE);
+
+    khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
+    if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) {
+        print_error_errno("ampliconstats",
+                          "Could not read file \"%s\"", argv[optind]);
+        return 1;
+
+    }
+
+    khiter_t k, ref_count = 0;
+    for (k = kh_begin(bed_hash); k != kh_end(bed_hash); k++) {
+        if (!kh_exist(bed_hash, k))
+            continue;
+        ref_count++;
+    }
+    if (ref_count == 0)
+        return 1;
+    if (ref_count > 1 && args.multi_ref == 0) {
+        print_error("ampliconstats",
+                    "Single-ref mode is not permitted for BED files\n"
+                    "containing more than one reference.");
+        return 1;
+    }
+
+    args.argv = stringify_argv(argc, argv);
+    int ret;
+    if (argc == ++optind) {
+        char *av = "-";
+        ret = amplicon_stats(&args, bed_hash, &av, 1);
+    } else {
+        ret = amplicon_stats(&args, bed_hash, &argv[optind], argc-optind);
+    }
+
+    free(args.argv);
+    destroy_bed_hash(bed_hash);
+
+    return ret;
+}
diff --git a/samtools/amplicon_stats.c.pysam.c b/samtools/amplicon_stats.c.pysam.c

new file mode 100644 (file)

index 0000000..aa09459
--- /dev/null
+++ b/samtools/amplicon_stats.c.pysam.c
@@ -0,0 +1,1756 @@
+#include "samtools.pysam.h"
+
+/*  stats.c -- This is the former bamcheck integrated into samtools/htslib.
+
+    Copyright (C) 2020-2021 Genome Research Ltd.
+
+    Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+/*
+ * This tool is designed to give "samtools stats" style output, but dedicated
+ * to small amplicon sequencing projects.  It gathers stats on the
+ * distribution of reads across amplicons.
+ */
+
+/*
+ * TODO:
+ * - Cope with multiple references.  What do we do here?  Just request one?
+ * - Permit regions rather than consuming whole file (maybe solves above).
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <math.h>
+
+#include <htslib/sam.h>
+#include <htslib/khash.h>
+
+#include "samtools.h"
+#include "sam_opts.h"
+#include "bam_ampliconclip.h"
+
+KHASH_MAP_INIT_INT64(tcoord, int64_t)
+KHASH_MAP_INIT_STR(qname, int64_t)
+
+#ifndef MIN
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef MAX
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#ifndef ABS
+#define ABS(a) ((a)>=0?(a):-(a))
+#endif
+
+#define TCOORD_MIN_COUNT   10
+#define MAX_AMP 1000       // Default maximum number of amplicons
+#define MAX_AMP_LEN 1000   // Default maximum length of any single amplicon
+#define MAX_PRIMER_PER_AMPLICON 4  // Max primers per LEFT/RIGHT
+#define MAX_DEPTH 5        // Number of different depths permitted
+
+typedef struct {
+    sam_global_args ga;
+    uint32_t flag_require;
+    uint32_t flag_filter;
+    int max_delta;   // Used for matching read to amplicon primer loc
+    int min_depth[MAX_DEPTH]; // Used for coverage; must be >= min_depth deep
+    int use_sample_name;
+    int max_amp;     // Total number of amplicons
+    int max_amp_len; // Maximum length of an individual amplicon
+    double depth_bin;// aggregate depth within this fraction
+    int tlen_adj;    // Adjust tlen by this amount, due to clip but no fixmate
+    FILE *out_fp;
+    char *argv;
+    int tcoord_min_count;
+    int tcoord_bin;
+    int multi_ref;
+} astats_args_t;
+
+typedef struct {
+    int nseq;       // total sequence count
+    int nfiltered;  // sequence filtered
+    int nfailprimer;// count of sequences not matching the primer locations
+
+    // Sizes of memory allocated below, to permit reset
+    int max_amp, max_amp_len, max_len;
+
+    // Summary across all samples, sum(x) plus sum(x^2) for s.d. calc
+    int64_t *nreads, *nreads2;          // [max_amp]
+    double  *nfull_reads;               // [max_amp]; 0.5/read if paired.
+    double  *nrperc, *nrperc2;          // [max_amp]
+    int64_t *nbases, *nbases2;          // [max_amp]
+    int64_t *coverage;                  // [max_amp][max_amp_len]
+    double  (*covered_perc)[MAX_DEPTH]; // [max_amp][MAX_DEPTH]
+    double  (*covered_perc2)[MAX_DEPTH];// [max_amp][MAX_DEPTH];
+    khash_t(tcoord) **tcoord;           // [max_amp+1]
+
+    // 0 is correct pair, 1 is incorrect pair, 2 is unidentified
+    int     (*amp_dist)[3];             // [MAX_AMP][3];
+
+    int *depth_valid; // [max_len]
+    int *depth_all;   // [max_len]
+    khash_t(qname) *qend;  // queryname end, for overlap removal
+} astats_t;
+
+// We can have multiple primers for LEFT / RIGHT, so this
+// permits detection by any compatible combination.
+// One reference:
+typedef struct {
+    int64_t left[MAX_PRIMER_PER_AMPLICON];
+    int nleft;
+    int64_t right[MAX_PRIMER_PER_AMPLICON];
+    int nright;
+    int64_t max_left, min_right; // inner dimensions
+    int64_t min_left, max_right; // outer dimensions
+} amplicon_t;
+
+// Multiple references, we have an array of amplicons_t - one per used ref.
+// We have per reference local and global stats here, as some of the stats
+// are coordinate based.  However we report them combined together as a single
+// list across all references.
+// "namp" is the number of amplicons in this reference, but they're
+// numbered first_amp to first_amp+namp-1 inclusively.
+typedef struct {
+    int tid, namp;
+    int64_t len;
+    bed_entry_list_t *sites;
+    amplicon_t *amp;
+    astats_t *lstats, *gstats; // local (1 file) and global (all file) stats
+    const char *ref;           // ref name (pointer to the bed hash table key)
+    int first_amp;             // first amplicon number for this ref
+} amplicons_t;
+
+// Reinitialised for each new reference/chromosome.
+// Counts from 1 to namp, -1 for no match and 0 for ?.
+static int *pos2start = NULL;
+static int *pos2end = NULL;
+static int pos2size = 0; // allocated size of pos2start/end
+
+// Lookup table to go from position to amplicon based on
+// read start / end.
+static int initialise_amp_pos_lookup(astats_args_t *args,
+                                     amplicons_t *amps,
+                                     int ref) {
+    int64_t i, j;
+    amplicon_t *amp = amps[ref].amp;
+    int64_t max_len = amps[ref].len;
+    int namp = amps[ref].namp;
+
+    if (max_len+1 > pos2size) {
+        if (!(pos2start = realloc(pos2start, (max_len+1)*sizeof(*pos2start))))
+            return -1;
+        if (!(pos2end   = realloc(pos2end,   (max_len+1)*sizeof(*pos2end))))
+            return -1;
+        pos2size = max_len;
+    }
+    for (i = 0; i < max_len; i++)
+        pos2start[i] = pos2end[i] = -1;
+
+    for (i = 0; i < namp; i++) {
+        for (j = 0; j < amp[i].nleft; j++) {
+            int64_t p;
+            for (p = amp[i].left[j] - args->max_delta;
+                 p <= amp[i].left[j] + args->max_delta; p++) {
+                if (p < 1 || p > max_len)
+                    continue;
+                pos2start[p-1] = i;
+            }
+        }
+        for (j = 0; j < amp[i].nright; j++) {
+            int64_t p;
+            for (p = amp[i].right[j] - args->max_delta;
+                 p <= amp[i].right[j] + args->max_delta; p++) {
+                if (p < 1 || p > max_len)
+                    continue;
+                pos2end[p-1] = i;
+            }
+        }
+    }
+
+    return 0;
+}
+
+// Counts amplicons.
+// Assumption: input BED file alternates between LEFT and RIGHT primers
+// per amplicon, thus we can count the number based on the switching
+// orientation.
+static int count_amplicon(bed_entry_list_t *sites) {
+    int i, namp, last_rev = 0;
+    for (i = namp = 0; i < sites->length; i++) {
+        if (sites->bp[i].rev == 0 && last_rev)
+            namp++;
+        last_rev = sites->bp[i].rev;
+    }
+
+    return ++namp;
+}
+
+// We're only interest in the internal part of the amplicon.
+// Our bed file has LEFT start/end followed by RIGHT start/end,
+// so collapse these to LEFT end / RIGHT start.
+//
+// Returns right most amplicon position on success,
+//         < 0 on error
+static int64_t bed2amplicon(astats_args_t *args, bed_entry_list_t *sites,
+                            amplicon_t *amp, int *namp, int do_title,
+                            const char *ref, int first_amp) {
+    int i, j;
+    int64_t max_right = 0;
+    FILE *ofp = args->out_fp;
+
+    *namp = 0;
+
+    // Assume all primers for the same amplicon are adjacent in BED
+    // with all + followed by all -.  Thus - to + signifies next primer set.
+    int last_rev = 0;
+    amp[0].max_left = 0;
+    amp[0].min_right = INT64_MAX;
+    amp[0].min_left = INT64_MAX;
+    amp[0].max_right = 0;
+    if (do_title) {
+        fprintf(ofp, "# Amplicon locations from BED file.\n");
+        fprintf(ofp, "# LEFT/RIGHT are <start>-<end> format and "
+                "comma-separated for alt-primers.\n");
+        if (args->multi_ref)
+            fprintf(ofp, "#\n# AMPLICON\tREF\tNUMBER\tLEFT\tRIGHT\n");
+        else
+            fprintf(ofp, "#\n# AMPLICON\tNUMBER\tLEFT\tRIGHT\n");
+    }
+    for (i = j = 0; i < sites->length; i++) {
+        if (i == 0 && sites->bp[i].rev != 0) {
+            fprintf(samtools_stderr, "[ampliconstats] error: BED file should start"
+                    " with the + strand primer\n");
+            return -1;
+        }
+        if (sites->bp[i].rev == 0 && last_rev) {
+            j++;
+            if (j >= args->max_amp) {
+                fprintf(samtools_stderr, "[ampliconstats] error: too many amplicons"
+                        " (%d). Use -a option to raise this.\n", j);
+                return -1;
+            }
+            amp[j].max_left = 0;
+            amp[j].min_right = INT64_MAX;
+            amp[j].min_left = INT64_MAX;
+            amp[j].max_right = 0;
+        }
+        if (sites->bp[i].rev == 0) {
+            if (i == 0 || last_rev) {
+                if (j>0) fprintf(ofp, "\n");
+                if (args->multi_ref)
+                    fprintf(ofp, "AMPLICON\t%s\t%d", ref, j+1 + first_amp);
+                else
+                    fprintf(ofp, "AMPLICON\t%d", j+1);
+            }
+            if (amp[j].nleft >= MAX_PRIMER_PER_AMPLICON) {
+                print_error_errno("ampliconstats",
+                                  "too many primers per amplicon (%d).\n",
+                                  MAX_PRIMER_PER_AMPLICON);
+                return -1;
+            }
+            amp[j].left[amp[j].nleft++] = sites->bp[i].right;
+            if (amp[j].max_left < sites->bp[i].right+1)
+                amp[j].max_left = sites->bp[i].right+1;
+            if (amp[j].min_left > sites->bp[i].right+1)
+                amp[j].min_left = sites->bp[i].right+1;
+            // BED file, so left+1 as zero based. right(+1-1) as
+            // BED goes one beyond end (and we want inclusive range).
+            fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nleft > 1],
+                    sites->bp[i].left+1, sites->bp[i].right);
+        } else {
+            if (amp[j].nright >= MAX_PRIMER_PER_AMPLICON) {
+                print_error_errno("ampliconstats",
+                                  "too many primers per amplicon (%d)",
+                                  MAX_PRIMER_PER_AMPLICON);
+                return -1;
+            }
+            amp[j].right[amp[j].nright++] = sites->bp[i].left;
+            if (amp[j].min_right > sites->bp[i].left-1)
+                amp[j].min_right = sites->bp[i].left-1;
+            if (amp[j].max_right < sites->bp[i].left-1) {
+                amp[j].max_right = sites->bp[i].left-1;
+                if (amp[j].max_right - amp[j].min_left + 1 >=
+                    args->max_amp_len) {
+                    fprintf(samtools_stderr, "[ampliconstats] error: amplicon "
+                            "longer (%d) than max_amp_len option (%d)\n",
+                            (int)(amp[j].max_right - amp[j].min_left + 2),
+                            args->max_amp_len);
+                    return -1;
+                }
+                if (max_right < amp[j].max_right)
+                    max_right = amp[j].max_right;
+            }
+            fprintf(ofp, "%c%"PRId64"-%"PRId64, "\t,"[amp[j].nright > 1],
+                    sites->bp[i].left+1, sites->bp[i].right);
+        }
+        last_rev = sites->bp[i].rev;
+    }
+    if (last_rev != 1) {
+        fprintf(ofp, "\n"); // useful if going to samtools_stdout
+        fprintf(samtools_stderr, "[ampliconstats] error: bed file does not end on"
+                " a reverse strand primer.\n");
+        return -1;
+    }
+    *namp = ++j;
+    if (j) fprintf(ofp, "\n");
+
+    if (j >= args->max_amp) {
+        fprintf(samtools_stderr, "[ampliconstats] error: "
+                "too many amplicons (%d). Use -a option to raise this.", j);
+        return -1;
+    }
+
+//    for (i = 0; i < *namp; i++) {
+//      fprintf(samtools_stdout, "%d\t%ld", i, amp[i].length);
+//      for (j = 0; j < amp[i].nleft; j++)
+//          fprintf(samtools_stdout, "%c%ld", "\t,"[j>0], amp[i].left[j]);
+//      for (j = 0; j < amp[i].nright; j++)
+//          fprintf(samtools_stdout, "%c%ld", "\t,"[j>0], amp[i].right[j]);
+//      fprintf(samtools_stdout, "\n");
+//    }
+
+    return max_right;
+}
+
+void stats_free(astats_t *st) {
+    if (!st)
+        return;
+
+    free(st->nreads);
+    free(st->nreads2);
+    free(st->nfull_reads);
+    free(st->nrperc);
+    free(st->nrperc2);
+    free(st->nbases);
+    free(st->nbases2);
+    free(st->coverage);
+    free(st->covered_perc);
+    free(st->covered_perc2);
+    free(st->amp_dist);
+
+    free(st->depth_valid);
+    free(st->depth_all);
+
+    if (st->tcoord) {
+        int i;
+        for (i = 0; i <= st->max_amp; i++) {
+            if (st->tcoord[i])
+                kh_destroy(tcoord, st->tcoord[i]);
+        }
+        free(st->tcoord);
+    }
+
+    khiter_t k;
+    for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
+        if (kh_exist(st->qend, k))
+            free((void *)kh_key(st->qend, k));
+    kh_destroy(qname, st->qend);
+
+    free(st);
+}
+
+astats_t *stats_alloc(int64_t max_len, int max_amp, int max_amp_len) {
+    astats_t *st = calloc(1, sizeof(*st));
+    if (!st)
+        return NULL;
+
+    st->max_amp = max_amp;
+    st->max_amp_len = max_amp_len;
+    st->max_len = max_len;
+
+    if (!(st->nreads  = calloc(max_amp, sizeof(*st->nreads))))  goto err;
+    if (!(st->nreads2 = calloc(max_amp, sizeof(*st->nreads2)))) goto err;
+    if (!(st->nrperc  = calloc(max_amp, sizeof(*st->nrperc))))  goto err;
+    if (!(st->nrperc2 = calloc(max_amp, sizeof(*st->nrperc2)))) goto err;
+    if (!(st->nbases  = calloc(max_amp, sizeof(*st->nbases))))  goto err;
+    if (!(st->nbases2 = calloc(max_amp, sizeof(*st->nbases2)))) goto err;
+
+    if (!(st->nfull_reads = calloc(max_amp, sizeof(*st->nfull_reads))))
+        goto err;
+
+    if (!(st->coverage = calloc(max_amp*max_amp_len, sizeof(*st->coverage))))
+        goto err;
+
+    if (!(st->covered_perc  = calloc(max_amp, sizeof(*st->covered_perc))))
+        goto err;
+    if (!(st->covered_perc2 = calloc(max_amp, sizeof(*st->covered_perc2))))
+        goto err;
+
+    if (!(st->tcoord = calloc(max_amp+1, sizeof(*st->tcoord)))) goto err;
+    int i;
+    for (i = 0; i <= st->max_amp; i++)
+        if (!(st->tcoord[i] = kh_init(tcoord)))
+            goto err;
+
+    if (!(st->qend = kh_init(qname)))
+        goto err;
+
+    if (!(st->depth_valid = calloc(max_len, sizeof(*st->depth_valid))))
+        goto err;
+    if (!(st->depth_all   = calloc(max_len, sizeof(*st->depth_all))))
+        goto err;
+
+    if (!(st->amp_dist  = calloc(max_amp, sizeof(*st->amp_dist))))  goto err;
+
+    return st;
+
+ err:
+    stats_free(st);
+    return NULL;
+}
+
+static void stats_reset(astats_t *st) {
+    st->nseq = 0;
+    st->nfiltered = 0;
+    st->nfailprimer = 0;
+
+    memset(st->nreads,  0, st->max_amp * sizeof(*st->nreads));
+    memset(st->nreads2, 0, st->max_amp * sizeof(*st->nreads2));
+    memset(st->nfull_reads, 0, st->max_amp * sizeof(*st->nfull_reads));
+
+    memset(st->nrperc,  0, st->max_amp * sizeof(*st->nrperc));
+    memset(st->nrperc2, 0, st->max_amp * sizeof(*st->nrperc2));
+
+    memset(st->nbases,  0, st->max_amp * sizeof(*st->nbases));
+    memset(st->nbases2, 0, st->max_amp * sizeof(*st->nbases2));
+
+    memset(st->coverage, 0, st->max_amp * st->max_amp_len
+           * sizeof(*st->coverage));
+    memset(st->covered_perc,  0, st->max_amp * sizeof(*st->covered_perc));
+    memset(st->covered_perc2, 0, st->max_amp * sizeof(*st->covered_perc2));
+
+    // Keep the allocated entries as it's likely all files will share
+    // the same keys.  Instead we reset counters to zero for common ones
+    // and delete rare ones.
+    int i;
+    for (i = 0; i <= st->max_amp; i++) {
+        khiter_t k;
+        for (k = kh_begin(st->tcoord[i]);
+             k != kh_end(st->tcoord[i]); k++)
+            if (kh_exist(st->tcoord[i], k)) {
+                if (kh_value(st->tcoord[i], k) < 5)
+                    kh_del(tcoord, st->tcoord[i], k);
+                else
+                    kh_value(st->tcoord[i], k) = 0;
+            }
+    }
+
+    khiter_t k;
+    for (k = kh_begin(st->qend); k != kh_end(st->qend); k++)
+        if (kh_exist(st->qend, k))
+            free((void *)kh_key(st->qend, k));
+    kh_clear(qname, st->qend);
+
+    memset(st->depth_valid, 0, st->max_len * sizeof(*st->depth_valid));
+    memset(st->depth_all,   0, st->max_len * sizeof(*st->depth_all));
+    memset(st->amp_dist,  0, st->max_amp * sizeof(*st->amp_dist));
+}
+
+static void amp_stats_reset(amplicons_t *amps, int nref) {
+    int i;
+    for (i = 0; i < nref; i++) {
+        if (!amps[i].sites)
+            continue;
+        stats_reset(amps[i].lstats);
+    }
+}
+
+static int accumulate_stats(astats_args_t *args, amplicons_t *amps,
+                            bam1_t *b) {
+    int ref = b->core.tid;
+    amplicon_t *amp = amps[ref].amp;
+    astats_t *stats = amps[ref].lstats;
+    int len = amps[ref].len;
+
+    if (!stats)
+        return 0;
+
+    stats->nseq++;
+    if ((b->core.flag & args->flag_require) != args->flag_require ||
+        (b->core.flag & args->flag_filter)  != 0) {
+        stats->nfiltered++;
+        return 0;
+    }
+
+    int64_t start = b->core.pos, mstart = start; // modified start
+    int64_t end = bam_endpos(b), i;
+
+    // Compute all-template-depth and valid-template-depth.
+    // We track current end location per read name so we can remove overlaps.
+    // Potentially we could use this data for a better amplicon-depth
+    // count too, but for now it's purely for the per-base plots.
+    int ret;
+    khiter_t k;
+    int prev_start = 0, prev_end = 0;
+    if ((b->core.flag & BAM_FPAIRED)
+        && !(b->core.flag & (BAM_FSUPPLEMENTARY | BAM_FSECONDARY))) {
+        k = kh_put(qname, stats->qend, bam_get_qname(b), &ret);
+        if (ret == 0) {
+            prev_start = kh_value(stats->qend, k) & 0xffffffff;
+            prev_end = kh_value(stats->qend, k)>>32;
+            mstart = MAX(mstart, prev_end);
+            // Ideally we'd reuse strings so we don't thrash free/malloc.
+            // However let's see if the official way of doing that (malloc
+            // itself) is fast enough first.
+            free((void *)kh_key(stats->qend, k));
+            kh_del(qname, stats->qend, k);
+            //fprintf(samtools_stderr, "remove overlap %d to %d\n", (int)start, (int)mstart);
+        } else {
+            if (!(kh_key(stats->qend, k) = strdup(bam_get_qname(b))))
+                return -1;
+
+            kh_value(stats->qend, k) = start | (end << 32);
+        }
+    }
+    for (i = mstart; i < end && i < len; i++)
+        stats->depth_all[i]++;
+    if (i < end) {
+        print_error("ampliconstats", "record %s overhangs end of reference",
+                    bam_get_qname(b));
+        // But keep going, as it's harmless.
+    }
+
+    // On single ended runs, eg ONT or PacBio, we just use the start/end
+    // of the template to assign.
+    int anum = (b->core.flag & BAM_FREVERSE) || !(b->core.flag & BAM_FPAIRED)
+        ? (end-1 >= 0 && end-1 < len ? pos2end[end-1] : -1)
+        : (start >= 0 && start < len ? pos2start[start] : -1);
+
+    // ivar sometimes soft-clips 100% of the bases.
+    // This is essentially unmapped
+    if (end == start && (args->flag_filter & BAM_FUNMAP)) {
+        stats->nfiltered++;
+        return 0;
+    }
+
+    if (anum == -1)
+        stats->nfailprimer++;
+
+    if (anum >= 0) {
+        int64_t c = MIN(end,amp[anum].min_right+1) - MAX(start,amp[anum].max_left);
+        if (c > 0) {
+            stats->nreads[anum]++;
+            // NB: ref bases rather than read bases
+            stats->nbases[anum] += c;
+
+            int64_t i;
+            if (start < 0) start = 0;
+            if (end > len) end = len;
+
+            int64_t ostart = MAX(start, amp[anum].min_left-1);
+            int64_t oend = MIN(end, amp[anum].max_right);
+            int64_t offset = amp[anum].min_left-1;
+            for (i = ostart; i < oend; i++)
+                stats->coverage[anum*stats->max_amp_len + i-offset]++;
+        } else {
+            stats->nfailprimer++;
+        }
+    }
+
+    // Template length in terms of amplicon number to amplicon number.
+    // We expect left to right of same amplicon (len 0), but it may go
+    // to next amplicon (len 1) or prev (len -1), etc.
+    int64_t t_end;
+    int oth_anum = -1;
+
+    if (b->core.flag & BAM_FPAIRED) {
+        t_end = (b->core.flag & BAM_FREVERSE ? end : start)
+            + b->core.isize;
+
+        // If we've clipped the primers but not followed up with a fixmates
+        // then our start+TLEN will take us to a location which is
+        // length(LEFT_PRIMER) + length(RIGHT_PRIMER) too far away.
+        //
+        // The correct solution is to run samtools fixmate so TLEN is correct.
+        // The hacky solution is to fudge the expected tlen by double the
+        // average primer length (e.g. 50).
+        t_end += b->core.isize > 0 ? -args->tlen_adj : +args->tlen_adj;
+
+        if (t_end > 0 && t_end < len && b->core.isize != 0)
+            oth_anum = (b->core.flag & BAM_FREVERSE)
+                ? pos2start[t_end]
+                : pos2end[t_end];
+    } else {
+        // Not paired (see int anum = (REV || !PAIR) ?en :st expr above)
+        oth_anum = pos2start[start];
+        t_end = end;
+    }
+
+    // We don't want to count our pairs twice.
+    // If both left/right are known, count it on left only.
+    // If only one is known, we'll only get to this code once
+    // so we can also count it.
+    int astatus = 2;
+    if (anum != -1 && oth_anum != -1) {
+        astatus = oth_anum == anum ? 0 : 1;
+        if (start <= t_end)
+            stats->amp_dist[anum][astatus]++;
+    } else if (anum >= 0) {
+        stats->amp_dist[anum][astatus = 2]++;
+    }
+
+    if (astatus == 0 && !(b->core.flag & (BAM_FUNMAP | BAM_FMUNMAP))) {
+        if (prev_end && mstart > prev_end) {
+            // 2nd read with gap to 1st; undo previous increment.
+            for (i = prev_start; i < prev_end; i++)
+                stats->depth_valid[i]--;
+            stats->nfull_reads[anum] -= (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
+        } else {
+            // 1st read, or 2nd read that overlaps 1st
+            for (i = mstart; i < end; i++)
+                stats->depth_valid[i]++;
+            stats->nfull_reads[anum] += (b->core.flag & BAM_FPAIRED) ? 0.5 : 1;
+        }
+    }
+
+    // Track template start,end frequencies, so we can give stats on
+    // amplicon primer usage.
+    if ((b->core.flag & BAM_FPAIRED) && b->core.isize <= 0)
+        // left to right only, so we don't double count template positions.
+        return 0;
+
+    start = b->core.pos;
+    t_end = b->core.flag & BAM_FPAIRED
+        ? start + b->core.isize-1
+        : end;
+    uint64_t tcoord = MIN(start+1, UINT32_MAX) | (MIN(t_end+1, UINT32_MAX)<<32);
+    k = kh_put(tcoord, stats->tcoord[anum+1], tcoord, &ret);
+    if (ret < 0)
+        return -1;
+    if (ret == 0)
+        kh_value(stats->tcoord[anum+1], k)++;
+    else
+        kh_value(stats->tcoord[anum+1], k)=1;
+    kh_value(stats->tcoord[anum+1], k) |= ((int64_t)astatus<<32);
+
+    return 0;
+}
+
+// Append file local stats to global stats
+int append_lstats(astats_t *lstats, astats_t *gstats, int namp, int all_nseq) {
+    gstats->nseq += lstats->nseq;
+    gstats->nfiltered += lstats->nfiltered;
+    gstats->nfailprimer += lstats->nfailprimer;
+
+    int a;
+    for (a = -1; a < namp; a++) {
+        // Add khash local (kl) to khash global (kg)
+        khiter_t kl, kg;
+        for (kl = kh_begin(lstats->tcoord[a+1]);
+             kl != kh_end(lstats->tcoord[a+1]); kl++) {
+            if (!kh_exist(lstats->tcoord[a+1], kl) ||
+                kh_value(lstats->tcoord[a+1], kl) == 0)
+                continue;
+
+            int ret;
+            kg = kh_put(tcoord, gstats->tcoord[a+1],
+                        kh_key(lstats->tcoord[a+1], kl),
+                        &ret);
+            if (ret < 0)
+                return -1;
+
+            kh_value(gstats->tcoord[a+1], kg) =
+                (ret == 0
+                 ? (kh_value(gstats->tcoord[a+1], kg) & 0xFFFFFFFF)
+                 : 0)
+                + kh_value(lstats->tcoord[a+1], kl);
+        }
+        if (a == -1) continue;
+
+        gstats->nreads[a]  += lstats->nreads[a];
+        gstats->nreads2[a] += lstats->nreads[a] * lstats->nreads[a];
+        gstats->nfull_reads[a] += lstats->nfull_reads[a];
+
+        // To get mean & sd for amplicon read percentage, we need
+        // to do the divisions here as nseq differs for each sample.
+        double nrperc = all_nseq ? 100.0 * lstats->nreads[a] / all_nseq : 0;
+        gstats->nrperc[a]  += nrperc;
+        gstats->nrperc2[a] += nrperc*nrperc;
+
+        gstats->nbases[a]  += lstats->nbases[a];
+        gstats->nbases2[a] += lstats->nbases[a] * lstats->nbases[a];
+
+        int d;
+        for (d = 0; d < MAX_DEPTH; d++) {
+            gstats->covered_perc[a][d]  += lstats->covered_perc[a][d];
+            gstats->covered_perc2[a][d] += lstats->covered_perc[a][d]
+                                         * lstats->covered_perc[a][d];
+        }
+
+        for (d = 0; d < 3; d++)
+            gstats->amp_dist[a][d] += lstats->amp_dist[a][d];
+    }
+
+    for (a = 0; a < lstats->max_len; a++) {
+        gstats->depth_valid[a] += lstats->depth_valid[a];
+        gstats->depth_all[a]   += lstats->depth_all[a];
+    }
+
+    return 0;
+}
+
+int append_stats(amplicons_t *amps, int nref) {
+    int i, r, all_nseq = 0;
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = amps[r].lstats;
+        all_nseq  += stats->nseq - stats->nfiltered - stats->nfailprimer;
+    }
+
+    for (i = 0; i < nref; i++) {
+        if (!amps[i].sites)
+            continue;
+        if (append_lstats(amps[i].lstats, amps[i].gstats, amps[i].namp,
+                          all_nseq) < 0)
+            return -1;
+    }
+
+    return 0;
+}
+
+typedef struct {
+    int32_t start, end;
+    uint32_t freq;
+    uint32_t status;
+} tcoord_t;
+
+// Sort tcoord by descending frequency and then ascending start and  end.
+static int tcoord_freq_sort(const void *vp1, const void *vp2) {
+    const tcoord_t *t1 = (const tcoord_t *)vp1;
+    const tcoord_t *t2 = (const tcoord_t *)vp2;
+
+    if (t1->freq != t2->freq)
+        return t2->freq - t1->freq;
+
+    if (t1->start != t2->start)
+        return t1->start - t2->start;
+
+    return t1->end - t2->end;
+}
+
+
+/*
+ * Merges tcoord start,end,freq,status tuples if their coordinates are
+ * close together.  We aim to keep the start,end for the most frequent
+ * value and assume that is the correct coordinate and all others are
+ * minor fluctuations due to errors or variants.
+ *
+ * We sort by frequency first and then merge later items in the list into
+ * the earlier more frequent ones.  It's O(N^2), but sufficient for now
+ * given current scale of projects.
+ *
+ * If we ever need to resolve that then consider sorting by start
+ * coordinate and scanning the list to find all items within X, find
+ * the most frequent of those, and then cluster that way.  (I'd have
+ * done that had I thought of it at the time!)
+ */
+static void aggregate_tcoord(astats_args_t *args, tcoord_t *tpos, size_t *np){
+    size_t n = *np, j, j2, j3, k;
+
+    // Sort by frequency and cluster infrequent coords into frequent
+    // ones provided they're close by.
+    // This is O(N^2), but we've already binned by tcoord_bin/2 so
+    // the list isn't intended to be vast at this point.
+    qsort(tpos, n, sizeof(*tpos), tcoord_freq_sort);
+
+    // For frequency ties, find mid start coord, and then find mid end
+    // coord of those matching start.
+    // We make that the first item so we merge into that mid point.
+    for (j = 0; j < n; j++) {
+        for (j2 = j+1; j2 < n; j2++) {
+            if (tpos[j].freq != tpos[j2].freq)
+                break;
+            if (tpos[j2].start - tpos[j].start >= args->tcoord_bin)
+                break;
+        }
+
+        // j to j2 all within bin of a common start,
+        // m is the mid start.
+        if (j2-1 > j) {
+            size_t m = (j2-1 + j)/2;
+
+            // Find mid end for this same start
+            while (m > 1 && tpos[m].start == tpos[m-1].start)
+                m--;
+            for (j3 = m+1; j3 < j2; j3++) {
+                if (tpos[m].start != tpos[j3].start)
+                    break;
+                if (tpos[m].end - tpos[j3].end >= args->tcoord_bin)
+                    break;
+            }
+            if (j3-1 > m)
+                m = (j3-1 + m)/2;
+
+            // Swap with first item.
+            tcoord_t tmp = tpos[j];
+            tpos[j] = tpos[m];
+            tpos[m] = tmp;
+            j = j2-1;
+        }
+    }
+
+    // Now merge in coordinates.
+    // This bit is O(N^2), so consider binning first to reduce the
+    // size of the list if we have excessive positional variation.
+    for (k = j = 0; j < n; j++) {
+        if (!tpos[j].freq)
+            continue;
+
+        if (k < j)
+            tpos[k] = tpos[j];
+
+        for (j2 = j+1; j2 < n; j2++) {
+            if (ABS(tpos[j].start-tpos[j2].start) < args->tcoord_bin/2 &&
+                ABS(tpos[j].end  -tpos[j2].end)  < args->tcoord_bin/2 &&
+                tpos[j].status == tpos[j2].status) {
+                tpos[k].freq += tpos[j2].freq;
+                tpos[j2].freq = 0;
+            }
+        }
+        k++;
+    }
+
+    *np = k;
+}
+
+int dump_stats(astats_args_t *args, char type, char *name, int nfile,
+               amplicons_t *amps, int nref, int local) {
+    int i, r;
+    FILE *ofp = args->out_fp;
+    tcoord_t *tpos = NULL;
+    size_t ntcoord = 0;
+
+    // summary stats for this sample (or for all samples)
+    fprintf(ofp, "# Summary stats.\n");
+    fprintf(ofp, "# Use 'grep ^%cSS | cut -f 2-' to extract this part.\n", type);
+
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        int nmatch = stats->nseq - stats->nfiltered - stats->nfailprimer;
+        char *name_ref = malloc(strlen(name) + strlen(amps[r].ref) + 2);
+        if (!name_ref)
+            return -1;
+        if (args->multi_ref)
+            sprintf(name_ref, "%s\t%s", name, amps[r].ref);
+        else
+            sprintf(name_ref, "%s", name);
+        fprintf(ofp, "%cSS\t%s\traw total sequences:\t%d\n",
+                type, name_ref, stats->nseq);
+        fprintf(ofp, "%cSS\t%s\tfiltered sequences:\t%d\n",
+                type, name_ref, stats->nfiltered);
+        fprintf(ofp, "%cSS\t%s\tfailed primer match:\t%d\n",
+                type, name_ref, stats->nfailprimer);
+        fprintf(ofp, "%cSS\t%s\tmatching sequences:\t%d\n",
+                type, name_ref, nmatch);
+
+        int d = 0;
+        do {
+            // From first to last amplicon only, so not entire consensus.
+            // If contig length is known, maybe we want to add the missing
+            // count to < DEPTH figures?
+            int64_t start = 0, covered = 0, total = 0;
+            amplicon_t *amp = amps[r].amp;
+            for (i = 0; i < amps[r].namp; i++) {
+                int64_t j, offset = amp[i].min_left-1;
+                if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
+                    fprintf(samtools_stderr, "[ampliconstats] error: "
+                            "Maximum amplicon length (%d) exceeded for '%s'\n",
+                            stats->max_amp, name);
+                    return -1;
+                }
+                for (j = MAX(start, amp[i].max_left-1);
+                     j < MAX(start, amp[i].min_right); j++) {
+                    if (stats->coverage[i*stats->max_amp_len + j-offset]
+                        >= args->min_depth[d])
+                        covered++;
+                    total++;
+                }
+                start = MAX(start, amp[i].min_right);
+            }
+            fprintf(ofp, "%cSS\t%s\tconsensus depth count < %d and >= %d:\t%"
+                    PRId64"\t%"PRId64"\n", type, name_ref,
+                    args->min_depth[d], args->min_depth[d],
+                    total-covered, covered);
+        } while (++d < MAX_DEPTH && args->min_depth[d]);
+
+        free(name_ref);
+    }
+
+    // Read count
+    fprintf(ofp, "# Absolute matching read counts per amplicon.\n");
+    fprintf(ofp, "# Use 'grep ^%cREADS | cut -f 2-' to extract this part.\n", type);
+    fprintf(ofp, "%cREADS\t%s", type, name);
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        for (i = 0; i < amps[r].namp; i++) {
+            fprintf(ofp, "\t%"PRId64, stats->nreads[i]);
+        }
+    }
+    fprintf(ofp, "\n");
+
+    // Valid depth is the number of full length reads (already divided
+    // by the number we expect to cover), so +0.5 per read in pair.
+    // A.k.a "usable depth" in the plots.
+    fprintf(ofp, "%cVDEPTH\t%s", type, name);
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        for (i = 0; i < amps[r].namp; i++)
+            fprintf(ofp, "\t%d", (int)stats->nfull_reads[i]);
+    }
+    fprintf(ofp, "\n");
+
+    if (type == 'C') {
+        // For combined we can compute mean & standard deviation too
+        fprintf(ofp, "CREADS\tMEAN");
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].sites)
+                continue;
+            astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+            for (i = 0; i < amps[r].namp; i++) {
+                fprintf(ofp, "\t%.1f", stats->nreads[i] / (double)nfile);
+            }
+        }
+        fprintf(ofp, "\n");
+
+        fprintf(ofp, "CREADS\tSTDDEV");
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].sites)
+                continue;
+            astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+            for (i = 0; i < amps[r].namp; i++) {
+                double n1 = stats->nreads[i];
+                fprintf(ofp, "\t%.1f", nfile > 1 && stats->nreads2[i] > 0
+                        ? sqrt(stats->nreads2[i]/(double)nfile
+                               - (n1/nfile)*(n1/nfile))
+                        : 0);
+            }
+        }
+        fprintf(ofp, "\n");
+    }
+
+    fprintf(ofp, "# Read percentage of distribution between amplicons.\n");
+    fprintf(ofp, "# Use 'grep ^%cRPERC | cut -f 2-' to extract this part.\n", type);
+    fprintf(ofp, "%cRPERC\t%s", type, name);
+    int all_nseq = 0;
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        all_nseq  += stats->nseq - stats->nfiltered - stats->nfailprimer;
+    }
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        for (i = 0; i < amps[r].namp; i++) {
+            if (type == 'C') {
+                fprintf(ofp, "\t%.3f", (double)stats->nrperc[i] / nfile);
+            } else {
+                fprintf(ofp, "\t%.3f",
+                        all_nseq ? 100.0 * stats->nreads[i] / all_nseq : 0);
+            }
+        }
+    }
+    fprintf(ofp, "\n");
+
+    if (type == 'C') {
+        // For combined we compute mean and standard deviation too
+        fprintf(ofp, "CRPERC\tMEAN");
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].sites)
+                continue;
+            astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+            for (i = 0; i < amps[r].namp; i++) {
+                fprintf(ofp, "\t%.3f", stats->nrperc[i] / nfile);
+            }
+        }
+        fprintf(ofp, "\n");
+
+        fprintf(ofp, "CRPERC\tSTDDEV");
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].sites)
+                continue;
+            astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+            for (i = 0; i < amps[r].namp; i++) {
+                // variance = SUM(X^2) - ((SUM(X)^2) / N)
+                double n1 = stats->nrperc[i];
+                double v = stats->nrperc2[i]/nfile - (n1/nfile)*(n1/nfile);
+                fprintf(ofp, "\t%.3f", v>0?sqrt(v):0);
+            }
+        }
+        fprintf(ofp, "\n");
+    }
+
+    // Base depth
+    fprintf(ofp, "# Read depth per amplicon.\n");
+    fprintf(ofp, "# Use 'grep ^%cDEPTH | cut -f 2-' to extract this part.\n", type);
+    fprintf(ofp, "%cDEPTH\t%s", type, name);
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        amplicon_t *amp = amps[r].amp;
+        for (i = 0; i < amps[r].namp; i++) {
+            int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
+            int64_t alen = amp[i].min_right - amp[i].max_left+1;
+            fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen : 0);
+        }
+    }
+    fprintf(ofp, "\n");
+
+    if (type == 'C') {
+        // For combined we can compute mean & standard deviation too
+        fprintf(ofp, "CDEPTH\tMEAN");
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].sites)
+                continue;
+            astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+            amplicon_t *amp = amps[r].amp;
+            int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer;
+            for (i = 0; i < amps[r].namp; i++) {
+                int64_t alen = amp[i].min_right - amp[i].max_left+1;
+                fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen / nfile : 0);
+            }
+        }
+        fprintf(ofp, "\n");
+
+        fprintf(ofp, "CDEPTH\tSTDDEV");
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].sites)
+                continue;
+            astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+            amplicon_t *amp = amps[r].amp;
+            for (i = 0; i < amps[r].namp; i++) {
+                double alen = amp[i].min_right - amp[i].max_left+1;
+                double n1 = stats->nbases[i] / alen;
+                double v = stats->nbases2[i] / (alen*alen) /nfile
+                    - (n1/nfile)*(n1/nfile);
+                fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
+            }
+        }
+        fprintf(ofp, "\n");
+    }
+
+    // Percent Coverage
+    if (type == 'F') {
+        fprintf(ofp, "# Percentage coverage per amplicon\n");
+        fprintf(ofp, "# Use 'grep ^%cPCOV | cut -f 2-' to extract this part.\n", type);
+        int d = 0;
+        do {
+            fprintf(ofp, "%cPCOV-%d\t%s", type, args->min_depth[d], name);
+
+            for (r = 0; r < nref; r++) {
+                if (!amps[r].sites)
+                    continue;
+                astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+                amplicon_t *amp = amps[r].amp;
+                for (i = 0; i < amps[r].namp; i++) {
+                    int covered = 0;
+                    if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) {
+                        fprintf(samtools_stderr, "[ampliconstats] error: "
+                                "Maximum amplicon length (%d) exceeded for '%s'\n",
+                                stats->max_amp, name);
+                        return -1;
+                    }
+                    int64_t j, offset = amp[i].min_left-1;
+                    for (j = amp[i].max_left-1; j < amp[i].min_right; j++) {
+                        int apos = i*stats->max_amp_len + j-offset;
+                        if (stats->coverage[apos] >= args->min_depth[d])
+                            covered++;
+                    }
+                    int64_t alen = amp[i].min_right - amp[i].max_left+1;
+                    stats->covered_perc[i][d] = 100.0 * covered / alen;
+                    fprintf(ofp, "\t%.2f", 100.0 * covered / alen);
+                }
+            }
+            fprintf(ofp, "\n");
+        } while (++d < MAX_DEPTH && args->min_depth[d]);
+
+    } else if (type == 'C') {
+        // For combined we can compute mean & standard deviation too
+        int d = 0;
+        do {
+            fprintf(ofp, "CPCOV-%d\tMEAN", args->min_depth[d]);
+            for (r = 0; r < nref; r++) {
+                if (!amps[r].sites)
+                    continue;
+                astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+                for (i = 0; i < amps[r].namp; i++) {
+                    fprintf(ofp, "\t%.1f", stats->covered_perc[i][d] / nfile);
+                }
+            }
+            fprintf(ofp, "\n");
+
+            fprintf(ofp, "CPCOV-%d\tSTDDEV", args->min_depth[d]);
+            for (r = 0; r < nref; r++) {
+                if (!amps[r].sites)
+                    continue;
+                astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+                for (i = 0; i < amps[r].namp; i++) {
+                    double n1 = stats->covered_perc[i][d] / nfile;
+                    double v = stats->covered_perc2[i][d] / nfile - n1*n1;
+                    fprintf(ofp, "\t%.1f", v>0?sqrt(v):0);
+                }
+            }
+            fprintf(ofp, "\n");
+        } while (++d < MAX_DEPTH && args->min_depth[d]);
+    }
+
+    // Plus base depth for all reads, irrespective of amplicon.
+    // This is post overlap removal, if reads in the read-pair overlap.
+    fprintf(ofp, "# Depth per reference base for ALL data.\n");
+    fprintf(ofp, "# Use 'grep ^%cDP_ALL | cut -f 2-' to extract this part.\n",
+            type);
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        if (args->multi_ref)
+            fprintf(ofp, "%cDP_ALL\t%s\t%s", type, name, amps[r].ref);
+        else
+            fprintf(ofp, "%cDP_ALL\t%s", type, name);
+
+        for (i = 0; i < amps[r].len; i++) {
+            // Basic run-length encoding provided all values are within
+            // +- depth_bin fraction of the mid-point.
+            int dmin = stats->depth_all[i], dmax = stats->depth_all[i], j;
+            double dmid = (dmin + dmax)/2.0;
+            double low  = dmid*(1-args->depth_bin);
+            double high = dmid*(1+args->depth_bin);
+            for (j = i+1; j < amps[r].len; j++) {
+                int d = stats->depth_all[j];
+                if (d < low || d > high)
+                    break;
+                if (dmin > d) {
+                    dmin = d;
+                    dmid = (dmin + dmax)/2.0;
+                    low  = dmid*(1-args->depth_bin);
+                    high = dmid*(1+args->depth_bin);
+                } else if (dmax < d) {
+                    dmax = d;
+                    dmid = (dmin + dmax)/2.0;
+                    low  = dmid*(1-args->depth_bin);
+                    high = dmid*(1+args->depth_bin);
+                }
+            }
+            fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
+            i = j-1;
+        }
+        fprintf(ofp, "\n");
+    }
+
+    // And depth for only reads matching to a single amplicon for full
+    // length.  This is post read overlap removal.
+    fprintf(ofp, "# Depth per reference base for full-length valid amplicon data.\n");
+    fprintf(ofp, "# Use 'grep ^%cDP_VALID | cut -f 2-' to extract this "
+            "part.\n", type);
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        if (args->multi_ref)
+            fprintf(ofp, "%cDP_VALID\t%s\t%s", type, name, amps[r].ref);
+        else
+            fprintf(ofp, "%cDP_VALID\t%s", type, name);
+
+        for (i = 0; i < amps[r].len; i++) {
+            int dmin = stats->depth_valid[i], dmax = stats->depth_valid[i], j;
+            double dmid = (dmin + dmax)/2.0;
+            double low  = dmid*(1-args->depth_bin);
+            double high = dmid*(1+args->depth_bin);
+            for (j = i+1; j < amps[r].len; j++) {
+                int d = stats->depth_valid[j];
+                if (d < low || d > high)
+                    break;
+                if (dmin > d) {
+                    dmin = d;
+                    dmid = (dmin + dmax)/2.0;
+                    low  = dmid*(1-args->depth_bin);
+                    high = dmid*(1+args->depth_bin);
+                } else if (dmax < d) {
+                    dmax = d;
+                    dmid = (dmin + dmax)/2.0;
+                    low  = dmid*(1-args->depth_bin);
+                    high = dmid*(1+args->depth_bin);
+                }
+            }
+            fprintf(ofp, "\t%d,%d", (int)dmid, j-i);
+            i = j-1;
+        }
+        fprintf(ofp, "\n");
+    }
+
+    // TCOORD (start to end) distribution
+    fprintf(ofp, "# Distribution of aligned template coordinates.\n");
+    fprintf(ofp, "# Use 'grep ^%cTCOORD | cut -f 2-' to extract this part.\n", type);
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        for (i = 0 - (nref==1); i < amps[r].namp; i++) {
+            if (ntcoord < kh_size(stats->tcoord[i+1])) {
+                ntcoord = kh_size(stats->tcoord[i+1]);
+                tcoord_t *tmp = realloc(tpos, ntcoord * sizeof(*tmp));
+                if (!tmp) {
+                    free(tpos);
+                    return -1;
+                }
+                tpos = tmp;
+            }
+
+            khiter_t k;
+            size_t n = 0, j;
+            for (k = kh_begin(stats->tcoord[i+1]);
+                 k != kh_end(stats->tcoord[i+1]); k++) {
+                if (!kh_exist(stats->tcoord[i+1], k) ||
+                    (kh_value(stats->tcoord[i+1], k) & 0xFFFFFFFF) == 0)
+                    continue;
+                // Key is start,end in 32-bit quantities.
+                // Yes this limits us to 4Gb references, but just how
+                // many primers are we planning on making?  Not that many
+                // I hope.
+                tpos[n].start = kh_key(stats->tcoord[i+1], k)&0xffffffff;
+                tpos[n].end   = kh_key(stats->tcoord[i+1], k)>>32;
+
+                // Value is frequency (top 32-bits) and status (bottom 32).
+                tpos[n].freq   = kh_value(stats->tcoord[i+1], k)&0xffffffff;
+                tpos[n].status = kh_value(stats->tcoord[i+1], k)>>32;
+                n++;
+            }
+
+            if (args->tcoord_bin > 1)
+                aggregate_tcoord(args, tpos, &n);
+
+            fprintf(ofp, "%cTCOORD\t%s\t%d", type, name,
+                    i+1+amps[r].first_amp); // per amplicon
+            for (j = 0; j < n; j++) {
+                if (tpos[j].freq < args->tcoord_min_count)
+                    continue;
+                fprintf(ofp, "\t%d,%d,%u,%u",
+                        tpos[j].start,
+                        tpos[j].end,
+                        tpos[j].freq,
+                        tpos[j].status);
+            }
+            fprintf(ofp, "\n");
+        }
+    }
+
+
+    // AMP length distribution.
+    // 0 = both ends in this amplicon
+    // 1 = ends in different amplicons
+    // 2 = other end matching an unknown amplicon site
+    //     (see tcoord for further analysis of where)
+    fprintf(ofp, "# Classification of amplicon status.  Columns are\n");
+    fprintf(ofp, "# number with both primers from this amplicon, number with\n");
+    fprintf(ofp, "# primers from different amplicon, and number with a position\n");
+    fprintf(ofp, "# not matching any valid amplicon primer site\n");
+    fprintf(ofp, "# Use 'grep ^%cAMP | cut -f 2-' to extract this part.\n", type);
+
+    fprintf(ofp, "%cAMP\t%s\t0", type, name); // all merged
+    int amp_dist[3] = {0};
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        for (i = 0; i < amps[r].namp; i++) { // accumulate for all amps
+            amp_dist[0] += stats->amp_dist[i][0];
+            amp_dist[1] += stats->amp_dist[i][1];
+            amp_dist[2] += stats->amp_dist[i][2];
+        }
+    }
+    fprintf(ofp, "\t%d\t%d\t%d\n", amp_dist[0], amp_dist[1], amp_dist[2]);
+
+    for (r = 0; r < nref; r++) {
+        if (!amps[r].sites)
+            continue;
+        astats_t *stats = local ? amps[r].lstats : amps[r].gstats;
+        for (i = 0; i < amps[r].namp; i++) {
+            // per amplicon
+            fprintf(ofp, "%cAMP\t%s\t%d", type, name, i+1+amps[r].first_amp);
+            fprintf(ofp, "\t%d\t%d\t%d\n", stats->amp_dist[i][0],
+                    stats->amp_dist[i][1], stats->amp_dist[i][2]);
+        }
+    }
+
+    free(tpos);
+    return 0;
+}
+
+int dump_lstats(astats_args_t *args, char type, char *name, int nfile,
+               amplicons_t *amps, int nref) {
+    return dump_stats(args, type, name, nfile, amps, nref, 1);
+}
+
+int dump_gstats(astats_args_t *args, char type, char *name, int nfile,
+               amplicons_t *amps, int nref) {
+    return dump_stats(args, type, name, nfile, amps, nref, 0);
+}
+
+char const *get_sample_name(sam_hdr_t *header, char *RG) {
+    kstring_t ks = {0};
+    sam_hdr_find_tag_id(header, "RG", RG?"ID":NULL, RG, "SM", &ks);
+    return ks.s;
+}
+
+// Return maximum reference length (SQ is NULL) or the length
+// of the specified reference in SQ.
+int64_t get_ref_len(sam_hdr_t *header, const char *SQ) {
+    if (SQ) {
+        int tid = SQ ? sam_hdr_name2tid(header, SQ) : 0;
+        return tid >= 0 ? sam_hdr_tid2len(header, tid) : -1;
+    } else {
+        int nref = sam_hdr_nref(header), tid;;
+        int64_t len = 0;
+        for (tid = 0; tid < nref; tid++) {
+            int64_t rl = sam_hdr_tid2len(header, tid);
+            if (len < rl)
+                len = rl;
+        }
+        return len;
+    }
+}
+
+static int amplicon_stats(astats_args_t *args,
+                          khash_t(bed_list_hash) *bed_hash,
+                          char **filev, int filec) {
+    int i, ref = -1, ref_tid = -1, ret = -1, nref = 0;
+    samFile *fp = NULL;
+    sam_hdr_t *header = NULL;
+    bam1_t *b = bam_init1();
+    FILE *ofp = args->out_fp;
+    char sname_[8192], *sname = NULL;
+    amplicons_t *amps = NULL;
+
+    // Report initial SS header.  We gather data from the bed_hash entries
+    // as well as from the first SAM header (with the requirement that all
+    // headers should be compatible).
+    if (filec) {
+        if (!(fp = sam_open_format(filev[0], "r", &args->ga.in))) {
+            print_error_errno("ampliconstats",
+                              "Cannot open input file \"%s\"",
+                              filev[0]);
+            goto err;
+        }
+        if (!(header = sam_hdr_read(fp)))
+            goto err;
+
+        if (!amps) {
+            amps = calloc(nref=sam_hdr_nref(header), sizeof(*amps));
+            if (!amps)
+                goto err;
+            fprintf(ofp, "# Summary statistics, used for scaling the plots.\n");
+            fprintf(ofp, "SS\tSamtools version: %s\n", samtools_version());
+            fprintf(ofp, "SS\tCommand line: %s\n", args->argv);
+            fprintf(ofp, "SS\tNumber of files:\t%d\n", filec);
+
+            // Note: order of hash entries will be different to order of
+            // BED file which may also differ to order of SQ headers.
+            // SQ header is canonical ordering (pos sorted file).
+            khiter_t k;
+            int bam_nref = sam_hdr_nref(header);
+            for (i = 0; i < bam_nref; i++) {
+                k = kh_get(bed_list_hash, bed_hash,
+                           sam_hdr_tid2name(header, i));
+                if (!kh_exist(bed_hash, k))
+                    continue;
+
+                bed_entry_list_t *sites = &kh_value(bed_hash, k);
+
+                ref = i;
+                amps[ref].ref = kh_key(bed_hash, k);
+                amps[ref].sites = sites;
+                amps[ref].namp = count_amplicon(sites);
+                amps[ref].amp  = calloc(sites->length,
+                                        sizeof(*amps[ref].amp));
+                if (!amps[ref].amp)
+                    goto err;
+                if (args->multi_ref)
+                    fprintf(ofp, "SS\tNumber of amplicons:\t%s\t%d\n",
+                            kh_key(bed_hash, k), amps[ref].namp);
+                else
+                    fprintf(ofp, "SS\tNumber of amplicons:\t%d\n",
+                            amps[ref].namp);
+
+                amps[ref].tid = ref;
+                if (ref_tid == -1)
+                    ref_tid = ref;
+
+                int64_t len = get_ref_len(header, kh_key(bed_hash, k));
+                amps[ref].len = len;
+                if (args->multi_ref)
+                    fprintf(ofp, "SS\tReference length:\t%s\t%"PRId64"\n",
+                            kh_key(bed_hash, k), len);
+                else
+                    fprintf(ofp, "SS\tReference length:\t%"PRId64"\n",
+                            len);
+
+                amps[ref].lstats = stats_alloc(len, args->max_amp,
+                                               args->max_amp_len);
+                amps[ref].gstats = stats_alloc(len, args->max_amp,
+                                               args->max_amp_len);
+                if (!amps[ref].lstats || !amps[ref].gstats)
+                    goto err;
+            }
+        }
+
+        sam_hdr_destroy(header);
+        header = NULL;
+        if (sam_close(fp) < 0) {
+            fp = NULL;
+            goto err;
+        }
+        fp = NULL;
+    }
+    fprintf(ofp, "SS\tEnd of summary\n");
+
+    // Extract the bits of amplicon data we need from bed hash and turn
+    // it into a position-to-amplicon lookup table.
+    int offset = 0;
+    for (i = 0; i < nref; i++) {
+        if (!amps[i].sites)
+            continue;
+
+        amps[i].first_amp = offset;
+        if (bed2amplicon(args, amps[i].sites, amps[i].amp,
+                         &amps[i].namp, i==0, amps[i].ref, offset) < 0)
+            goto err;
+
+        offset += amps[i].namp; // cumulative amplicon number across refs
+    }
+
+    // Now iterate over file contents, one at a time.
+    for (i = 0; i < filec; i++) {
+        char *nstart = filev[i];
+
+        fp = sam_open_format(filev[i], "r", &args->ga.in);
+        if (!fp) {
+            print_error_errno("ampliconstats",
+                              "Cannot open input file \"%s\"",
+                              filev[i]);
+            goto err;
+        }
+
+        if (args->ga.nthreads > 0)
+            hts_set_threads(fp, args->ga.nthreads);
+
+        if (!(header = sam_hdr_read(fp)))
+            goto err;
+
+        if (nref != sam_hdr_nref(header)) {
+            print_error_errno("ampliconstats",
+                              "SAM headers are not consistent across input files");
+            goto err;
+        }
+        int r;
+        for (r = 0; r < nref; r++) {
+            if (!amps[r].ref ||
+                strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 ||
+                amps[r].len != sam_hdr_tid2len(header, r)) {
+                print_error_errno("ampliconstats",
+                                  "SAM headers are not consistent across "
+                                  "input files");
+                goto err;
+            }
+        }
+
+        if (args->use_sample_name)
+            sname = (char *)get_sample_name(header, NULL);
+
+        if (!sname) {
+            sname = sname_;
+            char *nend = filev[i] + strlen(filev[i]), *cp;
+            if ((cp = strrchr(filev[i], '/')))
+                nstart = cp+1;
+            if ((cp = strrchr(nstart, '.')) &&
+                (strcmp(cp, ".bam") == 0 ||
+                 strcmp(cp, ".sam") == 0 ||
+                 strcmp(cp, ".cram") == 0))
+                nend = cp;
+            if (nend - nstart >= 8192) nend = nstart+8191;
+            memcpy(sname, nstart, nend-nstart);
+            sname[nend-nstart] = 0;
+        }
+
+        // Stats local to this sample only
+        amp_stats_reset(amps, nref);
+
+        int last_ref = -9;
+        while ((r = sam_read1(fp, header, b)) >= 0) {
+            // Other filter options useful here?
+            if (b->core.tid < 0)
+                continue;
+
+            if (last_ref != b->core.tid) {
+                last_ref  = b->core.tid;
+                if (initialise_amp_pos_lookup(args, amps, last_ref) < 0)
+                    goto err;
+            }
+
+            if (accumulate_stats(args, amps, b) < 0)
+                goto err;
+        }
+
+        if (r < -1) {
+            print_error_errno("ampliconstats", "Fail reading record");
+            goto err;
+        }
+
+        sam_hdr_destroy(header);
+        if (sam_close(fp) < 0) {
+            fp = NULL;
+            goto err;
+        }
+
+        fp = NULL;
+        header = NULL;
+
+        if (dump_lstats(args, 'F', sname, filec, amps, nref) < 0)
+            goto err;
+
+        if (append_stats(amps, nref) < 0)
+            goto err;
+
+        if (sname && sname != sname_)
+            free(sname);
+        sname = NULL;
+    }
+
+    if (dump_gstats(args, 'C', "COMBINED", filec, amps, nref) < 0)
+        goto err;
+
+    ret = 0;
+ err:
+    bam_destroy1(b);
+    if (ret) {
+        if (header)
+            sam_hdr_destroy(header);
+        if (fp)
+            sam_close(fp);
+    }
+    for (i = 0; i < nref; i++) {
+        stats_free(amps[i].lstats);
+        stats_free(amps[i].gstats);
+        free(amps[i].amp);
+    }
+    free(amps);
+    free(pos2start);
+    free(pos2end);
+    if (ret) {
+        if (sname && sname != sname_)
+            free(sname);
+    }
+
+    return ret;
+}
+
+static int usage(astats_args_t *args, FILE *fp, int exit_status) {
+    fprintf(fp,
+"\n"
+"Usage: samtools ampliconstats [options] primers.bed *.bam > astats.txt\n"
+"\n"
+"Options:\n");
+    fprintf(fp, "  -f, --required-flag STR|INT\n"
+            "               Only include reads with all of the FLAGs present [0x%X]\n",args->flag_require);
+    fprintf(fp, "  -F, --filter-flag STR|INT\n"
+            "               Only include reads with none of the FLAGs present [0x%X]\n",args->flag_filter & 0xffff);
+    fprintf(fp, "  -a, --max-amplicons INT\n"
+            "               Change the maximum number of amplicons permitted [%d]\n", MAX_AMP);
+    fprintf(fp, "  -l, --max-amplicon-length INT\n"
+            "               Change the maximum length of an individual amplicon [%d]\n", MAX_AMP_LEN);
+    fprintf(fp, "  -d, --min-depth INT[,INT]...\n"
+            "               Minimum base depth(s) to consider position covered [%d]\n", args->min_depth[0]);
+    fprintf(fp, "  -m, --pos-margin INT\n"
+            "               Margin of error for matching primer positions [%d]\n", args->max_delta);
+    fprintf(fp, "  -o, --output FILE\n"
+            "               Specify output file [samtools_stdout if unset]\n");
+    fprintf(fp, "  -s, --use-sample-name\n"
+            "               Use the sample name from the first @RG header line\n");
+    fprintf(fp, "  -t, --tlen-adjust INT\n"
+            "               Add/subtract from TLEN; use when clipping but no fixmate step\n");
+    fprintf(fp, "  -b, --tcoord-bin INT\n"
+            "               Bin template start,end positions into multiples of INT[1]\n");
+    fprintf(fp, "  -c, --tcoord-min-count INT\n"
+            "               Minimum template start,end frequency for recording [%d]\n", TCOORD_MIN_COUNT);
+    fprintf(fp, "  -D, --depth-bin FRACTION\n"
+            "               Merge FDP values within +/- FRACTION together\n");
+    fprintf(fp, "  -S, --single-ref\n"
+            "               Force single-ref (<=1.12) output format\n");
+    sam_global_opt_help(fp, "I.--.@");
+
+    return exit_status;
+}
+
+int main_ampliconstats(int argc, char **argv) {
+    astats_args_t args = {
+        .ga = SAM_GLOBAL_ARGS_INIT,
+        .flag_require = 0,
+        .flag_filter = 0x10B04,
+        //.sites = BED_LIST_INIT,
+        .max_delta = 30, // large enough to cope with alt primers
+        .min_depth = {1},
+        .use_sample_name = 0,
+        .max_amp = MAX_AMP,
+        .max_amp_len = MAX_AMP_LEN,
+        .tlen_adj = 0,
+        .out_fp = samtools_stdout,
+        .tcoord_min_count = TCOORD_MIN_COUNT,
+        .tcoord_bin = 1,
+        .depth_bin = 0.01,
+        .multi_ref = 1
+    }, oargs = args;
+
+    static const struct option loptions[] =
+    {
+        SAM_OPT_GLOBAL_OPTIONS('I', 0, '-', '-', 0, '@'),
+        {"help", no_argument, NULL, 'h'},
+        {"flag-require", required_argument, NULL, 'f'},
+        {"flag-filter", required_argument, NULL, 'F'},
+        {"min-depth", required_argument, NULL, 'd'},
+        {"output", required_argument, NULL, 'o'},
+        {"pos-margin", required_argument, NULL, 'm'},
+        {"use-sample-name", no_argument, NULL, 's'},
+        {"max-amplicons", required_argument, NULL, 'a'},
+        {"max-amplicon-length", required_argument, NULL, 'l'},
+        {"tlen-adjust", required_argument, NULL, 't'},
+        {"tcoord-min-count", required_argument, NULL, 'c'},
+        {"tcoord-bin", required_argument, NULL, 'b'},
+        {"depth-bin", required_argument, NULL, 'D'},
+        {"single-ref", no_argument, NULL, 'S'},
+        {NULL, 0, NULL, 0}
+    };
+    int opt;
+
+    while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) {
+        switch (opt) {
+        case 'f': args.flag_require = bam_str2flag(optarg); break;
+        case 'F':
+            if (args.flag_filter & 0x10000)
+                args.flag_filter = 0; // strip default on first -F usage
+            args.flag_filter |= bam_str2flag(optarg); break;
+
+        case 'm': args.max_delta = atoi(optarg); break; // margin
+        case 'D': args.depth_bin = atof(optarg); break; // depth bin fraction
+        case 'd': {
+            int d = 0;
+            char *cp = optarg, *ep;
+            do {
+                long n = strtol(cp, &ep, 10);
+                args.min_depth[d++] = n;
+                if (*ep != ',')
+                    break;
+                cp = ep+1;
+            } while (d < MAX_DEPTH);
+            break;
+        }
+
+        case 'a': args.max_amp = atoi(optarg)+1;break;
+        case 'l': args.max_amp_len = atoi(optarg)+1;break;
+
+        case 'c': args.tcoord_min_count = atoi(optarg);break;
+        case 'b':
+            args.tcoord_bin = atoi(optarg);
+            if (args.tcoord_bin < 1)
+                args.tcoord_bin = 1;
+            break;
+
+        case 't': args.tlen_adj = atoi(optarg);break;
+
+        case 's': args.use_sample_name = 1;break;
+
+        case 'o':
+            if (!(args.out_fp = fopen(optarg, "w"))) {
+                perror(optarg);
+                return 1;
+            }
+            break;
+
+        case 'S':
+            args.multi_ref = 0;
+            break;
+
+        case '?': return usage(&oargs, samtools_stderr, EXIT_FAILURE);
+        case 'h': return usage(&oargs, samtools_stdout, EXIT_SUCCESS);
+
+        default:
+            if (parse_sam_global_opt(opt, optarg, loptions, &args.ga) != 0)
+                usage(&oargs,samtools_stderr, EXIT_FAILURE);
+            break;
+        }
+    }
+
+    if (argc <= optind)
+        return usage(&oargs, samtools_stdout, EXIT_SUCCESS);
+    if (argc <= optind+1 && isatty(STDIN_FILENO))
+        return usage(&oargs, samtools_stderr, EXIT_FAILURE);
+
+    khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
+    if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) {
+        print_error_errno("ampliconstats",
+                          "Could not read file \"%s\"", argv[optind]);
+        return 1;
+
+    }
+
+    khiter_t k, ref_count = 0;
+    for (k = kh_begin(bed_hash); k != kh_end(bed_hash); k++) {
+        if (!kh_exist(bed_hash, k))
+            continue;
+        ref_count++;
+    }
+    if (ref_count == 0)
+        return 1;
+    if (ref_count > 1 && args.multi_ref == 0) {
+        print_error("ampliconstats",
+                    "Single-ref mode is not permitted for BED files\n"
+                    "containing more than one reference.");
+        return 1;
+    }
+
+    args.argv = stringify_argv(argc, argv);
+    int ret;
+    if (argc == ++optind) {
+        char *av = "-";
+        ret = amplicon_stats(&args, bed_hash, &av, 1);
+    } else {
+        ret = amplicon_stats(&args, bed_hash, &argv[optind], argc-optind);
+    }
+
+    free(args.argv);
+    destroy_bed_hash(bed_hash);
+
+    return ret;
+}
diff --git a/samtools/bam.c b/samtools/bam.c

index 0c1a06bae99ff688f0d72336e13296b33aff1edb..926062c71dd5cd0c61f53a99c1cd69bde48541cc 100644 (file)
--- a/samtools/bam.c
+++ b/samtools/bam.c
@@ -1,6 +1,6 @@
  /*  bam.c -- BAM format.
  
-    Copyright (C) 2008-2013, 2015, 2019 Genome Research Ltd.
+    Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd.
      Portions copyright (C) 2009-2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -125,21 +125,21 @@ int bam_remove_B(bam1_t *b)
      uint8_t *seq, *qual, *p;
      // test if removal is necessary
      if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing
-    cigar = bam1_cigar(b);
+    cigar = bam_get_cigar(b);
      for (k = 0; k < b->core.n_cigar; ++k)
          if (bam_cigar_op(cigar[k]) == BAM_CBACK) break;
      if (k == b->core.n_cigar) return 0; // no 'B'
      if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed
      // allocate memory for the new CIGAR
-    if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
-        b->m_data = b->data_len + b->core.n_cigar * 4;
+    if (b->l_data + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
+        b->m_data = b->l_data + b->core.n_cigar * 4;
          kroundup32(b->m_data);
          b->data = (uint8_t*)realloc(b->data, b->m_data);
-        cigar = bam1_cigar(b); // after realloc, cigar may be changed
+        cigar = bam_get_cigar(b); // after realloc, cigar may be changed
      }
      new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
      // the core loop
-    seq = bam1_seq(b); qual = bam1_qual(b);
+    seq = bam_get_seq(b); qual = bam_get_qual(b);
      no_qual = (qual[0] == 0xff); // test whether base quality is available
      i = j = 0; end_j = -1;
      for (k = l = 0; k < b->core.n_cigar; ++k) {
@@ -168,9 +168,9 @@ int bam_remove_B(bam1_t *b)
                  if (i != j) { // no need to copy if i == j
                      int u, c, c0;
                      for (u = 0; u < len; ++u) { // construct the consensus
-                        c = bam1_seqi(seq, i+u);
+                        c = bam_seqi(seq, i+u);
                          if (j + u < end_j) { // in an overlap
-                            c0 = bam1_seqi(seq, j+u);
+                            c0 = bam_seqi(seq, j+u);
                              if (c != c0) { // a mismatch; choose the better base
                                  if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
                                      bam1_seq_seti(seq, j+u, c);
@@ -202,9 +202,9 @@ int bam_remove_B(bam1_t *b)
      p = b->data + b->core.l_qname + l * 4;
      memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
      memmove(p, qual, j); p += j; // set QUAL
-    memmove(p, bam1_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields
+    memmove(p, bam_get_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields
      b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
-    b->data_len = p - b->data; // update record length
+    b->l_data = p - b->data; // update record length
      return 0;
  
  rmB_err:
diff --git a/samtools/bam.c.pysam.c b/samtools/bam.c.pysam.c

index 4c41e23acf1e65c7d271a6093cd3290764e44f02..2f40ca642038e264bac7b1ccf550253f4e55bf4d 100644 (file)
--- a/samtools/bam.c.pysam.c
+++ b/samtools/bam.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bam.c -- BAM format.
  
-    Copyright (C) 2008-2013, 2015, 2019 Genome Research Ltd.
+    Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd.
      Portions copyright (C) 2009-2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -127,21 +127,21 @@ int bam_remove_B(bam1_t *b)
      uint8_t *seq, *qual, *p;
      // test if removal is necessary
      if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing
-    cigar = bam1_cigar(b);
+    cigar = bam_get_cigar(b);
      for (k = 0; k < b->core.n_cigar; ++k)
          if (bam_cigar_op(cigar[k]) == BAM_CBACK) break;
      if (k == b->core.n_cigar) return 0; // no 'B'
      if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed
      // allocate memory for the new CIGAR
-    if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
-        b->m_data = b->data_len + b->core.n_cigar * 4;
+    if (b->l_data + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
+        b->m_data = b->l_data + b->core.n_cigar * 4;
          kroundup32(b->m_data);
          b->data = (uint8_t*)realloc(b->data, b->m_data);
-        cigar = bam1_cigar(b); // after realloc, cigar may be changed
+        cigar = bam_get_cigar(b); // after realloc, cigar may be changed
      }
      new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
      // the core loop
-    seq = bam1_seq(b); qual = bam1_qual(b);
+    seq = bam_get_seq(b); qual = bam_get_qual(b);
      no_qual = (qual[0] == 0xff); // test whether base quality is available
      i = j = 0; end_j = -1;
      for (k = l = 0; k < b->core.n_cigar; ++k) {
@@ -170,9 +170,9 @@ int bam_remove_B(bam1_t *b)
                  if (i != j) { // no need to copy if i == j
                      int u, c, c0;
                      for (u = 0; u < len; ++u) { // construct the consensus
-                        c = bam1_seqi(seq, i+u);
+                        c = bam_seqi(seq, i+u);
                          if (j + u < end_j) { // in an overlap
-                            c0 = bam1_seqi(seq, j+u);
+                            c0 = bam_seqi(seq, j+u);
                              if (c != c0) { // a mismatch; choose the better base
                                  if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
                                      bam1_seq_seti(seq, j+u, c);
@@ -204,9 +204,9 @@ int bam_remove_B(bam1_t *b)
      p = b->data + b->core.l_qname + l * 4;
      memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
      memmove(p, qual, j); p += j; // set QUAL
-    memmove(p, bam1_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields
+    memmove(p, bam_get_aux(b), bam_get_l_aux(b)); p += bam_get_l_aux(b); // set optional fields
      b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
-    b->data_len = p - b->data; // update record length
+    b->l_data = p - b->data; // update record length
      return 0;
  
  rmB_err:
diff --git a/samtools/bam.h b/samtools/bam.h

index 8c9d33af6683bca117d127e79d8b4748a4a7351f..804d590ec7bb83c7bbaf23863ab67699ebdfe1cf 100644 (file)
--- a/samtools/bam.h
+++ b/samtools/bam.h
@@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE.  */
    @copyright Genome Research Ltd.
   */
  
-#define BAM_VERSION "1.10"
+#define BAM_VERSION "1.13"
  
  #include <stdint.h>
  #include <stdlib.h>
@@ -77,7 +77,7 @@ typedef bam_hdr_t bam_header_t;
  #define BAM_OFHEX          1
  #define BAM_OFSTR          2
  
-/*! @abstract defautl mask for pileup */
+/*! @abstract default mask for pileup */
  #define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
  
  /*! @typedef
diff --git a/samtools/bam2bcf_indel.c b/samtools/bam2bcf_indel.c

index 104d108074ef2e5e48b3b27ac583700ae84c9176..17dedf008f29b54fa7fc5e4ff01861a02185bd03 100644 (file)
--- a/samtools/bam2bcf_indel.c
+++ b/samtools/bam2bcf_indel.c
@@ -408,6 +408,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf
                  { // do realignment; this is the bottleneck
                      const uint8_t *qual = bam_get_qual(p->b), *bq;
                      uint8_t *qq;
+                    if (qend < qbeg) {
+                        fprintf(stderr, "Impossible data in bcf_call_gap_prep\n");
+                        exit(1);
+                    }
                      qq = calloc(qend - qbeg, 1);
                      bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
                      if (bq) ++bq; // skip type
diff --git a/samtools/bam2bcf_indel.c.pysam.c b/samtools/bam2bcf_indel.c.pysam.c

index 583f99dd48ed205ba264ee9de8ce781e74d2a399..6706298369b49eb99bdcc1d368aaf3587004f458 100644 (file)
--- a/samtools/bam2bcf_indel.c.pysam.c
+++ b/samtools/bam2bcf_indel.c.pysam.c
@@ -410,6 +410,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf
                  { // do realignment; this is the bottleneck
                      const uint8_t *qual = bam_get_qual(p->b), *bq;
                      uint8_t *qq;
+                    if (qend < qbeg) {
+                        fprintf(samtools_stderr, "Impossible data in bcf_call_gap_prep\n");
+                        samtools_exit(1);
+                    }
                      qq = calloc(qend - qbeg, 1);
                      bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
                      if (bq) ++bq; // skip type
diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c

index 4b537c763912ce8759513aa148dbc5d79d93ec55..5253dfa85a458b355435b3177144128b8d290be7 100644 (file)
--- a/samtools/bam2depth.c
+++ b/samtools/bam2depth.c
@@ -1,9 +1,11 @@
  /*  bam2depth.c -- depth subcommand.
  
      Copyright (C) 2011, 2012 Broad Institute.
-    Copyright (C) 2012-2016, 2018, 2019 Genome Research Ltd.
+    Copyright (C) 2012-2016, 2018, 2019-2021 Genome Research Ltd.
+
+    Author: Heng Li <lh3@sanger.ac.uk> (to 2020)
+    Author: James Bonfield <jkb@sanger.ac.uk> (2021 rewrite)
  
-    Author: Heng Li <lh3@sanger.ac.uk>
  
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -24,7 +26,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  DEALINGS IN THE SOFTWARE.  */
  
  /* This program demonstrates how to generate pileup from multiple BAMs
- * simutaneously, to achieve random access and to use the BED interface.
+ * simultaneously, to achieve random access and to use the BED interface.
   * To compile this program separately, you may:
   *
   *   gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -lhts -lz
@@ -41,355 +43,913 @@ DEALINGS IN THE SOFTWARE.  */
  #include "samtools.h"
  #include "bedidx.h"
  #include "sam_opts.h"
+#include "htslib/khash.h"
  
-#define BAM_FMAX ((BAM_FSUPPLEMENTARY << 1) - 1)
+// From bam_plcmd.c
+int read_file_list(const char *file_list, int *n, char **argv[]);
  
-typedef struct {     // auxiliary data structure
-    samFile *fp;     // the file handle
-    sam_hdr_t *hdr;  // the file header
-    hts_itr_t *iter; // NULL if a region not specified
-    int min_mapQ, min_len; // mapQ filter; length filter
-    uint32_t flags;  // read filtering flags
-} aux_t;
+// We accumulate to hist[pos & (size-1)].  This is a ring-buffer.
+// We track where we last got to in output and what the biggest value
+// we've written to so far (in absolute unmasked coordinates) in
+// "last_output" and "end_pos" respectively.
+// For each new record we just flush anything we haven't written yet
+// already, between "last_output" and this read's start position, and
+// initialise any newly seen positions between "end_pos" and this read's
+// end position.
+typedef struct {
+    size_t size;
+    int **hist;         // hist[nfiles][size]
+    hts_pos_t *end_pos; // end_pos[nfiles]
+    hts_pos_t last_output;
+    int last_ref;
+    int nfiles;
+    const char *ref;
+    kstring_t ks;
+    hts_pos_t beg, end; // limit to region
+    int tid;
+} depth_hist;
  
-// This function reads a BAM alignment from one BAM file.
-static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
-{
-    aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
-    int ret;
-    while (1)
-    {
-        ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b);
-        if ( ret<0 ) break;
-        if ( b->core.flag & aux->flags) continue;
-        if ( (int)b->core.qual < aux->min_mapQ ) continue;
-        if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue;
-        break;
+typedef struct {
+    int header;
+    int flag;
+    int min_qual;
+    int min_mqual;
+    int min_len;
+    int skip_del;
+    int all_pos;
+    int remove_overlaps;
+    FILE *out;
+    char *reg;
+    void *bed;
+} depth_opt;
+
+static void zero_region(depth_opt *opt, depth_hist *dh,
+                        const char *name, hts_pos_t start, hts_pos_t end) {
+    hts_pos_t i;
+    kstring_t *ks = &dh->ks;
+
+    kputs(name, ks_clear(ks));
+    kputc('\t', ks);
+    size_t cur_l = ks->l;
+    if (dh->beg >= 0 && start < dh->beg)
+        start = dh->beg;
+    if (dh->end >= 0 && end > dh->end)
+        end = dh->end;
+
+    for (i = start; i < end; i++) {
+        // Could be optimised, but needs better API to skip to next
+        // bed region.
+        if (opt->bed && bed_overlap(opt->bed, name, i, i+1) == 0)
+            continue;
+
+        ks->l = cur_l;
+        kputll(i+1,  ks);
+        int n;
+        for (n = 0; n < dh->nfiles; n++) {
+            kputc_('\t', ks);
+            kputc_('0',  ks);
+        }
+        kputc('\n',  ks);
+        fputs(ks->s, opt->out);
      }
-    return ret;
+    ks->l = cur_l;
  }
  
-int read_file_list(const char *file_list,int *n,char **argv[]);
-
-static int usage() {
-    fprintf(stderr, "\n");
-    fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
-    fprintf(stderr, "Options:\n");
-    fprintf(stderr, "   -a                  output all positions (including zero depth)\n");
-    fprintf(stderr, "   -a -a (or -aa)      output absolutely all positions, including unused ref. sequences\n");
-    fprintf(stderr, "   -b <bed>            list of positions or regions\n");
-    fprintf(stderr, "   -X                  use customized index files\n");
-    fprintf(stderr, "   -f <list>           list of input BAM filenames, one per line [null]\n");
-    fprintf(stderr, "   -H                  print a file header\n");
-    fprintf(stderr, "   -l <int>            read length threshold (ignore reads shorter than <int>) [0]\n");
-    fprintf(stderr, "   -d/-m <int>         maximum coverage depth [8000]. If 0, depth is set to the maximum\n"
-                    "                       integer value, effectively removing any depth limit.\n");  // the htslib's default
-    fprintf(stderr, "   -o FILE             where to write output to [stdout]\n");
-    fprintf(stderr, "   -q <int>            base quality threshold [0]\n");
-    fprintf(stderr, "   -Q <int>            mapping quality threshold [0]\n");
-    fprintf(stderr, "   -r <chr:from-to>    region\n");
-    fprintf(stderr, "   -g <flags>          include reads that have any of the specified flags set [0]\n");
-    fprintf(stderr, "   -G <flags>          filter out reads that have any of the specified flags set"
-                    "                       [UNMAP,SECONDARY,QCFAIL,DUP]\n");
-
-    sam_global_opt_help(stderr, "-.--.--.");
-
-    fprintf(stderr, "\n");
-    fprintf(stderr, "The output is a simple tab-separated table with three columns: reference name,\n");
-    fprintf(stderr, "position, and coverage depth.  Note that positions with zero coverage may be\n");
-    fprintf(stderr, "omitted by default; see the -a option.\n");
-    fprintf(stderr, "\n");
-
-    return EXIT_FAILURE;
+// A variation of bam_cigar2qlen which doesn't count soft-clips in to the
+// equation.  Basically it's the number of bases in query that are aligned
+// in some way to the reference (including insertions, which are considered
+// to be aligned by dint of being anchored either side).
+hts_pos_t qlen_used(bam1_t *b) {
+    int n_cigar = b->core.n_cigar;
+    const uint32_t *cigar = bam_get_cigar(b);
+
+    hts_pos_t l;
+
+    if (b->core.l_qseq) {
+        // Known SEQ permits of short cut of l_qseq minus CSOFT_CLIPs.
+        // Full scan not needed, which helps on excessively long CIGARs.
+        l = b->core.l_qseq;
+        int kl, kr;
+        for (kl = 0; kl < n_cigar; kl++)
+            if (bam_cigar_op(cigar[kl]) == BAM_CSOFT_CLIP)
+                l -= bam_cigar_oplen(cigar[kl]);
+            else
+                break;
+
+        for (kr = n_cigar-1; kr > kl; kr--)
+            if (bam_cigar_op(cigar[kr]) == BAM_CSOFT_CLIP)
+                l -= bam_cigar_oplen(cigar[kr]);
+            else
+                break;
+    } else {
+        // Unknown SEQ ("*") needs a full scan through the CIGAR string.
+        static int query[16] = {
+          //M I D N  S H P =  X B ? ?  ? ? ? ?
+            1,1,0,0, 0,0,0,1, 1,0,0,0, 0,0,0,0
+        };
+        int k;
+        for (k = l = 0; k < n_cigar; k++)
+            if (query[bam_cigar_op(cigar[k])])
+                l += bam_cigar_oplen(cigar[k]);
+    }
+    return l;
+
  }
  
-int main_depth(int argc, char *argv[])
-{
-    int i, n, tid, reg_tid, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, has_index_file = 0;
-    hts_pos_t beg, end, pos, last_pos = -1;
-    int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1;
-    const bam_pileup1_t **plp;
-    char *reg = 0; // specified region
-    void *bed = 0; // BED data structure
-    char *file_list = NULL, **fn = NULL;
-    sam_hdr_t *h = NULL; // BAM header of the 1st input
-    aux_t **data;
-    bam_mplp_t mplp;
-    int last_tid = -1, ret;
-    int print_header = 0;
-    char *output_file = NULL;
-    FILE *file_out = stdout;
-    uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
-    int tflags = 0;
+// Adds the depth for a single read to a depth_hist struct.
+// For just one file, this is easy.  We just have a circular buffer
+// where we increment values for bits that overlap existing data
+// and initialise values for coordinates which we're seeing for the first
+// time.  This is tracked by "end_pos" to know where we've got to.
+//
+// As the input is sorted, we can flush output from "last_output" to
+// b->core.pos.
+//
+// With multiple files, we must feed data in sorted order as if all files
+// are merged, but track depth per file.  This also means "end_pos" is per
+// file too, but "last_output" is global as it corresponds to rows printed.
+static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b,
+                     int overlap_clip, int file) {
+    hts_pos_t i;
+    size_t hmask = dh->size-1;
+    int n;
  
-    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
-    static const struct option lopts[] = {
-        SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
-        { NULL, 0, NULL, 0 }
-    };
+    if (!b || b->core.tid != dh->last_ref) {
+        // New ref
+        if (dh->last_ref >= 0) {
+            // do end
+            size_t cur_l = dh->ks.l;
+            int nf = dh->nfiles;
+            i = dh->last_output;
+            for (i = dh->last_output; nf; i++) {
+                nf = 0;
+                for (n = 0; n < dh->nfiles; n++) {
+                    if (i < dh->end_pos[n])
+                        nf++;
+                }
+                if (!nf)
+                    break;
+
+                if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0)
+                    continue;
  
-    // parse the command line
-    while ((n = getopt_long(argc, argv, "r:b:Xq:Q:l:f:am:d:Ho:g:G:", lopts, NULL)) >= 0) {
-        switch (n) {
-            case 'l': min_len = atoi(optarg); break; // minimum query length
-            case 'r': reg = strdup(optarg); break;   // parsing a region requires a BAM header
-            case 'b':
-                bed = bed_read(optarg); // BED or position list file can be parsed now
-                if (!bed) {
-                    print_error_errno("depth", "Could not read file \"%s\"", optarg);
-                    return EXIT_FAILURE;
+                dh->ks.l = cur_l;
+                kputll(i+1, &dh->ks);
+                for (n = 0; n < dh->nfiles; n++) {
+                    kputc_('\t', &dh->ks);
+                    int d = i < dh->end_pos[n]
+                        ? dh->hist[n][i & hmask]
+                        : 0;
+                    kputuw(d, &dh->ks);
                  }
-                break;
-            case 'X': has_index_file = 1; break;
-            case 'q': baseQ = atoi(optarg); break;   // base quality threshold
-            case 'Q': mapQ = atoi(optarg); break;    // mapping quality threshold
-            case 'f': file_list = optarg; break;
-            case 'a': all++; break;
-            case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth
-            case 'H': print_header = 1; break;
-            case 'o': output_file = optarg; break;
-            case 'g':
-                tflags = bam_str2flag(optarg);
-                if (tflags < 0 || tflags > BAM_FMAX) {
-                    print_error_errno("depth", "Flag value \"%s\" is not supported", optarg);
-                    return 1;
+                kputc('\n', &dh->ks);
+                fputs(dh->ks.s, opt->out);
+            }
+            if (opt->all_pos) {
+                // End of last ref
+                zero_region(opt, dh,
+                            sam_hdr_tid2name(h, dh->last_ref),
+                            i, sam_hdr_tid2len(h, dh->last_ref));
+            }
+            dh->ks.l = cur_l;
+        }
+
+        if (opt->all_pos > 1 && !opt->reg) {
+            // Any previous unused refs
+            int lr = dh->last_ref < 0 ? 0 : dh->last_ref+1;
+            int rr = b ? b->core.tid : sam_hdr_nref(h), r;
+            for (r = lr; r < rr; r++)
+                zero_region(opt, dh,
+                            sam_hdr_tid2name(h, r),
+                            0, sam_hdr_tid2len(h, r));
+        }
+
+        if (!b) {
+            // we're just flushing to end of file
+            if (opt->all_pos && opt->reg && dh->last_ref < 0)
+                // -a or -aa without a single read being output yet
+                zero_region(opt, dh, sam_hdr_tid2name(h, dh->tid), dh->beg,
+                            MIN(dh->end, sam_hdr_tid2len(h, dh->tid)));
+
+            return 0;
+        }
+
+        for (n = 0; dh->end_pos && n < dh->nfiles; n++)
+            dh->end_pos[n] = 0;
+        dh->last_output = dh->beg >= 0
+            ? MAX(b->core.pos, dh->beg)
+            : b->core.pos;
+        dh->last_ref = b->core.tid;
+        dh->ref = sam_hdr_tid2name(h, b->core.tid);
+        kputs(dh->ref, ks_clear(&dh->ks));
+        kputc('\t', &dh->ks);
+
+        if (opt->all_pos)
+            // Start of ref
+            zero_region(opt, dh, dh->ref, 0, b->core.pos);
+    } else {
+        if (dh->last_output < b->core.pos) {
+            // Flush any depth outputs up to start of new read
+            size_t cur_l = dh->ks.l;
+            int nf = dh->nfiles;
+            for (i = dh->last_output; i < b->core.pos; i++) {
+                nf = 0;
+                for (n = 0; n < dh->nfiles; n++) {
+                    if (i < dh->end_pos[n])
+                        nf++;
                  }
-                flags &= ~tflags;
-                break;
-            case 'G':
-                tflags = bam_str2flag(optarg);
-                if (tflags < 0 || tflags > BAM_FMAX) {
-                    print_error_errno("depth", "Flag value \"%s\" is not supported", optarg);
-                    return 1;
+                if (!nf)
+                    break;
+
+                if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0)
+                    continue;
+
+                dh->ks.l = cur_l;
+                kputll(i+1, &dh->ks);
+                for (n = 0; n < dh->nfiles; n++) {
+                    kputc_('\t', &dh->ks);
+                    int d = i < dh->end_pos[n]
+                        ? dh->hist[n][i & hmask]
+                        : 0;
+                    kputuw(d, &dh->ks);
                  }
-                flags |= tflags;
-                break;
-            default:  if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break;
-                      /* else fall-through */
-            case '?': return usage();
+                kputc('\n', &dh->ks);
+                fputs(dh->ks.s, opt->out);
+            }
+            if (opt->all_pos && i < b->core.pos)
+                // Hole in middle of ref
+                zero_region(opt, dh, dh->ref, i, b->core.pos);
+
+            dh->ks.l = cur_l;
+            dh->last_output = b->core.pos;
          }
      }
-    if (optind == argc && !file_list)
-        return usage();
-
-    /* output file provided by user */
-    if (output_file != NULL && strcmp(output_file,"-")!=0) {
-        file_out = fopen( output_file, "w" );
-        if (file_out == NULL) {
-            print_error_errno("depth", "Cannot open \"%s\" for writing.", output_file);
-            return EXIT_FAILURE;
-        }
+
+    hts_pos_t end_pos = bam_endpos(b); // 0 based, 1 past end.
+    //printf("%d %d\n", (int)b->core.pos+1, (int)end_pos);
+
+    if (b->core.tid < dh->last_ref ||
+        (dh->last_ref == b->core.tid && end_pos < dh->last_output)) {
+        print_error_errno("depth", "Data is not position sorted");
+        return -1;
      }
  
+    // If needed, grow the circular buffer.
+    if (end_pos+1 - b->core.pos >= dh->size) {
+        size_t old_size = dh->size;
+        size_t old_hmask = hmask;
+        while (end_pos+1 - b->core.pos >= dh->size)
+            dh->size = dh->size ? 2*dh->size : 2048;
+        hmask = dh->size-1;
+        if (!dh->hist) {
+            dh->hist = calloc(dh->nfiles, sizeof(*dh->hist));
+            dh->end_pos = calloc(dh->nfiles, sizeof(*dh->end_pos));
+            if (!dh->hist || !dh->end_pos)
+                return -1;
+        }
+        for (n = 0; n < dh->nfiles; n++) {
+            int *hist = calloc(dh->size, sizeof(*dh->hist[n]));
+            if (!hist)
+                return -1;
  
-    // initialize the auxiliary data structures
-    if (file_list)
-    {
-        if (has_index_file) {
-            print_error("depth", "The -f option cannot be combined with -X");
-            return 1;
+            // Simple approach for now; copy over old histogram verbatim.
+            for (i = dh->last_output; i < dh->last_output + old_size; i++)
+                hist[i & hmask] = dh->hist[n][i & old_hmask];
+            free(dh->hist[n]);
+            dh->hist[n] = hist;
          }
-        if ( read_file_list(file_list,&nfiles,&fn) ) return EXIT_FAILURE;
-        n = nfiles;
-        argv = fn;
-        optind = 0;
      }
-    else if (has_index_file) { // Calculate # of input BAM files
-        if ((argc - optind) % 2 != 0) {
-            fprintf(stderr, "Error: Odd number of filenames detected! Each BAM file should have an index file\n");
-            return 1;
-        }
-        n = (argc - optind) / 2;
+
+    // Accumulate depth, based on CIGAR
+    uint32_t *cig = bam_get_cigar(b);
+    int ncig = b->core.n_cigar, j, k, spos = 0;
+
+    // Zero new (previously unseen) coordinates so increment works later.
+    hts_pos_t end = MAX(dh->end_pos[file], b->core.pos);
+    if (end_pos > end && (end & hmask) < (end_pos & hmask)) {
+        memset(&dh->hist[file][end & hmask], 0,
+               sizeof(**dh->hist) * (end_pos - end));
      } else {
-        n = argc - optind;
+        for (i = end; i < end_pos; i++)
+            dh->hist[file][i & hmask] = 0;
      }
-    data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input
-    reg_tid = 0; beg = 0; end = HTS_POS_MAX;  // set the default region
-
-    for (i = 0; i < n; ++i) {
-        int rf;
-        data[i] = calloc(1, sizeof(aux_t));
-        data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM
-        if (data[i]->fp == NULL) {
-            print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]);
-            status = EXIT_FAILURE;
-            goto depth_end;
-        }
-        rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ;
-        if (baseQ) rf |= SAM_QUAL;
-        if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
-            print_error_errno("depth", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
-            status = EXIT_FAILURE;
-            goto depth_end;
-        }
-        if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
-            print_error_errno("depth", "Failed to set CRAM_OPT_DECODE_MD value");
-            status = EXIT_FAILURE;
-            goto depth_end;
-        }
-        data[i]->min_mapQ = mapQ;                    // set the mapQ filter
-        data[i]->min_len  = min_len;                 // set the qlen filter
-        data[i]->hdr = sam_hdr_read(data[i]->fp);    // read the BAM header
-        if (data[i]->hdr == NULL) {
-            print_error_errno("depth", "Couldn't read header for \"%s\"",
-                              argv[optind+i]);
-            status = EXIT_FAILURE;
-            goto depth_end;
-        }
-        if (reg) { // if a region is specified
-            hts_idx_t *idx = NULL;
-            // If index filename has not been specfied, look in BAM folder
-            if (has_index_file) {
-                idx = sam_index_load2(data[i]->fp, argv[optind+i], argv[optind+i+n]);  // load the index
+
+    i = b->core.pos;
+    uint8_t *qual = bam_get_qual(b);
+    int min_qual = opt->min_qual;
+    for (j = 0; j < ncig; j++) {
+        int op    = bam_cigar_op(cig[j]);
+        int oplen = bam_cigar_oplen(cig[j]);
+
+        switch (op) {
+        case BAM_CDEL:
+        case BAM_CREF_SKIP:
+            if (op != BAM_CDEL || opt->skip_del) {
+                // don't increment reference location
+                if (i + oplen >= dh->end_pos[file]) {
+                    for (k = 0; k < oplen; k++, i++) {
+                        if (i >= dh->end_pos[file])
+                            // redundant due to zero new elements above?
+                            dh->hist[file][i & hmask] = 0;
+                    }
+                } else {
+                    i += oplen;
+                }
+            } else { // op == BAM_CDEL and we count them (-J option),
+                // We don't incr spos here, but we still use qual.
+                // This doesn't make much sense, but it's for compatibility
+                // with the old code.  Arguably DEL shouldn't have a min
+                // qual and should always pass (as we've explicitly asked to
+                // include them).
+                int *hist = dh->hist[file];
+                k = 0;
+                if (overlap_clip) {
+                    if (i+oplen < overlap_clip) {
+                        i += oplen;
+                        break;
+                    } else if (i < overlap_clip) {
+                        k = overlap_clip - i;
+                        i = overlap_clip;
+                    }
+                }
+
+                // Question: should we even check quality values for DEL?
+                // We've explicitly asked to include them, and the quality
+                // is wrong anyway (it's the neighbouring base).  We do this
+                // for now for compatibility with the old depth command.
+
+                if (spos < b->core.l_qseq)
+                    for (; k < oplen; k++, i++)
+                        hist[i & hmask]+=qual[spos]>=min_qual;
+                else
+                    for (; k < oplen; k++, i++)
+                        hist[i & hmask]++;
+            }
+            break;
+
+        case BAM_CMATCH:
+        case BAM_CEQUAL:
+        case BAM_CDIFF:
+            if ((i & hmask) < ((i+oplen) & hmask)) {
+                // Optimisation when not wrapping around
+
+                // Unrolling doesn't help clang, but helps gcc,
+                // especially when not using -O3.
+                int *hist = &dh->hist[file][i & hmask];
+                if (min_qual || overlap_clip) {
+                    k = 0;
+                    if (overlap_clip) {
+                        if (i+oplen < overlap_clip) {
+                            i += oplen;
+                            spos += oplen;
+                            break;
+                        } else if (i < overlap_clip) {
+                            oplen -= overlap_clip - i;
+                            spos += overlap_clip - i;
+                            hist += overlap_clip - i;
+                            i = overlap_clip;
+                        }
+                    }
+
+                    // approx 50% of this func cpu time in this loop
+                    for (; k < (oplen & ~7); k+=8) {
+                        hist[k+0]+=qual[spos+0]>=min_qual;
+                        hist[k+1]+=qual[spos+1]>=min_qual;
+                        hist[k+2]+=qual[spos+2]>=min_qual;
+                        hist[k+3]+=qual[spos+3]>=min_qual;
+                        hist[k+4]+=qual[spos+4]>=min_qual;
+                        hist[k+5]+=qual[spos+5]>=min_qual;
+                        hist[k+6]+=qual[spos+6]>=min_qual;
+                        hist[k+7]+=qual[spos+7]>=min_qual;
+                        spos += 8;
+                    }
+                } else {
+                    // easier to vectorize when no min_qual
+                    for (k = 0; k < (oplen & ~7); k+=8) {
+                        hist[k+0]++;
+                        hist[k+1]++;
+                        hist[k+2]++;
+                        hist[k+3]++;
+                        hist[k+4]++;
+                        hist[k+5]++;
+                        hist[k+6]++;
+                        hist[k+7]++;
+                    }
+                    spos += k;
+                }
+                for (; k < oplen && spos < b->core.l_qseq; k++, spos++)
+                    hist[k]+=qual[spos]>=min_qual;
+                for (; k < oplen; k++, spos++)
+                    hist[k]++;
+                i += oplen;
              } else {
-                idx = sam_index_load(data[i]->fp, argv[optind+i]);
+                // Simple to understand case, but slower.
+                // We use this only for reads with wrap-around.
+                int *hist = dh->hist[file];
+                k = 0;
+                if (overlap_clip) {
+                    if (i+oplen < overlap_clip) {
+                        i += oplen;
+                        break;
+                    } else if (i < overlap_clip) {
+                        oplen -= overlap_clip - i;
+                        spos += overlap_clip - i;
+                        i = overlap_clip;
+                    }
+                }
+                for (; k < oplen && spos < b->core.l_qseq; k++, i++, spos++)
+                    hist[i & hmask]+=qual[spos]>=min_qual;
+                for (; k < oplen; k++, i++, spos++)
+                    hist[i & hmask]++;
              }
-            if (idx == NULL) {
-                print_error("depth", "can't load index for \"%s\"", argv[optind+i]);
-                status = EXIT_FAILURE;
-                goto depth_end;
+            break;
+
+        case BAM_CINS:
+        case BAM_CSOFT_CLIP:
+            spos += oplen;
+            break;
+
+        case BAM_CPAD:
+        case BAM_CHARD_CLIP:
+            // ignore
+            break;
+
+        default:
+            print_error("depth", "Unsupported cigar op '%d'", op);
+            return -1;
+        }
+    }
+
+    if (dh->end >= 0 && end_pos > dh->end)
+        end_pos = dh->end;
+    if (dh->end_pos[file] < end_pos)
+        dh->end_pos[file] = end_pos;
+
+    return 0;
+}
+
+// Hash on name -> alignment end pos. This permits a naive overlap removal.
+// Note it cannot analyse the overlapping sequence and qualities, so the
+// interaction of basecalls/qualities and the -Q parameter cannot be
+// applied here (unlike the full mpileup algorithm).
+KHASH_MAP_INIT_STR(olap_hash, hts_pos_t)
+typedef khash_t(olap_hash) olap_hash_t;
+
+static int fastdepth_core(depth_opt *opt, uint32_t nfiles, char **fn,
+                          samFile **fp, hts_itr_t **itr, sam_hdr_t **h) {
+    int ret = -1, err = 1, i;
+    olap_hash_t **overlaps = NULL;
+    depth_hist dh = {0};
+
+    // An array of bam structs, one per input file, to hold the next entry
+    bam1_t **b = calloc(nfiles, sizeof(*b));
+    int *finished = calloc(nfiles, sizeof(*finished)), to_go = nfiles;
+    if (!b || !finished)
+        goto err;
+
+    for (i = 0; i < nfiles; i++)
+        if (!(b[i] = bam_init1()))
+            goto err;
+
+    // Do we need one overlap hash per file? Or shared?
+    if (opt->remove_overlaps) {
+        if (!(overlaps = calloc(nfiles, sizeof(*overlaps))))
+            return -1;
+        for (i = 0; i < nfiles; i++) {
+            if (!(overlaps[i] = kh_init(olap_hash)))
+                return -1;
+        }
+    }
+
+    // Create the initial histogram
+    dh.nfiles = nfiles;
+    dh.size = 0;
+    dh.hist = NULL;
+    dh.last_ref = -99;
+    dh.end_pos = NULL;
+    dh.last_output = itr && itr[0] ? itr[0]->beg : 0;
+    ks_initialize(&dh.ks);
+
+    // Clip results to region if specified
+    dh.beg = -1;
+    dh.end = -1;
+    dh.tid = 0;
+    if (itr && itr[0]) {
+        dh.tid = itr[0]->tid;
+        dh.beg = itr[0]->beg;
+        dh.end = itr[0]->end;
+    }
+
+    if (opt->header) {
+        fprintf(opt->out, "#CHROM\tPOS");
+        for (i = 0; i < nfiles; i++)
+            fprintf(opt->out, "\t%s", fn[i]);
+        fputc('\n', opt->out);
+    }
+
+    // Populate first record per file
+    for (i = 0; i < nfiles; i++) {
+        for(;;) {
+            ret = itr && itr[i]
+                ? sam_itr_next(fp[i], itr[i], b[i])
+                : sam_read1(fp[i], h[i], b[i]);
+            if (ret < -1)
+                goto err;
+            if (ret == -1) {
+                to_go--;
+                finished[i] = 1;
+                break;
              }
-            data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator
-            hts_idx_destroy(idx); // the index is not needed any more; free the memory
-            if (data[i]->iter == NULL) {
-                print_error("depth", "can't parse region \"%s\"", reg);
-                status = EXIT_FAILURE;
-                goto depth_end;
+
+            if (b[i]->core.tid < 0)
+                continue;
+            if (b[i]->core.flag & opt->flag)
+                continue;
+            if (b[i]->core.qual < opt->min_mqual)
+                continue;
+
+            // Original samtools depth used the total sequence (l_qseq)
+            // including soft-clips.  This doesn't feel like a useful metric
+            // to be filtering on.  We now only count sequence bases that
+            // form the used part of the alignment.
+            if (opt->min_len) {
+                if (qlen_used(b[i]) < opt->min_len)
+                    continue;
              }
+
+            break;
          }
-        data[i]->flags = flags;
      }
-    if (print_header) {
-        fputs("#CHROM\tPOS", file_out);
-        for (i = 0; i < n; ++i) {
-            fputc('\t', file_out);
-            fputs(argv[optind+i], file_out);
+
+    // Loop through input files, merging in order so we're
+    // always adding the next record in sequence
+    while (to_go) {
+        // Find next record in file list
+        int best_tid = INT_MAX, best_file = 0;
+        hts_pos_t best_pos = HTS_POS_MAX;
+
+        for (i = 0; i < nfiles; i++) {
+            if (finished[i])
+                continue;
+            if (best_tid > b[i]->core.tid) {
+                best_tid = b[i]->core.tid;
+                best_pos = b[i]->core.pos;
+                best_file = i;
+            } else if (best_tid == b[i]->core.tid &&
+                       best_pos > b[i]->core.pos) {
+                best_pos = b[i]->core.pos;
+                best_file = i;
              }
-        fputc('\n', file_out);
          }
-    h = data[0]->hdr; // easy access to the header of the 1st BAM
-    if (reg) {
-        beg = data[0]->iter->beg; // and to the parsed region coordinates
-        end = data[0]->iter->end;
-        reg_tid = data[0]->iter->tid;
-    }
+        i = best_file;
  
-    // the core multi-pileup loop
-    mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
-    if (0 < max_depth)
-        bam_mplp_set_maxcnt(mplp,max_depth);  // set maximum coverage depth
-    else if (!max_depth)
-        bam_mplp_set_maxcnt(mplp,INT_MAX);
-    n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
-    plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
-    while ((ret=bam_mplp64_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
-        if (pos < beg || pos >= end) continue; // out of range; skip
-        if (tid >= sam_hdr_nref(h)) continue;     // diff number of @SQ lines per file?
-        if (all) {
-            while (tid > last_tid) {
-                if (last_tid >= 0 && !reg) {
-                    // Deal with remainder or entirety of last tid.
-                    while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
-                        // Horribly inefficient, but the bed API is an obfuscated black box.
-                        if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)
-                            continue;
-                        fputs(sam_hdr_tid2name(h, last_tid), file_out);
-                        fprintf(file_out, "\t%"PRIhts_pos, last_pos+1);
-                        for (i = 0; i < n; i++)
-                            fputc('\t', file_out), fputc('0', file_out);
-                        fputc('\n', file_out);
-                    }
+        hts_pos_t clip = 0;
+        if (overlaps && (b[i]->core.flag & BAM_FPAIRED) &&
+            !(b[i]->core.flag & BAM_FMUNMAP)) {
+            khiter_t k = kh_get(olap_hash, overlaps[i], bam_get_qname(b[i]));
+            if (k == kh_end(overlaps[i])) {
+                // not seen before
+                hts_pos_t endpos = bam_endpos(b[i]);
+
+                // Don't add if mate location is known and can't overlap.
+                if (b[i]->core.mpos == -1 ||
+                    (b[i]->core.tid == b[i]->core.mtid &&
+                     b[i]->core.mpos <= endpos)) {
+                    k = kh_put(olap_hash, overlaps[i], bam_get_qname(b[i]),
+                               &ret);
+                    if (ret < 0)
+                        return -1;
+                    kh_key(overlaps[i], k) = strdup(bam_get_qname(b[i]));
+                    kh_value(overlaps[i], k) = endpos;
                  }
-                last_tid++;
-                last_pos = -1;
-                if (all < 2)
-                    break;
+            } else {
+                // seen before
+                clip = kh_value(overlaps[i], k);
+                free((char *)kh_key(overlaps[i], k));
+                kh_del(olap_hash, overlaps[i], k);
              }
+        }
  
-            // Deal with missing portion of current tid
-            while (++last_pos < pos) {
-                if (last_pos < beg) continue; // out of range; skip
-                if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0)
-                    continue;
-                fputs(sam_hdr_tid2name(h, tid), file_out);
-                fprintf(file_out, "\t%"PRIhts_pos, last_pos+1);
-                for (i = 0; i < n; i++)
-                    fputc('\t', file_out), fputc('0', file_out);
-                fputc('\n', file_out);
+        // Add the next merged BAM record to the depth plot
+        if ((ret = add_depth(opt, &dh, h[i], b[i], clip, i)) < 0) {
+            ret = -1;
+            goto err;
+        }
+
+        // Populate next record from this file
+        for(;!finished[i];) {
+            ret = itr && itr[i]
+                ? sam_itr_next(fp[i], itr[i], b[i])
+                : sam_read1(fp[i], h[i], b[i]);
+            if (ret < -1) {
+                ret = -1;
+                goto err;
+            }
+            if (ret == -1) {
+                to_go--;
+                finished[i] = 1;
+                break;
              }
  
-            last_tid = tid;
-            last_pos = pos;
-        }
-        if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), pos, pos + 1) == 0) continue;
-        fputs(sam_hdr_tid2name(h, tid), file_out);
-        fprintf(file_out, "\t%"PRIhts_pos, pos+1); // a customized printf() would be faster
-        for (i = 0; i < n; ++i) { // base level filters have to go here
-            int j, m = 0;
-            for (j = 0; j < n_plp[i]; ++j) {
-                const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
-                if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
-                else if (p->qpos < p->b->core.l_qseq &&
-                         bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
+            if (b[i]->core.tid < 0)
+                continue;
+            if (b[i]->core.flag & opt->flag)
+                continue;
+            if (b[i]->core.qual < opt->min_mqual)
+                continue;
+
+            if (opt->min_len) {
+                if (qlen_used(b[i]) < opt->min_len)
+                    continue;
              }
-            fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
+
+            break;
          }
-        fputc('\n', file_out);
      }
-    if (ret < 0) status = EXIT_FAILURE;
-    free(n_plp); free(plp);
-    bam_mplp_destroy(mplp);
-
-    if (all) {
-        // Handle terminating region
-        if (last_tid < 0 && reg) {
-            last_tid = reg_tid;
-            last_pos = beg-1;
+
+    // Tidy up end.
+    ret = add_depth(opt, &dh, h[0], NULL, 0, 0);
+    err = 0;
+
+ err:
+    if (ret == 0 && err)
+        ret = -1;
+
+    for (i = 0; i < nfiles; i++) {
+        if (b[i])
+            bam_destroy1(b[i]);
+        if (dh.hist && dh.hist[i])
+            free(dh.hist[i]);
+    }
+    free(b);
+    free(finished);
+    ks_free(&dh.ks);
+    free(dh.hist);
+    free(dh.end_pos);
+    if (overlaps) {
+        khiter_t k;
+        for (i = 0; i < nfiles; i++) {
+            if (!overlaps[i])
+                continue;
+            for (k = kh_begin(overlaps[i]); k < kh_end(overlaps[i]); k++)
+                if (kh_exist(overlaps[i], k))
+                    free((char *)kh_key(overlaps[i], k));
+            kh_destroy(olap_hash, overlaps[i]);
          }
-        while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) {
-            while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
-                if (last_pos >= end) break;
-                if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)
-                    continue;
-                fputs(sam_hdr_tid2name(h, last_tid), file_out);
-                fprintf(file_out, "\t%"PRIhts_pos, last_pos+1);
-                for (i = 0; i < n; i++)
-                    fputc('\t', file_out), fputc('0', file_out);
-                fputc('\n', file_out);
+        free(overlaps);
+    }
+
+    return ret;
+}
+
+static void usage_exit(FILE *fp, int exit_status)
+{
+    fprintf(fp, "Usage: samtools depth [options] in.bam [in.bam ...]\n");
+    fprintf(fp, "\nOptions:\n");
+    fprintf(fp, "  -a           Output all positions (including zero depth)\n");
+    fprintf(fp, "  -a -a, -aa   Output absolutely all positions, including unused ref seqs\n");
+    fprintf(fp, "  -r REG       Specify a region in chr or chr:from-to syntax\n");
+    fprintf(fp, "  -b FILE      Use bed FILE for list of regions\n");
+    fprintf(fp, "  -f FILE      Specify list of input BAM/SAM/CRAM filenames\n");
+    fprintf(fp, "  -X           Use custom index files (in -X *.bam *.bam.bai order)\n");
+    fprintf(fp, "  -g INT       Remove specified flags from default flag filter\n");
+    fprintf(fp, "  -G INT       Add specified flags to the default flag filter\n");
+    fprintf(fp, "  -H           Print a file header line\n");
+    fprintf(fp, "  -l INT       Minimum read length [0]\n");
+    fprintf(fp, "  -o FILE      Write output to FILE [stdout]\n");
+    fprintf(fp, "  -q INT       Minimum base quality [0]\n");
+    fprintf(fp, "  -Q INT       Minimum mapping quality [0]\n");
+    fprintf(fp, "  -H           Print a file header\n");
+    fprintf(fp, "  -J           Include reads with deletions in depth computation\n");
+    fprintf(fp, "  -s           Do not count overlapping reads within a template\n");
+    sam_global_opt_help(fp, "-.---@-.");
+    exit(exit_status);
+}
+
+int main_depth(int argc, char *argv[])
+{
+    int nfiles, i;
+    samFile **fp;
+    sam_hdr_t **header;
+    int c, has_index_file = 0;
+    char *file_list = NULL, **fn = NULL;
+    depth_opt opt = {
+        .flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL,
+        .min_qual = 0,
+        .min_mqual = 0,
+        .skip_del = 1,
+        .header = 0,
+        .min_len = 0,
+        .out = stdout,
+        .all_pos = 0,
+        .remove_overlaps = 0,
+        .reg = NULL,
+        .bed = NULL,
+    };
+
+    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+    static const struct option lopts[] = {
+        SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'),
+        {NULL, 0, NULL, 0}
+    };
+
+    while ((c = getopt_long(argc, argv, "@:q:Q:JHd:m:l:g:G:o:ar:Xf:b:s",
+                            lopts, NULL)) >= 0) {
+        switch (c) {
+        case 'a':
+            opt.all_pos++;
+            break;
+
+        case 'b':
+            opt.bed = bed_read(optarg);
+            if (!opt.bed) {
+                print_error_errno("depth", "Could not read file \"%s\"",
+                                  optarg);
+                return 1;
              }
-            last_tid++;
-            last_pos = -1;
-            if (all < 2 || reg)
+            break;
+
+        case 'f':
+            file_list = optarg;
+            break;
+
+        case 'd':
+        case 'm':
+            // depth limit - now ignored
+            break;
+
+        case 'g':
+            opt.flag &= ~bam_str2flag(optarg);
+            break;
+        case 'G':
+            opt.flag |= bam_str2flag(optarg);
+            break;
+
+        case 'l':
+            opt.min_len = atoi(optarg);
+            break;
+
+        case 'H':
+            opt.header = 1;
+            break;
+
+        case 'q':
+            opt.min_qual = atoi(optarg);
+            break;
+        case 'Q':
+            opt.min_mqual = atoi(optarg);
+            break;
+
+        case 'J':
+            opt.skip_del = 0;
+            break;
+
+        case 'o':
+            if (opt.out != stdout)
                  break;
+            opt.out = fopen(optarg, "w");
+            if (!opt.out) {
+                print_error_errno("depth", "Cannot open \"%s\" for writing.",
+                                  optarg);
+                return EXIT_FAILURE;
+            }
+            break;
+
+        case 'r':
+            opt.reg = optarg;
+            break;
+
+        case 's':
+            opt.remove_overlaps = 1;
+            break;
+
+        case 'X':
+            has_index_file = 1;
+            break;
+
+        default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+            /* else fall-through */
+        case '?':
+            usage_exit(stderr, EXIT_FAILURE);
          }
      }
  
-depth_end:
-    if (((file_out != stdout)? fclose(file_out) : fflush(file_out)) != 0) {
-        if (status == EXIT_SUCCESS) {
-            if (file_out != stdout)
-                print_error_errno("depth", "error on closing \"%s\"", output_file);
-            else
-                print_error_errno("depth", "error on flushing standard output");
-            status = EXIT_FAILURE;
+    if (argc < optind+1 && !file_list) {
+        if (argc == optind)
+            usage_exit(stdout, EXIT_SUCCESS);
+        else
+            usage_exit(stderr, EXIT_FAILURE);
+    }
+
+    if (file_list) {
+        if (has_index_file) {
+            print_error("depth", "The -f option cannot be combined with -X");
+            return 1;
+        }
+        if (read_file_list(file_list, &nfiles, &fn))
+            return 1;
+        argv = fn;
+        argc = nfiles;
+        optind = 0;
+    } else {
+        nfiles = argc - optind;
+    }
+
+    if (has_index_file) {
+        if (nfiles%1) {
+            print_error("depth", "-X needs one index specified per bam file");
+            return 1;
          }
+        nfiles /= 2;
+    }
+    fp = malloc(nfiles * sizeof(*fp));
+    header = malloc(nfiles * sizeof(*header));
+    if (!fp || !header) {
+        print_error_errno("depth", "Out of memory");
+        return 1;
      }
  
-    for (i = 0; i < n && data[i]; ++i) {
-        sam_hdr_destroy(data[i]->hdr);
-        if (data[i]->fp) sam_close(data[i]->fp);
-        hts_itr_destroy(data[i]->iter);
-        free(data[i]);
+    hts_itr_t **itr = NULL;
+    if (opt.reg) {
+        itr = calloc(nfiles, sizeof(*itr));
+        if (!itr)
+            return 1;
      }
-    free(data); free(reg);
-    if (bed) bed_destroy(bed);
-    if ( file_list )
-    {
-        for (i=0; i<n; i++) free(fn[i]);
+
+    for (i = 0; i < nfiles; i++, optind++) {
+        fp[i] = sam_open_format(argv[optind], "r", &ga.in);
+        if (fp[i] == NULL) {
+            print_error_errno("depth",
+                              "Cannot open input file \"%s\"", argv[optind]);
+            return 1;
+        }
+
+        if (ga.nthreads > 0)
+            hts_set_threads(fp[i], ga.nthreads);
+
+        if (hts_set_opt(fp[i], CRAM_OPT_REQUIRED_FIELDS,
+                        SAM_FLAG | SAM_RNAME | SAM_POS | SAM_CIGAR
+                        | (opt.remove_overlaps ? SAM_QNAME|SAM_RNEXT|SAM_PNEXT
+                                               : 0)
+                        | (opt.min_mqual       ? SAM_MAPQ  : 0)
+                        | (opt.min_len         ? SAM_SEQ   : 0)
+                        | (opt.min_qual        ? SAM_QUAL  : 0))) {
+            fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+            return 1;
+        }
+
+        if (hts_set_opt(fp[i], CRAM_OPT_DECODE_MD, 0)) {
+            fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+            return 1;
+        }
+
+        // FIXME: what if headers differ?
+        header[i] = sam_hdr_read(fp[i]);
+        if (header == NULL) {
+            fprintf(stderr, "Failed to read header for \"%s\"\n",
+                    argv[optind]);
+            return 1;
+        }
+
+        if (opt.reg) {
+            hts_idx_t *idx = has_index_file
+                ? sam_index_load2(fp[i], argv[optind], argv[optind+nfiles])
+                : sam_index_load(fp[i], argv[optind]);
+            if (!idx) {
+                print_error("depth", "cannot load index for \"%s\"",
+                            argv[optind]);
+                return 1;
+            }
+            if (!(itr[i] = sam_itr_querys(idx, header[i], opt.reg))) {
+                print_error("depth", "cannot parse region \"%s\"", opt.reg);
+                return 1;
+            }
+            hts_idx_destroy(idx);
+        }
+    }
+
+    int ret = fastdepth_core(&opt, nfiles, &argv[argc-nfiles], fp, itr, header)
+        ? 1 : 0;
+
+    for (i = 0; i < nfiles; i++) {
+        sam_hdr_destroy(header[i]);
+        sam_close(fp[i]);
+        if (itr && itr[i])
+            hts_itr_destroy(itr[i]);
+    }
+    free(header);
+    free(fp);
+    free(itr);
+    if (file_list) {
+        for (i=0; i<nfiles; i++)
+            free(fn[i]);
          free(fn);
      }
+    if (opt.bed)
+        bed_destroy(opt.bed);
      sam_global_args_free(&ga);
-    return status;
+    if (opt.out != stdout) fclose(opt.out);
+    return ret;
  }
  
  #ifdef _MAIN_BAM2DEPTH
diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c

index dbd095a8ca42f2130bf992a5adfbf4ab07536810..8b36457e9185db279baa93e1c85e0208a08c52cf 100644 (file)
--- a/samtools/bam2depth.c.pysam.c
+++ b/samtools/bam2depth.c.pysam.c
@@ -3,9 +3,11 @@
  /*  bam2depth.c -- depth subcommand.
  
      Copyright (C) 2011, 2012 Broad Institute.
-    Copyright (C) 2012-2016, 2018, 2019 Genome Research Ltd.
+    Copyright (C) 2012-2016, 2018, 2019-2021 Genome Research Ltd.
+
+    Author: Heng Li <lh3@sanger.ac.uk> (to 2020)
+    Author: James Bonfield <jkb@sanger.ac.uk> (2021 rewrite)
  
-    Author: Heng Li <lh3@sanger.ac.uk>
  
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +28,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  DEALINGS IN THE SOFTWARE.  */
  
  /* This program demonstrates how to generate pileup from multiple BAMs
- * simutaneously, to achieve random access and to use the BED interface.
+ * simultaneously, to achieve random access and to use the BED interface.
   * To compile this program separately, you may:
   *
   *   gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -lhts -lz
@@ -43,355 +45,913 @@ DEALINGS IN THE SOFTWARE.  */
  #include "samtools.h"
  #include "bedidx.h"
  #include "sam_opts.h"
+#include "htslib/khash.h"
  
-#define BAM_FMAX ((BAM_FSUPPLEMENTARY << 1) - 1)
+// From bam_plcmd.c
+int read_file_list(const char *file_list, int *n, char **argv[]);
  
-typedef struct {     // auxiliary data structure
-    samFile *fp;     // the file handle
-    sam_hdr_t *hdr;  // the file header
-    hts_itr_t *iter; // NULL if a region not specified
-    int min_mapQ, min_len; // mapQ filter; length filter
-    uint32_t flags;  // read filtering flags
-} aux_t;
+// We accumulate to hist[pos & (size-1)].  This is a ring-buffer.
+// We track where we last got to in output and what the biggest value
+// we've written to so far (in absolute unmasked coordinates) in
+// "last_output" and "end_pos" respectively.
+// For each new record we just flush anything we haven't written yet
+// already, between "last_output" and this read's start position, and
+// initialise any newly seen positions between "end_pos" and this read's
+// end position.
+typedef struct {
+    size_t size;
+    int **hist;         // hist[nfiles][size]
+    hts_pos_t *end_pos; // end_pos[nfiles]
+    hts_pos_t last_output;
+    int last_ref;
+    int nfiles;
+    const char *ref;
+    kstring_t ks;
+    hts_pos_t beg, end; // limit to region
+    int tid;
+} depth_hist;
  
-// This function reads a BAM alignment from one BAM file.
-static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
-{
-    aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
-    int ret;
-    while (1)
-    {
-        ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b);
-        if ( ret<0 ) break;
-        if ( b->core.flag & aux->flags) continue;
-        if ( (int)b->core.qual < aux->min_mapQ ) continue;
-        if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue;
-        break;
+typedef struct {
+    int header;
+    int flag;
+    int min_qual;
+    int min_mqual;
+    int min_len;
+    int skip_del;
+    int all_pos;
+    int remove_overlaps;
+    FILE *out;
+    char *reg;
+    void *bed;
+} depth_opt;
+
+static void zero_region(depth_opt *opt, depth_hist *dh,
+                        const char *name, hts_pos_t start, hts_pos_t end) {
+    hts_pos_t i;
+    kstring_t *ks = &dh->ks;
+
+    kputs(name, ks_clear(ks));
+    kputc('\t', ks);
+    size_t cur_l = ks->l;
+    if (dh->beg >= 0 && start < dh->beg)
+        start = dh->beg;
+    if (dh->end >= 0 && end > dh->end)
+        end = dh->end;
+
+    for (i = start; i < end; i++) {
+        // Could be optimised, but needs better API to skip to next
+        // bed region.
+        if (opt->bed && bed_overlap(opt->bed, name, i, i+1) == 0)
+            continue;
+
+        ks->l = cur_l;
+        kputll(i+1,  ks);
+        int n;
+        for (n = 0; n < dh->nfiles; n++) {
+            kputc_('\t', ks);
+            kputc_('0',  ks);
+        }
+        kputc('\n',  ks);
+        fputs(ks->s, opt->out);
      }
-    return ret;
+    ks->l = cur_l;
  }
  
-int read_file_list(const char *file_list,int *n,char **argv[]);
-
-static int usage() {
-    fprintf(samtools_stderr, "\n");
-    fprintf(samtools_stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
-    fprintf(samtools_stderr, "Options:\n");
-    fprintf(samtools_stderr, "   -a                  output all positions (including zero depth)\n");
-    fprintf(samtools_stderr, "   -a -a (or -aa)      output absolutely all positions, including unused ref. sequences\n");
-    fprintf(samtools_stderr, "   -b <bed>            list of positions or regions\n");
-    fprintf(samtools_stderr, "   -X                  use customized index files\n");
-    fprintf(samtools_stderr, "   -f <list>           list of input BAM filenames, one per line [null]\n");
-    fprintf(samtools_stderr, "   -H                  print a file header\n");
-    fprintf(samtools_stderr, "   -l <int>            read length threshold (ignore reads shorter than <int>) [0]\n");
-    fprintf(samtools_stderr, "   -d/-m <int>         maximum coverage depth [8000]. If 0, depth is set to the maximum\n"
-                    "                       integer value, effectively removing any depth limit.\n");  // the htslib's default
-    fprintf(samtools_stderr, "   -o FILE             where to write output to [samtools_stdout]\n");
-    fprintf(samtools_stderr, "   -q <int>            base quality threshold [0]\n");
-    fprintf(samtools_stderr, "   -Q <int>            mapping quality threshold [0]\n");
-    fprintf(samtools_stderr, "   -r <chr:from-to>    region\n");
-    fprintf(samtools_stderr, "   -g <flags>          include reads that have any of the specified flags set [0]\n");
-    fprintf(samtools_stderr, "   -G <flags>          filter out reads that have any of the specified flags set"
-                    "                       [UNMAP,SECONDARY,QCFAIL,DUP]\n");
-
-    sam_global_opt_help(samtools_stderr, "-.--.--.");
-
-    fprintf(samtools_stderr, "\n");
-    fprintf(samtools_stderr, "The output is a simple tab-separated table with three columns: reference name,\n");
-    fprintf(samtools_stderr, "position, and coverage depth.  Note that positions with zero coverage may be\n");
-    fprintf(samtools_stderr, "omitted by default; see the -a option.\n");
-    fprintf(samtools_stderr, "\n");
-
-    return EXIT_FAILURE;
+// A variation of bam_cigar2qlen which doesn't count soft-clips in to the
+// equation.  Basically it's the number of bases in query that are aligned
+// in some way to the reference (including insertions, which are considered
+// to be aligned by dint of being anchored either side).
+hts_pos_t qlen_used(bam1_t *b) {
+    int n_cigar = b->core.n_cigar;
+    const uint32_t *cigar = bam_get_cigar(b);
+
+    hts_pos_t l;
+
+    if (b->core.l_qseq) {
+        // Known SEQ permits of short cut of l_qseq minus CSOFT_CLIPs.
+        // Full scan not needed, which helps on excessively long CIGARs.
+        l = b->core.l_qseq;
+        int kl, kr;
+        for (kl = 0; kl < n_cigar; kl++)
+            if (bam_cigar_op(cigar[kl]) == BAM_CSOFT_CLIP)
+                l -= bam_cigar_oplen(cigar[kl]);
+            else
+                break;
+
+        for (kr = n_cigar-1; kr > kl; kr--)
+            if (bam_cigar_op(cigar[kr]) == BAM_CSOFT_CLIP)
+                l -= bam_cigar_oplen(cigar[kr]);
+            else
+                break;
+    } else {
+        // Unknown SEQ ("*") needs a full scan through the CIGAR string.
+        static int query[16] = {
+          //M I D N  S H P =  X B ? ?  ? ? ? ?
+            1,1,0,0, 0,0,0,1, 1,0,0,0, 0,0,0,0
+        };
+        int k;
+        for (k = l = 0; k < n_cigar; k++)
+            if (query[bam_cigar_op(cigar[k])])
+                l += bam_cigar_oplen(cigar[k]);
+    }
+    return l;
+
  }
  
-int main_depth(int argc, char *argv[])
-{
-    int i, n, tid, reg_tid, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, has_index_file = 0;
-    hts_pos_t beg, end, pos, last_pos = -1;
-    int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1;
-    const bam_pileup1_t **plp;
-    char *reg = 0; // specified region
-    void *bed = 0; // BED data structure
-    char *file_list = NULL, **fn = NULL;
-    sam_hdr_t *h = NULL; // BAM header of the 1st input
-    aux_t **data;
-    bam_mplp_t mplp;
-    int last_tid = -1, ret;
-    int print_header = 0;
-    char *output_file = NULL;
-    FILE *file_out = samtools_stdout;
-    uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
-    int tflags = 0;
+// Adds the depth for a single read to a depth_hist struct.
+// For just one file, this is easy.  We just have a circular buffer
+// where we increment values for bits that overlap existing data
+// and initialise values for coordinates which we're seeing for the first
+// time.  This is tracked by "end_pos" to know where we've got to.
+//
+// As the input is sorted, we can flush output from "last_output" to
+// b->core.pos.
+//
+// With multiple files, we must feed data in sorted order as if all files
+// are merged, but track depth per file.  This also means "end_pos" is per
+// file too, but "last_output" is global as it corresponds to rows printed.
+static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b,
+                     int overlap_clip, int file) {
+    hts_pos_t i;
+    size_t hmask = dh->size-1;
+    int n;
  
-    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
-    static const struct option lopts[] = {
-        SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
-        { NULL, 0, NULL, 0 }
-    };
+    if (!b || b->core.tid != dh->last_ref) {
+        // New ref
+        if (dh->last_ref >= 0) {
+            // do end
+            size_t cur_l = dh->ks.l;
+            int nf = dh->nfiles;
+            i = dh->last_output;
+            for (i = dh->last_output; nf; i++) {
+                nf = 0;
+                for (n = 0; n < dh->nfiles; n++) {
+                    if (i < dh->end_pos[n])
+                        nf++;
+                }
+                if (!nf)
+                    break;
+
+                if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0)
+                    continue;
  
-    // parse the command line
-    while ((n = getopt_long(argc, argv, "r:b:Xq:Q:l:f:am:d:Ho:g:G:", lopts, NULL)) >= 0) {
-        switch (n) {
-            case 'l': min_len = atoi(optarg); break; // minimum query length
-            case 'r': reg = strdup(optarg); break;   // parsing a region requires a BAM header
-            case 'b':
-                bed = bed_read(optarg); // BED or position list file can be parsed now
-                if (!bed) {
-                    print_error_errno("depth", "Could not read file \"%s\"", optarg);
-                    return EXIT_FAILURE;
+                dh->ks.l = cur_l;
+                kputll(i+1, &dh->ks);
+                for (n = 0; n < dh->nfiles; n++) {
+                    kputc_('\t', &dh->ks);
+                    int d = i < dh->end_pos[n]
+                        ? dh->hist[n][i & hmask]
+                        : 0;
+                    kputuw(d, &dh->ks);
                  }
-                break;
-            case 'X': has_index_file = 1; break;
-            case 'q': baseQ = atoi(optarg); break;   // base quality threshold
-            case 'Q': mapQ = atoi(optarg); break;    // mapping quality threshold
-            case 'f': file_list = optarg; break;
-            case 'a': all++; break;
-            case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth
-            case 'H': print_header = 1; break;
-            case 'o': output_file = optarg; break;
-            case 'g':
-                tflags = bam_str2flag(optarg);
-                if (tflags < 0 || tflags > BAM_FMAX) {
-                    print_error_errno("depth", "Flag value \"%s\" is not supported", optarg);
-                    return 1;
+                kputc('\n', &dh->ks);
+                fputs(dh->ks.s, opt->out);
+            }
+            if (opt->all_pos) {
+                // End of last ref
+                zero_region(opt, dh,
+                            sam_hdr_tid2name(h, dh->last_ref),
+                            i, sam_hdr_tid2len(h, dh->last_ref));
+            }
+            dh->ks.l = cur_l;
+        }
+
+        if (opt->all_pos > 1 && !opt->reg) {
+            // Any previous unused refs
+            int lr = dh->last_ref < 0 ? 0 : dh->last_ref+1;
+            int rr = b ? b->core.tid : sam_hdr_nref(h), r;
+            for (r = lr; r < rr; r++)
+                zero_region(opt, dh,
+                            sam_hdr_tid2name(h, r),
+                            0, sam_hdr_tid2len(h, r));
+        }
+
+        if (!b) {
+            // we're just flushing to end of file
+            if (opt->all_pos && opt->reg && dh->last_ref < 0)
+                // -a or -aa without a single read being output yet
+                zero_region(opt, dh, sam_hdr_tid2name(h, dh->tid), dh->beg,
+                            MIN(dh->end, sam_hdr_tid2len(h, dh->tid)));
+
+            return 0;
+        }
+
+        for (n = 0; dh->end_pos && n < dh->nfiles; n++)
+            dh->end_pos[n] = 0;
+        dh->last_output = dh->beg >= 0
+            ? MAX(b->core.pos, dh->beg)
+            : b->core.pos;
+        dh->last_ref = b->core.tid;
+        dh->ref = sam_hdr_tid2name(h, b->core.tid);
+        kputs(dh->ref, ks_clear(&dh->ks));
+        kputc('\t', &dh->ks);
+
+        if (opt->all_pos)
+            // Start of ref
+            zero_region(opt, dh, dh->ref, 0, b->core.pos);
+    } else {
+        if (dh->last_output < b->core.pos) {
+            // Flush any depth outputs up to start of new read
+            size_t cur_l = dh->ks.l;
+            int nf = dh->nfiles;
+            for (i = dh->last_output; i < b->core.pos; i++) {
+                nf = 0;
+                for (n = 0; n < dh->nfiles; n++) {
+                    if (i < dh->end_pos[n])
+                        nf++;
                  }
-                flags &= ~tflags;
-                break;
-            case 'G':
-                tflags = bam_str2flag(optarg);
-                if (tflags < 0 || tflags > BAM_FMAX) {
-                    print_error_errno("depth", "Flag value \"%s\" is not supported", optarg);
-                    return 1;
+                if (!nf)
+                    break;
+
+                if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0)
+                    continue;
+
+                dh->ks.l = cur_l;
+                kputll(i+1, &dh->ks);
+                for (n = 0; n < dh->nfiles; n++) {
+                    kputc_('\t', &dh->ks);
+                    int d = i < dh->end_pos[n]
+                        ? dh->hist[n][i & hmask]
+                        : 0;
+                    kputuw(d, &dh->ks);
                  }
-                flags |= tflags;
-                break;
-            default:  if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break;
-                      /* else fall-through */
-            case '?': return usage();
+                kputc('\n', &dh->ks);
+                fputs(dh->ks.s, opt->out);
+            }
+            if (opt->all_pos && i < b->core.pos)
+                // Hole in middle of ref
+                zero_region(opt, dh, dh->ref, i, b->core.pos);
+
+            dh->ks.l = cur_l;
+            dh->last_output = b->core.pos;
          }
      }
-    if (optind == argc && !file_list)
-        return usage();
-
-    /* output file provided by user */
-    if (output_file != NULL && strcmp(output_file,"-")!=0) {
-        file_out = fopen( output_file, "w" );
-        if (file_out == NULL) {
-            print_error_errno("depth", "Cannot open \"%s\" for writing.", output_file);
-            return EXIT_FAILURE;
-        }
+
+    hts_pos_t end_pos = bam_endpos(b); // 0 based, 1 past end.
+    //printf("%d %d\n", (int)b->core.pos+1, (int)end_pos);
+
+    if (b->core.tid < dh->last_ref ||
+        (dh->last_ref == b->core.tid && end_pos < dh->last_output)) {
+        print_error_errno("depth", "Data is not position sorted");
+        return -1;
      }
  
+    // If needed, grow the circular buffer.
+    if (end_pos+1 - b->core.pos >= dh->size) {
+        size_t old_size = dh->size;
+        size_t old_hmask = hmask;
+        while (end_pos+1 - b->core.pos >= dh->size)
+            dh->size = dh->size ? 2*dh->size : 2048;
+        hmask = dh->size-1;
+        if (!dh->hist) {
+            dh->hist = calloc(dh->nfiles, sizeof(*dh->hist));
+            dh->end_pos = calloc(dh->nfiles, sizeof(*dh->end_pos));
+            if (!dh->hist || !dh->end_pos)
+                return -1;
+        }
+        for (n = 0; n < dh->nfiles; n++) {
+            int *hist = calloc(dh->size, sizeof(*dh->hist[n]));
+            if (!hist)
+                return -1;
  
-    // initialize the auxiliary data structures
-    if (file_list)
-    {
-        if (has_index_file) {
-            print_error("depth", "The -f option cannot be combined with -X");
-            return 1;
+            // Simple approach for now; copy over old histogram verbatim.
+            for (i = dh->last_output; i < dh->last_output + old_size; i++)
+                hist[i & hmask] = dh->hist[n][i & old_hmask];
+            free(dh->hist[n]);
+            dh->hist[n] = hist;
          }
-        if ( read_file_list(file_list,&nfiles,&fn) ) return EXIT_FAILURE;
-        n = nfiles;
-        argv = fn;
-        optind = 0;
      }
-    else if (has_index_file) { // Calculate # of input BAM files
-        if ((argc - optind) % 2 != 0) {
-            fprintf(samtools_stderr, "Error: Odd number of filenames detected! Each BAM file should have an index file\n");
-            return 1;
-        }
-        n = (argc - optind) / 2;
+
+    // Accumulate depth, based on CIGAR
+    uint32_t *cig = bam_get_cigar(b);
+    int ncig = b->core.n_cigar, j, k, spos = 0;
+
+    // Zero new (previously unseen) coordinates so increment works later.
+    hts_pos_t end = MAX(dh->end_pos[file], b->core.pos);
+    if (end_pos > end && (end & hmask) < (end_pos & hmask)) {
+        memset(&dh->hist[file][end & hmask], 0,
+               sizeof(**dh->hist) * (end_pos - end));
      } else {
-        n = argc - optind;
+        for (i = end; i < end_pos; i++)
+            dh->hist[file][i & hmask] = 0;
      }
-    data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input
-    reg_tid = 0; beg = 0; end = HTS_POS_MAX;  // set the default region
-
-    for (i = 0; i < n; ++i) {
-        int rf;
-        data[i] = calloc(1, sizeof(aux_t));
-        data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM
-        if (data[i]->fp == NULL) {
-            print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]);
-            status = EXIT_FAILURE;
-            goto depth_end;
-        }
-        rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ;
-        if (baseQ) rf |= SAM_QUAL;
-        if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
-            print_error_errno("depth", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
-            status = EXIT_FAILURE;
-            goto depth_end;
-        }
-        if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
-            print_error_errno("depth", "Failed to set CRAM_OPT_DECODE_MD value");
-            status = EXIT_FAILURE;
-            goto depth_end;
-        }
-        data[i]->min_mapQ = mapQ;                    // set the mapQ filter
-        data[i]->min_len  = min_len;                 // set the qlen filter
-        data[i]->hdr = sam_hdr_read(data[i]->fp);    // read the BAM header
-        if (data[i]->hdr == NULL) {
-            print_error_errno("depth", "Couldn't read header for \"%s\"",
-                              argv[optind+i]);
-            status = EXIT_FAILURE;
-            goto depth_end;
-        }
-        if (reg) { // if a region is specified
-            hts_idx_t *idx = NULL;
-            // If index filename has not been specfied, look in BAM folder
-            if (has_index_file) {
-                idx = sam_index_load2(data[i]->fp, argv[optind+i], argv[optind+i+n]);  // load the index
+
+    i = b->core.pos;
+    uint8_t *qual = bam_get_qual(b);
+    int min_qual = opt->min_qual;
+    for (j = 0; j < ncig; j++) {
+        int op    = bam_cigar_op(cig[j]);
+        int oplen = bam_cigar_oplen(cig[j]);
+
+        switch (op) {
+        case BAM_CDEL:
+        case BAM_CREF_SKIP:
+            if (op != BAM_CDEL || opt->skip_del) {
+                // don't increment reference location
+                if (i + oplen >= dh->end_pos[file]) {
+                    for (k = 0; k < oplen; k++, i++) {
+                        if (i >= dh->end_pos[file])
+                            // redundant due to zero new elements above?
+                            dh->hist[file][i & hmask] = 0;
+                    }
+                } else {
+                    i += oplen;
+                }
+            } else { // op == BAM_CDEL and we count them (-J option),
+                // We don't incr spos here, but we still use qual.
+                // This doesn't make much sense, but it's for compatibility
+                // with the old code.  Arguably DEL shouldn't have a min
+                // qual and should always pass (as we've explicitly asked to
+                // include them).
+                int *hist = dh->hist[file];
+                k = 0;
+                if (overlap_clip) {
+                    if (i+oplen < overlap_clip) {
+                        i += oplen;
+                        break;
+                    } else if (i < overlap_clip) {
+                        k = overlap_clip - i;
+                        i = overlap_clip;
+                    }
+                }
+
+                // Question: should we even check quality values for DEL?
+                // We've explicitly asked to include them, and the quality
+                // is wrong anyway (it's the neighbouring base).  We do this
+                // for now for compatibility with the old depth command.
+
+                if (spos < b->core.l_qseq)
+                    for (; k < oplen; k++, i++)
+                        hist[i & hmask]+=qual[spos]>=min_qual;
+                else
+                    for (; k < oplen; k++, i++)
+                        hist[i & hmask]++;
+            }
+            break;
+
+        case BAM_CMATCH:
+        case BAM_CEQUAL:
+        case BAM_CDIFF:
+            if ((i & hmask) < ((i+oplen) & hmask)) {
+                // Optimisation when not wrapping around
+
+                // Unrolling doesn't help clang, but helps gcc,
+                // especially when not using -O3.
+                int *hist = &dh->hist[file][i & hmask];
+                if (min_qual || overlap_clip) {
+                    k = 0;
+                    if (overlap_clip) {
+                        if (i+oplen < overlap_clip) {
+                            i += oplen;
+                            spos += oplen;
+                            break;
+                        } else if (i < overlap_clip) {
+                            oplen -= overlap_clip - i;
+                            spos += overlap_clip - i;
+                            hist += overlap_clip - i;
+                            i = overlap_clip;
+                        }
+                    }
+
+                    // approx 50% of this func cpu time in this loop
+                    for (; k < (oplen & ~7); k+=8) {
+                        hist[k+0]+=qual[spos+0]>=min_qual;
+                        hist[k+1]+=qual[spos+1]>=min_qual;
+                        hist[k+2]+=qual[spos+2]>=min_qual;
+                        hist[k+3]+=qual[spos+3]>=min_qual;
+                        hist[k+4]+=qual[spos+4]>=min_qual;
+                        hist[k+5]+=qual[spos+5]>=min_qual;
+                        hist[k+6]+=qual[spos+6]>=min_qual;
+                        hist[k+7]+=qual[spos+7]>=min_qual;
+                        spos += 8;
+                    }
+                } else {
+                    // easier to vectorize when no min_qual
+                    for (k = 0; k < (oplen & ~7); k+=8) {
+                        hist[k+0]++;
+                        hist[k+1]++;
+                        hist[k+2]++;
+                        hist[k+3]++;
+                        hist[k+4]++;
+                        hist[k+5]++;
+                        hist[k+6]++;
+                        hist[k+7]++;
+                    }
+                    spos += k;
+                }
+                for (; k < oplen && spos < b->core.l_qseq; k++, spos++)
+                    hist[k]+=qual[spos]>=min_qual;
+                for (; k < oplen; k++, spos++)
+                    hist[k]++;
+                i += oplen;
              } else {
-                idx = sam_index_load(data[i]->fp, argv[optind+i]);
+                // Simple to understand case, but slower.
+                // We use this only for reads with wrap-around.
+                int *hist = dh->hist[file];
+                k = 0;
+                if (overlap_clip) {
+                    if (i+oplen < overlap_clip) {
+                        i += oplen;
+                        break;
+                    } else if (i < overlap_clip) {
+                        oplen -= overlap_clip - i;
+                        spos += overlap_clip - i;
+                        i = overlap_clip;
+                    }
+                }
+                for (; k < oplen && spos < b->core.l_qseq; k++, i++, spos++)
+                    hist[i & hmask]+=qual[spos]>=min_qual;
+                for (; k < oplen; k++, i++, spos++)
+                    hist[i & hmask]++;
              }
-            if (idx == NULL) {
-                print_error("depth", "can't load index for \"%s\"", argv[optind+i]);
-                status = EXIT_FAILURE;
-                goto depth_end;
+            break;
+
+        case BAM_CINS:
+        case BAM_CSOFT_CLIP:
+            spos += oplen;
+            break;
+
+        case BAM_CPAD:
+        case BAM_CHARD_CLIP:
+            // ignore
+            break;
+
+        default:
+            print_error("depth", "Unsupported cigar op '%d'", op);
+            return -1;
+        }
+    }
+
+    if (dh->end >= 0 && end_pos > dh->end)
+        end_pos = dh->end;
+    if (dh->end_pos[file] < end_pos)
+        dh->end_pos[file] = end_pos;
+
+    return 0;
+}
+
+// Hash on name -> alignment end pos. This permits a naive overlap removal.
+// Note it cannot analyse the overlapping sequence and qualities, so the
+// interaction of basecalls/qualities and the -Q parameter cannot be
+// applied here (unlike the full mpileup algorithm).
+KHASH_MAP_INIT_STR(olap_hash, hts_pos_t)
+typedef khash_t(olap_hash) olap_hash_t;
+
+static int fastdepth_core(depth_opt *opt, uint32_t nfiles, char **fn,
+                          samFile **fp, hts_itr_t **itr, sam_hdr_t **h) {
+    int ret = -1, err = 1, i;
+    olap_hash_t **overlaps = NULL;
+    depth_hist dh = {0};
+
+    // An array of bam structs, one per input file, to hold the next entry
+    bam1_t **b = calloc(nfiles, sizeof(*b));
+    int *finished = calloc(nfiles, sizeof(*finished)), to_go = nfiles;
+    if (!b || !finished)
+        goto err;
+
+    for (i = 0; i < nfiles; i++)
+        if (!(b[i] = bam_init1()))
+            goto err;
+
+    // Do we need one overlap hash per file? Or shared?
+    if (opt->remove_overlaps) {
+        if (!(overlaps = calloc(nfiles, sizeof(*overlaps))))
+            return -1;
+        for (i = 0; i < nfiles; i++) {
+            if (!(overlaps[i] = kh_init(olap_hash)))
+                return -1;
+        }
+    }
+
+    // Create the initial histogram
+    dh.nfiles = nfiles;
+    dh.size = 0;
+    dh.hist = NULL;
+    dh.last_ref = -99;
+    dh.end_pos = NULL;
+    dh.last_output = itr && itr[0] ? itr[0]->beg : 0;
+    ks_initialize(&dh.ks);
+
+    // Clip results to region if specified
+    dh.beg = -1;
+    dh.end = -1;
+    dh.tid = 0;
+    if (itr && itr[0]) {
+        dh.tid = itr[0]->tid;
+        dh.beg = itr[0]->beg;
+        dh.end = itr[0]->end;
+    }
+
+    if (opt->header) {
+        fprintf(opt->out, "#CHROM\tPOS");
+        for (i = 0; i < nfiles; i++)
+            fprintf(opt->out, "\t%s", fn[i]);
+        fputc('\n', opt->out);
+    }
+
+    // Populate first record per file
+    for (i = 0; i < nfiles; i++) {
+        for(;;) {
+            ret = itr && itr[i]
+                ? sam_itr_next(fp[i], itr[i], b[i])
+                : sam_read1(fp[i], h[i], b[i]);
+            if (ret < -1)
+                goto err;
+            if (ret == -1) {
+                to_go--;
+                finished[i] = 1;
+                break;
              }
-            data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator
-            hts_idx_destroy(idx); // the index is not needed any more; free the memory
-            if (data[i]->iter == NULL) {
-                print_error("depth", "can't parse region \"%s\"", reg);
-                status = EXIT_FAILURE;
-                goto depth_end;
+
+            if (b[i]->core.tid < 0)
+                continue;
+            if (b[i]->core.flag & opt->flag)
+                continue;
+            if (b[i]->core.qual < opt->min_mqual)
+                continue;
+
+            // Original samtools depth used the total sequence (l_qseq)
+            // including soft-clips.  This doesn't feel like a useful metric
+            // to be filtering on.  We now only count sequence bases that
+            // form the used part of the alignment.
+            if (opt->min_len) {
+                if (qlen_used(b[i]) < opt->min_len)
+                    continue;
              }
+
+            break;
          }
-        data[i]->flags = flags;
      }
-    if (print_header) {
-        fputs("#CHROM\tPOS", file_out);
-        for (i = 0; i < n; ++i) {
-            fputc('\t', file_out);
-            fputs(argv[optind+i], file_out);
+
+    // Loop through input files, merging in order so we're
+    // always adding the next record in sequence
+    while (to_go) {
+        // Find next record in file list
+        int best_tid = INT_MAX, best_file = 0;
+        hts_pos_t best_pos = HTS_POS_MAX;
+
+        for (i = 0; i < nfiles; i++) {
+            if (finished[i])
+                continue;
+            if (best_tid > b[i]->core.tid) {
+                best_tid = b[i]->core.tid;
+                best_pos = b[i]->core.pos;
+                best_file = i;
+            } else if (best_tid == b[i]->core.tid &&
+                       best_pos > b[i]->core.pos) {
+                best_pos = b[i]->core.pos;
+                best_file = i;
              }
-        fputc('\n', file_out);
          }
-    h = data[0]->hdr; // easy access to the header of the 1st BAM
-    if (reg) {
-        beg = data[0]->iter->beg; // and to the parsed region coordinates
-        end = data[0]->iter->end;
-        reg_tid = data[0]->iter->tid;
-    }
+        i = best_file;
  
-    // the core multi-pileup loop
-    mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
-    if (0 < max_depth)
-        bam_mplp_set_maxcnt(mplp,max_depth);  // set maximum coverage depth
-    else if (!max_depth)
-        bam_mplp_set_maxcnt(mplp,INT_MAX);
-    n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
-    plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
-    while ((ret=bam_mplp64_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
-        if (pos < beg || pos >= end) continue; // out of range; skip
-        if (tid >= sam_hdr_nref(h)) continue;     // diff number of @SQ lines per file?
-        if (all) {
-            while (tid > last_tid) {
-                if (last_tid >= 0 && !reg) {
-                    // Deal with remainder or entirety of last tid.
-                    while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
-                        // Horribly inefficient, but the bed API is an obfuscated black box.
-                        if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)
-                            continue;
-                        fputs(sam_hdr_tid2name(h, last_tid), file_out);
-                        fprintf(file_out, "\t%"PRIhts_pos, last_pos+1);
-                        for (i = 0; i < n; i++)
-                            fputc('\t', file_out), fputc('0', file_out);
-                        fputc('\n', file_out);
-                    }
+        hts_pos_t clip = 0;
+        if (overlaps && (b[i]->core.flag & BAM_FPAIRED) &&
+            !(b[i]->core.flag & BAM_FMUNMAP)) {
+            khiter_t k = kh_get(olap_hash, overlaps[i], bam_get_qname(b[i]));
+            if (k == kh_end(overlaps[i])) {
+                // not seen before
+                hts_pos_t endpos = bam_endpos(b[i]);
+
+                // Don't add if mate location is known and can't overlap.
+                if (b[i]->core.mpos == -1 ||
+                    (b[i]->core.tid == b[i]->core.mtid &&
+                     b[i]->core.mpos <= endpos)) {
+                    k = kh_put(olap_hash, overlaps[i], bam_get_qname(b[i]),
+                               &ret);
+                    if (ret < 0)
+                        return -1;
+                    kh_key(overlaps[i], k) = strdup(bam_get_qname(b[i]));
+                    kh_value(overlaps[i], k) = endpos;
                  }
-                last_tid++;
-                last_pos = -1;
-                if (all < 2)
-                    break;
+            } else {
+                // seen before
+                clip = kh_value(overlaps[i], k);
+                free((char *)kh_key(overlaps[i], k));
+                kh_del(olap_hash, overlaps[i], k);
              }
+        }
  
-            // Deal with missing portion of current tid
-            while (++last_pos < pos) {
-                if (last_pos < beg) continue; // out of range; skip
-                if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0)
-                    continue;
-                fputs(sam_hdr_tid2name(h, tid), file_out);
-                fprintf(file_out, "\t%"PRIhts_pos, last_pos+1);
-                for (i = 0; i < n; i++)
-                    fputc('\t', file_out), fputc('0', file_out);
-                fputc('\n', file_out);
+        // Add the next merged BAM record to the depth plot
+        if ((ret = add_depth(opt, &dh, h[i], b[i], clip, i)) < 0) {
+            ret = -1;
+            goto err;
+        }
+
+        // Populate next record from this file
+        for(;!finished[i];) {
+            ret = itr && itr[i]
+                ? sam_itr_next(fp[i], itr[i], b[i])
+                : sam_read1(fp[i], h[i], b[i]);
+            if (ret < -1) {
+                ret = -1;
+                goto err;
+            }
+            if (ret == -1) {
+                to_go--;
+                finished[i] = 1;
+                break;
              }
  
-            last_tid = tid;
-            last_pos = pos;
-        }
-        if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), pos, pos + 1) == 0) continue;
-        fputs(sam_hdr_tid2name(h, tid), file_out);
-        fprintf(file_out, "\t%"PRIhts_pos, pos+1); // a customized fprintf(samtools_stdout, ) would be faster
-        for (i = 0; i < n; ++i) { // base level filters have to go here
-            int j, m = 0;
-            for (j = 0; j < n_plp[i]; ++j) {
-                const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
-                if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
-                else if (p->qpos < p->b->core.l_qseq &&
-                         bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
+            if (b[i]->core.tid < 0)
+                continue;
+            if (b[i]->core.flag & opt->flag)
+                continue;
+            if (b[i]->core.qual < opt->min_mqual)
+                continue;
+
+            if (opt->min_len) {
+                if (qlen_used(b[i]) < opt->min_len)
+                    continue;
              }
-            fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
+
+            break;
          }
-        fputc('\n', file_out);
      }
-    if (ret < 0) status = EXIT_FAILURE;
-    free(n_plp); free(plp);
-    bam_mplp_destroy(mplp);
-
-    if (all) {
-        // Handle terminating region
-        if (last_tid < 0 && reg) {
-            last_tid = reg_tid;
-            last_pos = beg-1;
+
+    // Tidy up end.
+    ret = add_depth(opt, &dh, h[0], NULL, 0, 0);
+    err = 0;
+
+ err:
+    if (ret == 0 && err)
+        ret = -1;
+
+    for (i = 0; i < nfiles; i++) {
+        if (b[i])
+            bam_destroy1(b[i]);
+        if (dh.hist && dh.hist[i])
+            free(dh.hist[i]);
+    }
+    free(b);
+    free(finished);
+    ks_free(&dh.ks);
+    free(dh.hist);
+    free(dh.end_pos);
+    if (overlaps) {
+        khiter_t k;
+        for (i = 0; i < nfiles; i++) {
+            if (!overlaps[i])
+                continue;
+            for (k = kh_begin(overlaps[i]); k < kh_end(overlaps[i]); k++)
+                if (kh_exist(overlaps[i], k))
+                    free((char *)kh_key(overlaps[i], k));
+            kh_destroy(olap_hash, overlaps[i]);
          }
-        while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) {
-            while (++last_pos < sam_hdr_tid2len(h, last_tid)) {
-                if (last_pos >= end) break;
-                if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0)
-                    continue;
-                fputs(sam_hdr_tid2name(h, last_tid), file_out);
-                fprintf(file_out, "\t%"PRIhts_pos, last_pos+1);
-                for (i = 0; i < n; i++)
-                    fputc('\t', file_out), fputc('0', file_out);
-                fputc('\n', file_out);
+        free(overlaps);
+    }
+
+    return ret;
+}
+
+static void usage_exit(FILE *fp, int exit_status)
+{
+    fprintf(fp, "Usage: samtools depth [options] in.bam [in.bam ...]\n");
+    fprintf(fp, "\nOptions:\n");
+    fprintf(fp, "  -a           Output all positions (including zero depth)\n");
+    fprintf(fp, "  -a -a, -aa   Output absolutely all positions, including unused ref seqs\n");
+    fprintf(fp, "  -r REG       Specify a region in chr or chr:from-to syntax\n");
+    fprintf(fp, "  -b FILE      Use bed FILE for list of regions\n");
+    fprintf(fp, "  -f FILE      Specify list of input BAM/SAM/CRAM filenames\n");
+    fprintf(fp, "  -X           Use custom index files (in -X *.bam *.bam.bai order)\n");
+    fprintf(fp, "  -g INT       Remove specified flags from default flag filter\n");
+    fprintf(fp, "  -G INT       Add specified flags to the default flag filter\n");
+    fprintf(fp, "  -H           Print a file header line\n");
+    fprintf(fp, "  -l INT       Minimum read length [0]\n");
+    fprintf(fp, "  -o FILE      Write output to FILE [samtools_stdout]\n");
+    fprintf(fp, "  -q INT       Minimum base quality [0]\n");
+    fprintf(fp, "  -Q INT       Minimum mapping quality [0]\n");
+    fprintf(fp, "  -H           Print a file header\n");
+    fprintf(fp, "  -J           Include reads with deletions in depth computation\n");
+    fprintf(fp, "  -s           Do not count overlapping reads within a template\n");
+    sam_global_opt_help(fp, "-.---@-.");
+    samtools_exit(exit_status);
+}
+
+int main_depth(int argc, char *argv[])
+{
+    int nfiles, i;
+    samFile **fp;
+    sam_hdr_t **header;
+    int c, has_index_file = 0;
+    char *file_list = NULL, **fn = NULL;
+    depth_opt opt = {
+        .flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL,
+        .min_qual = 0,
+        .min_mqual = 0,
+        .skip_del = 1,
+        .header = 0,
+        .min_len = 0,
+        .out = samtools_stdout,
+        .all_pos = 0,
+        .remove_overlaps = 0,
+        .reg = NULL,
+        .bed = NULL,
+    };
+
+    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+    static const struct option lopts[] = {
+        SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'),
+        {NULL, 0, NULL, 0}
+    };
+
+    while ((c = getopt_long(argc, argv, "@:q:Q:JHd:m:l:g:G:o:ar:Xf:b:s",
+                            lopts, NULL)) >= 0) {
+        switch (c) {
+        case 'a':
+            opt.all_pos++;
+            break;
+
+        case 'b':
+            opt.bed = bed_read(optarg);
+            if (!opt.bed) {
+                print_error_errno("depth", "Could not read file \"%s\"",
+                                  optarg);
+                return 1;
              }
-            last_tid++;
-            last_pos = -1;
-            if (all < 2 || reg)
+            break;
+
+        case 'f':
+            file_list = optarg;
+            break;
+
+        case 'd':
+        case 'm':
+            // depth limit - now ignored
+            break;
+
+        case 'g':
+            opt.flag &= ~bam_str2flag(optarg);
+            break;
+        case 'G':
+            opt.flag |= bam_str2flag(optarg);
+            break;
+
+        case 'l':
+            opt.min_len = atoi(optarg);
+            break;
+
+        case 'H':
+            opt.header = 1;
+            break;
+
+        case 'q':
+            opt.min_qual = atoi(optarg);
+            break;
+        case 'Q':
+            opt.min_mqual = atoi(optarg);
+            break;
+
+        case 'J':
+            opt.skip_del = 0;
+            break;
+
+        case 'o':
+            if (opt.out != samtools_stdout)
                  break;
+            opt.out = fopen(optarg, "w");
+            if (!opt.out) {
+                print_error_errno("depth", "Cannot open \"%s\" for writing.",
+                                  optarg);
+                return EXIT_FAILURE;
+            }
+            break;
+
+        case 'r':
+            opt.reg = optarg;
+            break;
+
+        case 's':
+            opt.remove_overlaps = 1;
+            break;
+
+        case 'X':
+            has_index_file = 1;
+            break;
+
+        default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+            /* else fall-through */
+        case '?':
+            usage_exit(samtools_stderr, EXIT_FAILURE);
          }
      }
  
-depth_end:
-    if (((file_out != samtools_stdout)? fclose(file_out) : fflush(file_out)) != 0) {
-        if (status == EXIT_SUCCESS) {
-            if (file_out != samtools_stdout)
-                print_error_errno("depth", "error on closing \"%s\"", output_file);
-            else
-                print_error_errno("depth", "error on flushing standard output");
-            status = EXIT_FAILURE;
+    if (argc < optind+1 && !file_list) {
+        if (argc == optind)
+            usage_exit(samtools_stdout, EXIT_SUCCESS);
+        else
+            usage_exit(samtools_stderr, EXIT_FAILURE);
+    }
+
+    if (file_list) {
+        if (has_index_file) {
+            print_error("depth", "The -f option cannot be combined with -X");
+            return 1;
+        }
+        if (read_file_list(file_list, &nfiles, &fn))
+            return 1;
+        argv = fn;
+        argc = nfiles;
+        optind = 0;
+    } else {
+        nfiles = argc - optind;
+    }
+
+    if (has_index_file) {
+        if (nfiles%1) {
+            print_error("depth", "-X needs one index specified per bam file");
+            return 1;
          }
+        nfiles /= 2;
+    }
+    fp = malloc(nfiles * sizeof(*fp));
+    header = malloc(nfiles * sizeof(*header));
+    if (!fp || !header) {
+        print_error_errno("depth", "Out of memory");
+        return 1;
      }
  
-    for (i = 0; i < n && data[i]; ++i) {
-        sam_hdr_destroy(data[i]->hdr);
-        if (data[i]->fp) sam_close(data[i]->fp);
-        hts_itr_destroy(data[i]->iter);
-        free(data[i]);
+    hts_itr_t **itr = NULL;
+    if (opt.reg) {
+        itr = calloc(nfiles, sizeof(*itr));
+        if (!itr)
+            return 1;
      }
-    free(data); free(reg);
-    if (bed) bed_destroy(bed);
-    if ( file_list )
-    {
-        for (i=0; i<n; i++) free(fn[i]);
+
+    for (i = 0; i < nfiles; i++, optind++) {
+        fp[i] = sam_open_format(argv[optind], "r", &ga.in);
+        if (fp[i] == NULL) {
+            print_error_errno("depth",
+                              "Cannot open input file \"%s\"", argv[optind]);
+            return 1;
+        }
+
+        if (ga.nthreads > 0)
+            hts_set_threads(fp[i], ga.nthreads);
+
+        if (hts_set_opt(fp[i], CRAM_OPT_REQUIRED_FIELDS,
+                        SAM_FLAG | SAM_RNAME | SAM_POS | SAM_CIGAR
+                        | (opt.remove_overlaps ? SAM_QNAME|SAM_RNEXT|SAM_PNEXT
+                                               : 0)
+                        | (opt.min_mqual       ? SAM_MAPQ  : 0)
+                        | (opt.min_len         ? SAM_SEQ   : 0)
+                        | (opt.min_qual        ? SAM_QUAL  : 0))) {
+            fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
+            return 1;
+        }
+
+        if (hts_set_opt(fp[i], CRAM_OPT_DECODE_MD, 0)) {
+            fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+            return 1;
+        }
+
+        // FIXME: what if headers differ?
+        header[i] = sam_hdr_read(fp[i]);
+        if (header == NULL) {
+            fprintf(samtools_stderr, "Failed to read header for \"%s\"\n",
+                    argv[optind]);
+            return 1;
+        }
+
+        if (opt.reg) {
+            hts_idx_t *idx = has_index_file
+                ? sam_index_load2(fp[i], argv[optind], argv[optind+nfiles])
+                : sam_index_load(fp[i], argv[optind]);
+            if (!idx) {
+                print_error("depth", "cannot load index for \"%s\"",
+                            argv[optind]);
+                return 1;
+            }
+            if (!(itr[i] = sam_itr_querys(idx, header[i], opt.reg))) {
+                print_error("depth", "cannot parse region \"%s\"", opt.reg);
+                return 1;
+            }
+            hts_idx_destroy(idx);
+        }
+    }
+
+    int ret = fastdepth_core(&opt, nfiles, &argv[argc-nfiles], fp, itr, header)
+        ? 1 : 0;
+
+    for (i = 0; i < nfiles; i++) {
+        sam_hdr_destroy(header[i]);
+        sam_close(fp[i]);
+        if (itr && itr[i])
+            hts_itr_destroy(itr[i]);
+    }
+    free(header);
+    free(fp);
+    free(itr);
+    if (file_list) {
+        for (i=0; i<nfiles; i++)
+            free(fn[i]);
          free(fn);
      }
+    if (opt.bed)
+        bed_destroy(opt.bed);
      sam_global_args_free(&ga);
-    return status;
+    if (opt.out != samtools_stdout) fclose(opt.out);
+    return ret;
  }
  
  #ifdef _MAIN_BAM2DEPTH
diff --git a/samtools/bam_addrprg.c b/samtools/bam_addrprg.c

index 58c712f75fc87352ffe37f1a8778d210683e194a..06c31474a1e83ab488cab36daf782902d45da562 100644 (file)
--- a/samtools/bam_addrprg.c
+++ b/samtools/bam_addrprg.c
@@ -1,6 +1,6 @@
  /* bam_addrprg.c -- samtools command to add or replace readgroups.
  
-   Copyright (c) 2013, 2015-2017, 2019 Genome Research Limited.
+   Copyright (c) 2013, 2015-2017, 2019-2021 Genome Research Limited.
  
     Author: Martin O. Pollard <mp15@sanger.ac.uk>
  
@@ -51,6 +51,8 @@ struct parsed_opts {
      rg_mode mode;
      sam_global_args ga;
      htsThreadPool p;
+    int uncompressed;
+    int overwrite_hdr_rg;
  };
  
  struct state;
@@ -164,13 +166,15 @@ static char* get_rg_id(const char *line)
  static void usage(FILE *fp)
  {
      fprintf(fp,
-            "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-o <output.bam>] <input.bam>\n"
+            "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-m orphan_only|overwrite_all] [-o <output.bam>] <input.bam>\n"
              "\n"
              "Options:\n"
              "  -m MODE   Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n"
              "  -o FILE   Where to write output to [stdout]\n"
              "  -r STRING @RG line text\n"
              "  -R STRING ID of @RG line in existing header to use\n"
+            "  -u        Output uncompressed data\n"
+            "  -w        Overwrite an existing @RG line\n"
              "  --no-PG   Do not add a PG line\n"
              );
      sam_global_opt_help(fp, "..O..@..");
@@ -198,7 +202,7 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
      };
      kstring_t rg_line = {0,0,NULL};
  
-    while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h@:", lopts, NULL)) >= 0) {
+    while ((n = getopt_long(argc, argv, "r:R:m:o:O:h@:uw", lopts, NULL)) >= 0) {
          switch (n) {
              case 'r':
                  // Are we adding to existing rg line?
@@ -235,6 +239,12 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
              case 1:
                  retval->no_pg = 1;
                  break;
+            case 'u':
+                retval->uncompressed = 1;
+                break;
+            case 'w':
+                retval->overwrite_hdr_rg = 1;
+                break;
              case '?':
                  usage(stderr);
                  free(retval);
@@ -314,7 +324,7 @@ static void orphan_only_func(const state_t* state, bam1_t* file_read)
  }
  
  static bool init(const parsed_opts_t* opts, state_t** state_out) {
-    char output_mode[8] = "w";
+    char output_mode[9] = "w";
      state_t* retval = (state_t*) calloc(1, sizeof(state_t));
  
      if (retval == NULL) {
@@ -332,8 +342,12 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) {
      retval->input_header = sam_hdr_read(retval->input_file);
  
      retval->output_header = sam_hdr_dup(retval->input_header);
+
+    if (opts->uncompressed)
+        strcat(output_mode, "0");
      if (opts->output_name) // File format auto-detection
-        sam_open_mode(output_mode + 1, opts->output_name, NULL);
+        sam_open_mode(output_mode + strlen(output_mode),
+                      opts->output_name, NULL);
      retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out);
  
      if (retval->output_file == NULL) {
@@ -351,10 +365,20 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) {
          // Check does not already exist
          kstring_t hdr_line = { 0, 0, NULL };
          if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) {
-            fprintf(stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n");
-            free(hdr_line.s);
-            return false;
+            if (opts->overwrite_hdr_rg) {
+                if(-1 == sam_hdr_remove_line_id(retval->output_header, "RG", "ID", opts->rg_id)) {
+                    fprintf(stderr, "[init] Error removing the RG line with ID:%s from the output header.\n", opts->rg_id);
+                    ks_free(&hdr_line);
+                    return false;
+                }
+            } else {
+                fprintf(stderr, "[init] RG line with ID:%s already present in the header. Use -w to overwrite.\n", opts->rg_id);
+                ks_free(&hdr_line);
+                return false;
+            }
          }
+        ks_free(&hdr_line);
+
          if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) {
              fprintf(stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id);
              return false;
@@ -374,7 +398,7 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) {
                  return false;
              }
              retval->rg_id = strdup(opts->rg_id);
-            free(hdr_line.s);
+            ks_free(&hdr_line);
          } else {
              kstring_t rg_id = { 0, 0, NULL };
              if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) {
diff --git a/samtools/bam_addrprg.c.pysam.c b/samtools/bam_addrprg.c.pysam.c

index ba1cb0837b83907dbbba93c47356754f0a710f70..88ce7e35be6c1c7a55384e23c9d842d91411b7aa 100644 (file)
--- a/samtools/bam_addrprg.c.pysam.c
+++ b/samtools/bam_addrprg.c.pysam.c
@@ -2,7 +2,7 @@
  
  /* bam_addrprg.c -- samtools command to add or replace readgroups.
  
-   Copyright (c) 2013, 2015-2017, 2019 Genome Research Limited.
+   Copyright (c) 2013, 2015-2017, 2019-2021 Genome Research Limited.
  
     Author: Martin O. Pollard <mp15@sanger.ac.uk>
  
@@ -53,6 +53,8 @@ struct parsed_opts {
      rg_mode mode;
      sam_global_args ga;
      htsThreadPool p;
+    int uncompressed;
+    int overwrite_hdr_rg;
  };
  
  struct state;
@@ -166,13 +168,15 @@ static char* get_rg_id(const char *line)
  static void usage(FILE *fp)
  {
      fprintf(fp,
-            "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-o <output.bam>] <input.bam>\n"
+            "Usage: samtools addreplacerg [options] [-r <@RG line> | -R <existing id>] [-m orphan_only|overwrite_all] [-o <output.bam>] <input.bam>\n"
              "\n"
              "Options:\n"
              "  -m MODE   Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n"
              "  -o FILE   Where to write output to [samtools_stdout]\n"
              "  -r STRING @RG line text\n"
              "  -R STRING ID of @RG line in existing header to use\n"
+            "  -u        Output uncompressed data\n"
+            "  -w        Overwrite an existing @RG line\n"
              "  --no-PG   Do not add a PG line\n"
              );
      sam_global_opt_help(fp, "..O..@..");
@@ -200,7 +204,7 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
      };
      kstring_t rg_line = {0,0,NULL};
  
-    while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h@:", lopts, NULL)) >= 0) {
+    while ((n = getopt_long(argc, argv, "r:R:m:o:O:h@:uw", lopts, NULL)) >= 0) {
          switch (n) {
              case 'r':
                  // Are we adding to existing rg line?
@@ -237,6 +241,12 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
              case 1:
                  retval->no_pg = 1;
                  break;
+            case 'u':
+                retval->uncompressed = 1;
+                break;
+            case 'w':
+                retval->overwrite_hdr_rg = 1;
+                break;
              case '?':
                  usage(samtools_stderr);
                  free(retval);
@@ -316,7 +326,7 @@ static void orphan_only_func(const state_t* state, bam1_t* file_read)
  }
  
  static bool init(const parsed_opts_t* opts, state_t** state_out) {
-    char output_mode[8] = "w";
+    char output_mode[9] = "w";
      state_t* retval = (state_t*) calloc(1, sizeof(state_t));
  
      if (retval == NULL) {
@@ -334,8 +344,12 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) {
      retval->input_header = sam_hdr_read(retval->input_file);
  
      retval->output_header = sam_hdr_dup(retval->input_header);
+
+    if (opts->uncompressed)
+        strcat(output_mode, "0");
      if (opts->output_name) // File format auto-detection
-        sam_open_mode(output_mode + 1, opts->output_name, NULL);
+        sam_open_mode(output_mode + strlen(output_mode),
+                      opts->output_name, NULL);
      retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out);
  
      if (retval->output_file == NULL) {
@@ -353,10 +367,20 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) {
          // Check does not already exist
          kstring_t hdr_line = { 0, 0, NULL };
          if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) {
-            fprintf(samtools_stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n");
-            free(hdr_line.s);
-            return false;
+            if (opts->overwrite_hdr_rg) {
+                if(-1 == sam_hdr_remove_line_id(retval->output_header, "RG", "ID", opts->rg_id)) {
+                    fprintf(samtools_stderr, "[init] Error removing the RG line with ID:%s from the output header.\n", opts->rg_id);
+                    ks_free(&hdr_line);
+                    return false;
+                }
+            } else {
+                fprintf(samtools_stderr, "[init] RG line with ID:%s already present in the header. Use -w to overwrite.\n", opts->rg_id);
+                ks_free(&hdr_line);
+                return false;
+            }
          }
+        ks_free(&hdr_line);
+
          if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) {
              fprintf(samtools_stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id);
              return false;
@@ -376,7 +400,7 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) {
                  return false;
              }
              retval->rg_id = strdup(opts->rg_id);
-            free(hdr_line.s);
+            ks_free(&hdr_line);
          } else {
              kstring_t rg_id = { 0, 0, NULL };
              if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) {
diff --git a/samtools/bam_ampliconclip.c b/samtools/bam_ampliconclip.c

new file mode 100644 (file)

index 0000000..f3fe2bc
--- /dev/null
+++ b/samtools/bam_ampliconclip.c
@@ -0,0 +1,1079 @@
+/*  bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
+                          from the 5' end.
+
+    Copyright (C) 2020-2021 Genome Research Ltd.
+
+    Authors: Andrew Whitwham <aw7@sanger.ac.uk>
+             Rob Davies <rmd+git@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+*/
+
+#include <config.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include "htslib/thread_pool.h"
+#include "sam_opts.h"
+#include <htslib/hts.h>
+#include "htslib/hfile.h"
+#include "htslib/kstring.h"
+#include "htslib/sam.h"
+#include "samtools.h"
+#include "bam_ampliconclip.h"
+
+typedef enum {
+    soft_clip,
+    hard_clip
+} clipping_type;
+
+typedef struct {
+    int add_pg;
+    int use_strand;
+    int write_clipped;
+    int mark_fail;
+    int both;
+    int fail_len;
+    int filter_len;
+    int unmapped;
+    int oa_tag;
+    int del_tag;
+    int tol;
+    char *arg_list;
+    char *stats_file;
+    char *rejects_file;
+} cl_param_t;
+
+
+static int bed_entry_sort(const void *av, const void *bv) {
+    bed_entry_t *a = (bed_entry_t *) av;
+    bed_entry_t *b = (bed_entry_t *) bv;
+    return a->right < b->right ? -1 : (a->right == b->right ? 0 : 1);
+}
+
+
+int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists) {
+    hFILE *fp;
+    int line_count = 0, ret;
+    int64_t left, right;
+    kstring_t line = KS_INITIALIZE;
+    bed_entry_list_t *list;
+    khiter_t bed_itr;
+
+    if ((fp = hopen(infile, "r")) == NULL) {
+        print_error_errno("amplicon", "unable to open file %s.", infile);
+        return 1;
+    }
+
+    char ref[1024];
+
+    while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) {
+        line_count++;
+        int hret;
+        char strand;
+
+        if (line.l == 0 || *line.s == '#') continue;
+        if (strncmp(line.s, "track ", 6) == 0) continue;
+        if (strncmp(line.s, "browser ", 8) == 0) continue;
+
+        if (get_strand) {
+            if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64" %*s %*s %c",
+                       ref, &left, &right, &strand) != 4) {
+                fprintf(stderr, "[amplicon] error: bad bed file format in line %d of %s.\n"
+                                "(N.B. ref/chrom name limited to 1023 characters.)\n",
+                                    line_count, infile);
+                ret = 1;
+                goto error;
+            }
+        } else {
+            if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64,
+                       ref, &left, &right) != 3) {
+                fprintf(stderr, "[amplicon] error: bad bed file format in line %d of %s\n"
+                                "(N.B. ref/chrom name limited to 1023 characters.)\n",
+                                    line_count, infile);
+                ret = 1;
+                goto error;
+            }
+        }
+
+        bed_itr = kh_get(bed_list_hash, bed_lists, ref);
+
+        if (bed_itr == kh_end(bed_lists)) { // new ref entry
+            char *ref_name = strdup(ref); // need a copy for the hash key
+
+            if (!ref_name) {
+                fprintf(stderr, "[amplicon] error: unable to allocate memory for ref name.\n");
+                ret = 1;
+                goto error;
+            }
+
+            bed_itr = kh_put(bed_list_hash, bed_lists, ref_name, &hret);
+
+            if (hret > 0) {
+                list = &kh_val(bed_lists, bed_itr);
+
+                // initialise the new hash entry
+                list->longest = 0;
+                list->size = 0;
+                list->length = 0;
+                list->bp = NULL;
+            } else {
+                fprintf(stderr, "[amplicon] error: ref hashing failure.\n");
+                ret = 1;
+                goto error;
+            }
+        } else { // existing ref
+            list = &kh_val(bed_lists, bed_itr);
+        }
+
+        if (list->length == list->size) {
+           bed_entry_t *tmp;
+
+           list->size += list->size / 2 + 256;
+
+           if ((tmp = realloc(list->bp, list->size * sizeof(bed_entry_t))) == NULL) {
+               fprintf(stderr, "[amplicon] error: unable to allocate more memory for bed data.\n");
+               ret = 1;
+               goto error;
+           }
+
+           list->bp = tmp;
+        }
+
+        list->bp[list->length].left  = left;
+        list->bp[list->length].right = right;
+
+        if (get_strand) {
+            if (strand == '+') {
+                list->bp[list->length].rev = 0;
+            } else if (strand == '-') {
+                list->bp[list->length].rev = 1;
+            } else {
+                fprintf(stderr, "[amplicon] error: bad strand value in line %d, expecting '+' or '-', found '%c'.\n",
+                            line_count, strand);
+                ret = 1;
+                goto error;
+            }
+        }
+
+        if (right - left > list->longest)
+            list->longest = right - left;
+
+        list->length++;
+    }
+
+    if (sort_by_pos) {
+        for (bed_itr = kh_begin(bed_lists); bed_itr != kh_end(bed_lists); ++bed_itr) {
+            if (kh_exist(bed_lists, bed_itr)) {
+                list = &kh_val(bed_lists, bed_itr);
+                qsort(list->bp, list->length, sizeof(list->bp[0]), bed_entry_sort);
+            }
+        }
+    }
+
+    if (kh_size(bed_lists) > 0) {// any entries
+        ret = 0;
+    } else {
+        ret = 1;
+    }
+
+error:
+    ks_free(&line);
+
+    if (hclose(fp) != 0) {
+        fprintf(stderr, "[amplicon] warning: failed to close %s", infile);
+    }
+
+    return ret;
+}
+
+
+void destroy_bed_hash(khash_t(bed_list_hash) *hash) {
+    khiter_t itr;
+
+    for (itr = kh_begin(hash); itr != kh_end(hash); ++itr) {
+       if (kh_exist(hash, itr)) {
+           free(kh_val(hash, itr).bp);
+           free((char *)kh_key(hash, itr));
+           kh_key(hash, itr) = NULL;
+        }
+    }
+
+    kh_destroy(bed_list_hash, hash);
+}
+
+
+static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos,
+                              int is_rev, int use_strand, int64_t longest,
+                              cl_param_t *param) {
+    int i, size;  // may need this to be variable
+    int tol = param->tol;
+    int l = 0, mid = sites->length / 2, r = sites->length;
+    int pos_tol = is_rev ? (pos > tol ? pos - tol : 0) : pos;
+
+    while (r - l > 1) {
+        if (sites->bp[mid].right <= pos_tol) {
+            l = mid;
+        } else {
+            r = mid;
+        }
+        mid = (l + r) / 2;
+    }
+
+    size = 0;
+
+    for (i = l; i < sites->length; i++) {
+        hts_pos_t mod_left, mod_right;
+
+        if (use_strand && is_rev != sites->bp[i].rev)
+            continue;
+
+        if (is_rev) {
+            mod_left = sites->bp[i].left;
+            mod_right = sites->bp[i].right + tol;
+        } else {
+            if (sites->bp[i].left > tol) {
+                mod_left = sites->bp[i].left - tol;
+            } else {
+                mod_left = 0;
+            }
+            mod_right = sites->bp[i].right;
+        }
+
+        if (pos + longest + tol < mod_right)
+            break;
+
+        if (pos >= mod_left && pos <= mod_right) {
+            if (is_rev) {
+                if (size < pos - sites->bp[i].left) {
+                    size = pos - sites->bp[i].left;
+                }
+            } else {
+                if (size < sites->bp[i].right - pos) {
+                    size = sites->bp[i].right - pos;
+                }
+            }
+        }
+    }
+
+    return size;
+}
+
+
+static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
+                         clipping_type clipping) {
+    uint32_t *orig_cigar = bam_get_cigar(rec);
+    uint8_t *orig_seq = bam_get_seq(rec);
+    uint8_t *orig_qual = bam_get_qual(rec);
+    uint8_t *orig_aux = bam_get_aux(rec);
+    uint32_t *new_cigar;
+    uint8_t *new_qual;
+    size_t orig_l_aux = bam_get_l_aux(rec);
+    uint32_t i, j, odd_base = 0;
+    uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0;
+    hts_pos_t new_pos = rec->core.pos;
+    uint32_t cig_type, cig_op;
+
+    if (rec->l_data + 8 > rec_out->m_data) {
+        uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8);
+        if (!new_data) {
+            fprintf(stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n");
+            return 1;
+        }
+        rec_out->data = new_data;
+        rec_out->m_data = rec->l_data + 8;
+    }
+
+    // Copy core data & name
+    memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
+    memcpy(rec_out->data, rec->data, rec->core.l_qname);
+
+    if (clipping == hard_clip && bases >= rec->core.l_qseq) {
+        rec_out->core.l_qseq = 0;
+        rec_out->core.n_cigar = 0;
+
+        if (orig_l_aux)
+            memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+        rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+        return 0;
+    }
+
+    // Modify CIGAR
+    new_cigar = bam_get_cigar(rec_out);
+
+    for (i = 0;  i < rec->core.n_cigar; i++) {
+        cig_op = bam_cigar_op(orig_cigar[i]);
+        cig_type = bam_cigar_type(cig_op);
+
+        if (cig_op == BAM_CHARD_CLIP) {
+            hardclip += bam_cigar_oplen(orig_cigar[i]);
+        } else {
+            if (cig_type & 2) {
+                if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) {
+                    ref_remove -= bam_cigar_oplen(orig_cigar[i]);
+                } else {
+                    break;
+                }
+                new_pos += bam_cigar_oplen(orig_cigar[i]);
+            }
+            if (cig_type & 1) {
+                qry_removed += bam_cigar_oplen(orig_cigar[i]);
+            }
+        }
+    }
+
+    if (i < rec->core.n_cigar) {
+        cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i]));
+
+        // account for the last operation
+        if (cig_type & 2) {
+            new_pos += ref_remove;
+        }
+        if (cig_type & 1) {
+            qry_removed += ref_remove;
+        }
+    } else {
+        qry_removed = rec->core.l_qseq;
+    }
+
+    j = 0;
+    if (clipping == hard_clip && hardclip + qry_removed > 0) {
+        new_cigar[j++] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP);
+    }
+    if (clipping == soft_clip) {
+        if (hardclip > 0) {
+            new_cigar[j++] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP);
+        }
+        if (qry_removed > 0) {
+            new_cigar[j++] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP);
+        }
+    }
+
+    if (i < rec->core.n_cigar
+        && bam_cigar_oplen(orig_cigar[i]) > ref_remove) {
+        new_cigar[j++] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i]));
+
+        // fill in the rest of the cigar
+        i++;
+
+        for (; i < rec->core.n_cigar; i++) {
+            new_cigar[j++] = orig_cigar[i];
+        }
+    }
+
+    rec_out->core.n_cigar = j;
+
+    if (clipping == soft_clip) {
+        qry_removed = 0; // Copy all the sequence and confidence values
+        odd_base = 1; // account for an odd number of bases
+    }
+
+    new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2;
+    // Copy remaining SEQ
+    if ((qry_removed & 1) == 0) {
+        memcpy(bam_get_seq(rec_out), orig_seq + (qry_removed / 2),
+                (rec->core.l_qseq - qry_removed + odd_base) / 2);
+    } else {
+        uint8_t *in = orig_seq + qry_removed / 2;
+        uint8_t *out = bam_get_seq(rec_out);
+        uint32_t i;
+        for (i = qry_removed; i < rec->core.l_qseq - 1; i += 2) {
+            *out++ = ((in[0] & 0x0f) << 4) | ((in[1] & 0xf0) >> 4);
+            in++;
+        }
+        if (i < rec->core.l_qseq) {
+            *out++ = (in[0] & 0x0f) << 4;
+        }
+        assert(out == new_qual);
+    }
+
+    // Copy remaining QUAL
+    memmove(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
+
+    // Set new l_qseq
+    rec_out->core.l_qseq -= qry_removed;
+
+    // Move AUX
+    if (orig_l_aux)
+        memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+    // Set new l_data
+    rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+    // put in new pos
+    rec_out->core.pos = new_pos;
+
+    return 0;
+}
+
+
+static int bam_trim_right(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
+                          clipping_type clipping) {
+    uint32_t *orig_cigar = bam_get_cigar(rec);
+    uint8_t *orig_seq = bam_get_seq(rec);
+    uint8_t *orig_qual = bam_get_qual(rec);
+    uint8_t *orig_aux = bam_get_aux(rec);
+    uint32_t *new_cigar;
+    uint32_t new_n_cigar = 0;
+    uint8_t *new_qual;
+    size_t orig_l_aux = bam_get_l_aux(rec);
+    int32_t i;
+    int32_t j;
+    uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0;
+    uint32_t cig_type, cig_op;
+
+    if (rec->l_data + 8 > rec_out->m_data) {
+        uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8);
+        if (!new_data) {
+            fprintf(stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n");
+            return 1;
+        }
+        rec_out->data = new_data;
+        rec_out->m_data = rec->l_data + 8;
+    }
+
+    // Copy core data & name
+    memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
+    memcpy(rec_out->data, rec->data, rec->core.l_qname);
+
+    if (clipping == hard_clip && bases >= rec->core.l_qseq) {
+        rec_out->core.l_qseq = 0;
+        rec_out->core.n_cigar = 0;
+
+        if (orig_l_aux)
+            memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+        rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+        return 0;
+    }
+
+    // Modify CIGAR here
+    new_cigar = bam_get_cigar(rec_out);
+
+    for (i = rec->core.n_cigar - 1;  i >= 0; --i) {
+        cig_op = bam_cigar_op(orig_cigar[i]);
+        cig_type = bam_cigar_type(cig_op);
+
+        if (cig_op == BAM_CHARD_CLIP) {
+            hardclip += bam_cigar_oplen(orig_cigar[i]);
+        } else {
+            if (cig_type & 2) {
+                if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) {
+                    ref_remove -= bam_cigar_oplen(orig_cigar[i]);
+                } else {
+                    break;
+                }
+            }
+            if (cig_type & 1) {
+                qry_removed += bam_cigar_oplen(orig_cigar[i]);
+            }
+        }
+    }
+
+    if (i >= 0) {
+        cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i]));
+        if (cig_type & 1) {
+            qry_removed += ref_remove;
+        }
+        j = i;
+        if (qry_removed > 0) j++;
+        if (hardclip > 0 && (clipping == soft_clip || qry_removed == 0)) j++;
+    } else {
+        qry_removed = rec->core.l_qseq;
+        j = 0;
+        if (hardclip > 0 && clipping == soft_clip) j++;
+    }
+
+    if (clipping == hard_clip && hardclip + qry_removed > 0) {
+        new_cigar[j] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP);
+        new_n_cigar++;
+    }
+    if (clipping == soft_clip) {
+        if (hardclip > 0) {
+            new_cigar[j] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP);
+            new_n_cigar++;
+            if (qry_removed > 0) --j;
+        }
+        if (qry_removed > 0) {
+            new_cigar[j] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP);
+            new_n_cigar++;
+        }
+    }
+
+    if (j > 0) {
+        new_cigar[--j] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i]));
+        new_n_cigar++;
+    }
+
+    // fill in the rest of the cigar
+    while (j > 0) {
+        new_cigar[--j] = orig_cigar[--i];
+        new_n_cigar++;
+    }
+
+    rec_out->core.n_cigar = new_n_cigar;
+
+    if (clipping == soft_clip)
+        qry_removed = 0; // Copy all the sequence and confidence values
+
+    new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2;
+    // Copy remaining SEQ
+    memcpy(bam_get_seq(rec_out), orig_seq, (rec->core.l_qseq - qry_removed + 1) / 2);
+
+    // Copy remaining QUAL
+    memcpy(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
+
+    // Set new l_qseq
+    rec_out->core.l_qseq -= qry_removed;
+
+    // Copy AUX
+    if (orig_l_aux)
+        memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+    // Set new l_data
+    rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+    return 0;
+}
+
+
+static hts_pos_t active_query_len(bam1_t *b) {
+    uint32_t *cigar = bam_get_cigar(b);
+    uint32_t cig_type, cig_op;
+    hts_pos_t len = 0;
+    int i;
+
+    for (i = 0; i < b->core.n_cigar; i++) {
+        cig_op =  bam_cigar_op(cigar[i]);
+        cig_type = bam_cigar_type(cig_op);
+
+        if ((cig_type & 1) && (cig_op != BAM_CSOFT_CLIP)) {
+            len += bam_cigar_oplen(cigar[i]);
+        }
+    }
+
+    return len;
+}
+
+
+static inline void swap_bams(bam1_t **a, bam1_t **b) {
+    bam1_t *tmp = *a;
+    *a = *b;
+    *b = tmp;
+}
+
+
+// Format OA:Z:(RNAME,POS,strand,CIGAR,MAPQ,NM;
+static inline int tag_original_data(bam1_t *orig, kstring_t *oa_tag) {
+    char strand;
+    uint8_t *nm_tag, *old_oa_tag;
+    uint32_t *cigar;
+    int64_t nm = 0;
+    int i, res = 0;
+
+    ks_clear(oa_tag);
+
+    // if there is an existing OA tag the new one gets appended to it
+    if ((old_oa_tag = bam_aux_get(orig, "OA"))) {
+        res |= ksprintf(oa_tag, "%s", bam_aux2Z(old_oa_tag)) < 0;
+    }
+
+    if (orig->core.flag & BAM_FREVERSE)
+        strand = '-';
+    else
+        strand = '+';
+
+    if ((nm_tag = bam_aux_get(orig, "NM"))) {
+        nm = bam_aux2i(nm_tag);
+    }
+
+    res |= ksprintf(oa_tag, "%s,%"PRIhts_pos",%c,", bam_get_qname(orig), orig->core.pos + 1, strand) < 0;
+
+    for (i = 0, cigar = bam_get_cigar(orig); i < orig->core.n_cigar && res == 0; ++i) {
+        res |= kputw(bam_cigar_oplen(cigar[i]), oa_tag) < 0;
+        res |= kputc(bam_cigar_opchr(cigar[i]), oa_tag) < 0;
+    }
+
+    if (nm_tag) {
+        res |= ksprintf(oa_tag, ",%d,%"PRId64";", orig->core.qual, nm) < 0;
+    } else {
+        res |= ksprintf(oa_tag, "%d,;", orig->core.qual) < 0;
+    }
+
+    return res;
+}
+
+
+static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
+                    clipping_type clipping, cl_param_t *param) {
+    int ret = 1, r, file_open = 0;
+
+    bam_hdr_t *header = NULL;
+    bam1_t *b = NULL, *b_tmp = NULL;
+    long f_count = 0, r_count = 0, n_count = 0, l_count = 0, l_exclude = 0, b_count = 0;
+    long filtered = 0, written = 0, failed = 0;
+    kstring_t str = KS_INITIALIZE;
+    kstring_t oat = KS_INITIALIZE;
+    bed_entry_list_t *sites;
+    FILE *stats_fp = stderr;
+    khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
+
+    if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash)) {
+        fprintf(stderr, "[ampliconclip] error: unable to load bed file.\n");
+        goto fail;
+    }
+
+    if ((header = sam_hdr_read(in)) == NULL) {
+        fprintf(stderr, "[ampliconclip] error: could not read header\n");
+        goto fail;
+    }
+
+    // changing pos can ruin coordinate sort order
+    if (sam_hdr_find_tag_hd(header, "SO", &str) == 0 && str.s && strcmp(str.s, "coordinate") == 0) {
+        const char *new_order = "unknown";
+
+        if (sam_hdr_update_hd(header, "SO", new_order) == -1) {
+            fprintf(stderr, "[ampliconclip] error: unable to change sort order to 'SO:%s'\n", new_order);
+            goto fail;
+        }
+    }
+
+    ks_free(&str);
+
+    if (param->add_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(),
+                        param->arg_list ? "CL" : NULL,
+                        param->arg_list ? param->arg_list : NULL,
+                        NULL) != 0) {
+        fprintf(stderr, "[ampliconclip] warning: unable to add @PG line to header.\n");
+    }
+    if (sam_hdr_write(out, header) < 0) {
+        fprintf(stderr, "[ampliconclip] error: could not write header.\n");
+        goto fail;
+    }
+
+    if (reject) {
+       if (sam_hdr_write(reject, header) < 0) {
+           fprintf(stderr, "[ampliconclip] error: could not write header to rejects file.\n");
+           goto fail;
+       }
+    }
+
+    b = bam_init1();
+    b_tmp = bam_init1();
+    if (!b || !b_tmp) {
+        fprintf(stderr, "[ampliconclip] error: out of memory when trying to create record.\n");
+        goto fail;
+    }
+
+    int32_t last_tid = -1;
+    int ref_found = 0;
+
+    while ((r = sam_read1(in, header, b)) >= 0) {
+        hts_pos_t pos;
+        int is_rev;
+        int p_size;
+        int been_clipped  = 0, filter = 0;
+        int exclude = (BAM_FUNMAP | BAM_FQCFAIL);
+        khiter_t itr;
+
+        l_count++;
+
+        if (b->core.tid != last_tid) {
+            const char *ref_name;
+
+            ref_found = 0;
+            last_tid = b->core.tid;
+
+            if ((ref_name = sam_hdr_tid2name(header, b->core.tid)) != NULL) {
+                itr = kh_get(bed_list_hash, bed_hash, ref_name);
+
+                if (itr != kh_end(bed_hash)) {
+                    sites = &kh_val(bed_hash, itr);
+                    ref_found = 1;
+                }
+            }
+        }
+
+        if (!(b->core.flag & exclude) && ref_found) {
+            if (param->oa_tag)
+                if (tag_original_data(b, &oat))
+                    goto fail;
+
+            if (!param->both) {
+                if (bam_is_rev(b)) {
+                    pos = bam_endpos(b);
+                    is_rev = 1;
+                } else {
+                    pos = b->core.pos;
+                    is_rev = 0;
+                }
+
+                if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
+                    if (is_rev) {
+                        if (bam_trim_right(b, b_tmp, p_size, clipping) != 0)
+                            goto fail;
+
+                        swap_bams(&b, &b_tmp);
+                        r_count++;
+                    } else {
+                        if (bam_trim_left(b, b_tmp, p_size, clipping) != 0)
+                            goto fail;
+
+                        swap_bams(&b, &b_tmp);
+                        f_count++;
+                    }
+
+                    if (param->oa_tag) {
+                        if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s))
+                            goto fail;
+                    }
+
+                    if (param->del_tag) {
+                        uint8_t *tag;
+
+                        if ((tag = bam_aux_get(b, "NM")))
+                            bam_aux_del(b, tag);
+
+                        if ((tag = bam_aux_get(b, "MD")))
+                            bam_aux_del(b, tag);
+                    }
+
+                    been_clipped = 1;
+                } else {
+                    if (param->mark_fail) {
+                        b->core.flag |= BAM_FQCFAIL;
+                    }
+
+                    n_count++;
+                }
+            } else {
+                int left = 0, right = 0;
+
+                // left first
+                pos = b->core.pos;
+                is_rev = 0;
+
+                if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
+                    if (bam_trim_left(b, b_tmp, p_size, clipping) != 0)
+                        goto fail;
+
+                    swap_bams(&b, &b_tmp);
+                    f_count++;
+                    left = 1;
+                    been_clipped = 1;
+                }
+
+                // the right
+                pos = bam_endpos(b);
+                is_rev = 1;
+
+                if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
+                    if (bam_trim_right(b, b_tmp, p_size, clipping) != 0)
+                        goto fail;
+
+                    swap_bams(&b, &b_tmp);
+                    r_count++;
+                    right = 1;
+                    been_clipped = 1;
+                }
+
+                if (left || right) {
+                    uint8_t *tag;
+
+                    if (param->oa_tag) {
+                        if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s))
+                            goto fail;
+                    }
+
+                    if (param->del_tag) {
+                        if ((tag = bam_aux_get(b, "NM")))
+                            bam_aux_del(b, tag);
+
+                        if ((tag = bam_aux_get(b, "MD")))
+                            bam_aux_del(b, tag);
+                    }
+                }
+
+                if (left && right) {
+                    b_count++;
+                } else if (!left && !right) {
+                    if (param->mark_fail) {
+                        b->core.flag |= BAM_FQCFAIL;
+                    }
+
+                    n_count++;
+                }
+            }
+
+            if (param->fail_len >= 0 || param->filter_len >= 0) {
+               hts_pos_t aql = active_query_len(b);
+
+               if (param->fail_len >= 0 && aql <= param->fail_len) {
+                   b->core.flag |= BAM_FQCFAIL;
+               }
+
+               if (param->filter_len >= 0 && aql <= param->filter_len) {
+                   filter = 1;
+               }
+           }
+
+           if (b->core.flag & BAM_FQCFAIL) {
+               failed++;
+           }
+
+           if (param->write_clipped && !been_clipped) {
+               filter = 1;
+           }
+
+        } else {
+            l_exclude++;
+
+            if (param->unmapped) {
+                filter = 1;
+            }
+        }
+
+        if (!filter) {
+            if (sam_write1(out, header, b) < 0) {
+                fprintf(stderr, "[ampliconclip] error: could not write line %ld.\n", l_count);
+                goto fail;
+            }
+
+            written++;
+        } else {
+            if (reject) {
+                if (sam_write1(reject, header, b) < 0) {
+                    fprintf(stderr, "[ampliconclip] error: could not write to reject file %s\n",
+                            param->rejects_file);
+                    goto fail;
+                }
+            }
+
+            filtered++;
+        }
+    }
+
+    if (r < -1) {
+        fprintf(stderr, "[ampliconclip] error: failed to read input.\n");
+        goto fail;
+    }
+
+    if (param->stats_file) {
+        if ((stats_fp = fopen(param->stats_file, "w")) == NULL) {
+            fprintf(stderr, "[ampliconclip] warning: cannot write stats to %s.\n", param->stats_file);
+        } else {
+            file_open = 1;
+        }
+    }
+
+    fprintf(stats_fp, "COMMAND: %s\n"
+                    "TOTAL READS: %ld\n"
+                    "TOTAL CLIPPED: %ld\n"
+                    "FORWARD CLIPPED: %ld\n"
+                    "REVERSE CLIPPED: %ld\n"
+                    "BOTH CLIPPED: %ld\n"
+                    "NOT CLIPPED: %ld\n"
+                    "EXCLUDED: %ld\n"
+                    "FILTERED: %ld\n"
+                    "FAILED: %ld\n"
+                    "WRITTEN: %ld\n", param->arg_list, l_count, f_count + r_count,
+                                    f_count, r_count, b_count, n_count, l_exclude,
+                                    filtered, failed, written);
+
+    if (file_open) {
+        fclose(stats_fp);
+    }
+
+    ret = 0;
+
+fail:
+    destroy_bed_hash(bed_hash);
+    ks_free(&oat);
+    sam_hdr_destroy(header);
+    bam_destroy1(b);
+    bam_destroy1(b_tmp);
+    return ret;
+}
+
+
+static void usage(void) {
+    fprintf(stderr, "Usage: samtools ampliconclip -b BED file <input.bam> -o <output.bam>\n\n");
+    fprintf(stderr, "Option: \n");
+    fprintf(stderr, " -b  FILE            BED file of regions (eg amplicon primers) to be removed.\n");
+    fprintf(stderr, " -o  FILE            output file name (default stdout).\n");
+    fprintf(stderr, " -f  FILE            write stats to file name (default stderr)\n");
+    fprintf(stderr, " -u                  Output uncompressed data\n");
+    fprintf(stderr, " --soft-clip         soft clip amplicon primers from reads (default)\n");
+    fprintf(stderr, " --hard-clip         hard clip amplicon primers from reads.\n");
+    fprintf(stderr, " --both-ends         clip on both 5' and 3' ends.\n");
+    fprintf(stderr, " --strand            use strand data from BED file to match read direction.\n");
+    fprintf(stderr, " --clipped           only output clipped reads.\n");
+    fprintf(stderr, " --fail              mark unclipped, mapped reads as QCFAIL.\n");
+    fprintf(stderr, " --filter-len INT    do not output reads INT size or shorter.\n");
+    fprintf(stderr, " --fail-len   INT    mark as QCFAIL reads INT size or shorter.\n");
+    fprintf(stderr, " --no-excluded       do not write excluded reads (unmapped or QCFAIL).\n");
+    fprintf(stderr, " --rejects-file FILE file to write filtered reads.\n");
+    fprintf(stderr, " --original          for clipped entries add an OA tag with original data.\n");
+    fprintf(stderr, " --keep-tag          for clipped entries keep the old NM and MD tags.\n");
+    fprintf(stderr, " --tolerance         match region within this number of bases, default 5.\n");
+    fprintf(stderr, " --no-PG             do not add an @PG line.\n");
+    sam_global_opt_help(stderr, "-.O..@-.");
+    fprintf(stderr, "\nAbout: Soft clips read alignments where they match BED file defined regions.\n"
+                    "Default clipping is only on the 5' end.\n\n");
+}
+
+
+int amplicon_clip_main(int argc, char **argv) {
+    int c, ret;
+    char wmode[4] = {'w', 'b', 0, 0};
+    char *bedfile = NULL, *fnout = "-";
+    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+    htsThreadPool p = {NULL, 0};
+    samFile *in = NULL, *out = NULL, *reject = NULL;
+    clipping_type clipping = soft_clip;
+    cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL};
+
+    static const struct option lopts[] = {
+        SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
+        {"no-PG", no_argument, NULL, 1002},
+        {"soft-clip", no_argument, NULL, 1003},
+        {"hard-clip", no_argument, NULL, 1004},
+        {"strand", no_argument, NULL, 1005},
+        {"clipped", no_argument, NULL, 1006},
+        {"fail", no_argument, NULL, 1007},
+        {"both-ends", no_argument, NULL, 1008},
+        {"filter-len", required_argument, NULL, 1009},
+        {"fail-len", required_argument, NULL, 1010},
+        {"no-excluded", no_argument, NULL, 1011},
+        {"rejects-file", required_argument, NULL, 1012},
+        {"original", no_argument, NULL, 1013},
+        {"keep-tag", no_argument, NULL, 1014},
+        {"tolerance", required_argument, NULL, 1015},
+        {NULL, 0, NULL, 0}
+    };
+
+    while ((c = getopt_long(argc, argv, "b:@:o:O:f:u", lopts, NULL)) >= 0) {
+        switch (c) {
+            case 'b': bedfile = optarg; break;
+            case 'o': fnout = optarg; break;
+            case 'f': param.stats_file = optarg; break;
+            case 'u': wmode[2] = '0'; break;
+            case 1002: param.add_pg = 0; break;
+            case 1003: clipping = soft_clip; break;
+            case 1004: clipping = hard_clip; break;
+            case 1005: param.use_strand = 1; break;
+            case 1006: param.write_clipped = 1; break;
+            case 1007: param.mark_fail = 1; break;
+            case 1008: param.both = 1; break;
+            case 1009: param.filter_len = atoi(optarg); break;
+            case 1010: param.fail_len = atoi(optarg); break;
+            case 1011: param.unmapped = 1; break;
+            case 1012: param.rejects_file = optarg; break;
+            case 1013: param.oa_tag = 1; break;
+            case 1014: param.del_tag = 0; break;
+            case 1015: param.tol = atoi(optarg); break;
+            default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+                      /* else fall-through */
+            case '?': usage(); exit(1);
+        }
+    }
+
+    if (!bedfile) {
+        usage();
+        return 1;
+    }
+
+    if (optind + 1 > argc) {
+        usage();
+        return 1;
+    }
+
+    if (param.tol < 0) {
+        fprintf(stderr, "[ampliconclip] warning: invalid tolerance of %d,"
+                        " reseting tolerance to default of 5.\n", param.tol);
+        param.tol = 5;
+    }
+
+    if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) {
+        print_error_errno("ampliconclip", "cannot open input file");
+        return 1;
+    }
+
+    sam_open_mode(wmode+1, fnout, NULL);
+
+    if ((out = sam_open_format(fnout, wmode, &ga.out)) == NULL) {
+        print_error_errno("ampliconclip", "cannot open output file");
+        return 1;
+    }
+
+    if (param.rejects_file) {
+        sam_open_mode(wmode+1, param.rejects_file, NULL);
+
+        if ((reject = sam_open_format(param.rejects_file, wmode, &ga.out)) == NULL) {
+            print_error_errno("ampliconclip", "cannot open rejects file");
+            return 1;
+        }
+    }
+
+    if (ga.nthreads > 0) {
+        if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+            fprintf(stderr, "[ampliconclip] error: cannot create thread pool.\n");
+            return 1;
+        }
+        hts_set_opt(in,  HTS_OPT_THREAD_POOL, &p);
+        hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+
+        if (reject) {
+           hts_set_opt(reject,  HTS_OPT_THREAD_POOL, &p);
+        }
+    }
+
+    param.arg_list = stringify_argv(argc + 1, argv - 1);
+
+    ret = bam_clip(in, out, reject, bedfile, clipping, &param);
+
+    // cleanup
+    sam_close(in);
+
+    if (sam_close(out) < 0) {
+        fprintf(stderr, "[ampliconclip] error: error while closing output file %s.\n", argv[optind+1]);
+        ret = 1;
+    }
+
+    if (reject) {
+        if (sam_close(reject) < 0) {
+            fprintf(stderr, "[ampliconclip] error: error while closing reject file %s.\n", param.rejects_file);
+            ret = 1;
+        }
+    }
+
+    if (p.pool) hts_tpool_destroy(p.pool);
+
+    sam_global_args_free(&ga);
+    free(param.arg_list);
+
+    return ret;
+}
+
diff --git a/samtools/bam_ampliconclip.c.pysam.c b/samtools/bam_ampliconclip.c.pysam.c

new file mode 100644 (file)

index 0000000..3b2ed29
--- /dev/null
+++ b/samtools/bam_ampliconclip.c.pysam.c
@@ -0,0 +1,1081 @@
+#include "samtools.pysam.h"
+
+/*  bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
+                          from the 5' end.
+
+    Copyright (C) 2020-2021 Genome Research Ltd.
+
+    Authors: Andrew Whitwham <aw7@sanger.ac.uk>
+             Rob Davies <rmd+git@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+*/
+
+#include <config.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include "htslib/thread_pool.h"
+#include "sam_opts.h"
+#include <htslib/hts.h>
+#include "htslib/hfile.h"
+#include "htslib/kstring.h"
+#include "htslib/sam.h"
+#include "samtools.h"
+#include "bam_ampliconclip.h"
+
+typedef enum {
+    soft_clip,
+    hard_clip
+} clipping_type;
+
+typedef struct {
+    int add_pg;
+    int use_strand;
+    int write_clipped;
+    int mark_fail;
+    int both;
+    int fail_len;
+    int filter_len;
+    int unmapped;
+    int oa_tag;
+    int del_tag;
+    int tol;
+    char *arg_list;
+    char *stats_file;
+    char *rejects_file;
+} cl_param_t;
+
+
+static int bed_entry_sort(const void *av, const void *bv) {
+    bed_entry_t *a = (bed_entry_t *) av;
+    bed_entry_t *b = (bed_entry_t *) bv;
+    return a->right < b->right ? -1 : (a->right == b->right ? 0 : 1);
+}
+
+
+int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists) {
+    hFILE *fp;
+    int line_count = 0, ret;
+    int64_t left, right;
+    kstring_t line = KS_INITIALIZE;
+    bed_entry_list_t *list;
+    khiter_t bed_itr;
+
+    if ((fp = hopen(infile, "r")) == NULL) {
+        print_error_errno("amplicon", "unable to open file %s.", infile);
+        return 1;
+    }
+
+    char ref[1024];
+
+    while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) {
+        line_count++;
+        int hret;
+        char strand;
+
+        if (line.l == 0 || *line.s == '#') continue;
+        if (strncmp(line.s, "track ", 6) == 0) continue;
+        if (strncmp(line.s, "browser ", 8) == 0) continue;
+
+        if (get_strand) {
+            if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64" %*s %*s %c",
+                       ref, &left, &right, &strand) != 4) {
+                fprintf(samtools_stderr, "[amplicon] error: bad bed file format in line %d of %s.\n"
+                                "(N.B. ref/chrom name limited to 1023 characters.)\n",
+                                    line_count, infile);
+                ret = 1;
+                goto error;
+            }
+        } else {
+            if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64,
+                       ref, &left, &right) != 3) {
+                fprintf(samtools_stderr, "[amplicon] error: bad bed file format in line %d of %s\n"
+                                "(N.B. ref/chrom name limited to 1023 characters.)\n",
+                                    line_count, infile);
+                ret = 1;
+                goto error;
+            }
+        }
+
+        bed_itr = kh_get(bed_list_hash, bed_lists, ref);
+
+        if (bed_itr == kh_end(bed_lists)) { // new ref entry
+            char *ref_name = strdup(ref); // need a copy for the hash key
+
+            if (!ref_name) {
+                fprintf(samtools_stderr, "[amplicon] error: unable to allocate memory for ref name.\n");
+                ret = 1;
+                goto error;
+            }
+
+            bed_itr = kh_put(bed_list_hash, bed_lists, ref_name, &hret);
+
+            if (hret > 0) {
+                list = &kh_val(bed_lists, bed_itr);
+
+                // initialise the new hash entry
+                list->longest = 0;
+                list->size = 0;
+                list->length = 0;
+                list->bp = NULL;
+            } else {
+                fprintf(samtools_stderr, "[amplicon] error: ref hashing failure.\n");
+                ret = 1;
+                goto error;
+            }
+        } else { // existing ref
+            list = &kh_val(bed_lists, bed_itr);
+        }
+
+        if (list->length == list->size) {
+           bed_entry_t *tmp;
+
+           list->size += list->size / 2 + 256;
+
+           if ((tmp = realloc(list->bp, list->size * sizeof(bed_entry_t))) == NULL) {
+               fprintf(samtools_stderr, "[amplicon] error: unable to allocate more memory for bed data.\n");
+               ret = 1;
+               goto error;
+           }
+
+           list->bp = tmp;
+        }
+
+        list->bp[list->length].left  = left;
+        list->bp[list->length].right = right;
+
+        if (get_strand) {
+            if (strand == '+') {
+                list->bp[list->length].rev = 0;
+            } else if (strand == '-') {
+                list->bp[list->length].rev = 1;
+            } else {
+                fprintf(samtools_stderr, "[amplicon] error: bad strand value in line %d, expecting '+' or '-', found '%c'.\n",
+                            line_count, strand);
+                ret = 1;
+                goto error;
+            }
+        }
+
+        if (right - left > list->longest)
+            list->longest = right - left;
+
+        list->length++;
+    }
+
+    if (sort_by_pos) {
+        for (bed_itr = kh_begin(bed_lists); bed_itr != kh_end(bed_lists); ++bed_itr) {
+            if (kh_exist(bed_lists, bed_itr)) {
+                list = &kh_val(bed_lists, bed_itr);
+                qsort(list->bp, list->length, sizeof(list->bp[0]), bed_entry_sort);
+            }
+        }
+    }
+
+    if (kh_size(bed_lists) > 0) {// any entries
+        ret = 0;
+    } else {
+        ret = 1;
+    }
+
+error:
+    ks_free(&line);
+
+    if (hclose(fp) != 0) {
+        fprintf(samtools_stderr, "[amplicon] warning: failed to close %s", infile);
+    }
+
+    return ret;
+}
+
+
+void destroy_bed_hash(khash_t(bed_list_hash) *hash) {
+    khiter_t itr;
+
+    for (itr = kh_begin(hash); itr != kh_end(hash); ++itr) {
+       if (kh_exist(hash, itr)) {
+           free(kh_val(hash, itr).bp);
+           free((char *)kh_key(hash, itr));
+           kh_key(hash, itr) = NULL;
+        }
+    }
+
+    kh_destroy(bed_list_hash, hash);
+}
+
+
+static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos,
+                              int is_rev, int use_strand, int64_t longest,
+                              cl_param_t *param) {
+    int i, size;  // may need this to be variable
+    int tol = param->tol;
+    int l = 0, mid = sites->length / 2, r = sites->length;
+    int pos_tol = is_rev ? (pos > tol ? pos - tol : 0) : pos;
+
+    while (r - l > 1) {
+        if (sites->bp[mid].right <= pos_tol) {
+            l = mid;
+        } else {
+            r = mid;
+        }
+        mid = (l + r) / 2;
+    }
+
+    size = 0;
+
+    for (i = l; i < sites->length; i++) {
+        hts_pos_t mod_left, mod_right;
+
+        if (use_strand && is_rev != sites->bp[i].rev)
+            continue;
+
+        if (is_rev) {
+            mod_left = sites->bp[i].left;
+            mod_right = sites->bp[i].right + tol;
+        } else {
+            if (sites->bp[i].left > tol) {
+                mod_left = sites->bp[i].left - tol;
+            } else {
+                mod_left = 0;
+            }
+            mod_right = sites->bp[i].right;
+        }
+
+        if (pos + longest + tol < mod_right)
+            break;
+
+        if (pos >= mod_left && pos <= mod_right) {
+            if (is_rev) {
+                if (size < pos - sites->bp[i].left) {
+                    size = pos - sites->bp[i].left;
+                }
+            } else {
+                if (size < sites->bp[i].right - pos) {
+                    size = sites->bp[i].right - pos;
+                }
+            }
+        }
+    }
+
+    return size;
+}
+
+
+static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
+                         clipping_type clipping) {
+    uint32_t *orig_cigar = bam_get_cigar(rec);
+    uint8_t *orig_seq = bam_get_seq(rec);
+    uint8_t *orig_qual = bam_get_qual(rec);
+    uint8_t *orig_aux = bam_get_aux(rec);
+    uint32_t *new_cigar;
+    uint8_t *new_qual;
+    size_t orig_l_aux = bam_get_l_aux(rec);
+    uint32_t i, j, odd_base = 0;
+    uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0;
+    hts_pos_t new_pos = rec->core.pos;
+    uint32_t cig_type, cig_op;
+
+    if (rec->l_data + 8 > rec_out->m_data) {
+        uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8);
+        if (!new_data) {
+            fprintf(samtools_stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n");
+            return 1;
+        }
+        rec_out->data = new_data;
+        rec_out->m_data = rec->l_data + 8;
+    }
+
+    // Copy core data & name
+    memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
+    memcpy(rec_out->data, rec->data, rec->core.l_qname);
+
+    if (clipping == hard_clip && bases >= rec->core.l_qseq) {
+        rec_out->core.l_qseq = 0;
+        rec_out->core.n_cigar = 0;
+
+        if (orig_l_aux)
+            memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+        rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+        return 0;
+    }
+
+    // Modify CIGAR
+    new_cigar = bam_get_cigar(rec_out);
+
+    for (i = 0;  i < rec->core.n_cigar; i++) {
+        cig_op = bam_cigar_op(orig_cigar[i]);
+        cig_type = bam_cigar_type(cig_op);
+
+        if (cig_op == BAM_CHARD_CLIP) {
+            hardclip += bam_cigar_oplen(orig_cigar[i]);
+        } else {
+            if (cig_type & 2) {
+                if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) {
+                    ref_remove -= bam_cigar_oplen(orig_cigar[i]);
+                } else {
+                    break;
+                }
+                new_pos += bam_cigar_oplen(orig_cigar[i]);
+            }
+            if (cig_type & 1) {
+                qry_removed += bam_cigar_oplen(orig_cigar[i]);
+            }
+        }
+    }
+
+    if (i < rec->core.n_cigar) {
+        cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i]));
+
+        // account for the last operation
+        if (cig_type & 2) {
+            new_pos += ref_remove;
+        }
+        if (cig_type & 1) {
+            qry_removed += ref_remove;
+        }
+    } else {
+        qry_removed = rec->core.l_qseq;
+    }
+
+    j = 0;
+    if (clipping == hard_clip && hardclip + qry_removed > 0) {
+        new_cigar[j++] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP);
+    }
+    if (clipping == soft_clip) {
+        if (hardclip > 0) {
+            new_cigar[j++] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP);
+        }
+        if (qry_removed > 0) {
+            new_cigar[j++] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP);
+        }
+    }
+
+    if (i < rec->core.n_cigar
+        && bam_cigar_oplen(orig_cigar[i]) > ref_remove) {
+        new_cigar[j++] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i]));
+
+        // fill in the rest of the cigar
+        i++;
+
+        for (; i < rec->core.n_cigar; i++) {
+            new_cigar[j++] = orig_cigar[i];
+        }
+    }
+
+    rec_out->core.n_cigar = j;
+
+    if (clipping == soft_clip) {
+        qry_removed = 0; // Copy all the sequence and confidence values
+        odd_base = 1; // account for an odd number of bases
+    }
+
+    new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2;
+    // Copy remaining SEQ
+    if ((qry_removed & 1) == 0) {
+        memcpy(bam_get_seq(rec_out), orig_seq + (qry_removed / 2),
+                (rec->core.l_qseq - qry_removed + odd_base) / 2);
+    } else {
+        uint8_t *in = orig_seq + qry_removed / 2;
+        uint8_t *out = bam_get_seq(rec_out);
+        uint32_t i;
+        for (i = qry_removed; i < rec->core.l_qseq - 1; i += 2) {
+            *out++ = ((in[0] & 0x0f) << 4) | ((in[1] & 0xf0) >> 4);
+            in++;
+        }
+        if (i < rec->core.l_qseq) {
+            *out++ = (in[0] & 0x0f) << 4;
+        }
+        assert(out == new_qual);
+    }
+
+    // Copy remaining QUAL
+    memmove(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
+
+    // Set new l_qseq
+    rec_out->core.l_qseq -= qry_removed;
+
+    // Move AUX
+    if (orig_l_aux)
+        memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+    // Set new l_data
+    rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+    // put in new pos
+    rec_out->core.pos = new_pos;
+
+    return 0;
+}
+
+
+static int bam_trim_right(bam1_t *rec, bam1_t *rec_out, uint32_t bases,
+                          clipping_type clipping) {
+    uint32_t *orig_cigar = bam_get_cigar(rec);
+    uint8_t *orig_seq = bam_get_seq(rec);
+    uint8_t *orig_qual = bam_get_qual(rec);
+    uint8_t *orig_aux = bam_get_aux(rec);
+    uint32_t *new_cigar;
+    uint32_t new_n_cigar = 0;
+    uint8_t *new_qual;
+    size_t orig_l_aux = bam_get_l_aux(rec);
+    int32_t i;
+    int32_t j;
+    uint32_t ref_remove = bases, qry_removed = 0, hardclip = 0;
+    uint32_t cig_type, cig_op;
+
+    if (rec->l_data + 8 > rec_out->m_data) {
+        uint8_t *new_data = realloc(rec_out->data, rec->l_data + 8);
+        if (!new_data) {
+            fprintf(samtools_stderr, "[ampliconclip] error: could not allocate memoy for new bam record\n");
+            return 1;
+        }
+        rec_out->data = new_data;
+        rec_out->m_data = rec->l_data + 8;
+    }
+
+    // Copy core data & name
+    memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
+    memcpy(rec_out->data, rec->data, rec->core.l_qname);
+
+    if (clipping == hard_clip && bases >= rec->core.l_qseq) {
+        rec_out->core.l_qseq = 0;
+        rec_out->core.n_cigar = 0;
+
+        if (orig_l_aux)
+            memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+        rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+        return 0;
+    }
+
+    // Modify CIGAR here
+    new_cigar = bam_get_cigar(rec_out);
+
+    for (i = rec->core.n_cigar - 1;  i >= 0; --i) {
+        cig_op = bam_cigar_op(orig_cigar[i]);
+        cig_type = bam_cigar_type(cig_op);
+
+        if (cig_op == BAM_CHARD_CLIP) {
+            hardclip += bam_cigar_oplen(orig_cigar[i]);
+        } else {
+            if (cig_type & 2) {
+                if (bam_cigar_oplen(orig_cigar[i]) <= ref_remove) {
+                    ref_remove -= bam_cigar_oplen(orig_cigar[i]);
+                } else {
+                    break;
+                }
+            }
+            if (cig_type & 1) {
+                qry_removed += bam_cigar_oplen(orig_cigar[i]);
+            }
+        }
+    }
+
+    if (i >= 0) {
+        cig_type = bam_cigar_type(bam_cigar_op(orig_cigar[i]));
+        if (cig_type & 1) {
+            qry_removed += ref_remove;
+        }
+        j = i;
+        if (qry_removed > 0) j++;
+        if (hardclip > 0 && (clipping == soft_clip || qry_removed == 0)) j++;
+    } else {
+        qry_removed = rec->core.l_qseq;
+        j = 0;
+        if (hardclip > 0 && clipping == soft_clip) j++;
+    }
+
+    if (clipping == hard_clip && hardclip + qry_removed > 0) {
+        new_cigar[j] = bam_cigar_gen(hardclip + qry_removed, BAM_CHARD_CLIP);
+        new_n_cigar++;
+    }
+    if (clipping == soft_clip) {
+        if (hardclip > 0) {
+            new_cigar[j] = bam_cigar_gen(hardclip, BAM_CHARD_CLIP);
+            new_n_cigar++;
+            if (qry_removed > 0) --j;
+        }
+        if (qry_removed > 0) {
+            new_cigar[j] = bam_cigar_gen(qry_removed, BAM_CSOFT_CLIP);
+            new_n_cigar++;
+        }
+    }
+
+    if (j > 0) {
+        new_cigar[--j] = bam_cigar_gen(bam_cigar_oplen(orig_cigar[i]) - ref_remove, bam_cigar_op(orig_cigar[i]));
+        new_n_cigar++;
+    }
+
+    // fill in the rest of the cigar
+    while (j > 0) {
+        new_cigar[--j] = orig_cigar[--i];
+        new_n_cigar++;
+    }
+
+    rec_out->core.n_cigar = new_n_cigar;
+
+    if (clipping == soft_clip)
+        qry_removed = 0; // Copy all the sequence and confidence values
+
+    new_qual = bam_get_seq(rec_out) + (rec->core.l_qseq - qry_removed + 1) / 2;
+    // Copy remaining SEQ
+    memcpy(bam_get_seq(rec_out), orig_seq, (rec->core.l_qseq - qry_removed + 1) / 2);
+
+    // Copy remaining QUAL
+    memcpy(new_qual, orig_qual, rec->core.l_qseq - qry_removed);
+
+    // Set new l_qseq
+    rec_out->core.l_qseq -= qry_removed;
+
+    // Copy AUX
+    if (orig_l_aux)
+        memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+    // Set new l_data
+    rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+    return 0;
+}
+
+
+static hts_pos_t active_query_len(bam1_t *b) {
+    uint32_t *cigar = bam_get_cigar(b);
+    uint32_t cig_type, cig_op;
+    hts_pos_t len = 0;
+    int i;
+
+    for (i = 0; i < b->core.n_cigar; i++) {
+        cig_op =  bam_cigar_op(cigar[i]);
+        cig_type = bam_cigar_type(cig_op);
+
+        if ((cig_type & 1) && (cig_op != BAM_CSOFT_CLIP)) {
+            len += bam_cigar_oplen(cigar[i]);
+        }
+    }
+
+    return len;
+}
+
+
+static inline void swap_bams(bam1_t **a, bam1_t **b) {
+    bam1_t *tmp = *a;
+    *a = *b;
+    *b = tmp;
+}
+
+
+// Format OA:Z:(RNAME,POS,strand,CIGAR,MAPQ,NM;
+static inline int tag_original_data(bam1_t *orig, kstring_t *oa_tag) {
+    char strand;
+    uint8_t *nm_tag, *old_oa_tag;
+    uint32_t *cigar;
+    int64_t nm = 0;
+    int i, res = 0;
+
+    ks_clear(oa_tag);
+
+    // if there is an existing OA tag the new one gets appended to it
+    if ((old_oa_tag = bam_aux_get(orig, "OA"))) {
+        res |= ksprintf(oa_tag, "%s", bam_aux2Z(old_oa_tag)) < 0;
+    }
+
+    if (orig->core.flag & BAM_FREVERSE)
+        strand = '-';
+    else
+        strand = '+';
+
+    if ((nm_tag = bam_aux_get(orig, "NM"))) {
+        nm = bam_aux2i(nm_tag);
+    }
+
+    res |= ksprintf(oa_tag, "%s,%"PRIhts_pos",%c,", bam_get_qname(orig), orig->core.pos + 1, strand) < 0;
+
+    for (i = 0, cigar = bam_get_cigar(orig); i < orig->core.n_cigar && res == 0; ++i) {
+        res |= kputw(bam_cigar_oplen(cigar[i]), oa_tag) < 0;
+        res |= kputc(bam_cigar_opchr(cigar[i]), oa_tag) < 0;
+    }
+
+    if (nm_tag) {
+        res |= ksprintf(oa_tag, ",%d,%"PRId64";", orig->core.qual, nm) < 0;
+    } else {
+        res |= ksprintf(oa_tag, "%d,;", orig->core.qual) < 0;
+    }
+
+    return res;
+}
+
+
+static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile,
+                    clipping_type clipping, cl_param_t *param) {
+    int ret = 1, r, file_open = 0;
+
+    bam_hdr_t *header = NULL;
+    bam1_t *b = NULL, *b_tmp = NULL;
+    long f_count = 0, r_count = 0, n_count = 0, l_count = 0, l_exclude = 0, b_count = 0;
+    long filtered = 0, written = 0, failed = 0;
+    kstring_t str = KS_INITIALIZE;
+    kstring_t oat = KS_INITIALIZE;
+    bed_entry_list_t *sites;
+    FILE *stats_fp = samtools_stderr;
+    khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash);
+
+    if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash)) {
+        fprintf(samtools_stderr, "[ampliconclip] error: unable to load bed file.\n");
+        goto fail;
+    }
+
+    if ((header = sam_hdr_read(in)) == NULL) {
+        fprintf(samtools_stderr, "[ampliconclip] error: could not read header\n");
+        goto fail;
+    }
+
+    // changing pos can ruin coordinate sort order
+    if (sam_hdr_find_tag_hd(header, "SO", &str) == 0 && str.s && strcmp(str.s, "coordinate") == 0) {
+        const char *new_order = "unknown";
+
+        if (sam_hdr_update_hd(header, "SO", new_order) == -1) {
+            fprintf(samtools_stderr, "[ampliconclip] error: unable to change sort order to 'SO:%s'\n", new_order);
+            goto fail;
+        }
+    }
+
+    ks_free(&str);
+
+    if (param->add_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(),
+                        param->arg_list ? "CL" : NULL,
+                        param->arg_list ? param->arg_list : NULL,
+                        NULL) != 0) {
+        fprintf(samtools_stderr, "[ampliconclip] warning: unable to add @PG line to header.\n");
+    }
+    if (sam_hdr_write(out, header) < 0) {
+        fprintf(samtools_stderr, "[ampliconclip] error: could not write header.\n");
+        goto fail;
+    }
+
+    if (reject) {
+       if (sam_hdr_write(reject, header) < 0) {
+           fprintf(samtools_stderr, "[ampliconclip] error: could not write header to rejects file.\n");
+           goto fail;
+       }
+    }
+
+    b = bam_init1();
+    b_tmp = bam_init1();
+    if (!b || !b_tmp) {
+        fprintf(samtools_stderr, "[ampliconclip] error: out of memory when trying to create record.\n");
+        goto fail;
+    }
+
+    int32_t last_tid = -1;
+    int ref_found = 0;
+
+    while ((r = sam_read1(in, header, b)) >= 0) {
+        hts_pos_t pos;
+        int is_rev;
+        int p_size;
+        int been_clipped  = 0, filter = 0;
+        int exclude = (BAM_FUNMAP | BAM_FQCFAIL);
+        khiter_t itr;
+
+        l_count++;
+
+        if (b->core.tid != last_tid) {
+            const char *ref_name;
+
+            ref_found = 0;
+            last_tid = b->core.tid;
+
+            if ((ref_name = sam_hdr_tid2name(header, b->core.tid)) != NULL) {
+                itr = kh_get(bed_list_hash, bed_hash, ref_name);
+
+                if (itr != kh_end(bed_hash)) {
+                    sites = &kh_val(bed_hash, itr);
+                    ref_found = 1;
+                }
+            }
+        }
+
+        if (!(b->core.flag & exclude) && ref_found) {
+            if (param->oa_tag)
+                if (tag_original_data(b, &oat))
+                    goto fail;
+
+            if (!param->both) {
+                if (bam_is_rev(b)) {
+                    pos = bam_endpos(b);
+                    is_rev = 1;
+                } else {
+                    pos = b->core.pos;
+                    is_rev = 0;
+                }
+
+                if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
+                    if (is_rev) {
+                        if (bam_trim_right(b, b_tmp, p_size, clipping) != 0)
+                            goto fail;
+
+                        swap_bams(&b, &b_tmp);
+                        r_count++;
+                    } else {
+                        if (bam_trim_left(b, b_tmp, p_size, clipping) != 0)
+                            goto fail;
+
+                        swap_bams(&b, &b_tmp);
+                        f_count++;
+                    }
+
+                    if (param->oa_tag) {
+                        if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s))
+                            goto fail;
+                    }
+
+                    if (param->del_tag) {
+                        uint8_t *tag;
+
+                        if ((tag = bam_aux_get(b, "NM")))
+                            bam_aux_del(b, tag);
+
+                        if ((tag = bam_aux_get(b, "MD")))
+                            bam_aux_del(b, tag);
+                    }
+
+                    been_clipped = 1;
+                } else {
+                    if (param->mark_fail) {
+                        b->core.flag |= BAM_FQCFAIL;
+                    }
+
+                    n_count++;
+                }
+            } else {
+                int left = 0, right = 0;
+
+                // left first
+                pos = b->core.pos;
+                is_rev = 0;
+
+                if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
+                    if (bam_trim_left(b, b_tmp, p_size, clipping) != 0)
+                        goto fail;
+
+                    swap_bams(&b, &b_tmp);
+                    f_count++;
+                    left = 1;
+                    been_clipped = 1;
+                }
+
+                // the right
+                pos = bam_endpos(b);
+                is_rev = 1;
+
+                if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) {
+                    if (bam_trim_right(b, b_tmp, p_size, clipping) != 0)
+                        goto fail;
+
+                    swap_bams(&b, &b_tmp);
+                    r_count++;
+                    right = 1;
+                    been_clipped = 1;
+                }
+
+                if (left || right) {
+                    uint8_t *tag;
+
+                    if (param->oa_tag) {
+                        if (bam_aux_update_str(b, "OA", oat.l + 1, (const char *)oat.s))
+                            goto fail;
+                    }
+
+                    if (param->del_tag) {
+                        if ((tag = bam_aux_get(b, "NM")))
+                            bam_aux_del(b, tag);
+
+                        if ((tag = bam_aux_get(b, "MD")))
+                            bam_aux_del(b, tag);
+                    }
+                }
+
+                if (left && right) {
+                    b_count++;
+                } else if (!left && !right) {
+                    if (param->mark_fail) {
+                        b->core.flag |= BAM_FQCFAIL;
+                    }
+
+                    n_count++;
+                }
+            }
+
+            if (param->fail_len >= 0 || param->filter_len >= 0) {
+               hts_pos_t aql = active_query_len(b);
+
+               if (param->fail_len >= 0 && aql <= param->fail_len) {
+                   b->core.flag |= BAM_FQCFAIL;
+               }
+
+               if (param->filter_len >= 0 && aql <= param->filter_len) {
+                   filter = 1;
+               }
+           }
+
+           if (b->core.flag & BAM_FQCFAIL) {
+               failed++;
+           }
+
+           if (param->write_clipped && !been_clipped) {
+               filter = 1;
+           }
+
+        } else {
+            l_exclude++;
+
+            if (param->unmapped) {
+                filter = 1;
+            }
+        }
+
+        if (!filter) {
+            if (sam_write1(out, header, b) < 0) {
+                fprintf(samtools_stderr, "[ampliconclip] error: could not write line %ld.\n", l_count);
+                goto fail;
+            }
+
+            written++;
+        } else {
+            if (reject) {
+                if (sam_write1(reject, header, b) < 0) {
+                    fprintf(samtools_stderr, "[ampliconclip] error: could not write to reject file %s\n",
+                            param->rejects_file);
+                    goto fail;
+                }
+            }
+
+            filtered++;
+        }
+    }
+
+    if (r < -1) {
+        fprintf(samtools_stderr, "[ampliconclip] error: failed to read input.\n");
+        goto fail;
+    }
+
+    if (param->stats_file) {
+        if ((stats_fp = fopen(param->stats_file, "w")) == NULL) {
+            fprintf(samtools_stderr, "[ampliconclip] warning: cannot write stats to %s.\n", param->stats_file);
+        } else {
+            file_open = 1;
+        }
+    }
+
+    fprintf(stats_fp, "COMMAND: %s\n"
+                    "TOTAL READS: %ld\n"
+                    "TOTAL CLIPPED: %ld\n"
+                    "FORWARD CLIPPED: %ld\n"
+                    "REVERSE CLIPPED: %ld\n"
+                    "BOTH CLIPPED: %ld\n"
+                    "NOT CLIPPED: %ld\n"
+                    "EXCLUDED: %ld\n"
+                    "FILTERED: %ld\n"
+                    "FAILED: %ld\n"
+                    "WRITTEN: %ld\n", param->arg_list, l_count, f_count + r_count,
+                                    f_count, r_count, b_count, n_count, l_exclude,
+                                    filtered, failed, written);
+
+    if (file_open) {
+        fclose(stats_fp);
+    }
+
+    ret = 0;
+
+fail:
+    destroy_bed_hash(bed_hash);
+    ks_free(&oat);
+    sam_hdr_destroy(header);
+    bam_destroy1(b);
+    bam_destroy1(b_tmp);
+    return ret;
+}
+
+
+static void usage(void) {
+    fprintf(samtools_stderr, "Usage: samtools ampliconclip -b BED file <input.bam> -o <output.bam>\n\n");
+    fprintf(samtools_stderr, "Option: \n");
+    fprintf(samtools_stderr, " -b  FILE            BED file of regions (eg amplicon primers) to be removed.\n");
+    fprintf(samtools_stderr, " -o  FILE            output file name (default samtools_stdout).\n");
+    fprintf(samtools_stderr, " -f  FILE            write stats to file name (default samtools_stderr)\n");
+    fprintf(samtools_stderr, " -u                  Output uncompressed data\n");
+    fprintf(samtools_stderr, " --soft-clip         soft clip amplicon primers from reads (default)\n");
+    fprintf(samtools_stderr, " --hard-clip         hard clip amplicon primers from reads.\n");
+    fprintf(samtools_stderr, " --both-ends         clip on both 5' and 3' ends.\n");
+    fprintf(samtools_stderr, " --strand            use strand data from BED file to match read direction.\n");
+    fprintf(samtools_stderr, " --clipped           only output clipped reads.\n");
+    fprintf(samtools_stderr, " --fail              mark unclipped, mapped reads as QCFAIL.\n");
+    fprintf(samtools_stderr, " --filter-len INT    do not output reads INT size or shorter.\n");
+    fprintf(samtools_stderr, " --fail-len   INT    mark as QCFAIL reads INT size or shorter.\n");
+    fprintf(samtools_stderr, " --no-excluded       do not write excluded reads (unmapped or QCFAIL).\n");
+    fprintf(samtools_stderr, " --rejects-file FILE file to write filtered reads.\n");
+    fprintf(samtools_stderr, " --original          for clipped entries add an OA tag with original data.\n");
+    fprintf(samtools_stderr, " --keep-tag          for clipped entries keep the old NM and MD tags.\n");
+    fprintf(samtools_stderr, " --tolerance         match region within this number of bases, default 5.\n");
+    fprintf(samtools_stderr, " --no-PG             do not add an @PG line.\n");
+    sam_global_opt_help(samtools_stderr, "-.O..@-.");
+    fprintf(samtools_stderr, "\nAbout: Soft clips read alignments where they match BED file defined regions.\n"
+                    "Default clipping is only on the 5' end.\n\n");
+}
+
+
+int amplicon_clip_main(int argc, char **argv) {
+    int c, ret;
+    char wmode[4] = {'w', 'b', 0, 0};
+    char *bedfile = NULL, *fnout = "-";
+    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+    htsThreadPool p = {NULL, 0};
+    samFile *in = NULL, *out = NULL, *reject = NULL;
+    clipping_type clipping = soft_clip;
+    cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL};
+
+    static const struct option lopts[] = {
+        SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
+        {"no-PG", no_argument, NULL, 1002},
+        {"soft-clip", no_argument, NULL, 1003},
+        {"hard-clip", no_argument, NULL, 1004},
+        {"strand", no_argument, NULL, 1005},
+        {"clipped", no_argument, NULL, 1006},
+        {"fail", no_argument, NULL, 1007},
+        {"both-ends", no_argument, NULL, 1008},
+        {"filter-len", required_argument, NULL, 1009},
+        {"fail-len", required_argument, NULL, 1010},
+        {"no-excluded", no_argument, NULL, 1011},
+        {"rejects-file", required_argument, NULL, 1012},
+        {"original", no_argument, NULL, 1013},
+        {"keep-tag", no_argument, NULL, 1014},
+        {"tolerance", required_argument, NULL, 1015},
+        {NULL, 0, NULL, 0}
+    };
+
+    while ((c = getopt_long(argc, argv, "b:@:o:O:f:u", lopts, NULL)) >= 0) {
+        switch (c) {
+            case 'b': bedfile = optarg; break;
+            case 'o': fnout = optarg; break;
+            case 'f': param.stats_file = optarg; break;
+            case 'u': wmode[2] = '0'; break;
+            case 1002: param.add_pg = 0; break;
+            case 1003: clipping = soft_clip; break;
+            case 1004: clipping = hard_clip; break;
+            case 1005: param.use_strand = 1; break;
+            case 1006: param.write_clipped = 1; break;
+            case 1007: param.mark_fail = 1; break;
+            case 1008: param.both = 1; break;
+            case 1009: param.filter_len = atoi(optarg); break;
+            case 1010: param.fail_len = atoi(optarg); break;
+            case 1011: param.unmapped = 1; break;
+            case 1012: param.rejects_file = optarg; break;
+            case 1013: param.oa_tag = 1; break;
+            case 1014: param.del_tag = 0; break;
+            case 1015: param.tol = atoi(optarg); break;
+            default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+                      /* else fall-through */
+            case '?': usage(); samtools_exit(1);
+        }
+    }
+
+    if (!bedfile) {
+        usage();
+        return 1;
+    }
+
+    if (optind + 1 > argc) {
+        usage();
+        return 1;
+    }
+
+    if (param.tol < 0) {
+        fprintf(samtools_stderr, "[ampliconclip] warning: invalid tolerance of %d,"
+                        " reseting tolerance to default of 5.\n", param.tol);
+        param.tol = 5;
+    }
+
+    if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) {
+        print_error_errno("ampliconclip", "cannot open input file");
+        return 1;
+    }
+
+    sam_open_mode(wmode+1, fnout, NULL);
+
+    if ((out = sam_open_format(fnout, wmode, &ga.out)) == NULL) {
+        print_error_errno("ampliconclip", "cannot open output file");
+        return 1;
+    }
+
+    if (param.rejects_file) {
+        sam_open_mode(wmode+1, param.rejects_file, NULL);
+
+        if ((reject = sam_open_format(param.rejects_file, wmode, &ga.out)) == NULL) {
+            print_error_errno("ampliconclip", "cannot open rejects file");
+            return 1;
+        }
+    }
+
+    if (ga.nthreads > 0) {
+        if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+            fprintf(samtools_stderr, "[ampliconclip] error: cannot create thread pool.\n");
+            return 1;
+        }
+        hts_set_opt(in,  HTS_OPT_THREAD_POOL, &p);
+        hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+
+        if (reject) {
+           hts_set_opt(reject,  HTS_OPT_THREAD_POOL, &p);
+        }
+    }
+
+    param.arg_list = stringify_argv(argc + 1, argv - 1);
+
+    ret = bam_clip(in, out, reject, bedfile, clipping, &param);
+
+    // cleanup
+    sam_close(in);
+
+    if (sam_close(out) < 0) {
+        fprintf(samtools_stderr, "[ampliconclip] error: error while closing output file %s.\n", argv[optind+1]);
+        ret = 1;
+    }
+
+    if (reject) {
+        if (sam_close(reject) < 0) {
+            fprintf(samtools_stderr, "[ampliconclip] error: error while closing reject file %s.\n", param.rejects_file);
+            ret = 1;
+        }
+    }
+
+    if (p.pool) hts_tpool_destroy(p.pool);
+
+    sam_global_args_free(&ga);
+    free(param.arg_list);
+
+    return ret;
+}
+
diff --git a/samtools/bam_ampliconclip.h b/samtools/bam_ampliconclip.h

new file mode 100644 (file)

index 0000000..ef35357
--- /dev/null
+++ b/samtools/bam_ampliconclip.h
@@ -0,0 +1,54 @@
+/*  bam_ampliconclip.h -- shared functions between amplicon clip/stats
+
+    Copyright (C) 2020-2021 Genome Research Ltd.
+
+    Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#ifndef BAM_AMPLICONCLIP_H
+#define BAM_AMPLICONCLIP_H
+
+#include "htslib/khash.h"
+
+typedef struct {
+    int64_t left;
+    int64_t right;
+    int rev;
+} bed_entry_t;
+
+typedef struct {
+    bed_entry_t *bp;
+    int64_t longest;
+    int length;
+    int size;
+} bed_entry_list_t;
+
+KHASH_MAP_INIT_STR(bed_list_hash, bed_entry_list_t);
+
+#define BED_LIST_INIT {NULL, 0, 0, 0, {0}}
+
+
+int load_bed_file_multi_ref(char *infile, int get_strand,
+                        int sort_by_pos, khash_t(bed_list_hash) *bed_lists);
+
+void destroy_bed_hash(khash_t(bed_list_hash) *hash);
+
+
+#endif /* BAM_AMPLICONCLIP_H */
diff --git a/samtools/bam_aux.c b/samtools/bam_aux.c

index 4e222a0c401e516e7febf93c7c9981c40b257caa..77d94f8f3a262d721f4bfac5b0cb4c0642eea30f 100644 (file)
--- a/samtools/bam_aux.c
+++ b/samtools/bam_aux.c
@@ -50,13 +50,13 @@ int bam_aux_drop_other(bam1_t *b, uint8_t *s)
  {
      if (s) {
          uint8_t *p, *aux;
-        aux = bam1_aux(b);
+        aux = bam_get_aux(b);
          p = s - 2;
          __skip_tag(s);
          memmove(aux, p, s - p);
-        b->data_len -= bam_get_l_aux(b) - (s - p);
+        b->l_data -= bam_get_l_aux(b) - (s - p);
      } else {
-        b->data_len -= bam_get_l_aux(b);
+        b->l_data -= bam_get_l_aux(b);
      }
      return 0;
  }
diff --git a/samtools/bam_aux.c.pysam.c b/samtools/bam_aux.c.pysam.c

index 0763976b85df5a319ad010635e77d7f16260ebd4..39fe5cea5ac62d7e29e1d76bdd8a19d34e9154f6 100644 (file)
--- a/samtools/bam_aux.c.pysam.c
+++ b/samtools/bam_aux.c.pysam.c
@@ -52,13 +52,13 @@ int bam_aux_drop_other(bam1_t *b, uint8_t *s)
  {
      if (s) {
          uint8_t *p, *aux;
-        aux = bam1_aux(b);
+        aux = bam_get_aux(b);
          p = s - 2;
          __skip_tag(s);
          memmove(aux, p, s - p);
-        b->data_len -= bam_get_l_aux(b) - (s - p);
+        b->l_data -= bam_get_l_aux(b) - (s - p);
      } else {
-        b->data_len -= bam_get_l_aux(b);
+        b->l_data -= bam_get_l_aux(b);
      }
      return 0;
  }
diff --git a/samtools/bam_cat.c b/samtools/bam_cat.c

index f3c812aea019e16df1e2fd42cccd5ed15c394f44..ed8cf58c578e623332016d22c4f191ecd08d5e84 100644 (file)
--- a/samtools/bam_cat.c
+++ b/samtools/bam_cat.c
@@ -1,6 +1,6 @@
  /*  bam_cat.c -- efficiently concatenates bam files.
  
-    Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019 Genome Research Ltd.
+    Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd.
      Modified SAMtools work copyright (C) 2010 Illumina, Inc.
  
  Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -270,22 +270,13 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
  
          // Copy contains and blocks within them
          while ((c = cram_read_container(in_c))) {
-            cram_block *blk;
-
-           if (cram_container_is_empty(in_c)) {
-                if (cram_write_container(out_c, c) != 0)
-                    return -1;
-
+            if (cram_container_is_empty(in_c)) {
+                cram_block *blk;
                  // Container compression header
                  if (!(blk = cram_read_block(in_c)))
                      return -1;
-                if (cram_write_block(out_c, blk) != 0) {
-                    cram_free_block(blk);
-                    return -1;
-                }
                  cram_free_block(blk);
                  cram_free_container(c);
-
                  continue;
              }
  
@@ -297,6 +288,7 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
                  cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
              } else {
                  int32_t num_slices;
+                cram_block *blk;
  
                  // Not switching rg so do the usual read/write loop
                  if (cram_write_container(out_c, c) != 0)
@@ -467,7 +459,7 @@ int main_cat(int argc, char *argv[])
      char *outfn = 0;
      char **infns = NULL; // files to concatenate
      int infns_size = 0;
-    int c, ret = 0, no_pg = 0;
+    int c, ret = 0, no_pg = 0, usage = 0;
      samFile *in;
      sam_global_args ga;
  
@@ -481,7 +473,7 @@ int main_cat(int argc, char *argv[])
  
      sam_global_args_init(&ga);
  
-    while ((c = getopt_long(argc, argv, "h:o:b:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) {
          switch (c) {
              case 'h': {
                  samFile *fph = sam_open(optarg, "r");
@@ -522,6 +514,8 @@ int main_cat(int argc, char *argv[])
                  break;
              default:
                  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+                /* else fall-through */
+            case '?': usage=1; break;
          }
      }
  
@@ -539,7 +533,7 @@ int main_cat(int argc, char *argv[])
      }
  
      // Require at least one input file
-    if (infns_size + nargv_fns == 0) {
+    if (infns_size + nargv_fns == 0 || usage) {
          fprintf(stderr, "Usage: samtools cat [options] <in1.bam>  [... <inN.bam>]\n");
          fprintf(stderr, "       samtools cat [options] <in1.cram> [... <inN.cram>]\n\n");
          fprintf(stderr, "Concatenate BAM or CRAM files, first those in <bamlist.fofn>, then those\non the command line.\n\n");
diff --git a/samtools/bam_cat.c.pysam.c b/samtools/bam_cat.c.pysam.c

index 58a41b7ff5d856ab7903aa861a189bbd7f094dba..ef2199c78009fab1c3b6155b33ac9d719ba50b44 100644 (file)
--- a/samtools/bam_cat.c.pysam.c
+++ b/samtools/bam_cat.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bam_cat.c -- efficiently concatenates bam files.
  
-    Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019 Genome Research Ltd.
+    Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd.
      Modified SAMtools work copyright (C) 2010 Illumina, Inc.
  
  Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -272,22 +272,13 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
  
          // Copy contains and blocks within them
          while ((c = cram_read_container(in_c))) {
-            cram_block *blk;
-
-           if (cram_container_is_empty(in_c)) {
-                if (cram_write_container(out_c, c) != 0)
-                    return -1;
-
+            if (cram_container_is_empty(in_c)) {
+                cram_block *blk;
                  // Container compression header
                  if (!(blk = cram_read_block(in_c)))
                      return -1;
-                if (cram_write_block(out_c, blk) != 0) {
-                    cram_free_block(blk);
-                    return -1;
-                }
                  cram_free_block(blk);
                  cram_free_container(c);
-
                  continue;
              }
  
@@ -299,6 +290,7 @@ int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram,
                  cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
              } else {
                  int32_t num_slices;
+                cram_block *blk;
  
                  // Not switching rg so do the usual read/write loop
                  if (cram_write_container(out_c, c) != 0)
@@ -469,7 +461,7 @@ int main_cat(int argc, char *argv[])
      char *outfn = 0;
      char **infns = NULL; // files to concatenate
      int infns_size = 0;
-    int c, ret = 0, no_pg = 0;
+    int c, ret = 0, no_pg = 0, usage = 0;
      samFile *in;
      sam_global_args ga;
  
@@ -483,7 +475,7 @@ int main_cat(int argc, char *argv[])
  
      sam_global_args_init(&ga);
  
-    while ((c = getopt_long(argc, argv, "h:o:b:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) {
          switch (c) {
              case 'h': {
                  samFile *fph = sam_open(optarg, "r");
@@ -524,6 +516,8 @@ int main_cat(int argc, char *argv[])
                  break;
              default:
                  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+                /* else fall-through */
+            case '?': usage=1; break;
          }
      }
  
@@ -541,7 +535,7 @@ int main_cat(int argc, char *argv[])
      }
  
      // Require at least one input file
-    if (infns_size + nargv_fns == 0) {
+    if (infns_size + nargv_fns == 0 || usage) {
          fprintf(samtools_stderr, "Usage: samtools cat [options] <in1.bam>  [... <inN.bam>]\n");
          fprintf(samtools_stderr, "       samtools cat [options] <in1.cram> [... <inN.cram>]\n\n");
          fprintf(samtools_stderr, "Concatenate BAM or CRAM files, first those in <bamlist.fofn>, then those\non the command line.\n\n");
diff --git a/samtools/bam_color.c b/samtools/bam_color.c

index bee19b9da4a45cd1fef1672d557567bec40f9ca7..6decbc1c8edf585065fdc55fca8fca3fa16c2d30 100644 (file)
--- a/samtools/bam_color.c
+++ b/samtools/bam_color.c
@@ -25,7 +25,9 @@ DEALINGS IN THE SOFTWARE.  */
  #include <config.h>
  
  #include <ctype.h>
-#include "bam.h"
+#include <string.h>
+
+#include "htslib/sam.h"
  
  /*!
   @abstract     Get the color encoding the previous and current base
@@ -45,10 +47,10 @@ char bam_aux_getCSi(bam1_t *b, int i)
  
      cs = bam_aux2Z(c);
      // adjust for strandedness and leading adaptor
-    if(bam1_strand(b)) {
+    if(bam_is_rev(b)) {
          i = strlen(cs) - 1 - i;
          // adjust for leading hard clip
-        uint32_t cigar = bam1_cigar(b)[0];
+        uint32_t cigar = bam_get_cigar(b)[0];
          if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
          i -= cigar >> BAM_CIGAR_SHIFT;
          }
@@ -74,10 +76,10 @@ char bam_aux_getCQi(bam1_t *b, int i)
  
      cq = bam_aux2Z(c);
      // adjust for strandedness
-    if(bam1_strand(b)) {
+    if(bam_is_rev(b)) {
          i = strlen(cq) - 1 - i;
          // adjust for leading hard clip
-        uint32_t cigar = bam1_cigar(b)[0];
+        uint32_t cigar = bam_get_cigar(b)[0];
          if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
          i -= (cigar >> BAM_CIGAR_SHIFT);
          }
@@ -135,28 +137,28 @@ char bam_aux_getCEi(bam1_t *b, int i)
      cs = bam_aux2Z(c);
  
      // adjust for strandedness and leading adaptor
-    if(bam1_strand(b)) { //reverse strand
+    if(bam_is_rev(b)) { //reverse strand
          cs_i = strlen(cs) - 1 - i;
          // adjust for leading hard clip
-        uint32_t cigar = bam1_cigar(b)[0];
+        uint32_t cigar = bam_get_cigar(b)[0];
          if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
              cs_i -= cigar >> BAM_CIGAR_SHIFT;
          }
          // get current color
          cur_color = cs[cs_i];
          // get previous base.  Note: must rc adaptor
-        prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)];
+        prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : seq_nt16_str[bam_seqi(bam_get_seq(b), i+1)];
          // get current base
-        cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+        cur_b = seq_nt16_str[bam_seqi(bam_get_seq(b), i)];
      }
      else {
          cs_i=i+1;
          // get current color
          cur_color = cs[cs_i];
          // get previous base
-        prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)];
+        prev_b = (0 == i) ? cs[0] : seq_nt16_str[bam_seqi(bam_get_seq(b), i-1)];
          // get current base
-        cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+        cur_b = seq_nt16_str[bam_seqi(bam_get_seq(b), i)];
      }
  
      // corrected color
diff --git a/samtools/bam_color.c.pysam.c b/samtools/bam_color.c.pysam.c

index 762e83b118e80835a0dbbb6b0422f9a89c40d855..105cc332769d178affdc6c1bd27a755fda52ea52 100644 (file)
--- a/samtools/bam_color.c.pysam.c
+++ b/samtools/bam_color.c.pysam.c
@@ -27,7 +27,9 @@ DEALINGS IN THE SOFTWARE.  */
  #include <config.h>
  
  #include <ctype.h>
-#include "bam.h"
+#include <string.h>
+
+#include "htslib/sam.h"
  
  /*!
   @abstract     Get the color encoding the previous and current base
@@ -47,10 +49,10 @@ char bam_aux_getCSi(bam1_t *b, int i)
  
      cs = bam_aux2Z(c);
      // adjust for strandedness and leading adaptor
-    if(bam1_strand(b)) {
+    if(bam_is_rev(b)) {
          i = strlen(cs) - 1 - i;
          // adjust for leading hard clip
-        uint32_t cigar = bam1_cigar(b)[0];
+        uint32_t cigar = bam_get_cigar(b)[0];
          if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
          i -= cigar >> BAM_CIGAR_SHIFT;
          }
@@ -76,10 +78,10 @@ char bam_aux_getCQi(bam1_t *b, int i)
  
      cq = bam_aux2Z(c);
      // adjust for strandedness
-    if(bam1_strand(b)) {
+    if(bam_is_rev(b)) {
          i = strlen(cq) - 1 - i;
          // adjust for leading hard clip
-        uint32_t cigar = bam1_cigar(b)[0];
+        uint32_t cigar = bam_get_cigar(b)[0];
          if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
          i -= (cigar >> BAM_CIGAR_SHIFT);
          }
@@ -137,28 +139,28 @@ char bam_aux_getCEi(bam1_t *b, int i)
      cs = bam_aux2Z(c);
  
      // adjust for strandedness and leading adaptor
-    if(bam1_strand(b)) { //reverse strand
+    if(bam_is_rev(b)) { //reverse strand
          cs_i = strlen(cs) - 1 - i;
          // adjust for leading hard clip
-        uint32_t cigar = bam1_cigar(b)[0];
+        uint32_t cigar = bam_get_cigar(b)[0];
          if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {
              cs_i -= cigar >> BAM_CIGAR_SHIFT;
          }
          // get current color
          cur_color = cs[cs_i];
          // get previous base.  Note: must rc adaptor
-        prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)];
+        prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : seq_nt16_str[bam_seqi(bam_get_seq(b), i+1)];
          // get current base
-        cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+        cur_b = seq_nt16_str[bam_seqi(bam_get_seq(b), i)];
      }
      else {
          cs_i=i+1;
          // get current color
          cur_color = cs[cs_i];
          // get previous base
-        prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)];
+        prev_b = (0 == i) ? cs[0] : seq_nt16_str[bam_seqi(bam_get_seq(b), i-1)];
          // get current base
-        cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)];
+        cur_b = seq_nt16_str[bam_seqi(bam_get_seq(b), i)];
      }
  
      // corrected color
diff --git a/samtools/bam_fastq.c b/samtools/bam_fastq.c

index 44879c207720507b79dd222e1d24e5c67e25b7b6..a4d757c050fbf6067735d2ce42a191cf8784b453 100644 (file)
--- a/samtools/bam_fastq.c
+++ b/samtools/bam_fastq.c
@@ -1,6 +1,6 @@
  /*  bam_fastq.c -- FASTA and FASTQ file generation
  
-    Copyright (C) 2009-2017, 2019 Genome Research Ltd.
+    Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd.
      Portions copyright (C) 2009, 2011, 2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -42,16 +42,11 @@ DEALINGS IN THE SOFTWARE.  */
  #include "samtools.h"
  #include "sam_opts.h"
  
-#define taglist_free(p)
-KLIST_INIT(ktaglist, char*, taglist_free)
-
  #define DEFAULT_BARCODE_TAG "BC"
  #define DEFAULT_QUALITY_TAG "QT"
  #define INDEX_SEPARATOR "+"
  
  int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
-static const char *copied_tags[] = { "RG", "BC", "QT", NULL };
-
  static void bam2fq_usage(FILE *to, const char *command)
  {
      int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0;
@@ -60,64 +55,71 @@ static void bam2fq_usage(FILE *to, const char *command)
      fprintf(to,
  "\n"
  "Description:\n"
-"Converts a SAM, BAM or CRAM into either FASTQ or FASTA format depending on the command invoked.\n"
+"Converts a SAM, BAM or CRAM to %s format.\n"
  "\n"
  "Options:\n"
-"  -0 FILE              write reads designated READ_OTHER to FILE\n"
-"  -1 FILE              write reads designated READ1 to FILE\n"
-"  -2 FILE              write reads designated READ2 to FILE\n"
-"  -o FILE              write reads designated READ1 or READ2 to FILE\n"
-"                       note: if a singleton file is specified with -s, only\n"
-"                       paired reads will be written to the -1 and -2 files.\n"
-"  -f INT               only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
-"  -F INT               only include reads with none of the FLAGS in INT present [0x900]\n"       //   F&x == 0
-"  -G INT               only EXCLUDE reads with all  of the FLAGs in INT present [0]\n"       // !(F&x == x)
-"  -n                   don't append /1 and /2 to the read name\n"
-"  -N                   always append /1 and /2 to the read name\n");
+"  -0 FILE      write reads designated READ_OTHER to FILE\n"
+"  -1 FILE      write reads designated READ1 to FILE\n"
+"  -2 FILE      write reads designated READ2 to FILE\n"
+"  -o FILE      write reads designated READ1 or READ2 to FILE\n"
+"               note: if a singleton file is specified with -s, only\n"
+"               paired reads will be written to the -1 and -2 files.\n"
+"  -f INT       only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
+"  -F INT       only include reads with none of the FLAGS in INT present [0x900]\n"       //   F&x == 0
+"  -G INT       only EXCLUDE reads with all  of the FLAGs in INT present [0]\n"       // !(F&x == x)
+"  -n           don't append /1 and /2 to the read name\n"
+"  -N           always append /1 and /2 to the read name\n",
+    fq ? "FASTQ" : "FASTA");
      if (fq) fprintf(to,
-"  -O                   output quality in the OQ tag if present\n");
+"  -O           output quality in the OQ tag if present\n");
      fprintf(to,
-"  -s FILE              write singleton reads designated READ1 or READ2 to FILE\n"
-"  -t                   copy RG, BC and QT tags to the %s header line\n",
+"  -s FILE      write singleton reads designated READ1 or READ2 to FILE\n"
+"  -t           copy RG, BC and QT tags to the %s header line\n",
      fq ? "FASTQ" : "FASTA");
      fprintf(to,
-"  -T TAGLIST           copy arbitrary tags to the %s header line\n",
+"  -T TAGLIST   copy arbitrary tags to the %s header line\n",
      fq ? "FASTQ" : "FASTA");
      if (fq) fprintf(to,
-"  -v INT               default quality score if not given in file [1]\n"
-"  -i                   add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n"
-"  -c                   compression level [0..9] to use when creating gz or bgzf fastq files [1]\n"
-"  --i1 FILE            write first index reads to FILE\n"
-"  --i2 FILE            write second index reads to FILE\n"
-"  --barcode-tag TAG    Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n"
-"  --quality-tag TAG    Quality tag [default: " DEFAULT_QUALITY_TAG "]\n"
-"  --index-format STR   How to parse barcode and quality tags\n\n");
+"  -v INT       default quality score if not given in file [1]\n"
+"  -i           add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n"
+"  -c INT       compression level [0..9] to use when writing bgzf files [1]\n"
+"  --i1 FILE    write first index reads to FILE\n"
+"  --i2 FILE    write second index reads to FILE\n"
+"  --barcode-tag TAG\n"
+"               Barcode tag [" DEFAULT_BARCODE_TAG "]\n"
+"  --quality-tag TAG\n"
+"               Quality tag [" DEFAULT_QUALITY_TAG "]\n"
+"  --index-format STR\n"
+"               How to parse barcode and quality tags\n\n");
      sam_global_opt_help(to, "-.--.@-.");
      fprintf(to,
  "\n"
-"The files will be automatically compressed if the file names have a .gz or .bgzf extension.\n"
-"The input to this program must be collated by name. Run 'samtools collate' or 'samtools sort -n'.\n"
+"The files will be automatically compressed if the file names have a .gz\n"
+"or .bgzf extension.  The input to this program must be collated by name.\n"
+"Run 'samtools collate' or 'samtools sort -n' to achieve this.\n"
  "\n"
  "Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n"
  "Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n"
-"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n"
-"or both unset.\n"
+"Otherwise reads are designated READ_OTHER (both flags set or both flags unset).\n"
  "Run 'samtools flags' for more information on flag codes and meanings.\n");
      fprintf(to,
  "\n"
-"The index-format string describes how to parse the barcode and quality tags, for example:\n"
-"   i14i8       the first 14 characters are index 1, the next 8 characters are index 2\n"
-"   n8i14       ignore the first 8 characters, and use the next 14 characters for index 1\n"
-"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n"
-"'read until the separator or end of tag', for example:\n"
-"   n*i*        ignore the left part of the tag until the separator, then use the second part\n"
-"               of the tag as index 1\n");
+"The index-format string describes how to parse the barcode and quality tags.\n"
+"It is made up of 'i' or 'n' followed by a length or '*'.  For example:\n"
+"   i14i8       The first 14 characters are index 1, the next 8 are index 2\n"
+"   n8i14       Ignore the first 8 characters, and use the next 14 for index 1\n\n"
+"If the tag contains a separator, then the numeric part can be replaced with\n"
+"'*' to mean 'read until the separator or end of tag', for example:\n"
+"   i*i*        Break the tag at the separator into index 1 and index 2\n"
+"   n*i*        Ignore the left part of the tag until the separator,\n"
+"               then use the second part of the tag as index 1\n");
      fprintf(to,
  "\n"
  "Examples:\n"
-" To get just the paired reads in separate files, use:\n"
-"   samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n in.bam\n"
-"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n"
+"To get just the paired reads in separate files, use:\n"
+"   samtools %s -1 pair1.%s -2 pair2.%s -0 /dev/null -s /dev/null -n in.bam\n"
+"\nTo get all non-supplementary/secondary reads in a single file, redirect\n"
+"the output:\n"
  "   samtools %s in.bam > all_reads.%s\n",
              command, fq ? "fq" : "fa", fq ? "fq" : "fa",
              command, fq ? "fq" : "fa");
@@ -144,96 +146,20 @@ typedef struct bam2fq_opts {
  
  typedef struct bam2fq_state {
      samFile *fp;
-    BGZF *fpse;
-    BGZF *fpr[3];
-    BGZF *fpi[2];
-    BGZF *hstdout;
+    samFile *fpse;
+    samFile *fpr[3];
+    samFile *fpi[3];
+    samFile *hstdout;
      sam_hdr_t *h;
      bool has12, use_oq, copy_tags, illumina_tag;
      int flag_on, flag_off, flag_alloff;
      fastfile filetype;
      int def_qual;
-    klist_t(ktaglist) *taglist;
      char *index_sequence;
      char compression_level;
      htsThreadPool p;
  } bam2fq_state_t;
  
-/*
- * Get and decode the read from a BAM record.
- *
- * TODO: htslib really needs an interface for this.  Consider this or perhaps
- * bam_get_seq_str (current vs original orientation) and bam_get_qual_str
- * functions as string formatted equivalents to bam_get_{seq,qual}?
- */
-
-/*
- * Reverse a string in place.
- * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux.
- * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik
- */
-static char *reverse(char *str)
-{
-    int i = strlen(str)-1,j=0;
-    char ch;
-    while (i>j) {
-        ch = str[i];
-        str[i]= str[j];
-        str[j] = ch;
-        i--;
-        j++;
-    }
-    return str;
-}
-
-/* return the read, reverse complemented if necessary */
-static char *get_read(const bam1_t *rec)
-{
-    int len = rec->core.l_qseq + 1;
-    char *read = calloc(1, len);
-    char *seq = (char *)bam_get_seq(rec);
-    int n;
-
-    if (!read) return NULL;
-
-    for (n=0; n < rec->core.l_qseq; n++) {
-        if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]];
-        else                               read[n] = seq_nt16_str[bam_seqi(seq,n)];
-    }
-    if (rec->core.flag & BAM_FREVERSE) reverse(read);
-    return read;
-}
-
-/*
- * get and decode the quality from a BAM record
- */
-static int get_quality(const bam1_t *rec, char **qual_out)
-{
-    char *quality = calloc(1, rec->core.l_qseq + 1);
-    char *q = (char *)bam_get_qual(rec);
-    int n;
-
-    if (!quality) return -1;
-
-    if (*q == '\xff') {
-        free(quality);
-        *qual_out = NULL;
-        return 0;
-    }
-
-    for (n=0; n < rec->core.l_qseq; n++) {
-        quality[n] = q[n]+33;
-    }
-    if (rec->core.flag & BAM_FREVERSE) reverse(quality);
-    *qual_out = quality;
-    return 0;
-}
-
-//
-// End of htslib complaints
-//
-
-
  static readpart which_readpart(const bam1_t *b)
  {
      if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
@@ -245,290 +171,8 @@ static readpart which_readpart(const bam1_t *b)
      }
  }
  
-/*
- * parse the length part from the index-format string
- */
-static int getLength(char **s)
-{
-    int n = 0;
-    while (**s) {
-        if (**s == '*') { n=-1; (*s)++; break; }
-        if ( !isdigit(**s)) break;
-        n = n*10 + ((**s)-'0');
-        (*s)++;
-    }
-    return n;
-}
-
-static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf)
-{
-    uint8_t *s = bam_aux_get(rec, tag);
-    if (s) {
-        char aux_type = *s;
-        switch (aux_type) {
-            case 'C':
-            case 'S': aux_type = 'I'; break;
-            case 'c':
-            case 's': aux_type = 'i'; break;
-            case 'd': aux_type = 'f'; break;
-        }
-
-        // Ensure space.  Need 6 chars + length of tag.  Max length of
-        // i is 16, A is 21, B currently 26, Z is unknown, so
-        // have to check that one later.
-        if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false;
-
-        kputc('\t', linebuf);
-        kputsn(tag, 2, linebuf);
-        kputc(':', linebuf);
-        kputc(aux_type=='I'? 'i': aux_type, linebuf);
-        kputc(':', linebuf);
-        switch (aux_type) {
-            case 'H':
-            case 'Z':
-                if (kputs(bam_aux2Z(s), linebuf) < 0) return false;
-                break;
-            case 'i': kputw(bam_aux2i(s), linebuf); break;
-            case 'I': kputuw(bam_aux2i(s), linebuf); break;
-            case 'A': kputc(bam_aux2A(s), linebuf); break;
-            case 'f': kputd(bam_aux2f(s), linebuf); break;
-            case 'B': kputs("*** Unhandled aux type ***", linebuf); return false;
-            default:  kputs("*** Unknown aux type ***", linebuf); return false;
-       }
-    }
-    return true;
-}
-
-static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec)
-{
-    if (!index_sequence) return 0;
-
-    kstring_t new = {0,0,NULL};
-    if (linebuf->s) {
-        char *s = strchr(linebuf->s, '\n');
-        if (s) {
-            if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0)
-                return -1;
-            *s = 0;
-            kputs(linebuf->s, &new);
-            kputc(' ', &new);
-            readpart readpart = which_readpart(rec);
-            if (readpart == READ_1) kputc('1', &new);
-            else if (readpart == READ_2) kputc('2', &new);
-            else kputc('0', &new);
-
-            kputc(':', &new);
-            if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new);
-            else                              kputc('N', &new);
-
-            kputs(":0:", &new);
-            kputs(index_sequence, &new);
-            kputc('\n', &new);
-            kputs(s+1, &new);
-            free(ks_release(linebuf));
-            linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m;
-        }
-    }
-    return 0;
-}
-
-static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state)
-{
-    int i;
-
-    linebuf->l = 0;
-    // Write read name
-    if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false;
-    if (kputs(bam_get_qname(rec), linebuf) < 0) return false;
-    // Add the /1 /2 if requested
-    if (state->has12) {
-        readpart readpart = which_readpart(rec);
-        if (readpart == READ_1) {
-            if (kputs("/1", linebuf) < 0) return false;
-        } else if (readpart == READ_2) {
-            if (kputs("/2", linebuf) < 0) return false;
-        }
-    }
-    if (state->copy_tags) {
-        for (i = 0; copied_tags[i]; ++i) {
-            if (!copy_tag(copied_tags[i], rec, linebuf)) {
-                fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s);
-                return false;
-            }
-        }
-    }
-
-    if (state->taglist->size) {
-        kliter_t(ktaglist) *p;
-        for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) {
-            if (!copy_tag(kl_val(p), rec, linebuf)) {
-                fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s);
-                return false;
-            }
-        }
-    }
-
-    if (kputc('\n', linebuf) < 0) return false;
-    if (kputs(seq, linebuf) < 0) return false;
-    if (kputc('\n', linebuf) < 0) return false;
-
-    if (state->filetype == FASTQ) {
-        // Write quality
-        if (kputs("+\n", linebuf) < 0) return false;
-        if (qual && *qual) {
-            if (kputs(qual, linebuf) < 0) return false;
-        } else {
-            int len = strlen(seq);
-            if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false;
-            for (i = 0; i < len; ++i) {
-                kputc(33 + state->def_qual, linebuf);
-            }
-        }
-        if (kputc('\n', linebuf) < 0) return false;
-    }
-    return true;
-}
-
-/*
- * Create FASTQ lines from the barcode tag using the index-format
- */
-static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts)
-{
-    uint8_t *p;
-    char *ifmt = opts->index_format;
-    char *tag = NULL;
-    char *qual = NULL;
-    char *sub_tag = NULL;
-    char *sub_qual = NULL;
-    size_t tag_len;
-    int file_number = 0;
-    kstring_t linebuf = { 0, 0, NULL }; // Buffer
-
-    if (!ifmt) return true;
-
-    // read barcode tag
-    p = bam_aux_get(rec,opts->barcode_tag);
-    if (p) tag = bam_aux2Z(p);
-
-    if (!tag) return true; // there is no tag
-
-    tag_len = strlen(tag);
-    sub_tag = calloc(1, tag_len + 1);
-    if (!sub_tag) goto fail;
-    sub_qual = calloc(1, tag_len + 1);
-    if (!sub_qual) goto fail;
-
-    // read quality tag
-    p = bam_aux_get(rec, opts->quality_tag);
-    if (p) qual = bam_aux2Z(p);
-
-    // Parse the index-format string
-    while (*ifmt) {
-        if (file_number > 1) break;     // shouldn't happen if we've validated paramaters correctly
-        char action = *ifmt;        // should be 'i' or 'n'
-        ifmt++; // skip over action
-        int index_len = getLength(&ifmt);
-        int n = 0;
-
-        if (index_len < 0) {
-            // read until separator
-            while (isalpha(*tag)) {
-                sub_tag[n] = *tag++;
-                if (qual) sub_qual[n] = *qual++;
-                n++;
-            }
-            if (*tag) { // skip separator
-                tag++;
-                if (qual) qual++;
-            }
-        } else {
-            // read index_len characters
-            while (index_len-- && *tag) {
-                sub_tag[n] = *tag++;
-                if (qual) sub_qual[n] = *qual++;
-                n++;
-            }
-        }
-        sub_tag[n] = '\0';
-        sub_qual[n] = '\0';
-
-        if (action=='i' && *sub_tag) {
-            if (state->index_sequence) {
-                char *new_index_sequence = realloc(state->index_sequence, strlen(state->index_sequence) + strlen(sub_tag) + 2);
-                if (!new_index_sequence) goto fail;
-                state->index_sequence = new_index_sequence;
-                strcat(state->index_sequence, INDEX_SEPARATOR);
-                strcat(state->index_sequence, sub_tag);
-            } else {
-                state->index_sequence = strdup(sub_tag);    // we're going to need this later...
-            }
-            if (!state->index_sequence) goto fail;
-            if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail;
-            if (state->illumina_tag) {
-                if (insert_index_sequence_into_linebuf(sub_tag, &linebuf, rec) < 0) {
-                    goto fail;
-                }
-            }
-            if (state->fpi[file_number]) {
-                if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0)
-                    goto fail;
-            }
-        }
-
-    }
-
-    free(sub_qual); free(sub_tag);
-    free(linebuf.s);
-    return true;
-
- fail:
-    perror(__func__);
-    free(sub_qual); free(sub_tag);
-    free(linebuf.s);
-    return false;
-}
-
-// Transform a bam1_t record into a string with the FASTQ representation of it
-// @returns false for error, true for success
-static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
-{
-    int32_t qlen = b->core.l_qseq;
-    assert(qlen >= 0);
-    const uint8_t *oq = NULL;
-    char *qual = NULL;
-
-    char *seq = get_read(b);
-    if (!seq) return false;
-
-    if (state->use_oq) oq = bam_aux_get(b, "OQ");
-    if (oq && *oq=='Z') {
-        qual = strdup(bam_aux2Z(oq));
-        if (!qual) goto fail;
-        if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
-            reverse(qual);
-        }
-    } else {
-        if (get_quality(b, &qual) < 0) goto fail;
-    }
-
-    if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail;
-
-    free(qual);
-    free(seq);
-    return true;
-
- fail:
-    free(seq);
-    free(qual);
-    return false;
-}
-
  static void free_opts(bam2fq_opts_t *opts)
  {
-    free(opts->barcode_tag);
-    free(opts->quality_tag);
-    free(opts->index_format);
-    free(opts->extra_tags);
      free(opts);
  }
  
@@ -566,13 +210,14 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
          {"quality-tag", required_argument, NULL, 'q'},
          { NULL, 0, NULL, 0 }
      };
-    while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) {
+    while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:",
+                            lopts, NULL)) > 0) {
          switch (c) {
-            case 'b': opts->barcode_tag = strdup(optarg); break;
-            case 'q': opts->quality_tag = strdup(optarg); break;
+            case 'b': opts->barcode_tag = optarg; break;
+            case 'q': opts->quality_tag = optarg; break;
              case  1 : opts->index_file[0] = optarg; break;
              case  2 : opts->index_file[1] = optarg; break;
-            case  3 : opts->index_format = strdup(optarg); break;
+            case  3 : opts->index_format = optarg; break;
              case '0': opts->fnr[0] = optarg; break;
              case '1': opts->fnr[1] = optarg; break;
              case '2': opts->fnr[2] = optarg; break;
@@ -583,7 +228,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
                      flag_off_set = 1;
                      opts->flag_off = 0;
                  }
-                opts->flag_off |= strtol(optarg, 0, 0); break;
+                opts->flag_off |= strtol(optarg, 0, 0);
+                break;
              case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
              case 'n': opts->has12 = false; break;
              case 'N': opts->has12always = true; break;
@@ -591,13 +237,25 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
              case 's': opts->fnse = optarg; break;
              case 't': opts->copy_tags = true; break;
              case 'i': opts->illumina_tag = true; break;
-            case 'c': opts->compression_level = atoi(optarg); break;
-            case 'T': opts->extra_tags = strdup(optarg); break;
+            case 'c':
+                opts->compression_level = atoi(optarg);
+                if (opts->compression_level < 0)
+                    opts->compression_level = 0;
+                if (opts->compression_level > 9)
+                    opts->compression_level = 9;
+                break;
+            case 'T': opts->extra_tags = optarg; break;
              case 'v': opts->def_qual = atoi(optarg); break;
-            case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); return false;
+
+            case '?':
+                bam2fq_usage(stderr, argv[0]);
+                free_opts(opts);
+                return false;
              default:
                  if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
-                    bam2fq_usage(stderr, argv[0]); free_opts(opts); return false;
+                    bam2fq_usage(stderr, argv[0]);
+                    free_opts(opts);
+                    return false;
                  }
                  break;
          }
@@ -606,8 +264,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
      if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
      if (opts->has12always) opts->has12 = true;
  
-    if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG);
-    if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG);
+    if (!opts->barcode_tag) opts->barcode_tag = DEFAULT_BARCODE_TAG;
+    if (!opts->quality_tag) opts->quality_tag = DEFAULT_QUALITY_TAG;
  
      int nIndex = 0;
      if (opts->index_format) {
@@ -652,7 +310,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
      }
  
      const char* type_str = argv[0];
-    if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) {
+    if (strcasecmp("fastq", type_str) == 0 ||
+        strcasecmp("bam2fq", type_str) == 0) {
          opts->filetype = FASTQ;
      } else if (strcasecmp("fasta", type_str) == 0) {
          opts->filetype = FASTA;
@@ -680,34 +339,61 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
      return true;
  }
  
-static BGZF *open_fqfile(char *filename, int c, htsThreadPool *tp)
-{
-    char mode[4] = "w";
-    size_t len = strlen(filename);
-
-    mode[2] = 0; mode[3] = 0;
-    if (len > 3 && strstr(filename + (len - 3),".gz")) {
-        mode[1] = 'g'; mode[2] = c+'0';
-    } else if ((len > 4 && strstr(filename + (len - 4),".bgz"))
-               || (len > 5 && strstr(filename + (len - 5),".bgzf"))) {
-        mode[1] = c+'0';
-    } else {
-        mode[1] = 'u';
+void set_sam_opts(samFile *fp, bam2fq_state_t *state,
+                  const bam2fq_opts_t *opts) {
+    if (state->has12)
+        hts_set_opt(fp, FASTQ_OPT_RNUM, 1);
+
+    if (state->illumina_tag)
+        hts_set_opt(fp, FASTQ_OPT_CASAVA, 1);
+
+    hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag);
+
+    kstring_t tag_list = {0,0};
+    if (state->copy_tags)
+        kputs("RG,BC,QT", &tag_list);
+    if (opts->extra_tags) {
+        if (tag_list.l)
+            kputc(',', &tag_list);
+        kputs(opts->extra_tags, &tag_list);
      }
+    if (tag_list.l)
+        hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s);
+    ks_free(&tag_list);
+}
  
-    BGZF *fp = bgzf_open(filename,mode);
+// Open a file as normal or gzipped based on filename.
+// Note we always use bgzf and don't bother to attempt non-blocked
+// gzip streams.  This is a departure from the old fastq code.
+static samFile *sam_open_z(char *fn, char *mode, bam2fq_state_t *state) {
+    char modez[6];
+    strcpy(modez, mode);
+
+    size_t l = strlen(fn);
+    if ((l > 3 && strcmp(fn+l-3, ".gz") == 0) ||
+        (l > 4 && strcmp(fn+l-4, ".bgz") == 0) ||
+        (l > 5 && strcmp(fn+l-5, ".bgzf") == 0)) {
+        char m[3] = {'z', state->compression_level+'0', '\0'};
+        strcat(modez, m);
+    }
+
+    samFile *fp = sam_open(fn, modez);
      if (!fp)
-        return fp;
-    if (tp->pool && bgzf_thread_pool(fp, tp->pool, tp->qsize) < 0) {
-        bgzf_close(fp);
          return NULL;
-    }
+
+    if (state->p.pool)
+        hts_set_thread_pool(fp, &state->p);
+
      return fp;
  }
  
  static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
  {
+    char *mode = opts->filetype == FASTA ? "wF" : "wf";
+
      bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
+    if (!state)
+        return false;
      state->flag_on = opts->flag_on;
      state->flag_off = opts->flag_off;
      state->flag_alloff = opts->flag_alloff;
@@ -721,22 +407,6 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
      state->hstdout = NULL;
      state->compression_level = opts->compression_level;
  
-    state->taglist = kl_init(ktaglist);
-    if (opts->extra_tags) {
-        char *save_p;
-        char *s = strtok_r(opts->extra_tags, ",", &save_p);
-        while (s) {
-            if (strlen(s) != 2) {
-                fprintf(stderr, "Parsing extra tags - '%s' is not two characters\n", s);
-                free(state);
-                return false;
-            }
-            char **et = kl_pushp(ktaglist, state->taglist);
-            *et = s;
-            s = strtok_r(NULL, ",", &save_p);
-        }
-    }
-
      state->fp = sam_open(opts->fn_input, "r");
      if (state->fp == NULL) {
          print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input);
@@ -768,12 +438,12 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
          return false;
      }
      if (opts->fnse) {
-        state->fpse = open_fqfile(opts->fnse, state->compression_level, &state->p);
-        if (state->fpse == NULL) {
-            print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse);
+        if (!(state->fpse = sam_open_z(opts->fnse, mode, state))) {
+            print_error_errno("bam2fq", "Cannot open singleton file \"%s\"", opts->fnse);
              free(state);
              return false;
          }
+        set_sam_opts(state->fpse, state, opts);
      }
  
      if (opts->ga.reference) {
@@ -784,6 +454,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
          }
      }
  
+    // single, read1, read2
      int i, j;
      for (i = 0; i < 3; ++i) {
          if (opts->fnr[i]) {
@@ -791,28 +462,30 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
                  if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0)
                      break;
              if (j == i) {
-                state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level, &state->p);
-                if (state->fpr[i] == NULL) {
-                    print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"",
+                if (!(state->fpr[i] = sam_open_z(opts->fnr[i], mode, state))) {
+                    print_error_errno("bam2fq", "Cannot open r%d file \"%s\"",
                                        i, opts->fnr[i]);
                      free(state);
                      return false;
                  }
+                set_sam_opts(state->fpr[i], state, opts);
              } else {
                  state->fpr[i] = state->fpr[j];
              }
          } else {
              if (!state->hstdout) {
-                state->hstdout = bgzf_dopen(fileno(stdout), "wu");
-                if (!state->hstdout) {
+                if (!(state->hstdout = sam_open_z("-", mode, state))) {
                      print_error_errno("bam2fq", "Cannot open STDOUT");
                      free(state);
                      return false;
                  }
+                set_sam_opts(state->hstdout, state, opts);
              }
              state->fpr[i] = state->hstdout;
          }
      }
+
+    // index 1, index 2
      for (i = 0; i < 2; i++) {
          state->fpi[i] = NULL;
          if (opts->index_file[i]) {
@@ -823,13 +496,14 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
                  if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0)
                      break;
              if (i == j) {
-                state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level, &state->p);
-                if (state->fpi[i] == NULL) {
-                    print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"",
+                if (!(state->fpi[i] = sam_open_z(opts->index_file[i], mode,
+                                                 state))) {
+                    print_error_errno("bam2fq", "Cannot open i%d file \"%s\"",
                                        i+1, opts->index_file[i]);
                      free(state);
                      return false;
                  }
+                set_sam_opts(state->fpi[i], state, opts);
              } else if (j < 0) {
                  state->fpi[i] = state->fpr[j+3];
              } else {
@@ -854,21 +528,25 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int*
      bool valid = true;
      sam_hdr_destroy(state->h);
      check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status);
-    if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; }
+    if (state->fpse && sam_close(state->fpse) < 0) {
+        print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse);
+        valid = false;
+    }
+
      int i, j;
      for (i = 0; i < 3; ++i) {
          if (state->fpr[i] != state->hstdout) {
              for (j = 0; j < i; j++)
                  if (state->fpr[i] == state->fpr[j])
                      break;
-            if (j == i && bgzf_close(state->fpr[i])) {
+            if (j == i && sam_close(state->fpr[i])) {
                  print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]);
                  valid = false;
              }
          }
      }
      if (state->hstdout) {
-        if (bgzf_close(state->hstdout)) {
+        if (sam_close(state->hstdout) < 0) {
              print_error_errno("bam2fq", "Error closing STDOUT");
              valid = false;
          }
@@ -880,12 +558,11 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int*
          for (j -= 3; j >= 0 && j < i; j++)
              if (state->fpi[i] == state->fpi[j])
                  break;
-        if (j == i && state->fpi[i] && bgzf_close(state->fpi[i])) {
+        if (j == i && state->fpi[i] && sam_close(state->fpi[i]) < 0) {
              print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
              valid = false;
          }
      }
-    kl_destroy(ktaglist,state->taglist);
      free(state->index_sequence);
      if (state->p.pool)
          hts_tpool_destroy(state->p.pool);
@@ -901,135 +578,300 @@ static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
  
  }
  
+int write_index_rec(samFile *fp, bam1_t *b, bam2fq_state_t *state,
+                    bam2fq_opts_t* opts, char *seq, int seq_len,
+                    char *qual, int qual_len) {
+    if (!fp || !b || !seq_len)
+        return 0;
+
+    int ret = -1;
+    bam1_t *b2 = bam_init1(); // FIXME: reuse
+    if (!b2)
+        return -1;
+
+    size_t aux_len = b->data + b->l_data - bam_get_aux(b);
+    if (bam_set1(b2, b->core.l_qname, bam_get_qname(b),
+                 (b->core.flag | BAM_FUNMAP) & ~BAM_FREVERSE,
+                 -1, -1, 0,    // refid, pos, mapq
+                 0, NULL,      // cigar
+                 -1, -1, 0,    // rnext, pnext, tlen
+                 seq_len, seq, qual,
+                 aux_len) < 0)
+        goto err;
+
+    uint8_t *q = bam_get_qual(b2);
+    if (qual) {
+        int i;
+        for (i = 0; i < seq_len; i++)
+            q[i] -= '!';
+    } else {
+        memset(q, opts->def_qual, seq_len);
+    }
+
+    memcpy(bam_get_aux(b2), bam_get_aux(b), aux_len);
+    b2->l_data += aux_len;
+    if (sam_write1(fp, state->h, b2) < 0)
+        goto err;
+
+    ret = 0;
+ err:
+    if (b2)
+        bam_destroy1(b2);
+    return ret;
+}
+
+int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state,
+                 bam2fq_opts_t* opts) {
+    bam1_t *b[2] = {b1, b2};
+
+    char *ifmt = opts->index_format;
+    if (!ifmt)
+        ifmt = "i*i*";
+
+    // Get seq / qual elements
+    char *bc = NULL, *qt = NULL;
+    if (b1)
+        bc = (char *)bam_aux_get(b1, opts->barcode_tag);
+    if (b2 && !bc)
+        bc = (char *)bam_aux_get(b2, opts->barcode_tag);
+    if (!bc)
+        return 0;
+    else
+        bc++; // skip Z
+
+    if (b1)
+        qt = (char *)bam_aux_get(b1, opts->quality_tag);
+    if (b2 && !qt)
+        qt = (char *)bam_aux_get(b2, opts->quality_tag);
+    if (qt && strlen(bc) != strlen(qt)-1)
+        qt = NULL;
+    else if (qt)
+        qt++;
+
+    int inum = 0;
+    while (inum < 2) {
+        char fc = *ifmt++;
+        if (!fc)
+            break; // ran out of index-format
+
+        long len, rem = 0;
+        if (isdigit(*ifmt)) {
+            rem = len = strtol(ifmt, &ifmt, 10);
+        } else {
+            ifmt++;
+            len = 0;
+        }
+
+        char *bc_end = bc, *qt_end = qt;
+        while (len ? *bc_end && rem-- : isalpha(*bc_end))
+            bc_end++, qt_end += qt != NULL;
+
+        switch (fc) {
+        case 'n':
+            // skip
+            bc = bc_end + (len==0);
+            if (qt)
+                qt = qt_end + (len==0);
+            break;
+
+        case 'i':
+            if (write_index_rec(state->fpi[inum], b[inum], state, opts,
+                                bc, bc_end-bc, qt, qt_end-qt) < 0)
+                return -1;
+            bc = bc_end + (len==0);
+            if (qt)
+                qt = qt_end + (len==0);
+            inum++;
+            break;
+
+        default:
+            fprintf(stderr, "Unknown index-format code\n");
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static int flush_rec(bam2fq_state_t *state, bam2fq_opts_t* opts,
+                     bam1_t *b[4], int score[3], int best[3],
+                     int64_t *n_singletons) {
+    // Paired data, with 1 or 2 ends present.
+    if (score[1] > 0 && score[2] > 0) {
+        // If CASAVA tag is required and barcode is only on R1,
+        // copy it to R2
+        if (state->illumina_tag) {
+            char *tag;
+            if ((tag = (char *)bam_aux_get(b[best[1]],
+                                           opts->barcode_tag)))
+                if (bam_aux_update_str(b[best[2]],
+                                       opts->barcode_tag,
+                                       strlen(tag), tag+1) < 0)
+                    goto err;
+            if ((tag = (char *)bam_aux_get(b[best[1]],
+                                           opts->quality_tag)))
+                if (bam_aux_update_str(b[best[2]],
+                                       opts->quality_tag,
+                                       strlen(tag), tag+1) < 0)
+                    goto err;
+
+        }
+        if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0)
+            goto err;
+        if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0)
+            goto err;
+
+        if (output_index(b[best[1]], b[best[2]], state, opts) < 0)
+            goto err;
+    } else if (score[1] > 0 || score[2] > 0) {
+        if (state->fpse) {
+            // print whichever one exists to fpse
+            if (score[1] > 0) {
+                if (sam_write1(state->fpse, state->h, b[best[1]]) < 0)
+                    goto err;
+            } else {
+                if (sam_write1(state->fpse, state->h, b[best[2]]) < 0)
+                    goto err;
+            }
+            ++(*n_singletons);
+        } else {
+            if (score[1] > 0) {
+                if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0)
+                    goto err;
+            } else {
+                if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0)
+                    goto err;
+            }
+        }
+
+        if (output_index(score[1] > 0 ? b[best[1]] : NULL,
+                         score[2] > 0 ? b[best[2]] : NULL,
+                         state, opts) < 0)
+            goto err;
+    }
+
+    if (score[0]) { // single ended data (neither READ1 nor READ2)
+        if (sam_write1(state->fpr[0], state->h, b[best[0]]) < 0)
+            goto err;
+
+        if (output_index(b[best[0]], NULL, state, opts) < 0)
+            goto err;
+    }
+
+    return 0;
+
+ err:
+    return -1;
+}
+
  static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
  {
      int n;
-    bam1_t *records[3] = {NULL, NULL, NULL};
      char *current_qname = NULL;
      int64_t n_reads = 0, n_singletons = 0; // Statistics
-    kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}};
      int score[3];
      int at_eof;
-    bool valid = true;
-    bam1_t* b = NULL;
+    bool valid = false;
+    int best[3] = {-1, -1, -1}; // map R0, R1, single to b[] indices;
+                                // indexed by [readpart]
+    bam1_t *b[4];               // 3 readparts, plus current record
  
-    while (true) {
-        if (!b)
-            b = bam_init1();
-        if (b == NULL) {
+    for (n = 0; n < 4; n++) {
+        if (!(b[n] = bam_init1())) {
              perror("[bam2fq_mainloop] Malloc error for bam record buffer.");
-            valid = false;
-            break;
+            return false;
          }
-        int res = sam_read1(state->fp, state->h, b);
+    }
+
+    n = 0;
+    while (true) {
+        int res = sam_read1(state->fp, state->h, b[n]);
          if (res < -1) {
              fprintf(stderr, "[bam2fq_mainloop] Failed to read bam record.\n");
-            valid = false;
-            break;
+            goto err;
          }
          at_eof = res < 0;
  
-        if (!at_eof && filter_it_out(b, state))
+        if (!at_eof && filter_it_out(b[n], state))
              continue;
-        if (!at_eof) ++n_reads;
-
-        if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) {
-            if (current_qname) {
-                if (state->illumina_tag) {
-                    for (n=0; valid && n<3; n++) {
-                        if (!records[n]) continue;
-                        if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false;
-                    }
-                    if (!valid) break;
-                }
-                free(state->index_sequence); state->index_sequence = NULL;
-                if (score[1] > 0 && score[2] > 0) {
-                    // print linebuf[1] to fpr[1], linebuf[2] to fpr[2]
-                    if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
-                    if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
-                } else if (score[1] > 0 || score[2] > 0) {
-                    if (state->fpse) {
-                        // print whichever one exists to fpse
-                        if (score[1] > 0) {
-                            if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
-                        } else {
-                            if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
-                        }
-                        ++n_singletons;
-                    } else {
-                        if (score[1] > 0) {
-                            if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
-                        } else {
-                            if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
-                        }
-                    }
-                }
-                if (score[0]) { // TODO: check this
-                    // print linebuf[0] to fpr[0]
-                    if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; }
-                }
+        if (!at_eof) {
+            ++n_reads;
+
+            // Handle -O option: use OQ for qual
+            uint8_t *oq;
+            if (state->use_oq && (oq = bam_aux_get(b[n],"OQ")) && *oq == 'Z') {
+                int i, l = strlen((char *)++oq);
+                uint8_t *qual = bam_get_qual(b[n]);
+                for (i = 0; i < l && i < b[n]->core.l_qseq; i++)
+                    qual[i] = oq[i] - '!';
              }
+        }
  
+        if (at_eof
+            || !current_qname
+            || (strcmp(current_qname, bam_get_qname(b[n])) != 0)) {
+            // New name, so flush best examples of previous name.
+            if (current_qname)
+                if (flush_rec(state, opts, b, score, best, &n_singletons) < 0)
+                    goto err;
  
-            free(current_qname); current_qname = NULL;
+            current_qname = bam_get_qname(b[n]);
              score[0] = score[1] = score[2] = 0;
-            for (n=0; n < 3; n++) {
-                bam_destroy1(records[n]); records[n]=NULL;
-            }
  
              if (at_eof) { break; }
-
-            current_qname = strdup(bam_get_qname(b));
-            if (!current_qname) { valid = false; break; }
          }
  
          // Prefer a copy of the read that has base qualities
-        int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1;
-        readpart rp = which_readpart(b);
-        if (b_score > score[rp]) {
-            if (!tags2fq(b, state, opts)) { valid = false; break; }
-            if (records[rp]) bam_destroy1(records[rp]);
-            records[rp] = b;
+        int b_score = bam_get_qual(b[n])[0] != 0xff? 2 : 1;
+        readpart rp = which_readpart(b[n]);
+        if (score[rp] < b_score) {
              score[rp] = b_score;
-            b = NULL;
-            if(!bam1_to_fq(records[rp], &linebuf[rp], state)) {
-                fprintf(stderr, "[%s] Error converting read to FASTA/Q\n", __func__);
-                valid = false; break;
-            }
+            // Record b[n] slot for best copy of readpair and find a new
+            // slot for next bam read
+            best[rp] = n;
+            int used_slot[4] = {0}, i;
+            for (i = 0; i < 3; i++)
+                if (best[i] >= 0)
+                    used_slot[best[i]] = 1;
+            for (i = 0; i < 4 && used_slot[i]; i++)
+                ;
+            n = i;
          }
      }
+
+    valid = true;
+ err:
      if (!valid)
-    {
-        perror("[bam2fq_mainloop] Error writing to FASTx files.");
-    }
-    bam_destroy1(b);
-    for (n=0; n < 3; n++) {
-        bam_destroy1(records[n]);
-    }
-    free(current_qname);
-    free(linebuf[0].s);
-    free(linebuf[1].s);
-    free(linebuf[2].s);
-    fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons);
-    fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
+        print_error_errno("bam2fq", "Error writing to FASTx files.");
+
+    for (n = 0; n < 4; n++)
+        bam_destroy1(b[n]);
+
+    fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n",
+            __func__, n_singletons);
+    fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n",
+            __func__, n_reads);
  
      return valid;
  }
  
  int main_bam2fq(int argc, char *argv[])
  {
-    int status = EXIT_SUCCESS;
+    int status = EXIT_FAILURE;
      bam2fq_opts_t* opts = NULL;
      bam2fq_state_t* state = NULL;
  
      bool valid = parse_opts(argc, argv, &opts);
      if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE;
  
-    if (!init_state(opts, &state)) return EXIT_FAILURE;
+    if (!init_state(opts, &state)) goto err;
+
+    if (!bam2fq_mainloop(state,opts)) goto err;
  
-    if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE;
+    if (!destroy_state(opts, state, &status)) goto err;
  
-    if (!destroy_state(opts, state, &status)) return EXIT_FAILURE;
+    status = EXIT_SUCCESS;
+ err:
      sam_global_args_free(&opts->ga);
      free_opts(opts);
  
diff --git a/samtools/bam_fastq.c.pysam.c b/samtools/bam_fastq.c.pysam.c

index 2fe4c87207aec7565e6987a290beb029e7ac6c8f..f7249d1823cfa76d7c8270631d5535c0df097fd1 100644 (file)
--- a/samtools/bam_fastq.c.pysam.c
+++ b/samtools/bam_fastq.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bam_fastq.c -- FASTA and FASTQ file generation
  
-    Copyright (C) 2009-2017, 2019 Genome Research Ltd.
+    Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd.
      Portions copyright (C) 2009, 2011, 2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -44,16 +44,11 @@ DEALINGS IN THE SOFTWARE.  */
  #include "samtools.h"
  #include "sam_opts.h"
  
-#define taglist_free(p)
-KLIST_INIT(ktaglist, char*, taglist_free)
-
  #define DEFAULT_BARCODE_TAG "BC"
  #define DEFAULT_QUALITY_TAG "QT"
  #define INDEX_SEPARATOR "+"
  
  int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
-static const char *copied_tags[] = { "RG", "BC", "QT", NULL };
-
  static void bam2fq_usage(FILE *to, const char *command)
  {
      int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0;
@@ -62,64 +57,71 @@ static void bam2fq_usage(FILE *to, const char *command)
      fprintf(to,
  "\n"
  "Description:\n"
-"Converts a SAM, BAM or CRAM into either FASTQ or FASTA format depending on the command invoked.\n"
+"Converts a SAM, BAM or CRAM to %s format.\n"
  "\n"
  "Options:\n"
-"  -0 FILE              write reads designated READ_OTHER to FILE\n"
-"  -1 FILE              write reads designated READ1 to FILE\n"
-"  -2 FILE              write reads designated READ2 to FILE\n"
-"  -o FILE              write reads designated READ1 or READ2 to FILE\n"
-"                       note: if a singleton file is specified with -s, only\n"
-"                       paired reads will be written to the -1 and -2 files.\n"
-"  -f INT               only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
-"  -F INT               only include reads with none of the FLAGS in INT present [0x900]\n"       //   F&x == 0
-"  -G INT               only EXCLUDE reads with all  of the FLAGs in INT present [0]\n"       // !(F&x == x)
-"  -n                   don't append /1 and /2 to the read name\n"
-"  -N                   always append /1 and /2 to the read name\n");
+"  -0 FILE      write reads designated READ_OTHER to FILE\n"
+"  -1 FILE      write reads designated READ1 to FILE\n"
+"  -2 FILE      write reads designated READ2 to FILE\n"
+"  -o FILE      write reads designated READ1 or READ2 to FILE\n"
+"               note: if a singleton file is specified with -s, only\n"
+"               paired reads will be written to the -1 and -2 files.\n"
+"  -f INT       only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
+"  -F INT       only include reads with none of the FLAGS in INT present [0x900]\n"       //   F&x == 0
+"  -G INT       only EXCLUDE reads with all  of the FLAGs in INT present [0]\n"       // !(F&x == x)
+"  -n           don't append /1 and /2 to the read name\n"
+"  -N           always append /1 and /2 to the read name\n",
+    fq ? "FASTQ" : "FASTA");
      if (fq) fprintf(to,
-"  -O                   output quality in the OQ tag if present\n");
+"  -O           output quality in the OQ tag if present\n");
      fprintf(to,
-"  -s FILE              write singleton reads designated READ1 or READ2 to FILE\n"
-"  -t                   copy RG, BC and QT tags to the %s header line\n",
+"  -s FILE      write singleton reads designated READ1 or READ2 to FILE\n"
+"  -t           copy RG, BC and QT tags to the %s header line\n",
      fq ? "FASTQ" : "FASTA");
      fprintf(to,
-"  -T TAGLIST           copy arbitrary tags to the %s header line\n",
+"  -T TAGLIST   copy arbitrary tags to the %s header line\n",
      fq ? "FASTQ" : "FASTA");
      if (fq) fprintf(to,
-"  -v INT               default quality score if not given in file [1]\n"
-"  -i                   add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n"
-"  -c                   compression level [0..9] to use when creating gz or bgzf fastq files [1]\n"
-"  --i1 FILE            write first index reads to FILE\n"
-"  --i2 FILE            write second index reads to FILE\n"
-"  --barcode-tag TAG    Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n"
-"  --quality-tag TAG    Quality tag [default: " DEFAULT_QUALITY_TAG "]\n"
-"  --index-format STR   How to parse barcode and quality tags\n\n");
+"  -v INT       default quality score if not given in file [1]\n"
+"  -i           add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n"
+"  -c INT       compression level [0..9] to use when writing bgzf files [1]\n"
+"  --i1 FILE    write first index reads to FILE\n"
+"  --i2 FILE    write second index reads to FILE\n"
+"  --barcode-tag TAG\n"
+"               Barcode tag [" DEFAULT_BARCODE_TAG "]\n"
+"  --quality-tag TAG\n"
+"               Quality tag [" DEFAULT_QUALITY_TAG "]\n"
+"  --index-format STR\n"
+"               How to parse barcode and quality tags\n\n");
      sam_global_opt_help(to, "-.--.@-.");
      fprintf(to,
  "\n"
-"The files will be automatically compressed if the file names have a .gz or .bgzf extension.\n"
-"The input to this program must be collated by name. Run 'samtools collate' or 'samtools sort -n'.\n"
+"The files will be automatically compressed if the file names have a .gz\n"
+"or .bgzf extension.  The input to this program must be collated by name.\n"
+"Run 'samtools collate' or 'samtools sort -n' to achieve this.\n"
  "\n"
  "Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n"
  "Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n"
-"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n"
-"or both unset.\n"
+"Otherwise reads are designated READ_OTHER (both flags set or both flags unset).\n"
  "Run 'samtools flags' for more information on flag codes and meanings.\n");
      fprintf(to,
  "\n"
-"The index-format string describes how to parse the barcode and quality tags, for example:\n"
-"   i14i8       the first 14 characters are index 1, the next 8 characters are index 2\n"
-"   n8i14       ignore the first 8 characters, and use the next 14 characters for index 1\n"
-"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n"
-"'read until the separator or end of tag', for example:\n"
-"   n*i*        ignore the left part of the tag until the separator, then use the second part\n"
-"               of the tag as index 1\n");
+"The index-format string describes how to parse the barcode and quality tags.\n"
+"It is made up of 'i' or 'n' followed by a length or '*'.  For example:\n"
+"   i14i8       The first 14 characters are index 1, the next 8 are index 2\n"
+"   n8i14       Ignore the first 8 characters, and use the next 14 for index 1\n\n"
+"If the tag contains a separator, then the numeric part can be replaced with\n"
+"'*' to mean 'read until the separator or end of tag', for example:\n"
+"   i*i*        Break the tag at the separator into index 1 and index 2\n"
+"   n*i*        Ignore the left part of the tag until the separator,\n"
+"               then use the second part of the tag as index 1\n");
      fprintf(to,
  "\n"
  "Examples:\n"
-" To get just the paired reads in separate files, use:\n"
-"   samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n in.bam\n"
-"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n"
+"To get just the paired reads in separate files, use:\n"
+"   samtools %s -1 pair1.%s -2 pair2.%s -0 /dev/null -s /dev/null -n in.bam\n"
+"\nTo get all non-supplementary/secondary reads in a single file, redirect\n"
+"the output:\n"
  "   samtools %s in.bam > all_reads.%s\n",
              command, fq ? "fq" : "fa", fq ? "fq" : "fa",
              command, fq ? "fq" : "fa");
@@ -146,96 +148,20 @@ typedef struct bam2fq_opts {
  
  typedef struct bam2fq_state {
      samFile *fp;
-    BGZF *fpse;
-    BGZF *fpr[3];
-    BGZF *fpi[2];
-    BGZF *hsamtools_stdout;
+    samFile *fpse;
+    samFile *fpr[3];
+    samFile *fpi[3];
+    samFile *hsamtools_stdout;
      sam_hdr_t *h;
      bool has12, use_oq, copy_tags, illumina_tag;
      int flag_on, flag_off, flag_alloff;
      fastfile filetype;
      int def_qual;
-    klist_t(ktaglist) *taglist;
      char *index_sequence;
      char compression_level;
      htsThreadPool p;
  } bam2fq_state_t;
  
-/*
- * Get and decode the read from a BAM record.
- *
- * TODO: htslib really needs an interface for this.  Consider this or perhaps
- * bam_get_seq_str (current vs original orientation) and bam_get_qual_str
- * functions as string formatted equivalents to bam_get_{seq,qual}?
- */
-
-/*
- * Reverse a string in place.
- * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux.
- * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik
- */
-static char *reverse(char *str)
-{
-    int i = strlen(str)-1,j=0;
-    char ch;
-    while (i>j) {
-        ch = str[i];
-        str[i]= str[j];
-        str[j] = ch;
-        i--;
-        j++;
-    }
-    return str;
-}
-
-/* return the read, reverse complemented if necessary */
-static char *get_read(const bam1_t *rec)
-{
-    int len = rec->core.l_qseq + 1;
-    char *read = calloc(1, len);
-    char *seq = (char *)bam_get_seq(rec);
-    int n;
-
-    if (!read) return NULL;
-
-    for (n=0; n < rec->core.l_qseq; n++) {
-        if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]];
-        else                               read[n] = seq_nt16_str[bam_seqi(seq,n)];
-    }
-    if (rec->core.flag & BAM_FREVERSE) reverse(read);
-    return read;
-}
-
-/*
- * get and decode the quality from a BAM record
- */
-static int get_quality(const bam1_t *rec, char **qual_out)
-{
-    char *quality = calloc(1, rec->core.l_qseq + 1);
-    char *q = (char *)bam_get_qual(rec);
-    int n;
-
-    if (!quality) return -1;
-
-    if (*q == '\xff') {
-        free(quality);
-        *qual_out = NULL;
-        return 0;
-    }
-
-    for (n=0; n < rec->core.l_qseq; n++) {
-        quality[n] = q[n]+33;
-    }
-    if (rec->core.flag & BAM_FREVERSE) reverse(quality);
-    *qual_out = quality;
-    return 0;
-}
-
-//
-// End of htslib complaints
-//
-
-
  static readpart which_readpart(const bam1_t *b)
  {
      if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
@@ -247,290 +173,8 @@ static readpart which_readpart(const bam1_t *b)
      }
  }
  
-/*
- * parse the length part from the index-format string
- */
-static int getLength(char **s)
-{
-    int n = 0;
-    while (**s) {
-        if (**s == '*') { n=-1; (*s)++; break; }
-        if ( !isdigit(**s)) break;
-        n = n*10 + ((**s)-'0');
-        (*s)++;
-    }
-    return n;
-}
-
-static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf)
-{
-    uint8_t *s = bam_aux_get(rec, tag);
-    if (s) {
-        char aux_type = *s;
-        switch (aux_type) {
-            case 'C':
-            case 'S': aux_type = 'I'; break;
-            case 'c':
-            case 's': aux_type = 'i'; break;
-            case 'd': aux_type = 'f'; break;
-        }
-
-        // Ensure space.  Need 6 chars + length of tag.  Max length of
-        // i is 16, A is 21, B currently 26, Z is unknown, so
-        // have to check that one later.
-        if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false;
-
-        kputc('\t', linebuf);
-        kputsn(tag, 2, linebuf);
-        kputc(':', linebuf);
-        kputc(aux_type=='I'? 'i': aux_type, linebuf);
-        kputc(':', linebuf);
-        switch (aux_type) {
-            case 'H':
-            case 'Z':
-                if (kputs(bam_aux2Z(s), linebuf) < 0) return false;
-                break;
-            case 'i': kputw(bam_aux2i(s), linebuf); break;
-            case 'I': kputuw(bam_aux2i(s), linebuf); break;
-            case 'A': kputc(bam_aux2A(s), linebuf); break;
-            case 'f': kputd(bam_aux2f(s), linebuf); break;
-            case 'B': kputs("*** Unhandled aux type ***", linebuf); return false;
-            default:  kputs("*** Unknown aux type ***", linebuf); return false;
-       }
-    }
-    return true;
-}
-
-static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec)
-{
-    if (!index_sequence) return 0;
-
-    kstring_t new = {0,0,NULL};
-    if (linebuf->s) {
-        char *s = strchr(linebuf->s, '\n');
-        if (s) {
-            if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0)
-                return -1;
-            *s = 0;
-            kputs(linebuf->s, &new);
-            kputc(' ', &new);
-            readpart readpart = which_readpart(rec);
-            if (readpart == READ_1) kputc('1', &new);
-            else if (readpart == READ_2) kputc('2', &new);
-            else kputc('0', &new);
-
-            kputc(':', &new);
-            if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new);
-            else                              kputc('N', &new);
-
-            kputs(":0:", &new);
-            kputs(index_sequence, &new);
-            kputc('\n', &new);
-            kputs(s+1, &new);
-            free(ks_release(linebuf));
-            linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m;
-        }
-    }
-    return 0;
-}
-
-static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state)
-{
-    int i;
-
-    linebuf->l = 0;
-    // Write read name
-    if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false;
-    if (kputs(bam_get_qname(rec), linebuf) < 0) return false;
-    // Add the /1 /2 if requested
-    if (state->has12) {
-        readpart readpart = which_readpart(rec);
-        if (readpart == READ_1) {
-            if (kputs("/1", linebuf) < 0) return false;
-        } else if (readpart == READ_2) {
-            if (kputs("/2", linebuf) < 0) return false;
-        }
-    }
-    if (state->copy_tags) {
-        for (i = 0; copied_tags[i]; ++i) {
-            if (!copy_tag(copied_tags[i], rec, linebuf)) {
-                fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s);
-                return false;
-            }
-        }
-    }
-
-    if (state->taglist->size) {
-        kliter_t(ktaglist) *p;
-        for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) {
-            if (!copy_tag(kl_val(p), rec, linebuf)) {
-                fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s);
-                return false;
-            }
-        }
-    }
-
-    if (kputc('\n', linebuf) < 0) return false;
-    if (kputs(seq, linebuf) < 0) return false;
-    if (kputc('\n', linebuf) < 0) return false;
-
-    if (state->filetype == FASTQ) {
-        // Write quality
-        if (kputs("+\n", linebuf) < 0) return false;
-        if (qual && *qual) {
-            if (kputs(qual, linebuf) < 0) return false;
-        } else {
-            int len = strlen(seq);
-            if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false;
-            for (i = 0; i < len; ++i) {
-                kputc(33 + state->def_qual, linebuf);
-            }
-        }
-        if (kputc('\n', linebuf) < 0) return false;
-    }
-    return true;
-}
-
-/*
- * Create FASTQ lines from the barcode tag using the index-format
- */
-static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts)
-{
-    uint8_t *p;
-    char *ifmt = opts->index_format;
-    char *tag = NULL;
-    char *qual = NULL;
-    char *sub_tag = NULL;
-    char *sub_qual = NULL;
-    size_t tag_len;
-    int file_number = 0;
-    kstring_t linebuf = { 0, 0, NULL }; // Buffer
-
-    if (!ifmt) return true;
-
-    // read barcode tag
-    p = bam_aux_get(rec,opts->barcode_tag);
-    if (p) tag = bam_aux2Z(p);
-
-    if (!tag) return true; // there is no tag
-
-    tag_len = strlen(tag);
-    sub_tag = calloc(1, tag_len + 1);
-    if (!sub_tag) goto fail;
-    sub_qual = calloc(1, tag_len + 1);
-    if (!sub_qual) goto fail;
-
-    // read quality tag
-    p = bam_aux_get(rec, opts->quality_tag);
-    if (p) qual = bam_aux2Z(p);
-
-    // Parse the index-format string
-    while (*ifmt) {
-        if (file_number > 1) break;     // shouldn't happen if we've validated paramaters correctly
-        char action = *ifmt;        // should be 'i' or 'n'
-        ifmt++; // skip over action
-        int index_len = getLength(&ifmt);
-        int n = 0;
-
-        if (index_len < 0) {
-            // read until separator
-            while (isalpha(*tag)) {
-                sub_tag[n] = *tag++;
-                if (qual) sub_qual[n] = *qual++;
-                n++;
-            }
-            if (*tag) { // skip separator
-                tag++;
-                if (qual) qual++;
-            }
-        } else {
-            // read index_len characters
-            while (index_len-- && *tag) {
-                sub_tag[n] = *tag++;
-                if (qual) sub_qual[n] = *qual++;
-                n++;
-            }
-        }
-        sub_tag[n] = '\0';
-        sub_qual[n] = '\0';
-
-        if (action=='i' && *sub_tag) {
-            if (state->index_sequence) {
-                char *new_index_sequence = realloc(state->index_sequence, strlen(state->index_sequence) + strlen(sub_tag) + 2);
-                if (!new_index_sequence) goto fail;
-                state->index_sequence = new_index_sequence;
-                strcat(state->index_sequence, INDEX_SEPARATOR);
-                strcat(state->index_sequence, sub_tag);
-            } else {
-                state->index_sequence = strdup(sub_tag);    // we're going to need this later...
-            }
-            if (!state->index_sequence) goto fail;
-            if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail;
-            if (state->illumina_tag) {
-                if (insert_index_sequence_into_linebuf(sub_tag, &linebuf, rec) < 0) {
-                    goto fail;
-                }
-            }
-            if (state->fpi[file_number]) {
-                if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0)
-                    goto fail;
-            }
-        }
-
-    }
-
-    free(sub_qual); free(sub_tag);
-    free(linebuf.s);
-    return true;
-
- fail:
-    perror(__func__);
-    free(sub_qual); free(sub_tag);
-    free(linebuf.s);
-    return false;
-}
-
-// Transform a bam1_t record into a string with the FASTQ representation of it
-// @returns false for error, true for success
-static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
-{
-    int32_t qlen = b->core.l_qseq;
-    assert(qlen >= 0);
-    const uint8_t *oq = NULL;
-    char *qual = NULL;
-
-    char *seq = get_read(b);
-    if (!seq) return false;
-
-    if (state->use_oq) oq = bam_aux_get(b, "OQ");
-    if (oq && *oq=='Z') {
-        qual = strdup(bam_aux2Z(oq));
-        if (!qual) goto fail;
-        if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
-            reverse(qual);
-        }
-    } else {
-        if (get_quality(b, &qual) < 0) goto fail;
-    }
-
-    if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail;
-
-    free(qual);
-    free(seq);
-    return true;
-
- fail:
-    free(seq);
-    free(qual);
-    return false;
-}
-
  static void free_opts(bam2fq_opts_t *opts)
  {
-    free(opts->barcode_tag);
-    free(opts->quality_tag);
-    free(opts->index_format);
-    free(opts->extra_tags);
      free(opts);
  }
  
@@ -568,13 +212,14 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
          {"quality-tag", required_argument, NULL, 'q'},
          { NULL, 0, NULL, 0 }
      };
-    while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) {
+    while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:",
+                            lopts, NULL)) > 0) {
          switch (c) {
-            case 'b': opts->barcode_tag = strdup(optarg); break;
-            case 'q': opts->quality_tag = strdup(optarg); break;
+            case 'b': opts->barcode_tag = optarg; break;
+            case 'q': opts->quality_tag = optarg; break;
              case  1 : opts->index_file[0] = optarg; break;
              case  2 : opts->index_file[1] = optarg; break;
-            case  3 : opts->index_format = strdup(optarg); break;
+            case  3 : opts->index_format = optarg; break;
              case '0': opts->fnr[0] = optarg; break;
              case '1': opts->fnr[1] = optarg; break;
              case '2': opts->fnr[2] = optarg; break;
@@ -585,7 +230,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
                      flag_off_set = 1;
                      opts->flag_off = 0;
                  }
-                opts->flag_off |= strtol(optarg, 0, 0); break;
+                opts->flag_off |= strtol(optarg, 0, 0);
+                break;
              case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
              case 'n': opts->has12 = false; break;
              case 'N': opts->has12always = true; break;
@@ -593,13 +239,25 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
              case 's': opts->fnse = optarg; break;
              case 't': opts->copy_tags = true; break;
              case 'i': opts->illumina_tag = true; break;
-            case 'c': opts->compression_level = atoi(optarg); break;
-            case 'T': opts->extra_tags = strdup(optarg); break;
+            case 'c':
+                opts->compression_level = atoi(optarg);
+                if (opts->compression_level < 0)
+                    opts->compression_level = 0;
+                if (opts->compression_level > 9)
+                    opts->compression_level = 9;
+                break;
+            case 'T': opts->extra_tags = optarg; break;
              case 'v': opts->def_qual = atoi(optarg); break;
-            case '?': bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false;
+
+            case '?':
+                bam2fq_usage(samtools_stderr, argv[0]);
+                free_opts(opts);
+                return false;
              default:
                  if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
-                    bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false;
+                    bam2fq_usage(samtools_stderr, argv[0]);
+                    free_opts(opts);
+                    return false;
                  }
                  break;
          }
@@ -608,8 +266,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
      if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
      if (opts->has12always) opts->has12 = true;
  
-    if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG);
-    if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG);
+    if (!opts->barcode_tag) opts->barcode_tag = DEFAULT_BARCODE_TAG;
+    if (!opts->quality_tag) opts->quality_tag = DEFAULT_QUALITY_TAG;
  
      int nIndex = 0;
      if (opts->index_format) {
@@ -654,7 +312,8 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
      }
  
      const char* type_str = argv[0];
-    if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) {
+    if (strcasecmp("fastq", type_str) == 0 ||
+        strcasecmp("bam2fq", type_str) == 0) {
          opts->filetype = FASTQ;
      } else if (strcasecmp("fasta", type_str) == 0) {
          opts->filetype = FASTA;
@@ -682,34 +341,61 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
      return true;
  }
  
-static BGZF *open_fqfile(char *filename, int c, htsThreadPool *tp)
-{
-    char mode[4] = "w";
-    size_t len = strlen(filename);
-
-    mode[2] = 0; mode[3] = 0;
-    if (len > 3 && strstr(filename + (len - 3),".gz")) {
-        mode[1] = 'g'; mode[2] = c+'0';
-    } else if ((len > 4 && strstr(filename + (len - 4),".bgz"))
-               || (len > 5 && strstr(filename + (len - 5),".bgzf"))) {
-        mode[1] = c+'0';
-    } else {
-        mode[1] = 'u';
+void set_sam_opts(samFile *fp, bam2fq_state_t *state,
+                  const bam2fq_opts_t *opts) {
+    if (state->has12)
+        hts_set_opt(fp, FASTQ_OPT_RNUM, 1);
+
+    if (state->illumina_tag)
+        hts_set_opt(fp, FASTQ_OPT_CASAVA, 1);
+
+    hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag);
+
+    kstring_t tag_list = {0,0};
+    if (state->copy_tags)
+        kputs("RG,BC,QT", &tag_list);
+    if (opts->extra_tags) {
+        if (tag_list.l)
+            kputc(',', &tag_list);
+        kputs(opts->extra_tags, &tag_list);
      }
+    if (tag_list.l)
+        hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s);
+    ks_free(&tag_list);
+}
  
-    BGZF *fp = bgzf_open(filename,mode);
+// Open a file as normal or gzipped based on filename.
+// Note we always use bgzf and don't bother to attempt non-blocked
+// gzip streams.  This is a departure from the old fastq code.
+static samFile *sam_open_z(char *fn, char *mode, bam2fq_state_t *state) {
+    char modez[6];
+    strcpy(modez, mode);
+
+    size_t l = strlen(fn);
+    if ((l > 3 && strcmp(fn+l-3, ".gz") == 0) ||
+        (l > 4 && strcmp(fn+l-4, ".bgz") == 0) ||
+        (l > 5 && strcmp(fn+l-5, ".bgzf") == 0)) {
+        char m[3] = {'z', state->compression_level+'0', '\0'};
+        strcat(modez, m);
+    }
+
+    samFile *fp = sam_open(fn, modez);
      if (!fp)
-        return fp;
-    if (tp->pool && bgzf_thread_pool(fp, tp->pool, tp->qsize) < 0) {
-        bgzf_close(fp);
          return NULL;
-    }
+
+    if (state->p.pool)
+        hts_set_thread_pool(fp, &state->p);
+
      return fp;
  }
  
  static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
  {
+    char *mode = opts->filetype == FASTA ? "wF" : "wf";
+
      bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
+    if (!state)
+        return false;
      state->flag_on = opts->flag_on;
      state->flag_off = opts->flag_off;
      state->flag_alloff = opts->flag_alloff;
@@ -723,22 +409,6 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
      state->hsamtools_stdout = NULL;
      state->compression_level = opts->compression_level;
  
-    state->taglist = kl_init(ktaglist);
-    if (opts->extra_tags) {
-        char *save_p;
-        char *s = strtok_r(opts->extra_tags, ",", &save_p);
-        while (s) {
-            if (strlen(s) != 2) {
-                fprintf(samtools_stderr, "Parsing extra tags - '%s' is not two characters\n", s);
-                free(state);
-                return false;
-            }
-            char **et = kl_pushp(ktaglist, state->taglist);
-            *et = s;
-            s = strtok_r(NULL, ",", &save_p);
-        }
-    }
-
      state->fp = sam_open(opts->fn_input, "r");
      if (state->fp == NULL) {
          print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input);
@@ -770,12 +440,12 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
          return false;
      }
      if (opts->fnse) {
-        state->fpse = open_fqfile(opts->fnse, state->compression_level, &state->p);
-        if (state->fpse == NULL) {
-            print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse);
+        if (!(state->fpse = sam_open_z(opts->fnse, mode, state))) {
+            print_error_errno("bam2fq", "Cannot open singleton file \"%s\"", opts->fnse);
              free(state);
              return false;
          }
+        set_sam_opts(state->fpse, state, opts);
      }
  
      if (opts->ga.reference) {
@@ -786,6 +456,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
          }
      }
  
+    // single, read1, read2
      int i, j;
      for (i = 0; i < 3; ++i) {
          if (opts->fnr[i]) {
@@ -793,28 +464,30 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
                  if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0)
                      break;
              if (j == i) {
-                state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level, &state->p);
-                if (state->fpr[i] == NULL) {
-                    print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"",
+                if (!(state->fpr[i] = sam_open_z(opts->fnr[i], mode, state))) {
+                    print_error_errno("bam2fq", "Cannot open r%d file \"%s\"",
                                        i, opts->fnr[i]);
                      free(state);
                      return false;
                  }
+                set_sam_opts(state->fpr[i], state, opts);
              } else {
                  state->fpr[i] = state->fpr[j];
              }
          } else {
              if (!state->hsamtools_stdout) {
-                state->hsamtools_stdout = bgzf_dopen(fileno(samtools_stdout), "wu");
-                if (!state->hsamtools_stdout) {
+                if (!(state->hsamtools_stdout = sam_open_z("-", mode, state))) {
                      print_error_errno("bam2fq", "Cannot open STDOUT");
                      free(state);
                      return false;
                  }
+                set_sam_opts(state->hsamtools_stdout, state, opts);
              }
              state->fpr[i] = state->hsamtools_stdout;
          }
      }
+
+    // index 1, index 2
      for (i = 0; i < 2; i++) {
          state->fpi[i] = NULL;
          if (opts->index_file[i]) {
@@ -825,13 +498,14 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
                  if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0)
                      break;
              if (i == j) {
-                state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level, &state->p);
-                if (state->fpi[i] == NULL) {
-                    print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"",
+                if (!(state->fpi[i] = sam_open_z(opts->index_file[i], mode,
+                                                 state))) {
+                    print_error_errno("bam2fq", "Cannot open i%d file \"%s\"",
                                        i+1, opts->index_file[i]);
                      free(state);
                      return false;
                  }
+                set_sam_opts(state->fpi[i], state, opts);
              } else if (j < 0) {
                  state->fpi[i] = state->fpr[j+3];
              } else {
@@ -856,21 +530,25 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int*
      bool valid = true;
      sam_hdr_destroy(state->h);
      check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status);
-    if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; }
+    if (state->fpse && sam_close(state->fpse) < 0) {
+        print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse);
+        valid = false;
+    }
+
      int i, j;
      for (i = 0; i < 3; ++i) {
          if (state->fpr[i] != state->hsamtools_stdout) {
              for (j = 0; j < i; j++)
                  if (state->fpr[i] == state->fpr[j])
                      break;
-            if (j == i && bgzf_close(state->fpr[i])) {
+            if (j == i && sam_close(state->fpr[i])) {
                  print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]);
                  valid = false;
              }
          }
      }
      if (state->hsamtools_stdout) {
-        if (bgzf_close(state->hsamtools_stdout)) {
+        if (sam_close(state->hsamtools_stdout) < 0) {
              print_error_errno("bam2fq", "Error closing STDOUT");
              valid = false;
          }
@@ -882,12 +560,11 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int*
          for (j -= 3; j >= 0 && j < i; j++)
              if (state->fpi[i] == state->fpi[j])
                  break;
-        if (j == i && state->fpi[i] && bgzf_close(state->fpi[i])) {
+        if (j == i && state->fpi[i] && sam_close(state->fpi[i]) < 0) {
              print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
              valid = false;
          }
      }
-    kl_destroy(ktaglist,state->taglist);
      free(state->index_sequence);
      if (state->p.pool)
          hts_tpool_destroy(state->p.pool);
@@ -903,135 +580,300 @@ static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state)
  
  }
  
+int write_index_rec(samFile *fp, bam1_t *b, bam2fq_state_t *state,
+                    bam2fq_opts_t* opts, char *seq, int seq_len,
+                    char *qual, int qual_len) {
+    if (!fp || !b || !seq_len)
+        return 0;
+
+    int ret = -1;
+    bam1_t *b2 = bam_init1(); // FIXME: reuse
+    if (!b2)
+        return -1;
+
+    size_t aux_len = b->data + b->l_data - bam_get_aux(b);
+    if (bam_set1(b2, b->core.l_qname, bam_get_qname(b),
+                 (b->core.flag | BAM_FUNMAP) & ~BAM_FREVERSE,
+                 -1, -1, 0,    // refid, pos, mapq
+                 0, NULL,      // cigar
+                 -1, -1, 0,    // rnext, pnext, tlen
+                 seq_len, seq, qual,
+                 aux_len) < 0)
+        goto err;
+
+    uint8_t *q = bam_get_qual(b2);
+    if (qual) {
+        int i;
+        for (i = 0; i < seq_len; i++)
+            q[i] -= '!';
+    } else {
+        memset(q, opts->def_qual, seq_len);
+    }
+
+    memcpy(bam_get_aux(b2), bam_get_aux(b), aux_len);
+    b2->l_data += aux_len;
+    if (sam_write1(fp, state->h, b2) < 0)
+        goto err;
+
+    ret = 0;
+ err:
+    if (b2)
+        bam_destroy1(b2);
+    return ret;
+}
+
+int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state,
+                 bam2fq_opts_t* opts) {
+    bam1_t *b[2] = {b1, b2};
+
+    char *ifmt = opts->index_format;
+    if (!ifmt)
+        ifmt = "i*i*";
+
+    // Get seq / qual elements
+    char *bc = NULL, *qt = NULL;
+    if (b1)
+        bc = (char *)bam_aux_get(b1, opts->barcode_tag);
+    if (b2 && !bc)
+        bc = (char *)bam_aux_get(b2, opts->barcode_tag);
+    if (!bc)
+        return 0;
+    else
+        bc++; // skip Z
+
+    if (b1)
+        qt = (char *)bam_aux_get(b1, opts->quality_tag);
+    if (b2 && !qt)
+        qt = (char *)bam_aux_get(b2, opts->quality_tag);
+    if (qt && strlen(bc) != strlen(qt)-1)
+        qt = NULL;
+    else if (qt)
+        qt++;
+
+    int inum = 0;
+    while (inum < 2) {
+        char fc = *ifmt++;
+        if (!fc)
+            break; // ran out of index-format
+
+        long len, rem = 0;
+        if (isdigit(*ifmt)) {
+            rem = len = strtol(ifmt, &ifmt, 10);
+        } else {
+            ifmt++;
+            len = 0;
+        }
+
+        char *bc_end = bc, *qt_end = qt;
+        while (len ? *bc_end && rem-- : isalpha(*bc_end))
+            bc_end++, qt_end += qt != NULL;
+
+        switch (fc) {
+        case 'n':
+            // skip
+            bc = bc_end + (len==0);
+            if (qt)
+                qt = qt_end + (len==0);
+            break;
+
+        case 'i':
+            if (write_index_rec(state->fpi[inum], b[inum], state, opts,
+                                bc, bc_end-bc, qt, qt_end-qt) < 0)
+                return -1;
+            bc = bc_end + (len==0);
+            if (qt)
+                qt = qt_end + (len==0);
+            inum++;
+            break;
+
+        default:
+            fprintf(samtools_stderr, "Unknown index-format code\n");
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static int flush_rec(bam2fq_state_t *state, bam2fq_opts_t* opts,
+                     bam1_t *b[4], int score[3], int best[3],
+                     int64_t *n_singletons) {
+    // Paired data, with 1 or 2 ends present.
+    if (score[1] > 0 && score[2] > 0) {
+        // If CASAVA tag is required and barcode is only on R1,
+        // copy it to R2
+        if (state->illumina_tag) {
+            char *tag;
+            if ((tag = (char *)bam_aux_get(b[best[1]],
+                                           opts->barcode_tag)))
+                if (bam_aux_update_str(b[best[2]],
+                                       opts->barcode_tag,
+                                       strlen(tag), tag+1) < 0)
+                    goto err;
+            if ((tag = (char *)bam_aux_get(b[best[1]],
+                                           opts->quality_tag)))
+                if (bam_aux_update_str(b[best[2]],
+                                       opts->quality_tag,
+                                       strlen(tag), tag+1) < 0)
+                    goto err;
+
+        }
+        if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0)
+            goto err;
+        if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0)
+            goto err;
+
+        if (output_index(b[best[1]], b[best[2]], state, opts) < 0)
+            goto err;
+    } else if (score[1] > 0 || score[2] > 0) {
+        if (state->fpse) {
+            // print whichever one exists to fpse
+            if (score[1] > 0) {
+                if (sam_write1(state->fpse, state->h, b[best[1]]) < 0)
+                    goto err;
+            } else {
+                if (sam_write1(state->fpse, state->h, b[best[2]]) < 0)
+                    goto err;
+            }
+            ++(*n_singletons);
+        } else {
+            if (score[1] > 0) {
+                if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0)
+                    goto err;
+            } else {
+                if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0)
+                    goto err;
+            }
+        }
+
+        if (output_index(score[1] > 0 ? b[best[1]] : NULL,
+                         score[2] > 0 ? b[best[2]] : NULL,
+                         state, opts) < 0)
+            goto err;
+    }
+
+    if (score[0]) { // single ended data (neither READ1 nor READ2)
+        if (sam_write1(state->fpr[0], state->h, b[best[0]]) < 0)
+            goto err;
+
+        if (output_index(b[best[0]], NULL, state, opts) < 0)
+            goto err;
+    }
+
+    return 0;
+
+ err:
+    return -1;
+}
+
  static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
  {
      int n;
-    bam1_t *records[3] = {NULL, NULL, NULL};
      char *current_qname = NULL;
      int64_t n_reads = 0, n_singletons = 0; // Statistics
-    kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}};
      int score[3];
      int at_eof;
-    bool valid = true;
-    bam1_t* b = NULL;
+    bool valid = false;
+    int best[3] = {-1, -1, -1}; // map R0, R1, single to b[] indices;
+                                // indexed by [readpart]
+    bam1_t *b[4];               // 3 readparts, plus current record
  
-    while (true) {
-        if (!b)
-            b = bam_init1();
-        if (b == NULL) {
+    for (n = 0; n < 4; n++) {
+        if (!(b[n] = bam_init1())) {
              perror("[bam2fq_mainloop] Malloc error for bam record buffer.");
-            valid = false;
-            break;
+            return false;
          }
-        int res = sam_read1(state->fp, state->h, b);
+    }
+
+    n = 0;
+    while (true) {
+        int res = sam_read1(state->fp, state->h, b[n]);
          if (res < -1) {
              fprintf(samtools_stderr, "[bam2fq_mainloop] Failed to read bam record.\n");
-            valid = false;
-            break;
+            goto err;
          }
          at_eof = res < 0;
  
-        if (!at_eof && filter_it_out(b, state))
+        if (!at_eof && filter_it_out(b[n], state))
              continue;
-        if (!at_eof) ++n_reads;
-
-        if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) {
-            if (current_qname) {
-                if (state->illumina_tag) {
-                    for (n=0; valid && n<3; n++) {
-                        if (!records[n]) continue;
-                        if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false;
-                    }
-                    if (!valid) break;
-                }
-                free(state->index_sequence); state->index_sequence = NULL;
-                if (score[1] > 0 && score[2] > 0) {
-                    // print linebuf[1] to fpr[1], linebuf[2] to fpr[2]
-                    if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
-                    if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
-                } else if (score[1] > 0 || score[2] > 0) {
-                    if (state->fpse) {
-                        // print whichever one exists to fpse
-                        if (score[1] > 0) {
-                            if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
-                        } else {
-                            if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
-                        }
-                        ++n_singletons;
-                    } else {
-                        if (score[1] > 0) {
-                            if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; }
-                        } else {
-                            if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; }
-                        }
-                    }
-                }
-                if (score[0]) { // TODO: check this
-                    // print linebuf[0] to fpr[0]
-                    if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; }
-                }
+        if (!at_eof) {
+            ++n_reads;
+
+            // Handle -O option: use OQ for qual
+            uint8_t *oq;
+            if (state->use_oq && (oq = bam_aux_get(b[n],"OQ")) && *oq == 'Z') {
+                int i, l = strlen((char *)++oq);
+                uint8_t *qual = bam_get_qual(b[n]);
+                for (i = 0; i < l && i < b[n]->core.l_qseq; i++)
+                    qual[i] = oq[i] - '!';
              }
+        }
  
+        if (at_eof
+            || !current_qname
+            || (strcmp(current_qname, bam_get_qname(b[n])) != 0)) {
+            // New name, so flush best examples of previous name.
+            if (current_qname)
+                if (flush_rec(state, opts, b, score, best, &n_singletons) < 0)
+                    goto err;
  
-            free(current_qname); current_qname = NULL;
+            current_qname = bam_get_qname(b[n]);
              score[0] = score[1] = score[2] = 0;
-            for (n=0; n < 3; n++) {
-                bam_destroy1(records[n]); records[n]=NULL;
-            }
  
              if (at_eof) { break; }
-
-            current_qname = strdup(bam_get_qname(b));
-            if (!current_qname) { valid = false; break; }
          }
  
          // Prefer a copy of the read that has base qualities
-        int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1;
-        readpart rp = which_readpart(b);
-        if (b_score > score[rp]) {
-            if (!tags2fq(b, state, opts)) { valid = false; break; }
-            if (records[rp]) bam_destroy1(records[rp]);
-            records[rp] = b;
+        int b_score = bam_get_qual(b[n])[0] != 0xff? 2 : 1;
+        readpart rp = which_readpart(b[n]);
+        if (score[rp] < b_score) {
              score[rp] = b_score;
-            b = NULL;
-            if(!bam1_to_fq(records[rp], &linebuf[rp], state)) {
-                fprintf(samtools_stderr, "[%s] Error converting read to FASTA/Q\n", __func__);
-                valid = false; break;
-            }
+            // Record b[n] slot for best copy of readpair and find a new
+            // slot for next bam read
+            best[rp] = n;
+            int used_slot[4] = {0}, i;
+            for (i = 0; i < 3; i++)
+                if (best[i] >= 0)
+                    used_slot[best[i]] = 1;
+            for (i = 0; i < 4 && used_slot[i]; i++)
+                ;
+            n = i;
          }
      }
+
+    valid = true;
+ err:
      if (!valid)
-    {
-        perror("[bam2fq_mainloop] Error writing to FASTx files.");
-    }
-    bam_destroy1(b);
-    for (n=0; n < 3; n++) {
-        bam_destroy1(records[n]);
-    }
-    free(current_qname);
-    free(linebuf[0].s);
-    free(linebuf[1].s);
-    free(linebuf[2].s);
-    fprintf(samtools_stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons);
-    fprintf(samtools_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads);
+        print_error_errno("bam2fq", "Error writing to FASTx files.");
+
+    for (n = 0; n < 4; n++)
+        bam_destroy1(b[n]);
+
+    fprintf(samtools_stderr, "[M::%s] discarded %" PRId64 " singletons\n",
+            __func__, n_singletons);
+    fprintf(samtools_stderr, "[M::%s] processed %" PRId64 " reads\n",
+            __func__, n_reads);
  
      return valid;
  }
  
  int main_bam2fq(int argc, char *argv[])
  {
-    int status = EXIT_SUCCESS;
+    int status = EXIT_FAILURE;
      bam2fq_opts_t* opts = NULL;
      bam2fq_state_t* state = NULL;
  
      bool valid = parse_opts(argc, argv, &opts);
      if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE;
  
-    if (!init_state(opts, &state)) return EXIT_FAILURE;
+    if (!init_state(opts, &state)) goto err;
+
+    if (!bam2fq_mainloop(state,opts)) goto err;
  
-    if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE;
+    if (!destroy_state(opts, state, &status)) goto err;
  
-    if (!destroy_state(opts, state, &status)) return EXIT_FAILURE;
+    status = EXIT_SUCCESS;
+ err:
      sam_global_args_free(&opts->ga);
      free_opts(opts);
  
diff --git a/samtools/bam_flags.c b/samtools/bam_flags.c

index 11a82b6eb377ba28c8dd6170fb60ac732b841db3..78312eee789db3136dae39de5609aaeac1924705 100644 (file)
--- a/samtools/bam_flags.c
+++ b/samtools/bam_flags.c
@@ -1,6 +1,6 @@
  /*  bam_flags.c -- flags subcommand.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2014, 2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -32,38 +32,54 @@ DEALINGS IN THE SOFTWARE.  */
  #include <unistd.h>
  #include <stdarg.h>
  #include <htslib/sam.h>
+#include "samtools.h"
  
-static void usage(void)
+static void usage(FILE *fp)
  {
-    fprintf(stderr, "\n");
-    fprintf(stderr, "About: Convert between textual and numeric flag representation\n");
-    fprintf(stderr, "Usage: samtools flags INT|STR[,...]\n");
-    fprintf(stderr, "\n");
-    fprintf(stderr, "Flags:\n");
-    fprintf(stderr, "\t0x%x\tPAIRED        .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED);
-    fprintf(stderr, "\t0x%x\tPROPER_PAIR   .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR);
-    fprintf(stderr, "\t0x%x\tUNMAP         .. segment unmapped\n", BAM_FUNMAP);
-    fprintf(stderr, "\t0x%x\tMUNMAP        .. next segment in the template unmapped\n", BAM_FMUNMAP);
-    fprintf(stderr, "\t0x%x\tREVERSE       .. SEQ is reverse complemented\n", BAM_FREVERSE);
-    fprintf(stderr, "\t0x%x\tMREVERSE      .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE);
-    fprintf(stderr, "\t0x%x\tREAD1         .. the first segment in the template\n", BAM_FREAD1);
-    fprintf(stderr, "\t0x%x\tREAD2         .. the last segment in the template\n", BAM_FREAD2);
-    fprintf(stderr, "\t0x%x\tSECONDARY     .. secondary alignment\n", BAM_FSECONDARY);
-    fprintf(stderr, "\t0x%x\tQCFAIL        .. not passing quality controls\n", BAM_FQCFAIL);
-    fprintf(stderr, "\t0x%x\tDUP           .. PCR or optical duplicate\n", BAM_FDUP);
-    fprintf(stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY);
-    fprintf(stderr, "\n");
+    static const struct { int bit; const char *desc; } *fl, flags[] = {
+        { BAM_FPAIRED, "paired-end / multiple-segment sequencing technology" },
+        { BAM_FPROPER_PAIR, "each segment properly aligned according to aligner" },
+        { BAM_FUNMAP, "segment unmapped" },
+        { BAM_FMUNMAP, "next segment in the template unmapped" },
+        { BAM_FREVERSE, "SEQ is reverse complemented" },
+        { BAM_FMREVERSE, "SEQ of next segment in template is rev.complemented" },
+        { BAM_FREAD1, "the first segment in the template" },
+        { BAM_FREAD2, "the last segment in the template" },
+        { BAM_FSECONDARY, "secondary alignment" },
+        { BAM_FQCFAIL, "not passing quality controls or other filters" },
+        { BAM_FDUP, "PCR or optical duplicate" },
+        { BAM_FSUPPLEMENTARY, "supplementary alignment" },
+        { 0, NULL }
+    };
+
+    fprintf(fp,
+"About: Convert between textual and numeric flag representation\n"
+"Usage: samtools flags FLAGS...\n"
+"\n"
+"Each FLAGS argument is either an INT (in decimal/hexadecimal/octal) representing\n"
+"a combination of the following numeric flag values, or a comma-separated string\n"
+"NAME,...,NAME representing a combination of the following flag names:\n"
+"\n");
+    for (fl = flags; fl->desc; fl++) {
+        char *name = bam_flag2str(fl->bit);
+        fprintf(fp, "%#6x %5d  %-15s%s\n", fl->bit, fl->bit, name, fl->desc);
+        free(name);
+    }
  }
  
  
  int main_flags(int argc, char *argv[])
  {
-    if ( argc!=2 ) usage();
-    else
+    if ( argc < 2 ) { usage(stdout); return 0; }
+
+    int i;
+    for (i = 1; i < argc; i++)
      {
-        int mask = bam_str2flag(argv[1]);
-        if ( mask<0 ) { fprintf(stderr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; }
-        printf("0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask));
+        int mask = bam_str2flag(argv[i]);
+        if ( mask<0 ) { print_error("flags", "Could not parse \"%s\"", argv[i]); usage(stderr); return 1; }
+        char *str = bam_flag2str(mask);
+        printf("0x%x\t%d\t%s\n", mask, mask, str);
+        free(str);
      }
      return 0;
  }
diff --git a/samtools/bam_flags.c.pysam.c b/samtools/bam_flags.c.pysam.c

index 9c6424f63b0685ddf09b90807d4bb916eff3b6f4..b3a9d29d4770b4d8bfa0c3f4f5cf906234dfc66a 100644 (file)
--- a/samtools/bam_flags.c.pysam.c
+++ b/samtools/bam_flags.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bam_flags.c -- flags subcommand.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2014, 2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -34,38 +34,54 @@ DEALINGS IN THE SOFTWARE.  */
  #include <unistd.h>
  #include <stdarg.h>
  #include <htslib/sam.h>
+#include "samtools.h"
  
-static void usage(void)
+static void usage(FILE *fp)
  {
-    fprintf(samtools_stderr, "\n");
-    fprintf(samtools_stderr, "About: Convert between textual and numeric flag representation\n");
-    fprintf(samtools_stderr, "Usage: samtools flags INT|STR[,...]\n");
-    fprintf(samtools_stderr, "\n");
-    fprintf(samtools_stderr, "Flags:\n");
-    fprintf(samtools_stderr, "\t0x%x\tPAIRED        .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED);
-    fprintf(samtools_stderr, "\t0x%x\tPROPER_PAIR   .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR);
-    fprintf(samtools_stderr, "\t0x%x\tUNMAP         .. segment unmapped\n", BAM_FUNMAP);
-    fprintf(samtools_stderr, "\t0x%x\tMUNMAP        .. next segment in the template unmapped\n", BAM_FMUNMAP);
-    fprintf(samtools_stderr, "\t0x%x\tREVERSE       .. SEQ is reverse complemented\n", BAM_FREVERSE);
-    fprintf(samtools_stderr, "\t0x%x\tMREVERSE      .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE);
-    fprintf(samtools_stderr, "\t0x%x\tREAD1         .. the first segment in the template\n", BAM_FREAD1);
-    fprintf(samtools_stderr, "\t0x%x\tREAD2         .. the last segment in the template\n", BAM_FREAD2);
-    fprintf(samtools_stderr, "\t0x%x\tSECONDARY     .. secondary alignment\n", BAM_FSECONDARY);
-    fprintf(samtools_stderr, "\t0x%x\tQCFAIL        .. not passing quality controls\n", BAM_FQCFAIL);
-    fprintf(samtools_stderr, "\t0x%x\tDUP           .. PCR or optical duplicate\n", BAM_FDUP);
-    fprintf(samtools_stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY);
-    fprintf(samtools_stderr, "\n");
+    static const struct { int bit; const char *desc; } *fl, flags[] = {
+        { BAM_FPAIRED, "paired-end / multiple-segment sequencing technology" },
+        { BAM_FPROPER_PAIR, "each segment properly aligned according to aligner" },
+        { BAM_FUNMAP, "segment unmapped" },
+        { BAM_FMUNMAP, "next segment in the template unmapped" },
+        { BAM_FREVERSE, "SEQ is reverse complemented" },
+        { BAM_FMREVERSE, "SEQ of next segment in template is rev.complemented" },
+        { BAM_FREAD1, "the first segment in the template" },
+        { BAM_FREAD2, "the last segment in the template" },
+        { BAM_FSECONDARY, "secondary alignment" },
+        { BAM_FQCFAIL, "not passing quality controls or other filters" },
+        { BAM_FDUP, "PCR or optical duplicate" },
+        { BAM_FSUPPLEMENTARY, "supplementary alignment" },
+        { 0, NULL }
+    };
+
+    fprintf(fp,
+"About: Convert between textual and numeric flag representation\n"
+"Usage: samtools flags FLAGS...\n"
+"\n"
+"Each FLAGS argument is either an INT (in decimal/hexadecimal/octal) representing\n"
+"a combination of the following numeric flag values, or a comma-separated string\n"
+"NAME,...,NAME representing a combination of the following flag names:\n"
+"\n");
+    for (fl = flags; fl->desc; fl++) {
+        char *name = bam_flag2str(fl->bit);
+        fprintf(fp, "%#6x %5d  %-15s%s\n", fl->bit, fl->bit, name, fl->desc);
+        free(name);
+    }
  }
  
  
  int main_flags(int argc, char *argv[])
  {
-    if ( argc!=2 ) usage();
-    else
+    if ( argc < 2 ) { usage(samtools_stdout); return 0; }
+
+    int i;
+    for (i = 1; i < argc; i++)
      {
-        int mask = bam_str2flag(argv[1]);
-        if ( mask<0 ) { fprintf(samtools_stderr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; }
-        fprintf(samtools_stdout, "0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask));
+        int mask = bam_str2flag(argv[i]);
+        if ( mask<0 ) { print_error("flags", "Could not parse \"%s\"", argv[i]); usage(samtools_stderr); return 1; }
+        char *str = bam_flag2str(mask);
+        fprintf(samtools_stdout, "0x%x\t%d\t%s\n", mask, mask, str);
+        free(str);
      }
      return 0;
  }
diff --git a/samtools/bam_import.c b/samtools/bam_import.c

new file mode 100644 (file)

index 0000000..daf6b17
--- /dev/null
+++ b/samtools/bam_import.c
@@ -0,0 +1,487 @@
+/* bam_import -- Import of FASTQ files.
+ *
+ *   samtools import -1 a_1.fq -2 a_2.fq --i1 a_i1.fq --i2 a_i2.fq
+ *   samtools import a_1.fq a_2.fq
+ *   samtools import a_interleaved.fq
+ *
+ * Copyright (C) 2020 Genome Research Ltd.
+ *
+ * Author: James Bonfield <jkb@sanger.ac.uk>
+ */
+
+/*
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notices and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+// TODO: Store other non-aux comments; in new sam tag?
+
+#include <config.h>
+#include <ctype.h>
+
+#include "htslib/sam.h"
+#include "htslib/thread_pool.h"
+
+#include "samtools.h"
+#include "sam_opts.h"
+
+static int usage(FILE *fp, int exit_status) {
+    fprintf(fp, "Usage: samtools import [options] [file.fastq ...]\n");
+    fprintf(fp, "\n");
+    fprintf(fp, "Options:\n");
+    fprintf(fp, "  -s FILE      Read paired-ended data from single FILE\n");
+    fprintf(fp, "  -0 FILE      Read single-ended data from FILE\n");
+    fprintf(fp, "  -1 FILE      Read-1 from FILE\n");
+    fprintf(fp, "  -2 FILE      Read-2 from FILE\n");
+    fprintf(fp, "  --i1 FILE    Index-1 from FILE\n");
+    fprintf(fp, "  --i2 FILE    Index-2 from FILE\n");
+    fprintf(fp, "  -i           Parse CASAVA identifier\n");
+    fprintf(fp, "  --barcode-tag TAG\n");
+    fprintf(fp, "               Tag to use with barcode sequences [BC]\n");
+    fprintf(fp, "  --quality-tag TAG\n");
+    fprintf(fp, "               Tag to use with barcode qualities [QT]\n");
+    fprintf(fp, "  -r STRING    Build up a complete @RG line\n");
+    fprintf(fp, "  -R STRING    Add a simple RG line of \"@RG\\tID:STRING\"\n");
+    fprintf(fp, "  -T TAGLIST   Parse tags in SAM format; list of '*' for all\n");
+    fprintf(fp, "  -o FILE      Output to FILE instead of stdout\n");
+    fprintf(fp, "  -u           Uncompressed output\n");
+    fprintf(fp, "  --order TAG  Store Nth record count in TAG\n");
+    fprintf(fp, "\n");
+    sam_global_opt_help(fp, "-.O.-@--");
+
+    fprintf(fp, "\nA single fastq file will be interpreted as -s, -0 or -1 depending on\n");
+    fprintf(fp, "file contents, and a pair of fastq files as \"-1 FILE1 -2 FILE2\".\n");
+
+    return exit_status;
+}
+
+// Order matters here as we want to read index elements before main
+// sequences so on reading the seqs we can emit a fully annotated record.
+enum fileno {
+    FQ_I1, FQ_I2, // index seqs for R1 and R2
+    FQ_R0,        // single file and unpaired data (singled-ended tech).
+    FQ_R1, FQ_R2, // separate read1 and read2 files
+    FQ_SINGLE,    // single file, but with read1 and/or read2 present.
+    FQ_END
+};
+
+typedef struct {
+    sam_global_args ga;
+    int no_pg;
+    char *fn[FQ_END], *fn_out;
+    int idx_both;      // add index to READ2 too, not just READ1
+    int casava;
+    char *barcode_seq;
+    char *barcode_qual;
+    char *aux;
+    char *rg;
+    char *rg_line;
+    char *order;
+    int compress_level;
+    htsThreadPool p;
+} opts_t;
+
+// Append a sequence and quality string from a BAM record to a BC:Z and
+// QT:Z style aux tag string.
+static int append_index(kstring_t *s, kstring_t *q, bam1_t *b) {
+    char *sp, *qp;
+    if (ks_resize(s, s->l + b->core.l_qseq+1 +1) < 0)
+        return -1;
+    if (ks_resize(q, q->l + b->core.l_qseq+1 +1) < 0)
+        return -1;
+
+    sp = s->s + s->l - (s->l > 0);
+    qp = q->s + q->l - (q->l > 0);
+
+    if (s->l)
+        *sp++ = '-';
+
+    if (q->l)
+        *qp++ = ' ';
+
+    int i;
+    uint8_t *seq = bam_get_seq(b);
+    uint8_t *qual = bam_get_qual(b);
+    for (i = 0; i < b->core.l_qseq; i++) {
+        *sp++ = seq_nt16_str[bam_seqi(seq, i)];
+        *qp++ = qual[i] + '!';
+    }
+    *sp++ = 0;
+    *qp++ = 0;
+
+    s->l = sp - s->s;
+    q->l = qp - q->s;
+
+    return 0;
+}
+
+static int import_fastq(int argc, char **argv, opts_t *opts) {
+    int i, n, ret = 0;
+    samFile *fp_in[FQ_END] = {NULL};
+    bam1_t *b = bam_init1();
+    int ids[FQ_END];
+    samFile *fp_out = NULL;
+    sam_hdr_t *hdr_out = NULL;
+    kstring_t index_str = {0,0};
+    kstring_t read_str = {0,0};
+    char *rg = opts->rg;
+    kstring_t rg_line = {0,0};
+    uint64_t read_num = 0;
+    kstring_t idx_seq  = {0};
+    kstring_t idx_qual = {0};
+
+    // Any additional arguments are assumed to be r1 r2, as a
+    // short cut. We support reading index tags out of those too (eg
+    // Illumina CASAVA format), but if we do that we lack the barcode
+    // quality string.
+    //
+    // We also consider a read name ending in /1 or /2 to be a single
+    // file containing interleaved fastq records for both ends.
+    // These will be labeled as fn[FQ_R1] but adjusted during reading.
+    if (argc == 1)
+        opts->fn[FQ_SINGLE] = argv[0];
+    else
+        for (i = 0; i < 4; i++)
+            if (argc > i)
+                opts->fn[FQ_R1+i] = argv[i];
+
+    // Open all files
+    for (i = n = 0; i < FQ_END; i++) {
+        if (!opts->fn[i])
+            continue;
+        fp_in[i] = sam_open_format(opts->fn[i], "r", &opts->ga.in);
+        if (!fp_in[i]) {
+            perror(opts->fn[i]);
+            ret = -1;
+            goto err;
+        }
+        if (opts->p.pool)
+            hts_set_thread_pool(fp_in[i], &opts->p);
+        ids[n++] = i;
+
+        if (opts->casava)
+            hts_set_opt(fp_in[i], FASTQ_OPT_CASAVA, 1);
+        if (opts->barcode_seq) // for auto-CASAVA parsing
+            hts_set_opt(fp_in[i], FASTQ_OPT_BARCODE, opts->barcode_seq);
+        if (opts->aux)
+            hts_set_opt(fp_in[i], FASTQ_OPT_AUX,
+                        *opts->aux == '*' || *opts->aux == '\0'
+                        ? NULL : opts->aux);
+
+        switch (i) {
+        case FQ_I1:
+            kputs("--i1 I1.fastq ", &read_str);
+            kputs("i*", &index_str);
+            break;
+        case FQ_I2:
+            kputs("--i2 I2.fastq ", &read_str);
+            kputs("i*", &index_str);
+            break;
+
+        case FQ_R0:
+            kputs("-0 unpaired.fastq ", &read_str);
+            break;
+
+        case FQ_R1:
+            kputs("-1 R1.fastq ", &read_str);
+            break;
+
+        case FQ_R2:
+            kputs("-2 R2.fastq ", &read_str);
+            break;
+
+        case FQ_SINGLE:
+            kputs("-N -o paired.fastq ", &read_str);
+            break;
+
+        default:
+            ks_clear(&read_str); // not reversible
+            kputs("", &read_str);
+        }
+    }
+    if (n == 0) {
+        bam_destroy1(b);
+        return usage(stdout, EXIT_SUCCESS);
+    }
+
+    char out_mode[10] = {'w', 0, 0};
+    if (opts->compress_level != -1)
+        out_mode[1] = '0' + opts->compress_level;
+    sam_open_mode(out_mode+strlen(out_mode), opts->fn_out, NULL);
+    fp_out = sam_open_format(opts->fn_out, out_mode, &opts->ga.out);
+    if (!fp_out) {
+        perror(opts->fn_out);
+        goto err;
+    }
+    if (opts->p.pool)
+        hts_set_thread_pool(fp_out, &opts->p);
+
+    // Create header
+    if (ks_len(&read_str)) {
+        char CO[2100];
+        if (ks_len(&index_str))
+            snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s "
+                    "--index-format=\"%s\"\n",
+                    ks_str(&read_str), ks_str(&index_str));
+        else
+            snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s\n",
+                    ks_str(&read_str));
+
+        hdr_out = sam_hdr_parse(strlen(CO), CO);
+    } else {
+        hdr_out = sam_hdr_init();
+    }
+
+    // Read group
+    if (opts->rg_line) {
+        if (*opts->rg_line != '@')
+            ksprintf(&rg_line, "@RG\t%s", opts->rg_line);
+        else
+            kputs(opts->rg_line, &rg_line);
+    } else if (opts->rg) {
+        ksprintf(&rg_line, "@RG\tID:%s", opts->rg);
+    }
+
+    if (ks_len(&rg_line)) {
+        if (sam_hdr_add_lines(hdr_out, ks_str(&rg_line), 0) < 0)
+            goto err;
+        rg = strstr(ks_str(&rg_line), "\tID:");
+        if (!rg) {
+            fprintf(stderr, "\"-r RG-LINE\" option contained no ID field\n");
+            goto err;
+        }
+        rg += 4;
+
+        i = 0;
+        while (rg[i] != '\t' && rg[i] != '\0')
+            i++;
+        rg[i] = 0;
+    }
+
+    if ((ret = sam_hdr_write(fp_out, hdr_out)) < 0)
+        goto err;
+
+
+    // Interleave / combine from n files (ids[0..n-1]).
+    int res;
+    int eof = 0;
+    do {
+        idx_seq.l = idx_qual.l = 0;
+        for (i = 0; i < n; i++) {
+            if ((res = sam_read1(fp_in[ids[i]], NULL, b)) < 0) {
+                if (res == -1) {
+                    eof++;
+                    continue;
+                } else
+                    break;
+            }
+
+            // index
+            if (ids[i] == FQ_I1 || ids[i] == FQ_I2) {
+                if (append_index(&idx_seq, &idx_qual, b) < 0) {
+                    res = -1;
+                    break;
+                }
+                continue;
+            }
+
+            // full read
+            if (idx_seq.l) {
+                if (opts->idx_both || ids[i] == FQ_SINGLE ||
+                    ids[i] == FQ_R0 || ids[i] == FQ_R1) {
+                    if (bam_aux_append(b, opts->barcode_seq, 'Z', idx_seq.l,
+                                       (uint8_t *)idx_seq.s) ||
+                        bam_aux_append(b, opts->barcode_qual, 'Z', idx_qual.l,
+                                       (uint8_t *)idx_qual.s)) {
+                        res = -1;
+                        break;
+                    }
+                }
+            }
+
+            switch(ids[i]) {
+            case FQ_R0:
+                // unpaired; no flags to declare
+                break;
+            case FQ_SINGLE:
+                // paired (but don't know if R1 or R2) or unpaired.
+                // We rely on the /1 and /2 read suffix parsing in htslib
+                // to distinguish the two cases, or CASAVA tags if
+                // explicitly enabled.
+                break;
+            case FQ_R1:
+                if ((b->core.flag & (BAM_FREAD1 | BAM_FREAD2)) == 0)
+                    b->core.flag |= BAM_FREAD1;
+                b->core.flag |= BAM_FPAIRED;
+                if (i+1 < n && ids[i+1] == FQ_R2)
+                    b->core.flag |= BAM_FMUNMAP;
+                break;
+            case FQ_R2:
+                b->core.flag |= BAM_FPAIRED | BAM_FREAD2;
+                if (i > 0 && ids[i-1] == FQ_R1)
+                    b->core.flag |= BAM_FMUNMAP;
+                break;
+            }
+
+            if (rg) {
+                if (bam_aux_append(b, "RG", 'Z', strlen(rg)+1,
+                                   (uint8_t *)rg) < 0) {
+                    ret = -1;
+                    goto err;
+                }
+            }
+
+            if (opts->order) {
+                if (bam_aux_update_int(b, opts->order, read_num++) < 0) {
+                    ret = -1;
+                    goto err;
+                }
+            }
+
+            res = sam_write1(fp_out, hdr_out, b);
+        }
+    } while (res >= 0);
+
+    if (res != -1) {
+        print_error("import", "truncated file. Aborting");
+        ret = res;
+        goto err;
+    }
+
+    if (eof != n) {
+        print_error("import", "input files with differing number of records");
+        ret = -1;
+        goto err;
+    }
+
+    // Close and return
+    ret = 0;
+err:
+    bam_destroy1(b);
+    sam_hdr_destroy(hdr_out);
+    ks_free(&rg_line);
+    ks_free(&index_str);
+    ks_free(&read_str);
+    if (fp_out) {
+        if (sam_close(fp_out) < 0) {
+            perror(opts->fn_out);
+            ret |= -1;
+        }
+    }
+    for (i = 0; i < FQ_END; i++) {
+        if (fp_in[i] && sam_close(fp_in[i]) < 0) {
+            perror(opts->fn[i]);
+            ret |= -1;
+        }
+    }
+    ks_free(&idx_seq);
+    ks_free(&idx_qual);
+
+    return ret;
+}
+
+int main_import(int argc, char *argv[]) {
+    int c;
+    opts_t opts = {
+        .no_pg = 0,
+        .ga = SAM_GLOBAL_ARGS_INIT,
+        .fn = {NULL},
+        .fn_out = "-",
+        .casava = 0,
+        .barcode_seq = "BC",
+        .barcode_qual = "QT",
+        .aux = NULL,
+        .rg = NULL,
+        .rg_line = NULL,
+        .order = NULL,
+        .compress_level = -1,
+    };
+    kstring_t rg = {0};
+
+    static const struct option lopts[] = {
+        SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, '-', '@'),
+        {"no-PG", no_argument, NULL, 9},
+        {"i1", required_argument, NULL, 1},
+        {"i2", required_argument, NULL, 2},
+        {"r1", required_argument, NULL, '1'},
+        {"r2", required_argument, NULL, '2'},
+        {"rg", required_argument, NULL, 'R'},
+        {"rg-line", required_argument, NULL, 'r'},
+        {"order", required_argument, NULL, 3},
+        {"barcode-tag", required_argument, NULL, 4},
+        {"quality-tag", required_argument, NULL, 5},
+        { NULL, 0, NULL, 0 }
+    };
+
+    while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:", lopts, NULL)) >= 0) {
+        switch (c) {
+        case 'b': opts.idx_both = 1; break;
+        case '0': opts.fn[FQ_R0] = optarg; break;
+        case '1': opts.fn[FQ_R1] = optarg; break;
+        case '2': opts.fn[FQ_R2] = optarg; break;
+        case  1:  opts.fn[FQ_I1] = optarg; break;
+        case  2:  opts.fn[FQ_I2] = optarg; break;
+        case 's': opts.fn[FQ_SINGLE] = optarg; break;
+        case 'o': opts.fn_out = optarg; break;
+        case 'i': opts.casava = 1; break;
+        case  4:  opts.barcode_seq = optarg; break;
+        case  5:  opts.barcode_qual = optarg; break;
+        case 'T': opts.aux = optarg; break;
+        case 'u': opts.compress_level = 0; break;
+        case 'R': opts.rg = optarg; break;
+        case 'r':
+            if (*optarg != '@' && ks_len(&rg) == 0)
+                kputs("@RG", &rg);
+            if (ks_len(&rg))
+                kputc_('\t', &rg);
+            kputs(optarg, &rg);
+            opts.rg_line = rg.s;
+            break;
+
+        case 9: opts.no_pg = 1; break;
+        case 3: opts.order = optarg; break;
+
+        case 'h': return usage(stdout, EXIT_SUCCESS);
+        case '?': return usage(stderr, EXIT_FAILURE);
+
+        default:
+            if (parse_sam_global_opt(c, optarg, lopts, &opts.ga) != 0)
+                return usage(stderr, EXIT_FAILURE);
+            break;
+        }
+    }
+
+    if (opts.ga.nthreads > 0) {
+        if (!(opts.p.pool = hts_tpool_init(opts.ga.nthreads))) {
+            fprintf(stderr, "Failed to create thread pool\n");
+            if (rg.s)
+                free(rg.s);
+            return -1;;
+        }
+    }
+
+    int ret = import_fastq(argc-optind, argv+optind, &opts) ? 1 : 0;
+
+    if (rg.s)
+        free(rg.s);
+
+    if (opts.p.pool)
+        hts_tpool_destroy(opts.p.pool);
+
+    return ret;
+}
diff --git a/samtools/bam_import.c.pysam.c b/samtools/bam_import.c.pysam.c

new file mode 100644 (file)

index 0000000..1307ac6
--- /dev/null
+++ b/samtools/bam_import.c.pysam.c
@@ -0,0 +1,489 @@
+#include "samtools.pysam.h"
+
+/* bam_import -- Import of FASTQ files.
+ *
+ *   samtools import -1 a_1.fq -2 a_2.fq --i1 a_i1.fq --i2 a_i2.fq
+ *   samtools import a_1.fq a_2.fq
+ *   samtools import a_interleaved.fq
+ *
+ * Copyright (C) 2020 Genome Research Ltd.
+ *
+ * Author: James Bonfield <jkb@sanger.ac.uk>
+ */
+
+/*
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notices and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+// TODO: Store other non-aux comments; in new sam tag?
+
+#include <config.h>
+#include <ctype.h>
+
+#include "htslib/sam.h"
+#include "htslib/thread_pool.h"
+
+#include "samtools.h"
+#include "sam_opts.h"
+
+static int usage(FILE *fp, int exit_status) {
+    fprintf(fp, "Usage: samtools import [options] [file.fastq ...]\n");
+    fprintf(fp, "\n");
+    fprintf(fp, "Options:\n");
+    fprintf(fp, "  -s FILE      Read paired-ended data from single FILE\n");
+    fprintf(fp, "  -0 FILE      Read single-ended data from FILE\n");
+    fprintf(fp, "  -1 FILE      Read-1 from FILE\n");
+    fprintf(fp, "  -2 FILE      Read-2 from FILE\n");
+    fprintf(fp, "  --i1 FILE    Index-1 from FILE\n");
+    fprintf(fp, "  --i2 FILE    Index-2 from FILE\n");
+    fprintf(fp, "  -i           Parse CASAVA identifier\n");
+    fprintf(fp, "  --barcode-tag TAG\n");
+    fprintf(fp, "               Tag to use with barcode sequences [BC]\n");
+    fprintf(fp, "  --quality-tag TAG\n");
+    fprintf(fp, "               Tag to use with barcode qualities [QT]\n");
+    fprintf(fp, "  -r STRING    Build up a complete @RG line\n");
+    fprintf(fp, "  -R STRING    Add a simple RG line of \"@RG\\tID:STRING\"\n");
+    fprintf(fp, "  -T TAGLIST   Parse tags in SAM format; list of '*' for all\n");
+    fprintf(fp, "  -o FILE      Output to FILE instead of samtools_stdout\n");
+    fprintf(fp, "  -u           Uncompressed output\n");
+    fprintf(fp, "  --order TAG  Store Nth record count in TAG\n");
+    fprintf(fp, "\n");
+    sam_global_opt_help(fp, "-.O.-@--");
+
+    fprintf(fp, "\nA single fastq file will be interpreted as -s, -0 or -1 depending on\n");
+    fprintf(fp, "file contents, and a pair of fastq files as \"-1 FILE1 -2 FILE2\".\n");
+
+    return exit_status;
+}
+
+// Order matters here as we want to read index elements before main
+// sequences so on reading the seqs we can emit a fully annotated record.
+enum fileno {
+    FQ_I1, FQ_I2, // index seqs for R1 and R2
+    FQ_R0,        // single file and unpaired data (singled-ended tech).
+    FQ_R1, FQ_R2, // separate read1 and read2 files
+    FQ_SINGLE,    // single file, but with read1 and/or read2 present.
+    FQ_END
+};
+
+typedef struct {
+    sam_global_args ga;
+    int no_pg;
+    char *fn[FQ_END], *fn_out;
+    int idx_both;      // add index to READ2 too, not just READ1
+    int casava;
+    char *barcode_seq;
+    char *barcode_qual;
+    char *aux;
+    char *rg;
+    char *rg_line;
+    char *order;
+    int compress_level;
+    htsThreadPool p;
+} opts_t;
+
+// Append a sequence and quality string from a BAM record to a BC:Z and
+// QT:Z style aux tag string.
+static int append_index(kstring_t *s, kstring_t *q, bam1_t *b) {
+    char *sp, *qp;
+    if (ks_resize(s, s->l + b->core.l_qseq+1 +1) < 0)
+        return -1;
+    if (ks_resize(q, q->l + b->core.l_qseq+1 +1) < 0)
+        return -1;
+
+    sp = s->s + s->l - (s->l > 0);
+    qp = q->s + q->l - (q->l > 0);
+
+    if (s->l)
+        *sp++ = '-';
+
+    if (q->l)
+        *qp++ = ' ';
+
+    int i;
+    uint8_t *seq = bam_get_seq(b);
+    uint8_t *qual = bam_get_qual(b);
+    for (i = 0; i < b->core.l_qseq; i++) {
+        *sp++ = seq_nt16_str[bam_seqi(seq, i)];
+        *qp++ = qual[i] + '!';
+    }
+    *sp++ = 0;
+    *qp++ = 0;
+
+    s->l = sp - s->s;
+    q->l = qp - q->s;
+
+    return 0;
+}
+
+static int import_fastq(int argc, char **argv, opts_t *opts) {
+    int i, n, ret = 0;
+    samFile *fp_in[FQ_END] = {NULL};
+    bam1_t *b = bam_init1();
+    int ids[FQ_END];
+    samFile *fp_out = NULL;
+    sam_hdr_t *hdr_out = NULL;
+    kstring_t index_str = {0,0};
+    kstring_t read_str = {0,0};
+    char *rg = opts->rg;
+    kstring_t rg_line = {0,0};
+    uint64_t read_num = 0;
+    kstring_t idx_seq  = {0};
+    kstring_t idx_qual = {0};
+
+    // Any additional arguments are assumed to be r1 r2, as a
+    // short cut. We support reading index tags out of those too (eg
+    // Illumina CASAVA format), but if we do that we lack the barcode
+    // quality string.
+    //
+    // We also consider a read name ending in /1 or /2 to be a single
+    // file containing interleaved fastq records for both ends.
+    // These will be labeled as fn[FQ_R1] but adjusted during reading.
+    if (argc == 1)
+        opts->fn[FQ_SINGLE] = argv[0];
+    else
+        for (i = 0; i < 4; i++)
+            if (argc > i)
+                opts->fn[FQ_R1+i] = argv[i];
+
+    // Open all files
+    for (i = n = 0; i < FQ_END; i++) {
+        if (!opts->fn[i])
+            continue;
+        fp_in[i] = sam_open_format(opts->fn[i], "r", &opts->ga.in);
+        if (!fp_in[i]) {
+            perror(opts->fn[i]);
+            ret = -1;
+            goto err;
+        }
+        if (opts->p.pool)
+            hts_set_thread_pool(fp_in[i], &opts->p);
+        ids[n++] = i;
+
+        if (opts->casava)
+            hts_set_opt(fp_in[i], FASTQ_OPT_CASAVA, 1);
+        if (opts->barcode_seq) // for auto-CASAVA parsing
+            hts_set_opt(fp_in[i], FASTQ_OPT_BARCODE, opts->barcode_seq);
+        if (opts->aux)
+            hts_set_opt(fp_in[i], FASTQ_OPT_AUX,
+                        *opts->aux == '*' || *opts->aux == '\0'
+                        ? NULL : opts->aux);
+
+        switch (i) {
+        case FQ_I1:
+            kputs("--i1 I1.fastq ", &read_str);
+            kputs("i*", &index_str);
+            break;
+        case FQ_I2:
+            kputs("--i2 I2.fastq ", &read_str);
+            kputs("i*", &index_str);
+            break;
+
+        case FQ_R0:
+            kputs("-0 unpaired.fastq ", &read_str);
+            break;
+
+        case FQ_R1:
+            kputs("-1 R1.fastq ", &read_str);
+            break;
+
+        case FQ_R2:
+            kputs("-2 R2.fastq ", &read_str);
+            break;
+
+        case FQ_SINGLE:
+            kputs("-N -o paired.fastq ", &read_str);
+            break;
+
+        default:
+            ks_clear(&read_str); // not reversible
+            kputs("", &read_str);
+        }
+    }
+    if (n == 0) {
+        bam_destroy1(b);
+        return usage(samtools_stdout, EXIT_SUCCESS);
+    }
+
+    char out_mode[10] = {'w', 0, 0};
+    if (opts->compress_level != -1)
+        out_mode[1] = '0' + opts->compress_level;
+    sam_open_mode(out_mode+strlen(out_mode), opts->fn_out, NULL);
+    fp_out = sam_open_format(opts->fn_out, out_mode, &opts->ga.out);
+    if (!fp_out) {
+        perror(opts->fn_out);
+        goto err;
+    }
+    if (opts->p.pool)
+        hts_set_thread_pool(fp_out, &opts->p);
+
+    // Create header
+    if (ks_len(&read_str)) {
+        char CO[2100];
+        if (ks_len(&index_str))
+            snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s "
+                    "--index-format=\"%s\"\n",
+                    ks_str(&read_str), ks_str(&index_str));
+        else
+            snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s\n",
+                    ks_str(&read_str));
+
+        hdr_out = sam_hdr_parse(strlen(CO), CO);
+    } else {
+        hdr_out = sam_hdr_init();
+    }
+
+    // Read group
+    if (opts->rg_line) {
+        if (*opts->rg_line != '@')
+            ksprintf(&rg_line, "@RG\t%s", opts->rg_line);
+        else
+            kputs(opts->rg_line, &rg_line);
+    } else if (opts->rg) {
+        ksprintf(&rg_line, "@RG\tID:%s", opts->rg);
+    }
+
+    if (ks_len(&rg_line)) {
+        if (sam_hdr_add_lines(hdr_out, ks_str(&rg_line), 0) < 0)
+            goto err;
+        rg = strstr(ks_str(&rg_line), "\tID:");
+        if (!rg) {
+            fprintf(samtools_stderr, "\"-r RG-LINE\" option contained no ID field\n");
+            goto err;
+        }
+        rg += 4;
+
+        i = 0;
+        while (rg[i] != '\t' && rg[i] != '\0')
+            i++;
+        rg[i] = 0;
+    }
+
+    if ((ret = sam_hdr_write(fp_out, hdr_out)) < 0)
+        goto err;
+
+
+    // Interleave / combine from n files (ids[0..n-1]).
+    int res;
+    int eof = 0;
+    do {
+        idx_seq.l = idx_qual.l = 0;
+        for (i = 0; i < n; i++) {
+            if ((res = sam_read1(fp_in[ids[i]], NULL, b)) < 0) {
+                if (res == -1) {
+                    eof++;
+                    continue;
+                } else
+                    break;
+            }
+
+            // index
+            if (ids[i] == FQ_I1 || ids[i] == FQ_I2) {
+                if (append_index(&idx_seq, &idx_qual, b) < 0) {
+                    res = -1;
+                    break;
+                }
+                continue;
+            }
+
+            // full read
+            if (idx_seq.l) {
+                if (opts->idx_both || ids[i] == FQ_SINGLE ||
+                    ids[i] == FQ_R0 || ids[i] == FQ_R1) {
+                    if (bam_aux_append(b, opts->barcode_seq, 'Z', idx_seq.l,
+                                       (uint8_t *)idx_seq.s) ||
+                        bam_aux_append(b, opts->barcode_qual, 'Z', idx_qual.l,
+                                       (uint8_t *)idx_qual.s)) {
+                        res = -1;
+                        break;
+                    }
+                }
+            }
+
+            switch(ids[i]) {
+            case FQ_R0:
+                // unpaired; no flags to declare
+                break;
+            case FQ_SINGLE:
+                // paired (but don't know if R1 or R2) or unpaired.
+                // We rely on the /1 and /2 read suffix parsing in htslib
+                // to distinguish the two cases, or CASAVA tags if
+                // explicitly enabled.
+                break;
+            case FQ_R1:
+                if ((b->core.flag & (BAM_FREAD1 | BAM_FREAD2)) == 0)
+                    b->core.flag |= BAM_FREAD1;
+                b->core.flag |= BAM_FPAIRED;
+                if (i+1 < n && ids[i+1] == FQ_R2)
+                    b->core.flag |= BAM_FMUNMAP;
+                break;
+            case FQ_R2:
+                b->core.flag |= BAM_FPAIRED | BAM_FREAD2;
+                if (i > 0 && ids[i-1] == FQ_R1)
+                    b->core.flag |= BAM_FMUNMAP;
+                break;
+            }
+
+            if (rg) {
+                if (bam_aux_append(b, "RG", 'Z', strlen(rg)+1,
+                                   (uint8_t *)rg) < 0) {
+                    ret = -1;
+                    goto err;
+                }
+            }
+
+            if (opts->order) {
+                if (bam_aux_update_int(b, opts->order, read_num++) < 0) {
+                    ret = -1;
+                    goto err;
+                }
+            }
+
+            res = sam_write1(fp_out, hdr_out, b);
+        }
+    } while (res >= 0);
+
+    if (res != -1) {
+        print_error("import", "truncated file. Aborting");
+        ret = res;
+        goto err;
+    }
+
+    if (eof != n) {
+        print_error("import", "input files with differing number of records");
+        ret = -1;
+        goto err;
+    }
+
+    // Close and return
+    ret = 0;
+err:
+    bam_destroy1(b);
+    sam_hdr_destroy(hdr_out);
+    ks_free(&rg_line);
+    ks_free(&index_str);
+    ks_free(&read_str);
+    if (fp_out) {
+        if (sam_close(fp_out) < 0) {
+            perror(opts->fn_out);
+            ret |= -1;
+        }
+    }
+    for (i = 0; i < FQ_END; i++) {
+        if (fp_in[i] && sam_close(fp_in[i]) < 0) {
+            perror(opts->fn[i]);
+            ret |= -1;
+        }
+    }
+    ks_free(&idx_seq);
+    ks_free(&idx_qual);
+
+    return ret;
+}
+
+int main_import(int argc, char *argv[]) {
+    int c;
+    opts_t opts = {
+        .no_pg = 0,
+        .ga = SAM_GLOBAL_ARGS_INIT,
+        .fn = {NULL},
+        .fn_out = "-",
+        .casava = 0,
+        .barcode_seq = "BC",
+        .barcode_qual = "QT",
+        .aux = NULL,
+        .rg = NULL,
+        .rg_line = NULL,
+        .order = NULL,
+        .compress_level = -1,
+    };
+    kstring_t rg = {0};
+
+    static const struct option lopts[] = {
+        SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, '-', '@'),
+        {"no-PG", no_argument, NULL, 9},
+        {"i1", required_argument, NULL, 1},
+        {"i2", required_argument, NULL, 2},
+        {"r1", required_argument, NULL, '1'},
+        {"r2", required_argument, NULL, '2'},
+        {"rg", required_argument, NULL, 'R'},
+        {"rg-line", required_argument, NULL, 'r'},
+        {"order", required_argument, NULL, 3},
+        {"barcode-tag", required_argument, NULL, 4},
+        {"quality-tag", required_argument, NULL, 5},
+        { NULL, 0, NULL, 0 }
+    };
+
+    while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:", lopts, NULL)) >= 0) {
+        switch (c) {
+        case 'b': opts.idx_both = 1; break;
+        case '0': opts.fn[FQ_R0] = optarg; break;
+        case '1': opts.fn[FQ_R1] = optarg; break;
+        case '2': opts.fn[FQ_R2] = optarg; break;
+        case  1:  opts.fn[FQ_I1] = optarg; break;
+        case  2:  opts.fn[FQ_I2] = optarg; break;
+        case 's': opts.fn[FQ_SINGLE] = optarg; break;
+        case 'o': opts.fn_out = optarg; break;
+        case 'i': opts.casava = 1; break;
+        case  4:  opts.barcode_seq = optarg; break;
+        case  5:  opts.barcode_qual = optarg; break;
+        case 'T': opts.aux = optarg; break;
+        case 'u': opts.compress_level = 0; break;
+        case 'R': opts.rg = optarg; break;
+        case 'r':
+            if (*optarg != '@' && ks_len(&rg) == 0)
+                kputs("@RG", &rg);
+            if (ks_len(&rg))
+                kputc_('\t', &rg);
+            kputs(optarg, &rg);
+            opts.rg_line = rg.s;
+            break;
+
+        case 9: opts.no_pg = 1; break;
+        case 3: opts.order = optarg; break;
+
+        case 'h': return usage(samtools_stdout, EXIT_SUCCESS);
+        case '?': return usage(samtools_stderr, EXIT_FAILURE);
+
+        default:
+            if (parse_sam_global_opt(c, optarg, lopts, &opts.ga) != 0)
+                return usage(samtools_stderr, EXIT_FAILURE);
+            break;
+        }
+    }
+
+    if (opts.ga.nthreads > 0) {
+        if (!(opts.p.pool = hts_tpool_init(opts.ga.nthreads))) {
+            fprintf(samtools_stderr, "Failed to create thread pool\n");
+            if (rg.s)
+                free(rg.s);
+            return -1;;
+        }
+    }
+
+    int ret = import_fastq(argc-optind, argv+optind, &opts) ? 1 : 0;
+
+    if (rg.s)
+        free(rg.s);
+
+    if (opts.p.pool)
+        hts_tpool_destroy(opts.p.pool);
+
+    return ret;
+}
diff --git a/samtools/bam_index.c.pysam.c b/samtools/bam_index.c.pysam.c

index 5399da714cecd5c4a6cb00a955a7d47ab1d41f8d..7b2ee3e522849db9162d9f5379190c821dbc38a6 100644 (file)
--- a/samtools/bam_index.c.pysam.c
+++ b/samtools/bam_index.c.pysam.c
@@ -170,7 +170,7 @@ static void usage_exit(FILE *fp, int exit_status)
  {
      fprintf(fp, "Usage: samtools idxstats [options] <in.bam>\n");
      sam_global_opt_help(fp, "-.---@-.");
-    exit(exit_status);
+    samtools_exit(exit_status);
  }
  
  int bam_idxstats(int argc, char *argv[])
diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c

index 1619b5b32f37fd2c30834b3de665feece59ca9db..2da184f788913da68f2cfedadd4299c09e3f808b 100644 (file)
--- a/samtools/bam_markdup.c
+++ b/samtools/bam_markdup.c
@@ -1,7 +1,7 @@
  /*  bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
                       through fixmates with the mate scoring option on.
  
-    Copyright (C) 2017-2019 Genome Research Ltd.
+    Copyright (C) 2017-2020 Genome Research Ltd.
  
      Author: Andrew Whitwham <aw7@sanger.ac.uk>
  
@@ -62,6 +62,7 @@ typedef struct {
      int mode;
      int write_index;
      int include_fails;
+    int check_chain;
      char *stats_file;
      char *arg_list;
      char *out_fn;
@@ -83,6 +84,7 @@ typedef struct read_queue_s {
      bam1_t *b;
      struct read_queue_s *duplicate;
      hts_pos_t pos;
+    int dup_checked;
  } read_queue_t;
  
  typedef struct {
@@ -94,8 +96,23 @@ typedef struct {
      char type;
  } dup_map_t;
  
+typedef struct {
+    bam1_t *b;
+    int64_t score;
+    int64_t mate_score;
+    long x;
+    long y;
+    int opt;
+    int xpos;
+} check_t;
  
  
+typedef struct {
+    check_t *c;
+    size_t size;
+    size_t length;
+} check_list_t;
+
  static khint32_t do_hash(unsigned char *key, khint32_t len);
  
  static khint_t hash_key(key_data_t key) {
@@ -665,6 +682,7 @@ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_n
  }
  
  
+/* Get the position of the coordinates from the read name. */
  static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) {
      int sep = 0;
      int pos = 0;
@@ -693,6 +711,66 @@ static inline int get_coordinate_positions(const char *qname, int *xpos, int *yp
      return sep;
  }
  
+
+static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *y_coord, long *warnings) {
+    int ret = 1;
+    int seps, xpos = 0, ypos = 0;
+    long x = 0, y = 0;
+    char *end;
+
+    seps = get_coordinate_positions(name, &xpos, &ypos);
+
+    /* The most current Illumina read format at time of writing is:
+       @machine:run:flowcell:lane:tile:x:y:UMI or
+       @machine:run:flowcell:lane:tile:x:y
+
+       Counting the separating colons gives us a quick format check.
+       Older name formats have fewer elements.
+    */
+
+    if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) {
+        (*warnings)++;
+
+        if (*warnings <= BMD_WARNING_MAX) {
+            fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", name);
+        }
+
+        return ret;
+    }
+
+    x = strtol(name + xpos, &end, 10);
+
+    if ((name + xpos) == end) {
+        (*warnings)++;
+
+        if (*warnings <= BMD_WARNING_MAX) {
+            fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name);
+        }
+
+        return ret;
+    }
+
+    y = strtol(name + ypos, &end, 10);
+
+    if ((name + ypos) == end) {
+        (*warnings)++;
+
+        if (*warnings <= BMD_WARNING_MAX) {
+            fprintf(stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name);
+        }
+
+        return ret;
+    }
+
+    *x_coord = x;
+    *y_coord = y;
+    *xpos_out = xpos;
+    ret = 0;
+
+    return ret;
+}
+
+
  /* Using the coordinates from the Illumina read name, see whether the duplicated read is
     close enough (set by max_dist) to the original to be counted as optical.*/
  
@@ -806,6 +884,59 @@ static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warn
  }
  
  
+/* Using the coordinates from the Illumina read name, see whether the duplicated read is
+   close enough (set by max_dist) to the original to be counted as optical.
+
+   This function needs the values from the first read to be already calculated. */
+
+static int optical_duplicate_partial(const char *name, const int oxpos, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) {
+    int ret = 0;
+    char *duplicate;
+    int dxpos = 0;
+    long dx, dy;
+
+    duplicate = bam_get_qname(dup);
+
+    if (get_coordinates(duplicate, &dxpos, &dx, &dy, warnings)) {
+        return ret;
+    }
+
+    if (strncmp(name, duplicate, oxpos - 1) == 0) {
+        // the initial parts match, look at the numbers
+        long xdiff, ydiff;
+
+        if (ox > dx) {
+            xdiff = ox - dx;
+        } else {
+            xdiff = dx - ox;
+        }
+
+        if (xdiff <= max_dist) {
+            // still might be optical
+
+            if (oy > dy) {
+                ydiff = oy - dy;
+            } else {
+                ydiff = dy - oy;
+            }
+
+            if (ydiff <= max_dist) ret = 1;
+        }
+    }
+
+    c->x = dx;
+    c->y = dy;
+    c->xpos = dxpos;
+
+    if (ret) {
+        c->opt = ret;
+    }
+
+    return ret;
+}
+
+
+/* Mark the read as a duplicate and update the duplicate hash (if needed) */
  static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup,
                             long *optical, long *warn) {
      char dup_type = 0;
@@ -814,7 +945,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam
      dup->core.flag |= BAM_FDUP;
  
      if (param->tag) {
-        if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(ori)) + 1, (uint8_t*)bam_get_qname(ori))) {
+        if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) {
              fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n");
              return -1;
          }
@@ -822,12 +953,12 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam
  
      if (param->opt_dist) { // mark optical duplicates
          if (optical_duplicate(ori, dup, param->opt_dist, warn)) {
-            bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"SQ");
+            bam_aux_update_str(dup, "dt", 3, "SQ");
              dup_type = 'O';
              (*optical)++;
          } else {
              // not an optical duplicate
-            bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"LB");
+            bam_aux_update_str(dup, "dt", 3, "LB");
          }
      }
  
@@ -853,17 +984,12 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam
  }
  
  
+/* If the duplicate type has changed to optical then retag and duplicate hash. */
  static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) {
      int ret = 0;
-    uint8_t *data;
  
-    // remove any existing dt tag
-    if ((data = bam_aux_get(b, "dt")) != NULL) {
-        bam_aux_del(b, data);
-    }
-
-    if (bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ")) {
-        fprintf(stderr, "[markdup] error: unable to append 'dt' tag.\n");
+    if (bam_aux_update_str(b, "dt", 3, "SQ")) {
+        fprintf(stderr, "[markdup] error: unable to update 'dt' tag.\n");
          ret = -1;
      }
  
@@ -897,23 +1023,54 @@ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash
  }
  
  
+/* Check all duplicates of the highest quality read (the "original") for consistancy.  Also
+   pre-calculate any values for use in check_duplicate_chain later.
+   Returns 0 on success, >0 on coordinate reading error (program can continue) or
+   <0 on an error (program should not continue. */
+static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori,
+             check_list_t *list, long *warn, long *optical_single, long *optical_pair) {
  
-/*
-    Where there is more than one duplicate go down the list and check for optical duplicates and change
-    do tags (where used) to point to original (non-duplicate) read.
-*/
-static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori,
-             long *warn, long *optical_single, long *optical_pair) {
      int ret = 0;
-    read_queue_t *current = ori->duplicate;
      char *ori_name = bam_get_qname(ori->b);
-    int have_original = !(ori->b->core.flag & BAM_FDUP);
-    int ori_paired = (ori->b->core.flag & BAM_FPAIRED) && !(ori->b->core.flag & BAM_FMUNMAP);
+    read_queue_t *current = ori->duplicate;
+    int xpos;
+    long x, y;
+
+    if (param->opt_dist) {
+        if ((ret = get_coordinates(ori_name, &xpos, &x, &y, warn))) {
+            return ret;
+        }
+    }
+
+    list->length = 0;
  
      while (current) {
-        int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
+        check_t *c;
+
+        if (list->length >= list->size) {
+            check_t *tmp;
+
+            list->size *= 2;
+
+            if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) {
+                fprintf(stderr, "[markdup] error: Unable to expand opt check list.\n");
+                return -1;
+            }
+
+            list->c = tmp;
+        }
+
+        c = &list->c[list->length];
  
-        if (param->tag && have_original) {
+        c->b = current->b;
+        c->x = -1;
+        c->y = -1;
+        c->opt = 0;
+        c->score = 0;
+        c->mate_score = 0;
+        current->dup_checked = 1;
+
+        if (param->tag) {
              uint8_t *data;
  
              // at this stage all duplicates should have a do tag
@@ -923,10 +1080,8 @@ static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_has
  
                  if (old_name) {
                      if (strcmp(old_name, ori_name) != 0) {
-                        bam_aux_del(current->b, data);
-
-                        if (bam_aux_append(current->b, "do", 'Z', strlen(ori_name) + 1, (uint8_t*)ori_name)) {
-                            fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n");
+                        if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) {
+                            fprintf(stderr, "[markdup] error: unable to update 'do' tag.\n");
                              ret =  -1;
                              break;
                          }
@@ -940,118 +1095,226 @@ static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_has
          }
  
          if (param->opt_dist) {
-            int is_cur_opt = 0, is_ori_opt = 0;
              uint8_t *data;
              char *dup_type;
+            int is_opt = 0;
+            int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
  
-            if ((data = bam_aux_get(ori->b, "dt"))) {
+            if ((data = bam_aux_get(current->b, "dt"))) {
                  if ((dup_type = bam_aux2Z(data))) {
                      if (strcmp(dup_type, "SQ") == 0) {
-                        is_ori_opt = 1;
+                        c->opt = 1;
                      }
                  }
              }
  
-            if ((data = bam_aux_get(current->b, "dt"))) {
-                if ((dup_type = bam_aux2Z(data))) {
-                    if (strcmp(dup_type, "SQ") == 0) {
-                        is_cur_opt = 1;
-                    }
+            // need to run this to get the duplicates x and y scores
+            is_opt = optical_duplicate_partial(ori_name, xpos, x, y, current->b, c, param->opt_dist, warn);
+
+            if (!c->opt && is_opt) {
+                if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
+                    ret = -1;
+                    break;
                  }
+
+                c->opt = 1;
              }
  
-            if (!(is_ori_opt && is_cur_opt)) {
-                // if both are already optical duplicates there is no need to check again, otherwise...
+            c->score = calc_score(current->b);
  
-                if (optical_duplicate(ori->b, current->b, param->opt_dist, warn)) {
-                    // find out which one is the duplicate
-                    int is_cur_dup = 0;
+            if (current_paired) {
+                if ((c->mate_score = get_mate_score(current->b)) == -1) {
+                     fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
+                     ret = -1;
+                     break;
+                }
+            }
+        }
  
-                    if (have_original) {
-                        // compared against an original, this is a dup.
-                        is_cur_dup = 1;
-                    } else if (ori_paired != current_paired) {
-                        if (!current_paired) {
-                            // current is single vs pair, this is a dup.
-                            is_cur_dup = 1;
-                        }
-                    } else {
-                        // do it by scores
-                        int64_t ori_score, curr_score;
+        current = current->duplicate;
+        list->length++;
+    }
  
-                        if ((ori->b->core.flag & BAM_FQCFAIL) != (current->b->core.flag & BAM_FQCFAIL)) {
-                            if (ori->b->core.flag & BAM_FQCFAIL) {
-                                ori_score  = 0;
-                                curr_score = 1;
-                            } else {
-                                ori_score  = 1;
-                                curr_score = 0;
-                            }
-                        } else {
-                            ori_score  = calc_score(ori->b);
-                            curr_score = calc_score(current->b);
-
-                            if (current_paired) {
-                                // they are pairs so add mate scores.
-                                int64_t mate_tmp;
-
-                                if ((mate_tmp = get_mate_score(ori->b)) == -1) {
-                                    fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
-                                    ret = -1;
-                                    break;
-                                } else {
-                                    ori_score += mate_tmp;
-                                }
+    return ret;
+}
  
-                                if ((mate_tmp = get_mate_score(current->b)) == -1) {
-                                    fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
-                                    ret = -1;
-                                    break;
-                                } else {
-                                    curr_score += mate_tmp;
-                                }
-                            }
-                        }
  
-                        if (ori_score == curr_score) {
-                            if (strcmp(bam_get_qname(current->b), ori_name) < 0) {
-                                curr_score++;
-                            } else {
-                                curr_score--;
-                            }
-                        }
+static int xcoord_sort(const void *a, const void *b) {
+    check_t *ac = (check_t *) a;
+    check_t *bc = (check_t *) b;
  
-                        if (ori_score > curr_score) {
-                            is_cur_dup = 1;
-                        }
+    return (ac->x - bc->x);
+}
+
+
+/* Check all the duplicates against each other to see if they are optical duplicates. */
+static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list,
+             long *warn, long *optical_single, long *optical_pair) {
+    int ret = 0;
+    size_t curr = 0;
+
+    qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort);
+
+    while (curr < list->length - 1) {
+        check_t *current = &list->c[curr];
+        size_t count = curr;
+        char *cur_name = bam_get_qname(current->b);
+        int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
+
+        while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) {
+            // while close enough along the x coordinate
+            check_t *chk = &list->c[count];
+
+            if (current->opt && chk->opt)
+                continue;
+
+            // if both are already optical duplicates there is no need to check again, otherwise...
+
+            long ydiff;
+
+            if (current->y > chk->y) {
+                ydiff = current->y - chk->y;
+            } else {
+                ydiff = chk->y - current->y;
+            }
+
+            if (ydiff > param->opt_dist)
+                continue;
+
+            // the number are right, check the names
+            if (strncmp(cur_name, bam_get_qname(chk->b), current->xpos - 1) != 0)
+                continue;
+
+            // optical duplicates
+            int chk_dup = 0;
+            int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP);
+
+            if (current_paired != chk_paired) {
+                if (!chk_paired) {
+                    // chk is single vs pair, this is a dup.
+                    chk_dup = 1;
+                }
+            } else {
+                // do it by scores
+                int64_t cur_score, chk_score;
+
+                if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) {
+                    if (current->b->core.flag & BAM_FQCFAIL) {
+                        cur_score = 0;
+                        chk_score = 1;
+                    } else {
+                        cur_score = 1;
+                        chk_score = 0;
                      }
+                } else {
+                    cur_score = current->score;
+                    chk_score = chk->score;
  
-                    if (is_cur_dup) {
-                        // the current is the optical duplicate
-                        if (!is_cur_opt) { // only change if not already an optical duplicate
-                            if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
-                                ret = -1;
-                                break;
-                            }
-                        }
+                    if (current_paired) {
+                        // they are pairs so add mate scores.
+                        chk_score += chk->mate_score;
+                        cur_score += current->mate_score;
+                    }
+                }
+
+                if (cur_score == chk_score) {
+                    if (strcmp(bam_get_qname(chk->b), cur_name) < 0) {
+                        chk_score++;
                      } else {
-                        if (!is_ori_opt) {
-                            if (optical_retag(param, dup_hash, ori->b, ori_paired, optical_single, optical_pair)) {
-                                ret = -1;
-                                break;
-                            }
-                        }
+                        chk_score--;
                      }
                  }
+
+                if (cur_score > chk_score) {
+                    chk_dup = 1;
+                }
+            }
+
+            if (chk_dup) {
+                // the duplicate is the optical duplicate
+                if (!chk->opt) { // only change if not already an optical duplicate
+                    if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) {
+                        ret = -1;
+                        goto fail;
+                    }
+
+                    chk->opt = 1;
+                }
+            } else {
+                if (!current->opt) {
+                    if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
+                        ret = -1;
+                        goto fail;
+                    }
+
+                    current->opt = 1;
+                }
              }
          }
  
-        current = current->duplicate;
+        curr++;
+    }
+
+ fail:
+    return ret;
+}
+
+
+/* Where there is more than one duplicate go down the list and check for optical duplicates and change
+   do tags (where used) to point to original (non-duplicate) read. */
+static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list,
+                                const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single,
+                                long *optical_pair, const int check_range) {
+    int ret = 0;
+    kliter_t(read_queue) *rq;
+
+    rq = kl_begin(read_buffer);
+
+    while (rq != kl_end(read_buffer)) {
+        read_queue_t *in_read = &kl_val(rq);
+
+        if (check_range) {
+            /* Just check against the moving window of reads based on coordinates and max read length. */
+            if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) {
+                break;
+            }
+        } else {
+            // this is the last set of results and the end entry will be blank
+            if (!bam_get_qname(in_read->b)) {
+                break;
+            }
+        }
+
+        if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain
+
+            // check against the original for tagging and optical duplication
+            if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) {
+                if (ret < 0) { // real error
+                    ret = -1;
+                    break;
+                } else { // coordinate decoding error
+                    ret = 0;
+                    in_read->duplicate = NULL;
+                    continue;
+                }
+            }
+
+            // check the rest of the duplicates against each other for optical duplication
+            if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) {
+                ret = -1;
+                break;
+            }
+
+            in_read->duplicate = NULL;
+        }
+
+        rq = kl_next(rq);
      }
  
      return ret;
  }
  
+
  /*
    Function to use when estimating library size.
  
@@ -1080,30 +1343,29 @@ static inline double coverage_equation(double x, double c, double n) {
  
  
  /* estimate the library size, based on the Picard code in DuplicationMetrics.java*/
-static unsigned long estimate_library_size(unsigned long read_pairs, unsigned long duplicate_pairs) {
+static unsigned long estimate_library_size(unsigned long paired_reads, unsigned long paired_duplicate_reads, unsigned long optical) {
      unsigned long estimated_size = 0;
+    unsigned long non_optical_pairs = (paired_reads - optical) / 2;
+    unsigned long unique_pairs = (paired_reads - paired_duplicate_reads) / 2;
+    unsigned long duplicate_pairs = (paired_duplicate_reads - optical) / 2;
  
-    read_pairs /= 2;
-    duplicate_pairs /= 2;
-
-    if ((read_pairs && duplicate_pairs) && (read_pairs > duplicate_pairs)) {
-        unsigned long unique_pairs = read_pairs - duplicate_pairs;
+    if ((non_optical_pairs && duplicate_pairs && unique_pairs) && (non_optical_pairs > duplicate_pairs)) {
          double m = 1;
          double M = 100;
          int i;
  
-        if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) < 0) {
+        if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) < 0) {
              fprintf(stderr, "[markdup] warning: unable to calculate estimated library size.\n");
              return  estimated_size;
          }
  
-        while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) > 0) {
+        while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) > 0) {
              M *= 10;
          }
  
          for (i = 0; i < 40; i++) {
              double r = (m + M) / 2;
-            double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)read_pairs);
+            double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs);
  
              if (u > 0) {
                  m = r;
@@ -1119,7 +1381,7 @@ static unsigned long estimate_library_size(unsigned long read_pairs, unsigned lo
          fprintf(stderr, "[markdup] warning: unable to calculate estimated library size."
                          " Read pairs %ld should be greater than duplicate pairs %ld,"
                          " which should both be non zero.\n",
-                        read_pairs, duplicate_pairs);
+                        non_optical_pairs, duplicate_pairs);
      }
  
      return estimated_size;
@@ -1153,6 +1415,7 @@ static int bam_mark_duplicates(md_param_t *param) {
      tmp_file_t temp;
      char *idx_fn = NULL;
      int exclude = 0;
+    check_list_t dup_list = {NULL, 0, 0};
  
      if (!pair_hash || !single_hash || !read_buffer || !dup_hash) {
          fprintf(stderr, "[markdup] out of memory\n");
@@ -1213,10 +1476,24 @@ static int bam_mark_duplicates(md_param_t *param) {
          goto fail;
      }
  
+    if (param->check_chain && !(param->tag || param->opt_dist))
+        param->check_chain = 0;
+
+    if (param->check_chain) {
+        dup_list.size = 128;
+        dup_list.c = NULL;
+
+        if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) {
+            fprintf(stderr, "[markdup] error: unable to allocate memory for dup_list.\n");
+            goto fail;
+        }
+    }
+
      reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0;
      np_duplicate = np_opt_duplicate = 0;
  
      while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) {
+        int dup_checked = 0;
  
          // do some basic coordinate order checks
          if (in_read->b->core.tid >= 0) { // -1 for unmapped reads
@@ -1231,6 +1508,8 @@ static int bam_mark_duplicates(md_param_t *param) {
          prev_tid   =  in_read->b->core.tid;
          in_read->pair_key.single   = 1;
          in_read->single_key.single = 0;
+        in_read->duplicate = NULL;
+        in_read->dup_checked = 0;
  
          reading++;
  
@@ -1257,7 +1536,7 @@ static int bam_mark_duplicates(md_param_t *param) {
          // read must not be secondary, supplementary, unmapped or (possibly) failed QC
          if (!(in_read->b->core.flag & exclude)) {
              examined++;
-            in_read->duplicate = NULL;
+
  
              // look at the pairs first
              if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) {
@@ -1300,17 +1579,15 @@ static int bam_mark_duplicates(md_param_t *param) {
                         // scores more than one read of the pair
                          bam1_t *dup = bp->p->b;
  
-                        in_read->duplicate = bp->p;
+                        if (param->check_chain)
+                            in_read->duplicate = bp->p;
+
                          bp->p = in_read;
  
                          if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
                              goto fail;
  
                          single_dup++;
-
-                        if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
-                            goto fail;
-
                      }
                  } else {
                      fprintf(stderr, "[markdup] error: single hashing failure.\n");
@@ -1327,8 +1604,7 @@ static int bam_mark_duplicates(md_param_t *param) {
                      in_read->pair_key = pair_key;
                  } else if (ret == 0) {
                      int64_t old_score, new_score, tie_add = 0;
-                    bam1_t *dup;
-                    int check_chain = 0;
+                    bam1_t *dup = NULL;
  
                      bp = &kh_val(pair_hash, k);
  
@@ -1369,29 +1645,48 @@ static int bam_mark_duplicates(md_param_t *param) {
  
                      if (new_score + tie_add > old_score) { // swap reads
                          dup = bp->p->b;
-                        in_read->duplicate = bp->p;
+
+                        if (param->check_chain) {
+
+                            if (in_read->duplicate) {
+                                read_queue_t *current = in_read->duplicate;
+
+                                while (current->duplicate) {
+                                    current = current->duplicate;
+                                }
+
+                                current->duplicate = bp->p;
+                            } else {
+                                in_read->duplicate = bp->p;
+                            }
+                        }
+
                          bp->p = in_read;
                      } else {
-                        if (bp->p->duplicate) {
-                            in_read->duplicate = bp->p->duplicate;
-                            check_chain = 1;
+                        if (param->check_chain) {
+                            if (bp->p->duplicate) {
+                                if (in_read->duplicate) {
+                                    read_queue_t *current = bp->p->duplicate;
+
+                                    while (current->duplicate) {
+                                        current = current->duplicate;
+                                    }
+
+                                    current->duplicate = in_read->duplicate;
+                                }
+
+                                in_read->duplicate = bp->p->duplicate;
+                            }
+
+                            bp->p->duplicate = in_read;
                          }
  
-                        bp->p->duplicate = in_read;
                          dup = in_read->b;
                      }
  
                      if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings))
                          goto fail;
  
-                    if (check_chain) {
-                        if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical))
-                            goto fail;
-                    }
-
-                    if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
-                        goto fail;
-
                      duplicate++;
                  } else {
                      fprintf(stderr, "[markdup] error: pair hashing failure.\n");
@@ -1401,7 +1696,6 @@ static int bam_mark_duplicates(md_param_t *param) {
                  int ret;
                  key_data_t single_key;
                  in_hash_t *bp;
-                int check_chain = 0;
  
                  make_single_key(&single_key, in_read->b);
  
@@ -1420,29 +1714,20 @@ static int bam_mark_duplicates(md_param_t *param) {
                      if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) {
                          // if matched against one of a pair just mark as duplicate
  
-                        if (bp->p->duplicate) {
-                            in_read->duplicate = bp->p->duplicate;
-                            check_chain = 1;
-                        }
-
-                        bp->p->duplicate = in_read;
-
-                        if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings))
-                            goto fail;
+                        if (param->check_chain) {
+                            if (bp->p->duplicate) {
+                                in_read->duplicate = bp->p->duplicate;
+                            }
  
-                        if (check_chain) {
-                            // check the new duplicate entry in the chain
-                            if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical))
-                                    goto fail;
+                            bp->p->duplicate = in_read;
                          }
  
-                        // check against the new original
-                        if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
+                        if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings))
                              goto fail;
  
                      } else {
                          int64_t old_score, new_score;
-                        bam1_t *dup;
+                        bam1_t *dup = NULL;
  
                          old_score = calc_score(bp->p->b);
                          new_score = calc_score(in_read->b);
@@ -1451,32 +1736,26 @@ static int bam_mark_duplicates(md_param_t *param) {
                          // to the single hash and mark the other as duplicate
                          if (new_score > old_score) { // swap reads
                              dup = bp->p->b;
-                            in_read->duplicate = bp->p;
+
+                            if (param->check_chain)
+                                in_read->duplicate = bp->p;
+
                              bp->p = in_read;
                          } else {
-                            if (bp->p->duplicate) {
-                                in_read->duplicate = bp->p->duplicate;
-                                check_chain = 1;
+                            if (param->check_chain) {
+                                if (bp->p->duplicate) {
+                                    in_read->duplicate = bp->p->duplicate;
+                                }
+
+                                bp->p->duplicate = in_read;
                              }
  
-                            bp->p->duplicate = in_read;
                              dup = in_read->b;
                          }
  
                          if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
                              goto fail;
-
-
-                        if (check_chain) {
-                            if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical))
-                                goto fail;
-                        }
-
-                        if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
-                            goto fail;
-
-
-                        }
+                    }
  
                      single_dup++;
                  } else {
@@ -1500,6 +1779,22 @@ static int bam_mark_duplicates(md_param_t *param) {
                  break;
              }
  
+            if (!dup_checked && param->check_chain) {
+                // check for multiple optical duplicates of the same original read
+
+                if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) {
+                    fprintf(stderr, "[markdup] error: duplicate checking failed.\n");
+                    goto fail;
+                }
+
+                dup_checked = 1;
+            }
+
+
+            if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) {
+                break;
+            }
+
              if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
                  if (param->supp) {
                      if (tmp_file_write(&temp, in_read->b)) {
@@ -1550,6 +1845,14 @@ static int bam_mark_duplicates(md_param_t *param) {
          goto fail;
      }
  
+    // one last check
+    if (param->tag || param->opt_dist) {
+        if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) {
+            fprintf(stderr, "[markdup] error: duplicate checking failed.\n");
+            goto fail;
+        }
+    }
+
      // write out the end of the list
      rq = kl_begin(read_buffer);
      while (rq != kl_end(read_buffer)) {
@@ -1606,7 +1909,7 @@ static int bam_mark_duplicates(md_param_t *param) {
                      np_duplicate++;
  
                      if (param->tag && kh_val(dup_hash, k).name) {
-                        if (bam_aux_append(b, "do", 'Z', strlen(kh_val(dup_hash, k).name) + 1, (uint8_t*)kh_val(dup_hash, k).name)) {
+                        if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) {
                              fprintf(stderr, "[markdup] error: unable to append supplementary 'do' tag.\n");
                              goto fail;
                          }
@@ -1614,10 +1917,10 @@ static int bam_mark_duplicates(md_param_t *param) {
  
                      if (param->opt_dist) {
                          if (kh_val(dup_hash, k).type) {
-                            bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ");
+                            bam_aux_update_str(b, "dt", 3, "SQ");
                              np_opt_duplicate++;
                          } else {
-                            bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"LB");
+                            bam_aux_update_str(b, "dt", 3, "LB");
                          }
                      }
                  }
@@ -1669,7 +1972,7 @@ static int bam_mark_duplicates(md_param_t *param) {
              fp = stderr;
          }
  
-        els = estimate_library_size(pair, duplicate - optical);
+        els = estimate_library_size(pair, duplicate, optical);
  
          fprintf(fp,
                  "COMMAND: %s\n"
@@ -1703,6 +2006,9 @@ static int bam_mark_duplicates(md_param_t *param) {
          }
      }
  
+    if (param->check_chain && (param->tag || param->opt_dist))
+        free(dup_list.c);
+
      kh_destroy(reads, pair_hash);
      kh_destroy(reads, single_hash);
      kl_destroy(read_queue, read_buffer);
@@ -1723,6 +2029,9 @@ static int bam_mark_duplicates(md_param_t *param) {
      }
      kh_destroy(duplicates, dup_hash);
  
+    if (param->check_chain && (param->tag || param->opt_dist))
+        free(dup_list.c);
+
      kh_destroy(reads, pair_hash);
      kh_destroy(reads, single_hash);
      sam_hdr_destroy(header);
@@ -1745,8 +2054,11 @@ static int markdup_usage(void) {
      fprintf(stderr, "  -m --mode TYPE   Duplicate decision method for paired reads.\n"
                      "                   TYPE = t measure positions based on template start/end (default).\n"
                      "                          s measure positions based on sequence start.\n");
+    fprintf(stderr, "  -n               Reduce optical duplicate accuracy (faster results with many duplicates).\n");
+    fprintf(stderr, "  -u               Output uncompressed data\n");
      fprintf(stderr, "  --include-fails  Include quality check failed reads.\n");
      fprintf(stderr, "  --no-PG          Do not add a PG line\n");
+    fprintf(stderr, "  --no-multi-dup   Reduced duplicates of duplicates checking.\n");
      fprintf(stderr, "  -t               Mark primary duplicates with the name of the original in a \'do\' tag."
                                    " Mainly for information and debugging.\n");
  
@@ -1761,23 +2073,24 @@ static int markdup_usage(void) {
  
  int bam_markdup(int argc, char **argv) {
      int c, ret;
-    char wmode[3] = {'w', 'b', 0};
+    char wmode[4] = {'w', 'b', 0, 0};
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
      htsThreadPool p = {NULL, 0};
      kstring_t tmpprefix = {0, 0, NULL};
      struct stat st;
      unsigned int t;
-    md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL};
+    md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL};
  
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
          {"include-fails", no_argument, NULL, 1001},
          {"no-PG", no_argument, NULL, 1002},
          {"mode", required_argument, NULL, 'm'},
+        {"no-multi-dup", no_argument, NULL, 1003},
          {NULL, 0, NULL, 0}
      };
  
-    while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:ncm:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:cm:u", lopts, NULL)) >= 0) {
          switch (c) {
              case 'r': param.remove_dups = 1; break;
              case 'l': param.max_length = atoi(optarg); break;
@@ -1799,8 +2112,10 @@ int bam_markdup(int argc, char **argv) {
                  }
  
                  break;
+            case 'u': wmode[2] = '0'; break;
              case 1001: param.include_fails = 1; break;
              case 1002: param.no_pg = 1; break;
+            case 1003: param.check_chain = 0; break;
              default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
              /* else fall-through */
              case '?': return markdup_usage();
diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c

index bcb92430021f39b8cdb95ec355ecb6c313cf4422..7132687dc50011694a6cfc216dc3a8c6d30f9ca2 100644 (file)
--- a/samtools/bam_markdup.c.pysam.c
+++ b/samtools/bam_markdup.c.pysam.c
@@ -3,7 +3,7 @@
  /*  bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
                       through fixmates with the mate scoring option on.
  
-    Copyright (C) 2017-2019 Genome Research Ltd.
+    Copyright (C) 2017-2020 Genome Research Ltd.
  
      Author: Andrew Whitwham <aw7@sanger.ac.uk>
  
@@ -64,6 +64,7 @@ typedef struct {
      int mode;
      int write_index;
      int include_fails;
+    int check_chain;
      char *stats_file;
      char *arg_list;
      char *out_fn;
@@ -85,6 +86,7 @@ typedef struct read_queue_s {
      bam1_t *b;
      struct read_queue_s *duplicate;
      hts_pos_t pos;
+    int dup_checked;
  } read_queue_t;
  
  typedef struct {
@@ -96,8 +98,23 @@ typedef struct {
      char type;
  } dup_map_t;
  
+typedef struct {
+    bam1_t *b;
+    int64_t score;
+    int64_t mate_score;
+    long x;
+    long y;
+    int opt;
+    int xpos;
+} check_t;
  
  
+typedef struct {
+    check_t *c;
+    size_t size;
+    size_t length;
+} check_list_t;
+
  static khint32_t do_hash(unsigned char *key, khint32_t len);
  
  static khint_t hash_key(key_data_t key) {
@@ -667,6 +684,7 @@ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_n
  }
  
  
+/* Get the position of the coordinates from the read name. */
  static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) {
      int sep = 0;
      int pos = 0;
@@ -695,6 +713,66 @@ static inline int get_coordinate_positions(const char *qname, int *xpos, int *yp
      return sep;
  }
  
+
+static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *y_coord, long *warnings) {
+    int ret = 1;
+    int seps, xpos = 0, ypos = 0;
+    long x = 0, y = 0;
+    char *end;
+
+    seps = get_coordinate_positions(name, &xpos, &ypos);
+
+    /* The most current Illumina read format at time of writing is:
+       @machine:run:flowcell:lane:tile:x:y:UMI or
+       @machine:run:flowcell:lane:tile:x:y
+
+       Counting the separating colons gives us a quick format check.
+       Older name formats have fewer elements.
+    */
+
+    if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) {
+        (*warnings)++;
+
+        if (*warnings <= BMD_WARNING_MAX) {
+            fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", name);
+        }
+
+        return ret;
+    }
+
+    x = strtol(name + xpos, &end, 10);
+
+    if ((name + xpos) == end) {
+        (*warnings)++;
+
+        if (*warnings <= BMD_WARNING_MAX) {
+            fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name);
+        }
+
+        return ret;
+    }
+
+    y = strtol(name + ypos, &end, 10);
+
+    if ((name + ypos) == end) {
+        (*warnings)++;
+
+        if (*warnings <= BMD_WARNING_MAX) {
+            fprintf(samtools_stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name);
+        }
+
+        return ret;
+    }
+
+    *x_coord = x;
+    *y_coord = y;
+    *xpos_out = xpos;
+    ret = 0;
+
+    return ret;
+}
+
+
  /* Using the coordinates from the Illumina read name, see whether the duplicated read is
     close enough (set by max_dist) to the original to be counted as optical.*/
  
@@ -808,6 +886,59 @@ static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warn
  }
  
  
+/* Using the coordinates from the Illumina read name, see whether the duplicated read is
+   close enough (set by max_dist) to the original to be counted as optical.
+
+   This function needs the values from the first read to be already calculated. */
+
+static int optical_duplicate_partial(const char *name, const int oxpos, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) {
+    int ret = 0;
+    char *duplicate;
+    int dxpos = 0;
+    long dx, dy;
+
+    duplicate = bam_get_qname(dup);
+
+    if (get_coordinates(duplicate, &dxpos, &dx, &dy, warnings)) {
+        return ret;
+    }
+
+    if (strncmp(name, duplicate, oxpos - 1) == 0) {
+        // the initial parts match, look at the numbers
+        long xdiff, ydiff;
+
+        if (ox > dx) {
+            xdiff = ox - dx;
+        } else {
+            xdiff = dx - ox;
+        }
+
+        if (xdiff <= max_dist) {
+            // still might be optical
+
+            if (oy > dy) {
+                ydiff = oy - dy;
+            } else {
+                ydiff = dy - oy;
+            }
+
+            if (ydiff <= max_dist) ret = 1;
+        }
+    }
+
+    c->x = dx;
+    c->y = dy;
+    c->xpos = dxpos;
+
+    if (ret) {
+        c->opt = ret;
+    }
+
+    return ret;
+}
+
+
+/* Mark the read as a duplicate and update the duplicate hash (if needed) */
  static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup,
                             long *optical, long *warn) {
      char dup_type = 0;
@@ -816,7 +947,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam
      dup->core.flag |= BAM_FDUP;
  
      if (param->tag) {
-        if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(ori)) + 1, (uint8_t*)bam_get_qname(ori))) {
+        if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) {
              fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n");
              return -1;
          }
@@ -824,12 +955,12 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam
  
      if (param->opt_dist) { // mark optical duplicates
          if (optical_duplicate(ori, dup, param->opt_dist, warn)) {
-            bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"SQ");
+            bam_aux_update_str(dup, "dt", 3, "SQ");
              dup_type = 'O';
              (*optical)++;
          } else {
              // not an optical duplicate
-            bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"LB");
+            bam_aux_update_str(dup, "dt", 3, "LB");
          }
      }
  
@@ -855,17 +986,12 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam
  }
  
  
+/* If the duplicate type has changed to optical then retag and duplicate hash. */
  static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) {
      int ret = 0;
-    uint8_t *data;
  
-    // remove any existing dt tag
-    if ((data = bam_aux_get(b, "dt")) != NULL) {
-        bam_aux_del(b, data);
-    }
-
-    if (bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ")) {
-        fprintf(samtools_stderr, "[markdup] error: unable to append 'dt' tag.\n");
+    if (bam_aux_update_str(b, "dt", 3, "SQ")) {
+        fprintf(samtools_stderr, "[markdup] error: unable to update 'dt' tag.\n");
          ret = -1;
      }
  
@@ -899,23 +1025,54 @@ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash
  }
  
  
+/* Check all duplicates of the highest quality read (the "original") for consistancy.  Also
+   pre-calculate any values for use in check_duplicate_chain later.
+   Returns 0 on success, >0 on coordinate reading error (program can continue) or
+   <0 on an error (program should not continue. */
+static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori,
+             check_list_t *list, long *warn, long *optical_single, long *optical_pair) {
  
-/*
-    Where there is more than one duplicate go down the list and check for optical duplicates and change
-    do tags (where used) to point to original (non-duplicate) read.
-*/
-static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori,
-             long *warn, long *optical_single, long *optical_pair) {
      int ret = 0;
-    read_queue_t *current = ori->duplicate;
      char *ori_name = bam_get_qname(ori->b);
-    int have_original = !(ori->b->core.flag & BAM_FDUP);
-    int ori_paired = (ori->b->core.flag & BAM_FPAIRED) && !(ori->b->core.flag & BAM_FMUNMAP);
+    read_queue_t *current = ori->duplicate;
+    int xpos;
+    long x, y;
+
+    if (param->opt_dist) {
+        if ((ret = get_coordinates(ori_name, &xpos, &x, &y, warn))) {
+            return ret;
+        }
+    }
+
+    list->length = 0;
  
      while (current) {
-        int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
+        check_t *c;
+
+        if (list->length >= list->size) {
+            check_t *tmp;
+
+            list->size *= 2;
+
+            if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) {
+                fprintf(samtools_stderr, "[markdup] error: Unable to expand opt check list.\n");
+                return -1;
+            }
+
+            list->c = tmp;
+        }
+
+        c = &list->c[list->length];
  
-        if (param->tag && have_original) {
+        c->b = current->b;
+        c->x = -1;
+        c->y = -1;
+        c->opt = 0;
+        c->score = 0;
+        c->mate_score = 0;
+        current->dup_checked = 1;
+
+        if (param->tag) {
              uint8_t *data;
  
              // at this stage all duplicates should have a do tag
@@ -925,10 +1082,8 @@ static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_has
  
                  if (old_name) {
                      if (strcmp(old_name, ori_name) != 0) {
-                        bam_aux_del(current->b, data);
-
-                        if (bam_aux_append(current->b, "do", 'Z', strlen(ori_name) + 1, (uint8_t*)ori_name)) {
-                            fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n");
+                        if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) {
+                            fprintf(samtools_stderr, "[markdup] error: unable to update 'do' tag.\n");
                              ret =  -1;
                              break;
                          }
@@ -942,118 +1097,226 @@ static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_has
          }
  
          if (param->opt_dist) {
-            int is_cur_opt = 0, is_ori_opt = 0;
              uint8_t *data;
              char *dup_type;
+            int is_opt = 0;
+            int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
  
-            if ((data = bam_aux_get(ori->b, "dt"))) {
+            if ((data = bam_aux_get(current->b, "dt"))) {
                  if ((dup_type = bam_aux2Z(data))) {
                      if (strcmp(dup_type, "SQ") == 0) {
-                        is_ori_opt = 1;
+                        c->opt = 1;
                      }
                  }
              }
  
-            if ((data = bam_aux_get(current->b, "dt"))) {
-                if ((dup_type = bam_aux2Z(data))) {
-                    if (strcmp(dup_type, "SQ") == 0) {
-                        is_cur_opt = 1;
-                    }
+            // need to run this to get the duplicates x and y scores
+            is_opt = optical_duplicate_partial(ori_name, xpos, x, y, current->b, c, param->opt_dist, warn);
+
+            if (!c->opt && is_opt) {
+                if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
+                    ret = -1;
+                    break;
                  }
+
+                c->opt = 1;
              }
  
-            if (!(is_ori_opt && is_cur_opt)) {
-                // if both are already optical duplicates there is no need to check again, otherwise...
+            c->score = calc_score(current->b);
  
-                if (optical_duplicate(ori->b, current->b, param->opt_dist, warn)) {
-                    // find out which one is the duplicate
-                    int is_cur_dup = 0;
+            if (current_paired) {
+                if ((c->mate_score = get_mate_score(current->b)) == -1) {
+                     fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
+                     ret = -1;
+                     break;
+                }
+            }
+        }
  
-                    if (have_original) {
-                        // compared against an original, this is a dup.
-                        is_cur_dup = 1;
-                    } else if (ori_paired != current_paired) {
-                        if (!current_paired) {
-                            // current is single vs pair, this is a dup.
-                            is_cur_dup = 1;
-                        }
-                    } else {
-                        // do it by scores
-                        int64_t ori_score, curr_score;
+        current = current->duplicate;
+        list->length++;
+    }
  
-                        if ((ori->b->core.flag & BAM_FQCFAIL) != (current->b->core.flag & BAM_FQCFAIL)) {
-                            if (ori->b->core.flag & BAM_FQCFAIL) {
-                                ori_score  = 0;
-                                curr_score = 1;
-                            } else {
-                                ori_score  = 1;
-                                curr_score = 0;
-                            }
-                        } else {
-                            ori_score  = calc_score(ori->b);
-                            curr_score = calc_score(current->b);
-
-                            if (current_paired) {
-                                // they are pairs so add mate scores.
-                                int64_t mate_tmp;
-
-                                if ((mate_tmp = get_mate_score(ori->b)) == -1) {
-                                    fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
-                                    ret = -1;
-                                    break;
-                                } else {
-                                    ori_score += mate_tmp;
-                                }
+    return ret;
+}
  
-                                if ((mate_tmp = get_mate_score(current->b)) == -1) {
-                                    fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
-                                    ret = -1;
-                                    break;
-                                } else {
-                                    curr_score += mate_tmp;
-                                }
-                            }
-                        }
  
-                        if (ori_score == curr_score) {
-                            if (strcmp(bam_get_qname(current->b), ori_name) < 0) {
-                                curr_score++;
-                            } else {
-                                curr_score--;
-                            }
-                        }
+static int xcoord_sort(const void *a, const void *b) {
+    check_t *ac = (check_t *) a;
+    check_t *bc = (check_t *) b;
  
-                        if (ori_score > curr_score) {
-                            is_cur_dup = 1;
-                        }
+    return (ac->x - bc->x);
+}
+
+
+/* Check all the duplicates against each other to see if they are optical duplicates. */
+static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list,
+             long *warn, long *optical_single, long *optical_pair) {
+    int ret = 0;
+    size_t curr = 0;
+
+    qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort);
+
+    while (curr < list->length - 1) {
+        check_t *current = &list->c[curr];
+        size_t count = curr;
+        char *cur_name = bam_get_qname(current->b);
+        int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP);
+
+        while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) {
+            // while close enough along the x coordinate
+            check_t *chk = &list->c[count];
+
+            if (current->opt && chk->opt)
+                continue;
+
+            // if both are already optical duplicates there is no need to check again, otherwise...
+
+            long ydiff;
+
+            if (current->y > chk->y) {
+                ydiff = current->y - chk->y;
+            } else {
+                ydiff = chk->y - current->y;
+            }
+
+            if (ydiff > param->opt_dist)
+                continue;
+
+            // the number are right, check the names
+            if (strncmp(cur_name, bam_get_qname(chk->b), current->xpos - 1) != 0)
+                continue;
+
+            // optical duplicates
+            int chk_dup = 0;
+            int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP);
+
+            if (current_paired != chk_paired) {
+                if (!chk_paired) {
+                    // chk is single vs pair, this is a dup.
+                    chk_dup = 1;
+                }
+            } else {
+                // do it by scores
+                int64_t cur_score, chk_score;
+
+                if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) {
+                    if (current->b->core.flag & BAM_FQCFAIL) {
+                        cur_score = 0;
+                        chk_score = 1;
+                    } else {
+                        cur_score = 1;
+                        chk_score = 0;
                      }
+                } else {
+                    cur_score = current->score;
+                    chk_score = chk->score;
  
-                    if (is_cur_dup) {
-                        // the current is the optical duplicate
-                        if (!is_cur_opt) { // only change if not already an optical duplicate
-                            if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
-                                ret = -1;
-                                break;
-                            }
-                        }
+                    if (current_paired) {
+                        // they are pairs so add mate scores.
+                        chk_score += chk->mate_score;
+                        cur_score += current->mate_score;
+                    }
+                }
+
+                if (cur_score == chk_score) {
+                    if (strcmp(bam_get_qname(chk->b), cur_name) < 0) {
+                        chk_score++;
                      } else {
-                        if (!is_ori_opt) {
-                            if (optical_retag(param, dup_hash, ori->b, ori_paired, optical_single, optical_pair)) {
-                                ret = -1;
-                                break;
-                            }
-                        }
+                        chk_score--;
                      }
                  }
+
+                if (cur_score > chk_score) {
+                    chk_dup = 1;
+                }
+            }
+
+            if (chk_dup) {
+                // the duplicate is the optical duplicate
+                if (!chk->opt) { // only change if not already an optical duplicate
+                    if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) {
+                        ret = -1;
+                        goto fail;
+                    }
+
+                    chk->opt = 1;
+                }
+            } else {
+                if (!current->opt) {
+                    if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
+                        ret = -1;
+                        goto fail;
+                    }
+
+                    current->opt = 1;
+                }
              }
          }
  
-        current = current->duplicate;
+        curr++;
+    }
+
+ fail:
+    return ret;
+}
+
+
+/* Where there is more than one duplicate go down the list and check for optical duplicates and change
+   do tags (where used) to point to original (non-duplicate) read. */
+static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list,
+                                const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single,
+                                long *optical_pair, const int check_range) {
+    int ret = 0;
+    kliter_t(read_queue) *rq;
+
+    rq = kl_begin(read_buffer);
+
+    while (rq != kl_end(read_buffer)) {
+        read_queue_t *in_read = &kl_val(rq);
+
+        if (check_range) {
+            /* Just check against the moving window of reads based on coordinates and max read length. */
+            if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) {
+                break;
+            }
+        } else {
+            // this is the last set of results and the end entry will be blank
+            if (!bam_get_qname(in_read->b)) {
+                break;
+            }
+        }
+
+        if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain
+
+            // check against the original for tagging and optical duplication
+            if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) {
+                if (ret < 0) { // real error
+                    ret = -1;
+                    break;
+                } else { // coordinate decoding error
+                    ret = 0;
+                    in_read->duplicate = NULL;
+                    continue;
+                }
+            }
+
+            // check the rest of the duplicates against each other for optical duplication
+            if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) {
+                ret = -1;
+                break;
+            }
+
+            in_read->duplicate = NULL;
+        }
+
+        rq = kl_next(rq);
      }
  
      return ret;
  }
  
+
  /*
    Function to use when estimating library size.
  
@@ -1082,30 +1345,29 @@ static inline double coverage_equation(double x, double c, double n) {
  
  
  /* estimate the library size, based on the Picard code in DuplicationMetrics.java*/
-static unsigned long estimate_library_size(unsigned long read_pairs, unsigned long duplicate_pairs) {
+static unsigned long estimate_library_size(unsigned long paired_reads, unsigned long paired_duplicate_reads, unsigned long optical) {
      unsigned long estimated_size = 0;
+    unsigned long non_optical_pairs = (paired_reads - optical) / 2;
+    unsigned long unique_pairs = (paired_reads - paired_duplicate_reads) / 2;
+    unsigned long duplicate_pairs = (paired_duplicate_reads - optical) / 2;
  
-    read_pairs /= 2;
-    duplicate_pairs /= 2;
-
-    if ((read_pairs && duplicate_pairs) && (read_pairs > duplicate_pairs)) {
-        unsigned long unique_pairs = read_pairs - duplicate_pairs;
+    if ((non_optical_pairs && duplicate_pairs && unique_pairs) && (non_optical_pairs > duplicate_pairs)) {
          double m = 1;
          double M = 100;
          int i;
  
-        if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) < 0) {
+        if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) < 0) {
              fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size.\n");
              return  estimated_size;
          }
  
-        while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) > 0) {
+        while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) > 0) {
              M *= 10;
          }
  
          for (i = 0; i < 40; i++) {
              double r = (m + M) / 2;
-            double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)read_pairs);
+            double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs);
  
              if (u > 0) {
                  m = r;
@@ -1121,7 +1383,7 @@ static unsigned long estimate_library_size(unsigned long read_pairs, unsigned lo
          fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size."
                          " Read pairs %ld should be greater than duplicate pairs %ld,"
                          " which should both be non zero.\n",
-                        read_pairs, duplicate_pairs);
+                        non_optical_pairs, duplicate_pairs);
      }
  
      return estimated_size;
@@ -1155,6 +1417,7 @@ static int bam_mark_duplicates(md_param_t *param) {
      tmp_file_t temp;
      char *idx_fn = NULL;
      int exclude = 0;
+    check_list_t dup_list = {NULL, 0, 0};
  
      if (!pair_hash || !single_hash || !read_buffer || !dup_hash) {
          fprintf(samtools_stderr, "[markdup] out of memory\n");
@@ -1215,10 +1478,24 @@ static int bam_mark_duplicates(md_param_t *param) {
          goto fail;
      }
  
+    if (param->check_chain && !(param->tag || param->opt_dist))
+        param->check_chain = 0;
+
+    if (param->check_chain) {
+        dup_list.size = 128;
+        dup_list.c = NULL;
+
+        if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) {
+            fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for dup_list.\n");
+            goto fail;
+        }
+    }
+
      reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0;
      np_duplicate = np_opt_duplicate = 0;
  
      while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) {
+        int dup_checked = 0;
  
          // do some basic coordinate order checks
          if (in_read->b->core.tid >= 0) { // -1 for unmapped reads
@@ -1233,6 +1510,8 @@ static int bam_mark_duplicates(md_param_t *param) {
          prev_tid   =  in_read->b->core.tid;
          in_read->pair_key.single   = 1;
          in_read->single_key.single = 0;
+        in_read->duplicate = NULL;
+        in_read->dup_checked = 0;
  
          reading++;
  
@@ -1259,7 +1538,7 @@ static int bam_mark_duplicates(md_param_t *param) {
          // read must not be secondary, supplementary, unmapped or (possibly) failed QC
          if (!(in_read->b->core.flag & exclude)) {
              examined++;
-            in_read->duplicate = NULL;
+
  
              // look at the pairs first
              if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) {
@@ -1302,17 +1581,15 @@ static int bam_mark_duplicates(md_param_t *param) {
                         // scores more than one read of the pair
                          bam1_t *dup = bp->p->b;
  
-                        in_read->duplicate = bp->p;
+                        if (param->check_chain)
+                            in_read->duplicate = bp->p;
+
                          bp->p = in_read;
  
                          if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
                              goto fail;
  
                          single_dup++;
-
-                        if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
-                            goto fail;
-
                      }
                  } else {
                      fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n");
@@ -1329,8 +1606,7 @@ static int bam_mark_duplicates(md_param_t *param) {
                      in_read->pair_key = pair_key;
                  } else if (ret == 0) {
                      int64_t old_score, new_score, tie_add = 0;
-                    bam1_t *dup;
-                    int check_chain = 0;
+                    bam1_t *dup = NULL;
  
                      bp = &kh_val(pair_hash, k);
  
@@ -1371,29 +1647,48 @@ static int bam_mark_duplicates(md_param_t *param) {
  
                      if (new_score + tie_add > old_score) { // swap reads
                          dup = bp->p->b;
-                        in_read->duplicate = bp->p;
+
+                        if (param->check_chain) {
+
+                            if (in_read->duplicate) {
+                                read_queue_t *current = in_read->duplicate;
+
+                                while (current->duplicate) {
+                                    current = current->duplicate;
+                                }
+
+                                current->duplicate = bp->p;
+                            } else {
+                                in_read->duplicate = bp->p;
+                            }
+                        }
+
                          bp->p = in_read;
                      } else {
-                        if (bp->p->duplicate) {
-                            in_read->duplicate = bp->p->duplicate;
-                            check_chain = 1;
+                        if (param->check_chain) {
+                            if (bp->p->duplicate) {
+                                if (in_read->duplicate) {
+                                    read_queue_t *current = bp->p->duplicate;
+
+                                    while (current->duplicate) {
+                                        current = current->duplicate;
+                                    }
+
+                                    current->duplicate = in_read->duplicate;
+                                }
+
+                                in_read->duplicate = bp->p->duplicate;
+                            }
+
+                            bp->p->duplicate = in_read;
                          }
  
-                        bp->p->duplicate = in_read;
                          dup = in_read->b;
                      }
  
                      if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings))
                          goto fail;
  
-                    if (check_chain) {
-                        if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical))
-                            goto fail;
-                    }
-
-                    if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
-                        goto fail;
-
                      duplicate++;
                  } else {
                      fprintf(samtools_stderr, "[markdup] error: pair hashing failure.\n");
@@ -1403,7 +1698,6 @@ static int bam_mark_duplicates(md_param_t *param) {
                  int ret;
                  key_data_t single_key;
                  in_hash_t *bp;
-                int check_chain = 0;
  
                  make_single_key(&single_key, in_read->b);
  
@@ -1422,29 +1716,20 @@ static int bam_mark_duplicates(md_param_t *param) {
                      if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) {
                          // if matched against one of a pair just mark as duplicate
  
-                        if (bp->p->duplicate) {
-                            in_read->duplicate = bp->p->duplicate;
-                            check_chain = 1;
-                        }
-
-                        bp->p->duplicate = in_read;
-
-                        if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings))
-                            goto fail;
+                        if (param->check_chain) {
+                            if (bp->p->duplicate) {
+                                in_read->duplicate = bp->p->duplicate;
+                            }
  
-                        if (check_chain) {
-                            // check the new duplicate entry in the chain
-                            if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical))
-                                    goto fail;
+                            bp->p->duplicate = in_read;
                          }
  
-                        // check against the new original
-                        if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
+                        if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings))
                              goto fail;
  
                      } else {
                          int64_t old_score, new_score;
-                        bam1_t *dup;
+                        bam1_t *dup = NULL;
  
                          old_score = calc_score(bp->p->b);
                          new_score = calc_score(in_read->b);
@@ -1453,32 +1738,26 @@ static int bam_mark_duplicates(md_param_t *param) {
                          // to the single hash and mark the other as duplicate
                          if (new_score > old_score) { // swap reads
                              dup = bp->p->b;
-                            in_read->duplicate = bp->p;
+
+                            if (param->check_chain)
+                                in_read->duplicate = bp->p;
+
                              bp->p = in_read;
                          } else {
-                            if (bp->p->duplicate) {
-                                in_read->duplicate = bp->p->duplicate;
-                                check_chain = 1;
+                            if (param->check_chain) {
+                                if (bp->p->duplicate) {
+                                    in_read->duplicate = bp->p->duplicate;
+                                }
+
+                                bp->p->duplicate = in_read;
                              }
  
-                            bp->p->duplicate = in_read;
                              dup = in_read->b;
                          }
  
                          if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
                              goto fail;
-
-
-                        if (check_chain) {
-                            if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical))
-                                goto fail;
-                        }
-
-                        if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical))
-                            goto fail;
-
-
-                        }
+                    }
  
                      single_dup++;
                  } else {
@@ -1502,6 +1781,22 @@ static int bam_mark_duplicates(md_param_t *param) {
                  break;
              }
  
+            if (!dup_checked && param->check_chain) {
+                // check for multiple optical duplicates of the same original read
+
+                if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) {
+                    fprintf(samtools_stderr, "[markdup] error: duplicate checking failed.\n");
+                    goto fail;
+                }
+
+                dup_checked = 1;
+            }
+
+
+            if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) {
+                break;
+            }
+
              if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
                  if (param->supp) {
                      if (tmp_file_write(&temp, in_read->b)) {
@@ -1552,6 +1847,14 @@ static int bam_mark_duplicates(md_param_t *param) {
          goto fail;
      }
  
+    // one last check
+    if (param->tag || param->opt_dist) {
+        if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) {
+            fprintf(samtools_stderr, "[markdup] error: duplicate checking failed.\n");
+            goto fail;
+        }
+    }
+
      // write out the end of the list
      rq = kl_begin(read_buffer);
      while (rq != kl_end(read_buffer)) {
@@ -1608,7 +1911,7 @@ static int bam_mark_duplicates(md_param_t *param) {
                      np_duplicate++;
  
                      if (param->tag && kh_val(dup_hash, k).name) {
-                        if (bam_aux_append(b, "do", 'Z', strlen(kh_val(dup_hash, k).name) + 1, (uint8_t*)kh_val(dup_hash, k).name)) {
+                        if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) {
                              fprintf(samtools_stderr, "[markdup] error: unable to append supplementary 'do' tag.\n");
                              goto fail;
                          }
@@ -1616,10 +1919,10 @@ static int bam_mark_duplicates(md_param_t *param) {
  
                      if (param->opt_dist) {
                          if (kh_val(dup_hash, k).type) {
-                            bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ");
+                            bam_aux_update_str(b, "dt", 3, "SQ");
                              np_opt_duplicate++;
                          } else {
-                            bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"LB");
+                            bam_aux_update_str(b, "dt", 3, "LB");
                          }
                      }
                  }
@@ -1671,7 +1974,7 @@ static int bam_mark_duplicates(md_param_t *param) {
              fp = samtools_stderr;
          }
  
-        els = estimate_library_size(pair, duplicate - optical);
+        els = estimate_library_size(pair, duplicate, optical);
  
          fprintf(fp,
                  "COMMAND: %s\n"
@@ -1705,6 +2008,9 @@ static int bam_mark_duplicates(md_param_t *param) {
          }
      }
  
+    if (param->check_chain && (param->tag || param->opt_dist))
+        free(dup_list.c);
+
      kh_destroy(reads, pair_hash);
      kh_destroy(reads, single_hash);
      kl_destroy(read_queue, read_buffer);
@@ -1725,6 +2031,9 @@ static int bam_mark_duplicates(md_param_t *param) {
      }
      kh_destroy(duplicates, dup_hash);
  
+    if (param->check_chain && (param->tag || param->opt_dist))
+        free(dup_list.c);
+
      kh_destroy(reads, pair_hash);
      kh_destroy(reads, single_hash);
      sam_hdr_destroy(header);
@@ -1747,8 +2056,11 @@ static int markdup_usage(void) {
      fprintf(samtools_stderr, "  -m --mode TYPE   Duplicate decision method for paired reads.\n"
                      "                   TYPE = t measure positions based on template start/end (default).\n"
                      "                          s measure positions based on sequence start.\n");
+    fprintf(samtools_stderr, "  -n               Reduce optical duplicate accuracy (faster results with many duplicates).\n");
+    fprintf(samtools_stderr, "  -u               Output uncompressed data\n");
      fprintf(samtools_stderr, "  --include-fails  Include quality check failed reads.\n");
      fprintf(samtools_stderr, "  --no-PG          Do not add a PG line\n");
+    fprintf(samtools_stderr, "  --no-multi-dup   Reduced duplicates of duplicates checking.\n");
      fprintf(samtools_stderr, "  -t               Mark primary duplicates with the name of the original in a \'do\' tag."
                                    " Mainly for information and debugging.\n");
  
@@ -1763,23 +2075,24 @@ static int markdup_usage(void) {
  
  int bam_markdup(int argc, char **argv) {
      int c, ret;
-    char wmode[3] = {'w', 'b', 0};
+    char wmode[4] = {'w', 'b', 0, 0};
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
      htsThreadPool p = {NULL, 0};
      kstring_t tmpprefix = {0, 0, NULL};
      struct stat st;
      unsigned int t;
-    md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL};
+    md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL};
  
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
          {"include-fails", no_argument, NULL, 1001},
          {"no-PG", no_argument, NULL, 1002},
          {"mode", required_argument, NULL, 'm'},
+        {"no-multi-dup", no_argument, NULL, 1003},
          {NULL, 0, NULL, 0}
      };
  
-    while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:ncm:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:cm:u", lopts, NULL)) >= 0) {
          switch (c) {
              case 'r': param.remove_dups = 1; break;
              case 'l': param.max_length = atoi(optarg); break;
@@ -1801,8 +2114,10 @@ int bam_markdup(int argc, char **argv) {
                  }
  
                  break;
+            case 'u': wmode[2] = '0'; break;
              case 1001: param.include_fails = 1; break;
              case 1002: param.no_pg = 1; break;
+            case 1003: param.check_chain = 0; break;
              default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
              /* else fall-through */
              case '?': return markdup_usage();
diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c

index 6d40144f716f9b1e748a5034a58aeccd3d7fd1c7..4239fd1571726be06c669a0513637a3bdd099fef 100644 (file)
--- a/samtools/bam_mate.c
+++ b/samtools/bam_mate.c
@@ -372,7 +372,7 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop
          curr = 1 - curr;
          pre_end = cur_end;
      }
-    if (result < -1) goto fail;
+    if (result < -1) goto read_fail;
      if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired
          bam1_t *pre = b[1-curr];
          if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
@@ -391,6 +391,10 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop
      ks_free(&str);
      return 0;
  
+ read_fail:
+    print_error("fixmate", "Couldn't read from input file");
+    goto fail;
+
   write_fail:
      print_error_errno("fixmate", "Couldn't write to output file");
   fail:
@@ -410,6 +414,7 @@ void usage(FILE* where)
  "  -p           Disable FR proper pair check\n"
  "  -c           Add template cigar ct tag\n"
  "  -m           Add mate score tag\n"
+"  -u           Uncompressed output\n"
  "  --no-PG      do not add a PG line\n");
  
      sam_global_opt_help(where, "-.O..@-.");
@@ -427,7 +432,7 @@ int bam_mating(int argc, char *argv[])
      samFile *in = NULL, *out = NULL;
      int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0;
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
-    char wmode[3] = {'w', 'b', 0};
+    char wmode[4] = {'w', 'b', 0, 0};
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
          {"no-PG", no_argument, NULL, 1},
@@ -437,12 +442,13 @@ int bam_mating(int argc, char *argv[])
  
      // parse args
      if (argc == 1) { usage(stdout); return 0; }
-    while ((c = getopt_long(argc, argv, "rpcmO:@:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "rpcmO:@:u", lopts, NULL)) >= 0) {
          switch (c) {
              case 'r': remove_reads = 1; break;
              case 'p': proper_pair_check = 0; break;
              case 'c': add_ct = 1; break;
              case 'm': mate_score = 1; break;
+            case 'u': wmode[2] = '0'; break;
              case 1: no_pg = 1; break;
              default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                        /* else fall-through */
diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c

index edefb0bbf9e550ddbf2890b200c7ca9653ef3000..0aa83ecc33e59c71bdd1801a3351b145938e4451 100644 (file)
--- a/samtools/bam_mate.c.pysam.c
+++ b/samtools/bam_mate.c.pysam.c
@@ -374,7 +374,7 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop
          curr = 1 - curr;
          pre_end = cur_end;
      }
-    if (result < -1) goto fail;
+    if (result < -1) goto read_fail;
      if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired
          bam1_t *pre = b[1-curr];
          if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
@@ -393,6 +393,10 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop
      ks_free(&str);
      return 0;
  
+ read_fail:
+    print_error("fixmate", "Couldn't read from input file");
+    goto fail;
+
   write_fail:
      print_error_errno("fixmate", "Couldn't write to output file");
   fail:
@@ -412,6 +416,7 @@ void usage(FILE* where)
  "  -p           Disable FR proper pair check\n"
  "  -c           Add template cigar ct tag\n"
  "  -m           Add mate score tag\n"
+"  -u           Uncompressed output\n"
  "  --no-PG      do not add a PG line\n");
  
      sam_global_opt_help(where, "-.O..@-.");
@@ -429,7 +434,7 @@ int bam_mating(int argc, char *argv[])
      samFile *in = NULL, *out = NULL;
      int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0;
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
-    char wmode[3] = {'w', 'b', 0};
+    char wmode[4] = {'w', 'b', 0, 0};
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
          {"no-PG", no_argument, NULL, 1},
@@ -439,12 +444,13 @@ int bam_mating(int argc, char *argv[])
  
      // parse args
      if (argc == 1) { usage(samtools_stdout); return 0; }
-    while ((c = getopt_long(argc, argv, "rpcmO:@:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "rpcmO:@:u", lopts, NULL)) >= 0) {
          switch (c) {
              case 'r': remove_reads = 1; break;
              case 'p': proper_pair_check = 0; break;
              case 'c': add_ct = 1; break;
              case 'm': mate_score = 1; break;
+            case 'u': wmode[2] = '0'; break;
              case 1: no_pg = 1; break;
              default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                        /* else fall-through */
diff --git a/samtools/bam_md.c b/samtools/bam_md.c

index 927778822fac17430b3750b7c89c0df9cd3ab231..7d5aeaaf4651c4cfaa0857000ffac7cde91cb603 100644 (file)
--- a/samtools/bam_md.c
+++ b/samtools/bam_md.c
@@ -1,6 +1,6 @@
  /*  bam_md.c -- calmd subcommand.
  
-    Copyright (C) 2009-2011, 2014-2015, 2019 Genome Research Ltd.
+    Copyright (C) 2009-2011, 2014-2015, 2019-2020 Genome Research Ltd.
      Portions copyright (C) 2009-2011 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -30,6 +30,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include <string.h>
  #include <ctype.h>
  #include <limits.h>
+#include <errno.h>
  #include "htslib/faidx.h"
  #include "htslib/sam.h"
  #include "htslib/kstring.h"
@@ -46,102 +47,136 @@ DEALINGS IN THE SOFTWARE.  */
  
  int bam_aux_drop_other(bam1_t *b, uint8_t *s);
  
-void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max_nm, int quiet_mode)
+static int bam_fillmd1_core(const char *ref_name, bam1_t *b, char *ref,
+                            hts_pos_t ref_len, int flag, int max_nm,
+                            int quiet_mode, uint32_t *skipped)
  {
      uint8_t *seq = bam_get_seq(b);
      uint32_t *cigar = bam_get_cigar(b);
      bam1_core_t *c = &b->core;
-    int i, y, u = 0;
-    hts_pos_t x;
-    kstring_t *str;
+    int i, qpos, matched = 0;
+    hts_pos_t rpos;
+    kstring_t str = KS_INITIALIZE;
      int32_t old_nm_i = -1, nm = 0;
+    uint32_t err = 0;
  
-    str = (kstring_t*)calloc(1, sizeof(kstring_t));
-    for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
-        int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+    if (c->l_qseq == 0) {
+        if (!quiet_mode) {
+            if (ref_name) {
+                fprintf(stderr, "[bam_fillmd1] no sequence in alignment "
+                        "record for '%s' at %s:%"PRIhts_pos", skipped\n",
+                        bam_get_qname(b), ref_name, c->pos + 1);
+            } else {
+                fprintf(stderr, "[bam_fillmd1] no sequence in alignment "
+                        "record for '%s', skipped", bam_get_qname(b));
+            }
+        }
+        if (skipped) (*skipped)++;
+        return 0;
+    }
+
+    for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) {
+        int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
          if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
-            for (j = 0; j < l; ++j) {
-                int c1, c2, z = y + j;
-                if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
-                c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
+            for (j = 0; j < oplen; ++j) {
+                int c1, c2, z = qpos + j;
+                if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0')
+                    break; // out of bounds
+                c1 = bam_seqi(seq, z);
+                c2 = seq_nt16_table[(uint8_t)ref[rpos+j]];
                  if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                      if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
-                    ++u;
+                    ++matched;
                  } else {
-                    kputw(u, str); kputc(toupper(ref[x+j]), str);
-                    u = 0; ++nm;
+                    err |= kputw(matched, &str) < 0;
+                    err |= kputc(toupper(ref[rpos+j]), &str) < 0;
+                    matched = 0; ++nm;
                  }
              }
-            if (j < l) break;
-            x += l; y += l;
+            if (j < oplen) break;
+            rpos += oplen; qpos += oplen;
          } else if (op == BAM_CDEL) {
-            kputw(u, str); kputc('^', str);
-            for (j = 0; j < l; ++j) {
-                if (x+j >= ref_len || ref[x+j] == '\0') break;
-                kputc(toupper(ref[x+j]), str);
+            err |= kputw(matched, &str) < 0;
+            err |= kputc('^', &str) < 0;
+            for (j = 0; j < oplen; ++j) {
+                if (rpos+j >= ref_len || ref[rpos+j] == '\0') break;
+                err |= kputc(toupper(ref[rpos+j]), &str) < 0;
              }
-            u = 0;
-            x += j; nm += j;
-            if (j < l) break;
+            matched = 0;
+            rpos += j; nm += j;
+            if (j < oplen) break;
          } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
-            y += l;
-            if (op == BAM_CINS) nm += l;
+            qpos += oplen;
+            if (op == BAM_CINS) nm += oplen;
          } else if (op == BAM_CREF_SKIP) {
-            x += l;
+            rpos += oplen;
          }
      }
-    kputw(u, str);
+    err |= kputw(matched, &str) < 0;
+    if (err) {
+        print_error_errno("calmd", "Couldn't build new MD string");
+        goto fail;
+    }
      // apply max_nm
      if (max_nm > 0 && nm >= max_nm) {
-        for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
-            int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+        for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) {
+            int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
              if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
-                for (j = 0; j < l; ++j) {
-                    int c1, c2, z = y + j;
-                    if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
-                    c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
+                for (j = 0; j < oplen; ++j) {
+                    int c1, c2, z = qpos + j;
+                    if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0')
+                        break; // out of bounds
+                    c1 = bam_seqi(seq, z);
+                    c2 = seq_nt16_table[(uint8_t)ref[rpos+j]];
                      if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                          seq[z/2] |= (z&1)? 0x0f : 0xf0;
                          bam_get_qual(b)[z] = 0;
                      }
                  }
-                if (j < l) break;
-                x += l; y += l;
-            } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
-            else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+                if (j < oplen) break;
+                rpos += oplen; qpos += oplen;
+            } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) rpos += oplen;
+            else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) qpos += oplen;
          }
      }
      // update NM
      if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) {
          uint8_t *old_nm = bam_aux_get(b, "NM");
          if (old_nm) old_nm_i = bam_aux2i(old_nm);
-        if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+        if (!old_nm) {
+            if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0)
+                goto aux_fail;
+        }
          else if (nm != old_nm_i) {
              if (!quiet_mode) {
                  fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
              }
-            bam_aux_del(b, old_nm);
-            bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+            if (bam_aux_del(b, old_nm) < 0) goto aux_fail;
+            if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0)
+                goto aux_fail;
          }
      }
      // update MD
      if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) {
          uint8_t *old_md = bam_aux_get(b, "MD");
-        if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
-        else {
+        if (!old_md) {
+            if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0)
+                goto aux_fail;
+        } else {
              int is_diff = 0;
-            if (strlen((char*)old_md+1) == str->l) {
-                for (i = 0; i < str->l; ++i)
-                    if (toupper(old_md[i+1]) != toupper(str->s[i]))
+            if (strlen((char*)old_md+1) == str.l) {
+                for (i = 0; i < str.l; ++i)
+                    if (toupper(old_md[i+1]) != toupper(str.s[i]))
                          break;
-                if (i < str->l) is_diff = 1;
+                if (i < str.l) is_diff = 1;
              } else is_diff = 1;
              if (is_diff) {
                  if (!quiet_mode) {
-                    fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
+                    fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str.s);
                  }
-                bam_aux_del(b, old_md);
-                bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+                if (bam_aux_del(b, old_md) < 0) goto aux_fail;
+                if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0)
+                    goto aux_fail;
              }
          }
      }
@@ -158,12 +193,25 @@ void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max
              if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
      }
  
-    free(str->s); free(str);
+    free(str.s);
+    return 0;
+
+ aux_fail:
+    if (errno == ENOMEM) {
+        print_error("calmd", "Couldn't add aux tag (too long)");
+    } else if (errno == EINVAL) {
+        print_error("calmd", "Corrupt aux data");
+    } else {
+        print_error_errno("calmd", "Couldn't add aux tag");
+    }
+ fail:
+    free(str.s);
+    return -1;
  }
  
-void bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode)
+int bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode)
  {
-    bam_fillmd1_core(b, ref, INT_MAX, flag, 0, quiet_mode);
+    return bam_fillmd1_core(NULL, b, ref, INT_MAX, flag, 0, quiet_mode, NULL);
  }
  
  int calmd_usage() {
@@ -193,8 +241,10 @@ int bam_fillmd(int argc, char *argv[])
      sam_hdr_t *header = NULL;
      faidx_t *fai = NULL;
      char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL;
+    const char *ref_name = NULL;
      bam1_t *b = NULL;
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+    uint32_t skipped = 0;
  
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'),
@@ -294,20 +344,34 @@ int bam_fillmd(int argc, char *argv[])
          if (b->core.tid >= 0) {
              if (tid != b->core.tid) {
                  free(ref);
-                ref = fai_fetch64(fai, sam_hdr_tid2name(header, b->core.tid), &len);
+                ref = NULL;
+                len = 0;
+                ref_name = sam_hdr_tid2name(header, b->core.tid);
+                if (ref_name) {
+                    ref = fai_fetch64(fai, ref_name, &len);
+                }
                  tid = b->core.tid;
                  if (ref == 0) { // FIXME: Should this always be fatal?
                      fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
-                            sam_hdr_tid2name(header, tid));
+                            ref_name ? ref_name : "(unknown)");
                      if (is_realn || capQ > 10) goto fail; // Would otherwise crash
                  }
              }
-            if (is_realn) sam_prob_realn(b, ref, len, baq_flag);
+            if (is_realn) {
+                if (sam_prob_realn(b, ref, len, baq_flag) < -3) {
+                    print_error_errno("calmd", "BAQ alignment failed");
+                    goto fail;
+                }
+            }
              if (capQ > 10) {
                  int q = sam_cap_mapq(b, ref, len, capQ);
                  if (b->core.qual > q) b->core.qual = q;
              }
-            if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm, quiet_mode);
+            if (ref) {
+                if (bam_fillmd1_core(ref_name, b, ref, len, flt_flag, max_nm,
+                                     quiet_mode, &skipped) < 0)
+                    goto fail;
+            }
          }
          if (sam_write1(fpout, header, b) < 0) {
              print_error_errno("calmd", "failed to write to output file");
@@ -318,6 +382,13 @@ int bam_fillmd(int argc, char *argv[])
          fprintf(stderr, "[bam_fillmd] Error reading input.\n");
          goto fail;
      }
+
+    if (skipped) {
+        fprintf(stderr, "[calmd] Warning: %"PRIu32" records skipped due "
+                "to no query sequence\n",
+                skipped);
+    }
+
      bam_destroy1(b);
      sam_hdr_destroy(header);
  
diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c

index 93990b9e1a672a817916bed47011bf4dae6dcdda..b71e77cc87386ffed70256e82944e62910be412d 100644 (file)
--- a/samtools/bam_md.c.pysam.c
+++ b/samtools/bam_md.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bam_md.c -- calmd subcommand.
  
-    Copyright (C) 2009-2011, 2014-2015, 2019 Genome Research Ltd.
+    Copyright (C) 2009-2011, 2014-2015, 2019-2020 Genome Research Ltd.
      Portions copyright (C) 2009-2011 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -32,6 +32,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include <string.h>
  #include <ctype.h>
  #include <limits.h>
+#include <errno.h>
  #include "htslib/faidx.h"
  #include "htslib/sam.h"
  #include "htslib/kstring.h"
@@ -48,102 +49,136 @@ DEALINGS IN THE SOFTWARE.  */
  
  int bam_aux_drop_other(bam1_t *b, uint8_t *s);
  
-void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max_nm, int quiet_mode)
+static int bam_fillmd1_core(const char *ref_name, bam1_t *b, char *ref,
+                            hts_pos_t ref_len, int flag, int max_nm,
+                            int quiet_mode, uint32_t *skipped)
  {
      uint8_t *seq = bam_get_seq(b);
      uint32_t *cigar = bam_get_cigar(b);
      bam1_core_t *c = &b->core;
-    int i, y, u = 0;
-    hts_pos_t x;
-    kstring_t *str;
+    int i, qpos, matched = 0;
+    hts_pos_t rpos;
+    kstring_t str = KS_INITIALIZE;
      int32_t old_nm_i = -1, nm = 0;
+    uint32_t err = 0;
  
-    str = (kstring_t*)calloc(1, sizeof(kstring_t));
-    for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
-        int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+    if (c->l_qseq == 0) {
+        if (!quiet_mode) {
+            if (ref_name) {
+                fprintf(samtools_stderr, "[bam_fillmd1] no sequence in alignment "
+                        "record for '%s' at %s:%"PRIhts_pos", skipped\n",
+                        bam_get_qname(b), ref_name, c->pos + 1);
+            } else {
+                fprintf(samtools_stderr, "[bam_fillmd1] no sequence in alignment "
+                        "record for '%s', skipped", bam_get_qname(b));
+            }
+        }
+        if (skipped) (*skipped)++;
+        return 0;
+    }
+
+    for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) {
+        int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
          if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
-            for (j = 0; j < l; ++j) {
-                int c1, c2, z = y + j;
-                if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
-                c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
+            for (j = 0; j < oplen; ++j) {
+                int c1, c2, z = qpos + j;
+                if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0')
+                    break; // out of bounds
+                c1 = bam_seqi(seq, z);
+                c2 = seq_nt16_table[(uint8_t)ref[rpos+j]];
                  if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                      if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
-                    ++u;
+                    ++matched;
                  } else {
-                    kputw(u, str); kputc(toupper(ref[x+j]), str);
-                    u = 0; ++nm;
+                    err |= kputw(matched, &str) < 0;
+                    err |= kputc(toupper(ref[rpos+j]), &str) < 0;
+                    matched = 0; ++nm;
                  }
              }
-            if (j < l) break;
-            x += l; y += l;
+            if (j < oplen) break;
+            rpos += oplen; qpos += oplen;
          } else if (op == BAM_CDEL) {
-            kputw(u, str); kputc('^', str);
-            for (j = 0; j < l; ++j) {
-                if (x+j >= ref_len || ref[x+j] == '\0') break;
-                kputc(toupper(ref[x+j]), str);
+            err |= kputw(matched, &str) < 0;
+            err |= kputc('^', &str) < 0;
+            for (j = 0; j < oplen; ++j) {
+                if (rpos+j >= ref_len || ref[rpos+j] == '\0') break;
+                err |= kputc(toupper(ref[rpos+j]), &str) < 0;
              }
-            u = 0;
-            x += j; nm += j;
-            if (j < l) break;
+            matched = 0;
+            rpos += j; nm += j;
+            if (j < oplen) break;
          } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
-            y += l;
-            if (op == BAM_CINS) nm += l;
+            qpos += oplen;
+            if (op == BAM_CINS) nm += oplen;
          } else if (op == BAM_CREF_SKIP) {
-            x += l;
+            rpos += oplen;
          }
      }
-    kputw(u, str);
+    err |= kputw(matched, &str) < 0;
+    if (err) {
+        print_error_errno("calmd", "Couldn't build new MD string");
+        goto fail;
+    }
      // apply max_nm
      if (max_nm > 0 && nm >= max_nm) {
-        for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
-            int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+        for (i = qpos = 0, rpos = c->pos; i < c->n_cigar; ++i) {
+            int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
              if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
-                for (j = 0; j < l; ++j) {
-                    int c1, c2, z = y + j;
-                    if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
-                    c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
+                for (j = 0; j < oplen; ++j) {
+                    int c1, c2, z = qpos + j;
+                    if (rpos+j >= ref_len || z >= c->l_qseq || ref[rpos+j] == '\0')
+                        break; // out of bounds
+                    c1 = bam_seqi(seq, z);
+                    c2 = seq_nt16_table[(uint8_t)ref[rpos+j]];
                      if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                          seq[z/2] |= (z&1)? 0x0f : 0xf0;
                          bam_get_qual(b)[z] = 0;
                      }
                  }
-                if (j < l) break;
-                x += l; y += l;
-            } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
-            else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+                if (j < oplen) break;
+                rpos += oplen; qpos += oplen;
+            } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) rpos += oplen;
+            else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) qpos += oplen;
          }
      }
      // update NM
      if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) {
          uint8_t *old_nm = bam_aux_get(b, "NM");
          if (old_nm) old_nm_i = bam_aux2i(old_nm);
-        if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+        if (!old_nm) {
+            if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0)
+                goto aux_fail;
+        }
          else if (nm != old_nm_i) {
              if (!quiet_mode) {
                  fprintf(samtools_stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
              }
-            bam_aux_del(b, old_nm);
-            bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
+            if (bam_aux_del(b, old_nm) < 0) goto aux_fail;
+            if (bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm) < 0)
+                goto aux_fail;
          }
      }
      // update MD
      if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) {
          uint8_t *old_md = bam_aux_get(b, "MD");
-        if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
-        else {
+        if (!old_md) {
+            if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0)
+                goto aux_fail;
+        } else {
              int is_diff = 0;
-            if (strlen((char*)old_md+1) == str->l) {
-                for (i = 0; i < str->l; ++i)
-                    if (toupper(old_md[i+1]) != toupper(str->s[i]))
+            if (strlen((char*)old_md+1) == str.l) {
+                for (i = 0; i < str.l; ++i)
+                    if (toupper(old_md[i+1]) != toupper(str.s[i]))
                          break;
-                if (i < str->l) is_diff = 1;
+                if (i < str.l) is_diff = 1;
              } else is_diff = 1;
              if (is_diff) {
                  if (!quiet_mode) {
-                    fprintf(samtools_stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
+                    fprintf(samtools_stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str.s);
                  }
-                bam_aux_del(b, old_md);
-                bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+                if (bam_aux_del(b, old_md) < 0) goto aux_fail;
+                if (bam_aux_append(b, "MD", 'Z', str.l + 1, (uint8_t*)str.s) < 0)
+                    goto aux_fail;
              }
          }
      }
@@ -160,12 +195,25 @@ void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max
              if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
      }
  
-    free(str->s); free(str);
+    free(str.s);
+    return 0;
+
+ aux_fail:
+    if (errno == ENOMEM) {
+        print_error("calmd", "Couldn't add aux tag (too long)");
+    } else if (errno == EINVAL) {
+        print_error("calmd", "Corrupt aux data");
+    } else {
+        print_error_errno("calmd", "Couldn't add aux tag");
+    }
+ fail:
+    free(str.s);
+    return -1;
  }
  
-void bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode)
+int bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode)
  {
-    bam_fillmd1_core(b, ref, INT_MAX, flag, 0, quiet_mode);
+    return bam_fillmd1_core(NULL, b, ref, INT_MAX, flag, 0, quiet_mode, NULL);
  }
  
  int calmd_usage() {
@@ -195,8 +243,10 @@ int bam_fillmd(int argc, char *argv[])
      sam_hdr_t *header = NULL;
      faidx_t *fai = NULL;
      char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL;
+    const char *ref_name = NULL;
      bam1_t *b = NULL;
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+    uint32_t skipped = 0;
  
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'),
@@ -296,20 +346,34 @@ int bam_fillmd(int argc, char *argv[])
          if (b->core.tid >= 0) {
              if (tid != b->core.tid) {
                  free(ref);
-                ref = fai_fetch64(fai, sam_hdr_tid2name(header, b->core.tid), &len);
+                ref = NULL;
+                len = 0;
+                ref_name = sam_hdr_tid2name(header, b->core.tid);
+                if (ref_name) {
+                    ref = fai_fetch64(fai, ref_name, &len);
+                }
                  tid = b->core.tid;
                  if (ref == 0) { // FIXME: Should this always be fatal?
                      fprintf(samtools_stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
-                            sam_hdr_tid2name(header, tid));
+                            ref_name ? ref_name : "(unknown)");
                      if (is_realn || capQ > 10) goto fail; // Would otherwise crash
                  }
              }
-            if (is_realn) sam_prob_realn(b, ref, len, baq_flag);
+            if (is_realn) {
+                if (sam_prob_realn(b, ref, len, baq_flag) < -3) {
+                    print_error_errno("calmd", "BAQ alignment failed");
+                    goto fail;
+                }
+            }
              if (capQ > 10) {
                  int q = sam_cap_mapq(b, ref, len, capQ);
                  if (b->core.qual > q) b->core.qual = q;
              }
-            if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm, quiet_mode);
+            if (ref) {
+                if (bam_fillmd1_core(ref_name, b, ref, len, flt_flag, max_nm,
+                                     quiet_mode, &skipped) < 0)
+                    goto fail;
+            }
          }
          if (sam_write1(fpout, header, b) < 0) {
              print_error_errno("calmd", "failed to write to output file");
@@ -320,6 +384,13 @@ int bam_fillmd(int argc, char *argv[])
          fprintf(samtools_stderr, "[bam_fillmd] Error reading input.\n");
          goto fail;
      }
+
+    if (skipped) {
+        fprintf(samtools_stderr, "[calmd] Warning: %"PRIu32" records skipped due "
+                "to no query sequence\n",
+                skipped);
+    }
+
      bam_destroy1(b);
      sam_hdr_destroy(header);
  
diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c

index 0497fb6f53196fc9262dcbf6385d14daec61a24a..6fd282c8d56417fdcc231939cde7622cf3722716 100644 (file)
--- a/samtools/bam_plcmd.c
+++ b/samtools/bam_plcmd.c
@@ -1,6 +1,6 @@
  /*  bam_plcmd.c -- mpileup subcommand.
  
-    Copyright (C) 2008-2015, 2019 Genome Research Ltd.
+    Copyright (C) 2008-2015, 2019-2021 Genome Research Ltd.
      Portions copyright (C) 2009-2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -90,8 +90,10 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
      int del_len = -p->indel;
      if (p->indel > 0) {
          int len = bam_plp_insertion(p, ks, &del_len);
-        if (len < 0)
+        if (len < 0) {
+            print_error("mpileup", "bam_plp_insertion() failed");
              return -1;
+        }
          putc('+', fp); printw(len, fp);
          if (bam_is_rev(p->b)) {
              char pad = rev_del ? '#' : '*';
@@ -126,10 +128,11 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
  #define MPLP_REDO_BAQ   (1<<6)
  #define MPLP_ILLUMINA13 (1<<7)
  #define MPLP_IGNORE_RG  (1<<8)
-#define MPLP_PRINT_QPOS (1<<9)
-#define MPLP_PER_SAMPLE (1<<11)
-#define MPLP_SMART_OVERLAPS (1<<12)
+#define MPLP_PER_SAMPLE (1<<9)
+#define MPLP_SMART_OVERLAPS (1<<10)
  
+#define MPLP_PRINT_MAPQ_CHAR (1<<11)
+#define MPLP_PRINT_QPOS  (1<<12)
  #define MPLP_PRINT_QNAME (1<<13)
  #define MPLP_PRINT_FLAG  (1<<14)
  #define MPLP_PRINT_RNAME (1<<15)
@@ -294,9 +297,7 @@ print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname,
      fprintf(fp, "%s\t%"PRIhts_pos"\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N');
      for (i = 0; i < n; ++i) {
          fputs("\t0\t*\t*", fp);
-        if (conf->flag & MPLP_PRINT_QPOS)
-            fputs("\t*", fp);
-        int flag_value = MPLP_PRINT_QNAME;
+        int flag_value = MPLP_PRINT_MAPQ_CHAR;
          while(flag_value < MPLP_PRINT_QUAL + 1) {
              if (conf->flag & flag_value)
                  fputs("\t*", fp);
@@ -757,9 +758,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                  fprintf(pileup_fp, "\t%d\t", cnt);
                  if (n_plp[i] == 0) {
                      fputs("*\t*", pileup_fp);
-                    if (conf->flag & MPLP_PRINT_QPOS)
-                        fputs("\t*", pileup_fp);
-                    int flag_value = MPLP_PRINT_QNAME;
+                    int flag_value = MPLP_PRINT_MAPQ_CHAR;
                      while(flag_value < MPLP_PRINT_QUAL + 1) {
                          if (conf->flag & flag_value)
                              fputs("\t*", pileup_fp);
@@ -805,25 +804,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                      }
                      if (!n) putc('*', pileup_fp);
  
-                    /* Print mpileup positions */
-                    if (conf->flag & MPLP_PRINT_QPOS) {
-                        n = 0;
-                        putc('\t', pileup_fp);
-                        for (j = 0; j < n_plp[i]; ++j) {
-                            const bam_pileup1_t *p = plp[i] + j;
-                            int c = p->qpos < p->b->core.l_qseq
-                                    ? bam_get_qual(p->b)[p->qpos]
-                                                         : 0;
-                            if ( c < conf->min_baseQ ) continue;
-                            if (n > 0) putc(',', pileup_fp);
-                            n++;
-                            fprintf(pileup_fp, "%d", p->qpos + 1);
-                        }
-                        if (!n) putc('*', pileup_fp);
-                    }
-
                      /* Print selected columns */
-                    int flag_value = MPLP_PRINT_QNAME;
+                    int flag_value = MPLP_PRINT_MAPQ_CHAR;
                      while(flag_value < MPLP_PRINT_QUAL + 1) {
                          if (conf->flag & flag_value) {
                              n = 0;
@@ -834,10 +816,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                                      ? bam_get_qual(p->b)[p->qpos]
                                      : 0;
                                  if ( c < conf->min_baseQ ) continue;
-                                if (n > 0 && flag_value != MPLP_PRINT_MAPQ) putc(',', pileup_fp);
+                                if (n > 0 && flag_value != MPLP_PRINT_MAPQ_CHAR) putc(',', pileup_fp);
                                  n++;
  
                                  switch (flag_value) {
+                                case MPLP_PRINT_MAPQ_CHAR:
+                                    c = p->b->core.qual + 33;
+                                    if (c > 126) c = 126;
+                                    putc(c, pileup_fp);
+                                    break;
+                                case MPLP_PRINT_QPOS:
+                                    fprintf(pileup_fp, "%d", p->qpos + 1);
+                                    break;
                                  case MPLP_PRINT_QNAME:
                                      fputs(bam_get_qname(p->b), pileup_fp);
                                      break;
@@ -854,9 +844,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                                      fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1);
                                      break;
                                  case MPLP_PRINT_MAPQ:
-                                    c = p->b->core.qual + 33;
-                                    if (c > 126) c = 126;
-                                    putc(c, pileup_fp);
+                                    fprintf(pileup_fp, "%d", p->b->core.qual);
                                      break;
                                  case MPLP_PRINT_RNEXT:
                                      if (p->b->core.mtid >= 0)
@@ -930,6 +918,12 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
          }
      }
  
+    if (ret < 0) {
+        print_error("mpileup", "error reading from input file");
+        ret = EXIT_FAILURE;
+        goto fail;
+    }
+
      if (conf->all && !(conf->flag & MPLP_BCF)) {
          // Handle terminating region
          if (last_tid < 0 && conf->reg && conf->all > 1) {
@@ -1110,9 +1104,9 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
      fprintf(fp,
  "  -r, --region REG        region in which pileup is generated\n"
  "  -R, --ignore-RG         ignore RG tags (one BAM = one sample)\n"
-"  --rf, --incl-flags STR|INT  required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+"  --rf, --incl-flags STR|INT  required flags: include reads with any of the mask bits set [%s]\n", tmp_require);
      fprintf(fp,
-"  --ff, --excl-flags STR|INT  filter flags: skip reads with mask bits set\n"
+"  --ff, --excl-flags STR|INT  filter flags: skip reads with any of the mask bits set\n"
  "                                            [%s]\n", tmp_filter);
      fprintf(fp,
  "  -x, --ignore-overlaps   disable read-pair overlap detection\n"
@@ -1281,7 +1275,7 @@ int bam_mpileup(int argc, char *argv[])
          case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
          case '6': mplp.flag |= MPLP_ILLUMINA13; break;
          case 'R': mplp.flag |= MPLP_IGNORE_RG; break;
-        case 's': mplp.flag |= MPLP_PRINT_MAPQ; break;
+        case 's': mplp.flag |= MPLP_PRINT_MAPQ_CHAR; break;
          case 'O': mplp.flag |= MPLP_PRINT_QPOS; break;
          case 'C': mplp.capQ_thres = atoi(optarg); break;
          case 'q': mplp.min_mq = atoi(optarg); break;
diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c

index 7c9986f844e48fa25fb93f91224250f4c62b3f67..bcb8a5c834cad3bbdb3e1a8bd78e95bee64a5d49 100644 (file)
--- a/samtools/bam_plcmd.c.pysam.c
+++ b/samtools/bam_plcmd.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bam_plcmd.c -- mpileup subcommand.
  
-    Copyright (C) 2008-2015, 2019 Genome Research Ltd.
+    Copyright (C) 2008-2015, 2019-2021 Genome Research Ltd.
      Portions copyright (C) 2009-2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -92,8 +92,10 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
      int del_len = -p->indel;
      if (p->indel > 0) {
          int len = bam_plp_insertion(p, ks, &del_len);
-        if (len < 0)
+        if (len < 0) {
+            print_error("mpileup", "bam_plp_insertion() failed");
              return -1;
+        }
          putc('+', fp); printw(len, fp);
          if (bam_is_rev(p->b)) {
              char pad = rev_del ? '#' : '*';
@@ -128,10 +130,11 @@ static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
  #define MPLP_REDO_BAQ   (1<<6)
  #define MPLP_ILLUMINA13 (1<<7)
  #define MPLP_IGNORE_RG  (1<<8)
-#define MPLP_PRINT_QPOS (1<<9)
-#define MPLP_PER_SAMPLE (1<<11)
-#define MPLP_SMART_OVERLAPS (1<<12)
+#define MPLP_PER_SAMPLE (1<<9)
+#define MPLP_SMART_OVERLAPS (1<<10)
  
+#define MPLP_PRINT_MAPQ_CHAR (1<<11)
+#define MPLP_PRINT_QPOS  (1<<12)
  #define MPLP_PRINT_QNAME (1<<13)
  #define MPLP_PRINT_FLAG  (1<<14)
  #define MPLP_PRINT_RNAME (1<<15)
@@ -296,9 +299,7 @@ print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname,
      fprintf(fp, "%s\t%"PRIhts_pos"\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N');
      for (i = 0; i < n; ++i) {
          fputs("\t0\t*\t*", fp);
-        if (conf->flag & MPLP_PRINT_QPOS)
-            fputs("\t*", fp);
-        int flag_value = MPLP_PRINT_QNAME;
+        int flag_value = MPLP_PRINT_MAPQ_CHAR;
          while(flag_value < MPLP_PRINT_QUAL + 1) {
              if (conf->flag & flag_value)
                  fputs("\t*", fp);
@@ -389,7 +390,7 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf,
              if (id < 0 || id >= m->n) {
                  assert(q); // otherwise a bug
                  fprintf(samtools_stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]);
-                exit(EXIT_FAILURE);
+                samtools_exit(EXIT_FAILURE);
              }
              if (m->n_plp[id] == m->m_plp[id]) {
                  m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
@@ -442,7 +443,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
  
      if (n == 0) {
          fprintf(samtools_stderr,"[%s] no input file/data given\n", __func__);
-        exit(EXIT_FAILURE);
+        samtools_exit(EXIT_FAILURE);
      }
  
      // read the header of each file in the list and initialize data
@@ -453,23 +454,23 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
          if ( !data[i]->fp )
          {
              fprintf(samtools_stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
-            exit(EXIT_FAILURE);
+            samtools_exit(EXIT_FAILURE);
          }
          if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
              fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
-            exit(EXIT_FAILURE);
+            samtools_exit(EXIT_FAILURE);
          }
          if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) {
              fprintf(samtools_stderr, "[%s] failed to process %s: %s\n",
                      __func__, conf->fai_fname, strerror(errno));
-            exit(EXIT_FAILURE);
+            samtools_exit(EXIT_FAILURE);
          }
          data[i]->conf = conf;
          data[i]->ref = &mp_ref;
          h_tmp = sam_hdr_read(data[i]->fp);
          if ( !h_tmp ) {
              fprintf(samtools_stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
-            exit(EXIT_FAILURE);
+            samtools_exit(EXIT_FAILURE);
          }
          bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : sam_hdr_str(h_tmp));
          if (conf->flag & MPLP_BCF) {
@@ -487,11 +488,11 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
  
              if (idx == NULL) {
                  fprintf(samtools_stderr, "[%s] fail to load index for %s\n", __func__, fn[i]);
-                exit(EXIT_FAILURE);
+                samtools_exit(EXIT_FAILURE);
              }
              if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) {
                  fprintf(samtools_stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]);
-                exit(EXIT_FAILURE);
+                samtools_exit(EXIT_FAILURE);
              }
              if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end, tid0 = data[i]->iter->tid;
              hts_idx_destroy(idx);
@@ -529,7 +530,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
          bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode);
          if (bcf_fp == NULL) {
              fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
-            exit(EXIT_FAILURE);
+            samtools_exit(EXIT_FAILURE);
          }
  
          // BCF header creation
@@ -613,7 +614,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
          if (bcf_hdr_write(bcf_fp, bcf_hdr) != 0) {
              print_error_errno("mpileup", "Failed to write VCF/BCF header to \"%s\"",
                                conf->output_fname? conf->output_fname : "standard output");
-            exit(EXIT_FAILURE);
+            samtools_exit(EXIT_FAILURE);
          }
          // End of BCF header creation
  
@@ -652,7 +653,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
  
          if (pileup_fp == NULL) {
              fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno));
-            exit(EXIT_FAILURE);
+            samtools_exit(EXIT_FAILURE);
          }
      }
  
@@ -698,7 +699,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
              if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) {
                  print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"",
                                    conf->output_fname?conf->output_fname:"standard output");
-                exit(EXIT_FAILURE);
+                samtools_exit(EXIT_FAILURE);
              }
              // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
              if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0)
@@ -712,7 +713,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                      if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) {
                          print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"",
                                            conf->output_fname?conf->output_fname:"standard output");
-                        exit(EXIT_FAILURE);
+                        samtools_exit(EXIT_FAILURE);
                      }
                  }
              }
@@ -759,9 +760,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                  fprintf(pileup_fp, "\t%d\t", cnt);
                  if (n_plp[i] == 0) {
                      fputs("*\t*", pileup_fp);
-                    if (conf->flag & MPLP_PRINT_QPOS)
-                        fputs("\t*", pileup_fp);
-                    int flag_value = MPLP_PRINT_QNAME;
+                    int flag_value = MPLP_PRINT_MAPQ_CHAR;
                      while(flag_value < MPLP_PRINT_QUAL + 1) {
                          if (conf->flag & flag_value)
                              fputs("\t*", pileup_fp);
@@ -807,25 +806,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                      }
                      if (!n) putc('*', pileup_fp);
  
-                    /* Print mpileup positions */
-                    if (conf->flag & MPLP_PRINT_QPOS) {
-                        n = 0;
-                        putc('\t', pileup_fp);
-                        for (j = 0; j < n_plp[i]; ++j) {
-                            const bam_pileup1_t *p = plp[i] + j;
-                            int c = p->qpos < p->b->core.l_qseq
-                                    ? bam_get_qual(p->b)[p->qpos]
-                                                         : 0;
-                            if ( c < conf->min_baseQ ) continue;
-                            if (n > 0) putc(',', pileup_fp);
-                            n++;
-                            fprintf(pileup_fp, "%d", p->qpos + 1);
-                        }
-                        if (!n) putc('*', pileup_fp);
-                    }
-
                      /* Print selected columns */
-                    int flag_value = MPLP_PRINT_QNAME;
+                    int flag_value = MPLP_PRINT_MAPQ_CHAR;
                      while(flag_value < MPLP_PRINT_QUAL + 1) {
                          if (conf->flag & flag_value) {
                              n = 0;
@@ -836,10 +818,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                                      ? bam_get_qual(p->b)[p->qpos]
                                      : 0;
                                  if ( c < conf->min_baseQ ) continue;
-                                if (n > 0 && flag_value != MPLP_PRINT_MAPQ) putc(',', pileup_fp);
+                                if (n > 0 && flag_value != MPLP_PRINT_MAPQ_CHAR) putc(',', pileup_fp);
                                  n++;
  
                                  switch (flag_value) {
+                                case MPLP_PRINT_MAPQ_CHAR:
+                                    c = p->b->core.qual + 33;
+                                    if (c > 126) c = 126;
+                                    putc(c, pileup_fp);
+                                    break;
+                                case MPLP_PRINT_QPOS:
+                                    fprintf(pileup_fp, "%d", p->qpos + 1);
+                                    break;
                                  case MPLP_PRINT_QNAME:
                                      fputs(bam_get_qname(p->b), pileup_fp);
                                      break;
@@ -856,9 +846,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
                                      fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1);
                                      break;
                                  case MPLP_PRINT_MAPQ:
-                                    c = p->b->core.qual + 33;
-                                    if (c > 126) c = 126;
-                                    putc(c, pileup_fp);
+                                    fprintf(pileup_fp, "%d", p->b->core.qual);
                                      break;
                                  case MPLP_PRINT_RNEXT:
                                      if (p->b->core.mtid >= 0)
@@ -932,6 +920,12 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx)
          }
      }
  
+    if (ret < 0) {
+        print_error("mpileup", "error reading from input file");
+        ret = EXIT_FAILURE;
+        goto fail;
+    }
+
      if (conf->all && !(conf->flag & MPLP_BCF)) {
          // Handle terminating region
          if (last_tid < 0 && conf->reg && conf->all > 1) {
@@ -1073,7 +1067,7 @@ int parse_format_flag(const char *str)
          else
          {
              fprintf(samtools_stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
-            exit(EXIT_FAILURE);
+            samtools_exit(EXIT_FAILURE);
          }
          free(tags[i]);
      }
@@ -1112,9 +1106,9 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
      fprintf(fp,
  "  -r, --region REG        region in which pileup is generated\n"
  "  -R, --ignore-RG         ignore RG tags (one BAM = one sample)\n"
-"  --rf, --incl-flags STR|INT  required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+"  --rf, --incl-flags STR|INT  required flags: include reads with any of the mask bits set [%s]\n", tmp_require);
      fprintf(fp,
-"  --ff, --excl-flags STR|INT  filter flags: skip reads with mask bits set\n"
+"  --ff, --excl-flags STR|INT  filter flags: skip reads with any of the mask bits set\n"
  "                                            [%s]\n", tmp_filter);
      fprintf(fp,
  "  -x, --ignore-overlaps   disable read-pair overlap detection\n"
@@ -1283,7 +1277,7 @@ int bam_mpileup(int argc, char *argv[])
          case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
          case '6': mplp.flag |= MPLP_ILLUMINA13; break;
          case 'R': mplp.flag |= MPLP_IGNORE_RG; break;
-        case 's': mplp.flag |= MPLP_PRINT_MAPQ; break;
+        case 's': mplp.flag |= MPLP_PRINT_MAPQ_CHAR; break;
          case 'O': mplp.flag |= MPLP_PRINT_QPOS; break;
          case 'C': mplp.capQ_thres = atoi(optarg); break;
          case 'q': mplp.min_mq = atoi(optarg); break;
diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c

index 81495141823b233e0f17d72963c82d2f38494fee..a48d7f628b66089612ac20fe26df0e01f49a820c 100644 (file)
--- a/samtools/bam_reheader.c.pysam.c
+++ b/samtools/bam_reheader.c.pysam.c
@@ -444,7 +444,7 @@ static void usage(FILE *fp, int ret) {
             "    -i, --in-place      Modify the CRAM file directly, if possible.\n"
             "                        (Defaults to outputting to samtools_stdout.)\n"
             "    -c, --command CMD   Pass the header in SAM format to external program CMD.\n");
-    exit(ret);
+    samtools_exit(ret);
  }
  
  static sam_hdr_t* external_reheader(samFile* in, const char* external) {
@@ -533,7 +533,7 @@ cleanup:
      return h;
  }
  
-int main_reheader(int argc, char *argv[])
+int samtools_main_reheader(int argc, char *argv[])
  {
      int inplace = 0, r, no_pg = 0, c, skip_header = 0;
      sam_hdr_t *h;
diff --git a/samtools/bam_rmdupse.c.pysam.c b/samtools/bam_rmdupse.c.pysam.c

index 2c67faca62a1f4e77c81c3e5b9d18b9a6fa9cc27..65689d7dfb3c83b5218ba899bab0f76156d5b4ae 100644 (file)
--- a/samtools/bam_rmdupse.c.pysam.c
+++ b/samtools/bam_rmdupse.c.pysam.c
@@ -86,8 +86,8 @@ static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, in
      p->discarded = 0;
      p->endpos = endpos; p->score = score;
      if (p->b == 0) p->b = bam_init1();
-    if (!p->b) { perror(NULL); exit(EXIT_FAILURE); }
-    if (bam_copy1(p->b, b) == NULL) { perror(NULL); exit(EXIT_FAILURE); }
+    if (!p->b) { perror(NULL); samtools_exit(EXIT_FAILURE); }
+    if (bam_copy1(p->b, b) == NULL) { perror(NULL); samtools_exit(EXIT_FAILURE); }
      return p;
  }
  
@@ -183,7 +183,7 @@ int bam_rmdupse_core(samFile *in, sam_hdr_t *hdr, samFile *out, int force_se)
                      } else { // replace
                          p->score = score; p->endpos = endpos;
                          if (bam_copy1(p->b, b) == NULL) {
-                            perror(NULL); exit(EXIT_FAILURE);
+                            perror(NULL); samtools_exit(EXIT_FAILURE);
                          }
                      }
                  } // otherwise, discard the alignment
diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c

index 0bf346cb33050484c237a5c04fdff7f13d2d4644..46a1d80fd93cdb14ced07e8e7d6df1a3718896e7 100644 (file)
--- a/samtools/bam_sort.c
+++ b/samtools/bam_sort.c
@@ -1,6 +1,6 @@
  /*  bam_sort.c -- sorting and merging.
  
-    Copyright (C) 2008-2019 Genome Research Ltd.
+    Copyright (C) 2008-2021 Genome Research Ltd.
      Portions copyright (C) 2009-2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -33,11 +33,13 @@ DEALINGS IN THE SOFTWARE.  */
  #include <stdio.h>
  #include <string.h>
  #include <time.h>
+#include <sys/types.h>
  #include <sys/stat.h>
  #include <unistd.h>
  #include <getopt.h>
  #include <assert.h>
  #include <pthread.h>
+#include <inttypes.h>
  #include "htslib/ksort.h"
  #include "htslib/hts_os.h"
  #include "htslib/khash.h"
@@ -47,6 +49,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include "htslib/hts_endian.h"
  #include "sam_opts.h"
  #include "samtools.h"
+#include "bedidx.h"
  
  
  // Struct which contains the a record, and the pointer to the sort tag (if any) or
@@ -97,6 +100,7 @@ KLIST_INIT(hdrln, char*, hdrln_free_char)
  
  static int g_is_by_qname = 0;
  static int g_is_by_tag = 0;
+static int g_is_by_minhash = 0;
  static char g_sort_tag[2] = {0,0};
  
  static int strnum_cmp(const char *_a, const char *_b)
@@ -133,8 +137,11 @@ typedef struct {
  } heap1_t;
  
  static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b);
+static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b);
  
  // Function to compare reads in the heap and determine which one is < the other
+// Note, unlike the bam1_cmp_by_X functions which return <0, 0, >0 this
+// is strictly 0 or 1 only.
  static inline int heap_lt(const heap1_t a, const heap1_t b)
  {
      if (!a.entry.bam_record)
@@ -146,6 +153,9 @@ static inline int heap_lt(const heap1_t a, const heap1_t b)
          int t;
          t = bam1_cmp_by_tag(a.entry, b.entry);
          if (t != 0) return t > 0;
+    } else if (g_is_by_minhash) {
+        int t = bam1_cmp_by_minhash(a.entry, b.entry);
+        if (t != 0) return t > 0;
      } else if (g_is_by_qname) {
          int t, fa, fb;
          t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record));
@@ -513,7 +523,8 @@ static klist_t(hdrln) * trans_rg_pg(bool is_rg, sam_hdr_t *translate,
                  id_len = id_end - idp;
  
                  if (id_len < transformed_id.l) {
-                    if (ks_resize(&new_hdr_line, new_hdr_line.l + transformed_id.l - id_len))
+                    if (ks_resize(&new_hdr_line, new_hdr_line.l
+                                  + transformed_id.l - id_len + 1/*nul*/))
                          goto fail;
                  }
                  if (id_len != transformed_id.l) {
@@ -714,6 +725,7 @@ static int trans_tbl_init(merged_header_t* merged_hdr, sam_hdr_t* translate,
      // Get translated header lines and fill in map for @PG records
      pg_list = trans_rg_pg(false, translate, merge_pg, merged_hdr->pg_ids,
                            tbl->pg_trans, NULL);
+    if (!pg_list) goto fail;
  
      // Fix-up PG: tags in the new @RG records and add to output
      if (finish_rg_pg(true, rg_list, tbl->pg_trans, &merged_hdr->out_rg))
@@ -911,10 +923,38 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
  #define MERGE_COMBINE_PG 32 // Combine PG tags frather than redefining them
  #define MERGE_FIRST_CO   64 // Use only first file's @CO headers (sort cmd only)
  
+
+static hts_reglist_t *duplicate_reglist(const hts_reglist_t *rl, int rn) {
+    if (!rl)
+        return NULL;
+
+    hts_reglist_t *new_rl = calloc(rn, sizeof(hts_reglist_t));
+    if (!new_rl)
+        return NULL;
+
+    int i;
+    for (i=0; i < rn; i++) {
+        new_rl[i].tid     = rl[i].tid;
+        new_rl[i].count   = rl[i].count;
+        new_rl[i].min_beg = rl[i].min_beg;
+        new_rl[i].max_end = rl[i].max_end;
+
+        new_rl[i].reg = rl[i].reg;
+        new_rl[i].intervals = malloc(new_rl[i].count * sizeof(hts_pair_pos_t));
+        if (!new_rl[i].intervals) {
+            hts_reglist_free(new_rl, i);
+            return NULL;
+        }
+        memcpy(new_rl[i].intervals, rl[i].intervals, new_rl[i].count * sizeof(hts_pair_pos_t));
+    }
+
+    return new_rl;
+}
+
  /*
   * How merging is handled
   *
- * If a hheader is defined use we will use that as our output header
+ * If a header is defined use we will use that as our output header
   * otherwise we use the first header from the first input file.
   *
   * Now go through each file and create a translation table for that file for:
@@ -957,9 +997,9 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
   */
  int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode,
                      const char *headers, int n, char * const *fn, char * const *fn_idx,
-                    int flag, const char *reg, int n_threads, const char *cmd,
-                    const htsFormat *in_fmt, const htsFormat *out_fmt, int write_index,
-                    char *arg_list, int no_pg)
+                    const char *fn_bed, int flag, const char *reg, int n_threads,
+                    const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt,
+                    int write_index, char *arg_list, int no_pg)
  {
      samFile *fpout, **fp = NULL;
      heap1_t *heap = NULL;
@@ -973,6 +1013,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
      trans_tbl_t *translation_tbl = NULL;
      int *rtrans = NULL;
      char *out_idx_fn = NULL;
+    void *hreg = NULL;
+    hts_reglist_t *lreg = NULL;
      merged_header_t *merged_hdr = init_merged_header();
      if (!merged_hdr) return -1;
  
@@ -1030,7 +1072,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
      }
  
      if (hin) {
-        // Popluate merged_hdr from the pre-prepared header
+        // Populate merged_hdr from the pre-prepared header
          trans_tbl_t dummy;
          int res;
          res = trans_tbl_init(merged_hdr, hin, &dummy, flag & MERGE_COMBINE_RG,
@@ -1059,10 +1101,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
                             RG[i]))
              return -1; // FIXME: memory leak
  
-        // TODO sam_itr_next() doesn't yet work for SAM files,
-        // so for those keep the headers around for use with sam_read1()
-        if (hts_get_format(fp[i])->format == sam) hdr[i] = hin;
-        else { sam_hdr_destroy(hin); hdr[i] = NULL; }
+        hdr[i] = hin;
  
          if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
              fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
@@ -1098,10 +1137,22 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
      if (!hout) return -1;  // FIXME: memory leak
  
      // If we're only merging a specified region move our iters to start at that point
-    if (reg) {
-        int tid;
-        hts_pos_t beg, end;
+    int tid, nreg;
+    hts_pos_t beg, end;
  
+    if (fn_bed) {
+        hreg = bed_read(fn_bed);
+        if (!hreg) {
+            fprintf(stderr, "[%s] Could not read BED file: \"%s\"\n", __func__, fn_bed);
+            goto fail;
+        }
+        bed_unify(hreg);
+        lreg = bed_reglist(hreg, ALL, &nreg);
+        if (!lreg || !nreg) {
+            fprintf(stderr, "[%s] Null or empty region list\n", __func__);
+            goto fail;
+        }
+    } else if (reg) {
          rtrans = rtrans_build(n, sam_hdr_nref(hout), translation_tbl);
          if (!rtrans) goto mem_fail;
  
@@ -1109,55 +1160,69 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
              fprintf(stderr, "[%s] Badly formatted region or unknown reference name: \"%s\"\n", __func__, reg);
              goto fail;
          }
+
+    }
+
+    if (reg || fn_bed) {
+        hts_idx_t *reg_idx = NULL;
          for (i = 0; i < n; ++i) {
-            hts_idx_t *idx = NULL;
-            // If index filename has not been specfied, look in BAM folder
+
+            // If index filename has not been specified, look in the BAM folder
              if (fn_idx != NULL) {
-                idx = sam_index_load2(fp[i], fn[i], fn_idx[i]);
+                reg_idx = sam_index_load2(fp[i], fn[i], fn_idx[i]);
              } else {
-                idx = sam_index_load(fp[i], fn[i]);
+                reg_idx = sam_index_load(fp[i], fn[i]);
              }
-            // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
-            int mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid];
-            if (idx == NULL) {
-                fprintf(stderr, "[%s] failed to load index for %s.  Random alignment retrieval only works for indexed BAM or CRAM files.\n",
+            if (reg_idx == NULL) {
+                fprintf(stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n",
                          __func__, fn[i]);
+                free(rtrans);
+                rtrans = NULL;
                  goto fail;
              }
-            if (mapped_tid != INT32_MIN) {
-                iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end);
+
+            int mapped_tid = INT32_MIN;
+            if (fn_bed) {
+                hts_reglist_t *rl = duplicate_reglist(lreg, nreg);
+                iter[i] = sam_itr_regions(reg_idx, hdr[i], rl, nreg);
              } else {
-                iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0);
+                // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
+                mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid];
+                if (mapped_tid != INT32_MIN) {
+                    iter[i] = sam_itr_queryi(reg_idx, mapped_tid, beg, end);
+                } else {
+                    iter[i] = sam_itr_queryi(reg_idx, HTS_IDX_NONE, 0, 0);
+                }
              }
-            hts_idx_destroy(idx);
+
              if (iter[i] == NULL) {
-                if (mapped_tid != INT32_MIN) {
-                    fprintf(stderr,
-                            "[%s] failed to get iterator over "
-                            "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n",
-                            __func__, fn[i], mapped_tid, beg, end);
+                if (fn_bed) {
+                    fprintf(stderr, "[%s] failed to get multi-region iterator "
+                            "{%s, %s}\n", __func__, fn[i], fn_bed);
                  } else {
-                    fprintf(stderr,
-                            "[%s] failed to get iterator over "
-                            "{%s, HTS_IDX_NONE, 0, 0}\n",
-                            __func__, fn[i]);
+                    if (mapped_tid != INT32_MIN) {
+                        fprintf(stderr,
+                                "[%s] failed to get iterator over "
+                                "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n",
+                                __func__, fn[i], mapped_tid, beg, end);
+                    } else {
+                        fprintf(stderr,
+                                "[%s] failed to get iterator over "
+                                "{%s, HTS_IDX_NONE, 0, 0}\n",
+                                __func__, fn[i]);
+                    }
                  }
+                hts_idx_destroy(reg_idx);
+                free(rtrans);
+                rtrans = NULL;
                  goto fail;
              }
+
+            hts_idx_destroy(reg_idx);
          }
+
          free(rtrans);
          rtrans = NULL;
-    } else {
-        for (i = 0; i < n; ++i) {
-            if (hdr[i] == NULL) {
-                iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
-                if (iter[i] == NULL) {
-                    fprintf(stderr, "[%s] failed to get iterator\n", __func__);
-                    goto fail;
-                }
-            }
-            else iter[i] = NULL;
-        }
      }
  
      // Load the first read from each file into the heap
@@ -1279,6 +1344,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
      sam_hdr_destroy(hin);
      sam_hdr_destroy(hout);
      free_merged_header(merged_hdr);
+    hts_reglist_free(lreg, nreg);
+    bed_destroy(hreg);
      free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
      if (sam_close(fpout) < 0) {
          print_error(cmd, "error closing output file");
@@ -1307,6 +1374,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
      free(RG);
      free(translation_tbl);
      free(hdr);
+    hts_reglist_free(lreg, nreg);
+    bed_destroy(hreg);
      free(iter);
      free(heap);
      free(fp);
@@ -1322,13 +1391,14 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch
      strcpy(mode, "wb");
      if (flag & MERGE_UNCOMP) strcat(mode, "0");
      else if (flag & MERGE_LEVEL1) strcat(mode, "1");
-    return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1);
+    return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1);
  }
  
  static void merge_usage(FILE *to)
  {
      fprintf(to,
-"Usage: samtools merge [-nurlf] [-h inh.sam] [-b <bamlist.fofn>] <out.bam> <in1.bam> [<in2.bam> ... <inN.bam>]\n"
+"Usage: samtools merge [options] -o <out.bam> [options] <in1.bam> ... <inN.bam>\n"
+"   or: samtools merge [options] <out.bam> <in1.bam> ... <inN.bam>\n"
  "\n"
  "Options:\n"
  "  -n         Input files are sorted by read name\n"
@@ -1336,6 +1406,7 @@ static void merge_usage(FILE *to)
  "  -r         Attach RG tag (inferred from file names)\n"
  "  -u         Uncompressed BAM output\n"
  "  -f         Overwrite the output BAM if exist\n"
+"  -o FILE    Specify output file via option instead of <out.bam> argument\n"
  "  -1         Compress level 1\n"
  "  -l INT     Compression level, from 0 to 9 [-1]\n"
  "  -R STR     Merge file in the specified region STR [all]\n"
@@ -1345,6 +1416,7 @@ static void merge_usage(FILE *to)
  "  -s VALUE   Override random seed\n"
  "  -b FILE    List of input BAM filenames, one per line [null]\n"
  "  -X         Use customized index files\n"
+"  -L FILE    Specify a BED file for multiple region filtering [null]\n"
  "  --no-PG    do not add a PG line\n");
      sam_global_opt_help(to, "-.O..@..");
  }
@@ -1353,10 +1425,10 @@ int bam_merge(int argc, char *argv[])
  {
      int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0;
      char *fn_headers = NULL, *reg = NULL, mode[12];
-    char *sort_tag = NULL, *arg_list = NULL;
+    char *sort_tag = NULL, *fnout = NULL, *arg_list = NULL;
      long random_seed = (long)time(NULL);
      char** fn = NULL;
-    char** fn_idx = NULL;
+    char** fn_idx = NULL, *fn_bed = NULL;
      int fn_size = 0, no_pg = 0;
  
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
@@ -1372,12 +1444,13 @@ int bam_merge(int argc, char *argv[])
          return 0;
      }
  
-    while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:X", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h:nru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) {
          switch (c) {
          case 'r': flag |= MERGE_RG; break;
          case 'f': flag |= MERGE_FORCE; break;
          case 'h': fn_headers = optarg; break;
          case 'n': is_by_qname = 1; break;
+        case 'o': fnout = optarg; break;
          case 't': sort_tag = optarg; break;
          case '1': flag |= MERGE_LEVEL1; level = 1; break;
          case 'u': flag |= MERGE_UNCOMP; level = 0; break;
@@ -1387,6 +1460,7 @@ int bam_merge(int argc, char *argv[])
          case 'p': flag |= MERGE_COMBINE_PG; break;
          case 's': random_seed = atol(optarg); break;
          case 'X': has_index_file = 1; break; // -X flag for index filename
+        case 'L': fn_bed = optarg; break;
          case 'b': {
              // load the list of files to read
              if (has_index_file) {
@@ -1415,7 +1489,12 @@ int bam_merge(int argc, char *argv[])
          case '?': merge_usage(stderr); return 1;
          }
      }
-    if ( argc - optind < 1 ) {
+
+    if (fnout == NULL && argc - optind >= 1) {
+        fnout = argv[optind];
+        optind++;
+    }
+    if (fnout == NULL) {
          print_error("merge", "You must at least specify the output file");
          merge_usage(stderr);
          return 1;
@@ -1426,50 +1505,57 @@ int bam_merge(int argc, char *argv[])
          return 1;
      }
  
-    srand48(random_seed);
-    if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) {
-        FILE *fp = fopen(argv[optind], "rb");
-        if (fp != NULL) {
-            fclose(fp);
-            fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]);
-            return 1;
+    hts_srand48(random_seed);
+    if (!(flag & MERGE_FORCE) && strcmp(fnout, "-") != 0) {
+        struct stat sbuf;
+        if (stat(fnout, &sbuf) == 0 && S_ISREG(sbuf.st_mode)) {
+            fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, fnout);
+            ret = 1;
+            goto end;
          }
      }
  
      int nargcfiles = 0;
      if (has_index_file) { // Calculate # of input BAM files
-        if ((argc - optind - 1) % 2 != 0) {
+        if ((argc - optind) % 2 != 0) {
              fprintf(stderr, "Odd number of filenames detected! Each BAM file should have an index file\n");
-            return 1;
+            ret = 1;
+            goto end;
          }
-        nargcfiles = (argc - optind - 1) / 2;
+        nargcfiles = (argc - optind) / 2;
      } else {
-        nargcfiles = argc - optind - 1;
+        nargcfiles = argc - optind;
      }
  
      if (nargcfiles > 0) {
          // Add argc files to end of array
          fn = realloc(fn, (fn_size+nargcfiles) * sizeof(char*));
          if (fn == NULL) { ret = 1; goto end; }
-        memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*));
+        memcpy(fn+fn_size, argv + optind, nargcfiles * sizeof(char*));
  
          if(has_index_file) {
              fn_idx = realloc(fn_idx, nargcfiles * sizeof(char*));
              if (fn_idx == NULL) { ret = 1; goto end; }
-            memcpy(fn_idx+fn_size, argv + nargcfiles + (optind+1), nargcfiles * sizeof(char*));
+            memcpy(fn_idx+fn_size, argv + nargcfiles + optind, nargcfiles * sizeof(char*));
          }
      }
      if (fn_size+nargcfiles < 1) {
          print_error("merge", "You must specify at least one (and usually two or more) input files");
          merge_usage(stderr);
-        free(fn_idx);
-        return 1;
+        ret = 1;
+        goto end;
+    }
+
+    if (reg && fn_bed) {
+        print_error("merge", "You must specify either a BED file or a region");
+        ret = 1;
+        goto end;
      }
      strcpy(mode, "wb");
-    sam_open_mode(mode+1, argv[optind], NULL);
+    sam_open_mode(mode+1, fnout, NULL);
      if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
-    if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers,
-                        fn_size+nargcfiles, fn, fn_idx, flag, reg, ga.nthreads,
+    if (bam_merge_core2(is_by_qname, sort_tag, fnout, mode, fn_headers,
+                        fn_size+nargcfiles, fn, fn_idx, fn_bed, flag, reg, ga.nthreads,
                          "merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0)
          ret = 1;
  
@@ -1631,6 +1717,12 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out,
      ks_heapmake(heap, heap_size, heap);
      while (heap->pos != HEAP_EMPTY) {
          bam1_t *b = heap->entry.bam_record;
+        if (g_is_by_minhash && b->core.tid == -1) {
+            // Remove the cached minhash value
+            b->core.pos = -1;
+            b->core.mpos = -1;
+            b->core.isize = 0;
+        }
          if (sam_write1(fpout, hout, b) < 0) {
              print_error_errno(cmd, "failed writing to \"%s\"", out);
              goto fail;
@@ -1789,12 +1881,45 @@ static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b)
      }
  }
  
+// Sort by minimiser (stored in bam1_tag.u.pos).
+// If equal, sort by position.
+//
+// The 64-bit sort key is split over the bam pos and isize fields.
+// This permits it to survive writing to temporary file and coming back.
+static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b)
+{
+    const bam1_t *A = a.bam_record;
+    const bam1_t *B = b.bam_record;
+
+    if (!A) return 1;
+    if (!B) return 0;
+
+    if (A->core.tid != -1 || B->core.tid != -1)
+        return bam1_cmp_core(a,b);
+
+    const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos;
+    const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos;
+
+    if (m_a < m_b) // by hash
+        return -1;
+    else if (m_a > m_b)
+        return 1;
+    else if (A->core.isize < B->core.isize) // by hash location in seq
+        return -1;
+    else if (A->core.isize > B->core.isize)
+        return 1;
+    else
+        return bam1_cmp_core(a,b);
+}
+
  // Function to compare reads and determine which one is < the other
  // Handle sort-by-pos, sort-by-name, or sort-by-tag
  static inline int bam1_lt(const bam1_tag a, const bam1_tag b)
  {
      if (g_is_by_tag) {
          return bam1_cmp_by_tag(a, b) < 0;
+    } else if (g_is_by_minhash) {
+        return bam1_cmp_by_minhash(a, b) < 0;
      } else {
          return bam1_cmp_core(a,b) < 0;
      }
@@ -1818,7 +1943,7 @@ typedef struct {
  //        -1 for failure
  static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf,
                          const sam_hdr_t *h, int n_threads, const htsFormat *fmt,
-                        char *arg_list, int no_pg, int write_index)
+                        int clear_minhash, char *arg_list, int no_pg, int write_index)
  {
      size_t i;
      samFile* fp;
@@ -1826,22 +1951,27 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *bu
  
      fp = sam_open_format(fn, mode, fmt);
      if (fp == NULL) return -1;
-    if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools",
-                                 "VN", samtools_version(),
+    if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", "VN", samtools_version(),
                                   arg_list ? "CL": NULL,
                                   arg_list ? arg_list : NULL,
                                   NULL)) {
          goto fail;
      }
-    if (sam_hdr_write(fp, (sam_hdr_t *)h) != 0) goto fail;
+    if (sam_hdr_write(fp, h) != 0) goto fail;
  
-    if (write_index) {
+    if (write_index)
          if (!(out_idx_fn = auto_index(fp, fn, (sam_hdr_t *)h))) goto fail;
-    }
  
      if (n_threads > 1) hts_set_threads(fp, n_threads);
      for (i = 0; i < l; ++i) {
-        if (sam_write1(fp, (sam_hdr_t *)h, buf[i].bam_record) < 0) goto fail;
+        bam1_t *b = buf[i].bam_record;
+        if (clear_minhash && b->core.tid == -1) {
+            // Remove the cached minhash value
+            b->core.pos = -1;
+            b->core.mpos = -1;
+            b->core.isize = 0;
+        }
+        if (sam_write1(fp, h, b) < 0) goto fail;
      }
  
      if (write_index) {
@@ -1944,18 +2074,206 @@ err:
      return ret;
  }
  
+/*
+ * Computes the minhash of a sequence using both forward and reverse strands.
+ *
+ * This is used as a sort key for unmapped data, to collate like sequences
+ * together and to improve compression ratio.
+ *
+ * The minhash is returned and *pos filled out with location of this hash
+ * key in the sequence if pos != NULL.
+ */
+static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) {
+    uint64_t hashf = 0, minhashf = UINT64_MAX;
+    uint64_t hashr = 0, minhashr = UINT64_MAX;
+    int minhashpf = 0, minhashpr = 0, i;
+    uint64_t mask = (1L<<(2*kmer))-1;
+    unsigned char *seq = bam_get_seq(b);
+    int len = b->core.l_qseq;
+
+    // Lookup tables for bam_seqi to 0123 fwd/rev hashes
+    // =ACM GRSV TWYH KDBN
+#define X 0
+    unsigned char L[16] = {
+        X,0,1,X,  2,X,X,X,  3,X,X,X,  X,X,X,X,
+    };
+    uint64_t R[16] = {
+        X,3,2,X,  1,X,X,X,  0,X,X,X,  X,X,X,X,
+    };
+    for (i = 0; i < 16; i++)
+        R[i] <<= 2*(kmer-1);
+
+    // Punt homopolymers somewhere central in the hash space
+#define XOR (0xdead7878beef7878 & mask)
+
+    // Initialise hash keys
+    for (i = 0; i < kmer-1 && i < len; i++) {
+        int base = bam_seqi(seq, i);
+        hashf = (hashf<<2) | L[base];
+        hashr = (hashr>>2) | R[base];
+    }
+
+    // Loop to find minimum
+    for (; i < len; i++) {
+        int base = bam_seqi(seq, i);
+
+        hashf = ((hashf<<2) | L[base]) & mask;
+        hashr =  (hashr>>2) | R[base];
+
+        if (minhashf > (hashf^XOR))
+            minhashf = (hashf^XOR), minhashpf = i;
+        if (minhashr > (hashr^XOR))
+            minhashr = (hashr^XOR), minhashpr = len-i+kmer-2;
+
+    }
+
+    if (minhashf <= minhashr) {
+        if (rev) *rev = 0;
+        if (pos) *pos = minhashpf;
+        return minhashf;
+    } else {
+        if (rev) *rev = 1;
+        if (pos) *pos = minhashpr;
+        return minhashr;
+    }
+}
+
+//--- Start of candidates to punt to htslib
+/*!
+ * @abstract
+ * Extracts the sequence (in current alignment orientation) from
+ * a bam record and places it in buf, which is nul terminated.
+ *
+ * @param b     The bam structure
+ * @param buf   A buffer at least b->core.l_qseq+1 bytes long
+ */
+static void bam_to_seq(bam1_t *b, char *buf) {
+    int i;
+    uint8_t *seq = bam_get_seq(b);
+    for (i = 0; i < b->core.l_qseq; i++)
+        buf[i] = seq_nt16_str[bam_seqi(seq, i)];
+    buf[i] = 0;
+}
+
+/*!
+ * @abstract
+ * Writes a new sequence, of length b->core.l_qseq, to a BAM record.
+ *
+ * If a sequence of a new length is required the caller must first make
+ * room for it by updating the bam1_t struct.
+ *
+ * @param b     The bam structure
+ * @param buf   A buffer at least b->core.l_qseq bytes long
+ */
+static void seq_to_bam(bam1_t *b, char *buf) {
+    int i;
+    uint8_t *seq = bam_get_seq(b);
+    for (i = 0; i < b->core.l_qseq; i++)
+        bam_set_seqi(seq, i, seq_nt16_table[(unsigned char)buf[i]]);
+}
+
+/*!
+ * @abstract Reverse complements a BAM record.
+ *
+ * It's possible to do this inline, but complex due to the 4-bit sequence
+ * encoding.  For now I take the dumb approach.
+ *
+ * @param b  Pointer to a BAM alignment
+ *
+ * @return   0 on success, -1 on failure (ENOMEM)
+ */
+static int reverse_complement(bam1_t *b) {
+    static char comp[256] = {
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//00
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//10
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//20
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//30
+
+       //    *   *   *    *   E   F   *    *   I   J   *    L   *   *   O
+        '@','T','V','G', 'H','E','F','C', 'D','I','H','M', 'L','K','N','O',//40
+       //P   Q   *   *    *   *   *   *    X   Y   Z   [    \   ]   ^   _
+        'P','Q','Y','S', 'A','A','B','W', 'X','Y','Z','[','\\','[','^','_',//50
+       //`   *   *   *    *   E   F   *    *   I   J   *    L   *   *   O
+        '`','t','v','g', 'h','e','f','c', 'd','i','j','m', 'l','k','n','o',//60
+       //P   Q   *   *    *   *   *   *    X   Y   Z   {    |   }   ~   DEL
+        'p','q','y','s', 'a','a','b','w', 'x','y','z','{', '|','}','~',127,//70
+
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//80
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//90
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//A0
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//B0
+
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//C0
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//D0
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//E0
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//F0
+    };
+    char seq_[10000], *seq = seq_;
+    uint8_t *qual = bam_get_qual(b);
+    int i, j;
+
+    if (b->core.l_qseq >= 10000)
+        if (!(seq = malloc(b->core.l_qseq+1)))
+            return -1;
+
+    bam_to_seq(b, seq);
+
+    for (i = 0, j = b->core.l_qseq-1; i < j; i++, j--) {
+        unsigned char tmp = seq[i];
+        seq[i] = comp[(unsigned char)seq[j]];
+        seq[j] = comp[tmp];
+        tmp = qual[i];
+        qual[i] = qual[j];
+        qual[j] = tmp;
+    }
+    if (i ==j)
+        seq[i] = comp[(unsigned char)seq[i]];
+
+    seq_to_bam(b, seq);
+
+    if (seq != seq_)
+        free(seq);
+
+    b->core.flag ^= 0x10;
+
+    return 0;
+}
+//--- End of candidates to punt to htslib
+
  static void *worker(void *data)
  {
      worker_t *w = (worker_t*)data;
      char *name;
      w->error = 0;
  
-    if (!g_is_by_qname && !g_is_by_tag) {
+    if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) {
          if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) {
              w->error = errno;
              return NULL;
          }
      } else {
+        if (g_is_by_minhash) {
+            int i;
+            for (i = 0; i < w->buf_len; i++) {
+                bam1_t *b = w->buf[i].bam_record;
+                if (b->core.tid != -1)
+                    continue;
+
+                int pos = 0, rev = 0;
+                uint64_t mh = minhash(b, g_is_by_minhash, &pos, &rev);
+                if (rev)
+                    reverse_complement(b);
+
+                // Store 64-bit hash in unmapped pos and mpos fields.
+                // The position of hash is in isize, which we use for
+                // resolving ties when sorting by hash key.
+                // These are unused for completely unmapped data and
+                // will be reset during final output.
+                b->core.pos = mh>>31;
+                b->core.mpos = mh&0x7fffffff;
+                b->core.isize = 65535-pos >=0 ? 65535-pos : 0;
+            }
+        }
          ks_mergesort(sort, w->buf_len, w->buf, 0);
      }
  
@@ -1983,10 +2301,10 @@ static void *worker(void *data)
              return 0;
          }
  
-        if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, NULL, 1, 0) < 0)
+        if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, 0, NULL, 1, 0) < 0)
              w->error = errno;
      } else {
-        if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, NULL, 1, 0) < 0)
+        if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) < 0)
              w->error = errno;
      }
  
@@ -2043,6 +2361,7 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix,
      return n_files + n_threads;
  }
  
+
  /*!
    @abstract Sort an unsorted BAM file based on the chromosome order
    and the leftmost position of an alignment
@@ -2067,7 +2386,7 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix,
   */
  int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix,
                        const char *fnout, const char *modeout,
-                      size_t _max_mem, int n_threads,
+                      size_t _max_mem, int by_minimiser, int n_threads,
                        const htsFormat *in_fmt, const htsFormat *out_fmt,
                        char *arg_list, int no_pg, int write_index)
  {
@@ -2090,6 +2409,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
  
      if (n_threads < 2) n_threads = 1;
      g_is_by_qname = is_by_qname;
+    g_is_by_minhash = by_minimiser;
      if (sort_by_tag) {
          g_is_by_tag = 1;
          g_sort_tag[0] = sort_by_tag[0];
@@ -2116,11 +2436,23 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
      else
          new_so = "coordinate";
  
-    if ((-1 == sam_hdr_update_hd(header, "SO", new_so))
-     && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL))
-     ) {
-        print_error("sort", "failed to change sort order header to '%s'\n", new_so);
-        goto err;
+    if (by_minimiser) {
+        const char *new_ss = "coordinate:minhash";
+        if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "SS", new_ss))
+            && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION,
+                                       "SO", new_so, "SS", new_ss, NULL))
+            ) {
+            print_error("sort", "failed to change sort order header to 'SO:%s SS:%s'\n",
+                        new_so, new_ss);
+            goto err;
+        }
+    } else {
+        if ((-1 == sam_hdr_update_hd(header, "SO", new_so))
+            && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL))
+            ) {
+            print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so);
+            goto err;
+        }
      }
  
      if (-1 == sam_hdr_remove_tag_hd(header, "GO")) {
@@ -2207,7 +2539,8 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
  
      // write the final output
      if (n_files == 0 && num_in_mem < 2) { // a single block
-        if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, arg_list, no_pg, write_index) != 0) {
+        if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt,
+                         g_is_by_minhash, arg_list, no_pg, write_index) != 0) {
              print_error_errno("sort", "failed to create \"%s\"", fnout);
              goto err;
          }
@@ -2261,7 +2594,7 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma
      char *fnout = calloc(strlen(prefix) + 4 + 1, 1);
      if (!fnout) return -1;
      sprintf(fnout, "%s.bam", prefix);
-    ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
+    ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, 0, NULL, NULL, NULL, 1, 0);
      free(fnout);
      return ret;
  }
@@ -2272,13 +2605,16 @@ static void sort_usage(FILE *fp)
  "Usage: samtools sort [options...] [in.bam]\n"
  "Options:\n"
  "  -l INT     Set compression level, from 0 (uncompressed) to 9 (best)\n"
+"  -u         Output uncompressed data (equivalent to -l 0)\n"
  "  -m INT     Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
-"  -n         Sort by read name\n"
+"  -M         Use minimiser for clustering unaligned/unplaced reads\n"
+"  -K INT     Kmer size to use for minimiser [20]\n"
+"  -n         Sort by read name (not compatible with samtools index command)\n"
  "  -t TAG     Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
  "  -o FILE    Write final output to FILE rather than standard output\n"
  "  -T PREFIX  Write temporary files to PREFIX.nnnn.bam\n"
  "  --no-PG    do not add a PG line\n");
-    sam_global_opt_help(fp, "-.O..@-.");
+    sam_global_opt_help(fp, "-.O..@..");
  }
  
  static void complain_about_memory_setting(size_t max_mem) {
@@ -2302,6 +2638,7 @@ int bam_sort(int argc, char *argv[])
  {
      size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20;
      int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0;
+    int by_minimiser = 0, minimiser_kmer = 20;
      char* sort_tag = NULL, *arg_list = NULL;
      char *fnout = "-", modeout[12];
      kstring_t tmpprefix = { 0, 0, NULL };
@@ -2315,7 +2652,7 @@ int bam_sort(int argc, char *argv[])
          { NULL, 0, NULL, 0 }
      };
  
-    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) {
          switch (c) {
          case 'o': fnout = optarg; o_seen = 1; break;
          case 'n': is_by_qname = 1; break;
@@ -2330,7 +2667,16 @@ int bam_sort(int argc, char *argv[])
              }
          case 'T': kputs(optarg, &tmpprefix); break;
          case 'l': level = atoi(optarg); break;
-        case 1: no_pg = 1; break;
+        case 'u': level = 0; break;
+        case   1: no_pg = 1; break;
+        case 'M': by_minimiser = 1; break;
+        case 'K':
+            minimiser_kmer = atoi(optarg);
+            if (minimiser_kmer < 1)
+                minimiser_kmer = 1;
+            else if (minimiser_kmer > 31)
+                minimiser_kmer = 31;
+            break;
  
          default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                    /* else fall-through */
@@ -2385,7 +2731,8 @@ int bam_sort(int argc, char *argv[])
      }
  
      ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-",
-                            tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
+                            tmpprefix.s, fnout, modeout, max_mem,
+                            by_minimiser * minimiser_kmer, ga.nthreads,
                              &ga.in, &ga.out, arg_list, no_pg, ga.write_index);
      if (ret >= 0)
          ret = EXIT_SUCCESS;
diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c

index 30939606b9eb7840174d6c38a38364bdca681348..6cbf66a72ab9af36f6414021575bf4194a0c4b21 100644 (file)
--- a/samtools/bam_sort.c.pysam.c
+++ b/samtools/bam_sort.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bam_sort.c -- sorting and merging.
  
-    Copyright (C) 2008-2019 Genome Research Ltd.
+    Copyright (C) 2008-2021 Genome Research Ltd.
      Portions copyright (C) 2009-2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -35,11 +35,13 @@ DEALINGS IN THE SOFTWARE.  */
  #include <stdio.h>
  #include <string.h>
  #include <time.h>
+#include <sys/types.h>
  #include <sys/stat.h>
  #include <unistd.h>
  #include <getopt.h>
  #include <assert.h>
  #include <pthread.h>
+#include <inttypes.h>
  #include "htslib/ksort.h"
  #include "htslib/hts_os.h"
  #include "htslib/khash.h"
@@ -49,6 +51,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include "htslib/hts_endian.h"
  #include "sam_opts.h"
  #include "samtools.h"
+#include "bedidx.h"
  
  
  // Struct which contains the a record, and the pointer to the sort tag (if any) or
@@ -99,6 +102,7 @@ KLIST_INIT(hdrln, char*, hdrln_free_char)
  
  static int g_is_by_qname = 0;
  static int g_is_by_tag = 0;
+static int g_is_by_minhash = 0;
  static char g_sort_tag[2] = {0,0};
  
  static int strnum_cmp(const char *_a, const char *_b)
@@ -135,8 +139,11 @@ typedef struct {
  } heap1_t;
  
  static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b);
+static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b);
  
  // Function to compare reads in the heap and determine which one is < the other
+// Note, unlike the bam1_cmp_by_X functions which return <0, 0, >0 this
+// is strictly 0 or 1 only.
  static inline int heap_lt(const heap1_t a, const heap1_t b)
  {
      if (!a.entry.bam_record)
@@ -148,6 +155,9 @@ static inline int heap_lt(const heap1_t a, const heap1_t b)
          int t;
          t = bam1_cmp_by_tag(a.entry, b.entry);
          if (t != 0) return t > 0;
+    } else if (g_is_by_minhash) {
+        int t = bam1_cmp_by_minhash(a.entry, b.entry);
+        if (t != 0) return t > 0;
      } else if (g_is_by_qname) {
          int t, fa, fb;
          t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record));
@@ -515,7 +525,8 @@ static klist_t(hdrln) * trans_rg_pg(bool is_rg, sam_hdr_t *translate,
                  id_len = id_end - idp;
  
                  if (id_len < transformed_id.l) {
-                    if (ks_resize(&new_hdr_line, new_hdr_line.l + transformed_id.l - id_len))
+                    if (ks_resize(&new_hdr_line, new_hdr_line.l
+                                  + transformed_id.l - id_len + 1/*nul*/))
                          goto fail;
                  }
                  if (id_len != transformed_id.l) {
@@ -716,6 +727,7 @@ static int trans_tbl_init(merged_header_t* merged_hdr, sam_hdr_t* translate,
      // Get translated header lines and fill in map for @PG records
      pg_list = trans_rg_pg(false, translate, merge_pg, merged_hdr->pg_ids,
                            tbl->pg_trans, NULL);
+    if (!pg_list) goto fail;
  
      // Fix-up PG: tags in the new @RG records and add to output
      if (finish_rg_pg(true, rg_list, tbl->pg_trans, &merged_hdr->out_rg))
@@ -913,10 +925,38 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
  #define MERGE_COMBINE_PG 32 // Combine PG tags frather than redefining them
  #define MERGE_FIRST_CO   64 // Use only first file's @CO headers (sort cmd only)
  
+
+static hts_reglist_t *duplicate_reglist(const hts_reglist_t *rl, int rn) {
+    if (!rl)
+        return NULL;
+
+    hts_reglist_t *new_rl = calloc(rn, sizeof(hts_reglist_t));
+    if (!new_rl)
+        return NULL;
+
+    int i;
+    for (i=0; i < rn; i++) {
+        new_rl[i].tid     = rl[i].tid;
+        new_rl[i].count   = rl[i].count;
+        new_rl[i].min_beg = rl[i].min_beg;
+        new_rl[i].max_end = rl[i].max_end;
+
+        new_rl[i].reg = rl[i].reg;
+        new_rl[i].intervals = malloc(new_rl[i].count * sizeof(hts_pair_pos_t));
+        if (!new_rl[i].intervals) {
+            hts_reglist_free(new_rl, i);
+            return NULL;
+        }
+        memcpy(new_rl[i].intervals, rl[i].intervals, new_rl[i].count * sizeof(hts_pair_pos_t));
+    }
+
+    return new_rl;
+}
+
  /*
   * How merging is handled
   *
- * If a hheader is defined use we will use that as our output header
+ * If a header is defined use we will use that as our output header
   * otherwise we use the first header from the first input file.
   *
   * Now go through each file and create a translation table for that file for:
@@ -959,9 +999,9 @@ int* rtrans_build(int n, int n_targets, trans_tbl_t* translation_tbl)
   */
  int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode,
                      const char *headers, int n, char * const *fn, char * const *fn_idx,
-                    int flag, const char *reg, int n_threads, const char *cmd,
-                    const htsFormat *in_fmt, const htsFormat *out_fmt, int write_index,
-                    char *arg_list, int no_pg)
+                    const char *fn_bed, int flag, const char *reg, int n_threads,
+                    const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt,
+                    int write_index, char *arg_list, int no_pg)
  {
      samFile *fpout, **fp = NULL;
      heap1_t *heap = NULL;
@@ -975,6 +1015,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
      trans_tbl_t *translation_tbl = NULL;
      int *rtrans = NULL;
      char *out_idx_fn = NULL;
+    void *hreg = NULL;
+    hts_reglist_t *lreg = NULL;
      merged_header_t *merged_hdr = init_merged_header();
      if (!merged_hdr) return -1;
  
@@ -1032,7 +1074,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
      }
  
      if (hin) {
-        // Popluate merged_hdr from the pre-prepared header
+        // Populate merged_hdr from the pre-prepared header
          trans_tbl_t dummy;
          int res;
          res = trans_tbl_init(merged_hdr, hin, &dummy, flag & MERGE_COMBINE_RG,
@@ -1061,10 +1103,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
                             RG[i]))
              return -1; // FIXME: memory leak
  
-        // TODO sam_itr_next() doesn't yet work for SAM files,
-        // so for those keep the headers around for use with sam_read1()
-        if (hts_get_format(fp[i])->format == sam) hdr[i] = hin;
-        else { sam_hdr_destroy(hin); hdr[i] = NULL; }
+        hdr[i] = hin;
  
          if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
              fprintf(samtools_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
@@ -1100,10 +1139,22 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
      if (!hout) return -1;  // FIXME: memory leak
  
      // If we're only merging a specified region move our iters to start at that point
-    if (reg) {
-        int tid;
-        hts_pos_t beg, end;
+    int tid, nreg;
+    hts_pos_t beg, end;
  
+    if (fn_bed) {
+        hreg = bed_read(fn_bed);
+        if (!hreg) {
+            fprintf(samtools_stderr, "[%s] Could not read BED file: \"%s\"\n", __func__, fn_bed);
+            goto fail;
+        }
+        bed_unify(hreg);
+        lreg = bed_reglist(hreg, ALL, &nreg);
+        if (!lreg || !nreg) {
+            fprintf(samtools_stderr, "[%s] Null or empty region list\n", __func__);
+            goto fail;
+        }
+    } else if (reg) {
          rtrans = rtrans_build(n, sam_hdr_nref(hout), translation_tbl);
          if (!rtrans) goto mem_fail;
  
@@ -1111,55 +1162,69 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
              fprintf(samtools_stderr, "[%s] Badly formatted region or unknown reference name: \"%s\"\n", __func__, reg);
              goto fail;
          }
+
+    }
+
+    if (reg || fn_bed) {
+        hts_idx_t *reg_idx = NULL;
          for (i = 0; i < n; ++i) {
-            hts_idx_t *idx = NULL;
-            // If index filename has not been specfied, look in BAM folder
+
+            // If index filename has not been specified, look in the BAM folder
              if (fn_idx != NULL) {
-                idx = sam_index_load2(fp[i], fn[i], fn_idx[i]);
+                reg_idx = sam_index_load2(fp[i], fn[i], fn_idx[i]);
              } else {
-                idx = sam_index_load(fp[i], fn[i]);
+                reg_idx = sam_index_load(fp[i], fn[i]);
              }
-            // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
-            int mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid];
-            if (idx == NULL) {
-                fprintf(samtools_stderr, "[%s] failed to load index for %s.  Random alignment retrieval only works for indexed BAM or CRAM files.\n",
+            if (reg_idx == NULL) {
+                fprintf(samtools_stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n",
                          __func__, fn[i]);
+                free(rtrans);
+                rtrans = NULL;
                  goto fail;
              }
-            if (mapped_tid != INT32_MIN) {
-                iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end);
+
+            int mapped_tid = INT32_MIN;
+            if (fn_bed) {
+                hts_reglist_t *rl = duplicate_reglist(lreg, nreg);
+                iter[i] = sam_itr_regions(reg_idx, hdr[i], rl, nreg);
              } else {
-                iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0);
+                // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
+                mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid];
+                if (mapped_tid != INT32_MIN) {
+                    iter[i] = sam_itr_queryi(reg_idx, mapped_tid, beg, end);
+                } else {
+                    iter[i] = sam_itr_queryi(reg_idx, HTS_IDX_NONE, 0, 0);
+                }
              }
-            hts_idx_destroy(idx);
+
              if (iter[i] == NULL) {
-                if (mapped_tid != INT32_MIN) {
-                    fprintf(samtools_stderr,
-                            "[%s] failed to get iterator over "
-                            "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n",
-                            __func__, fn[i], mapped_tid, beg, end);
+                if (fn_bed) {
+                    fprintf(samtools_stderr, "[%s] failed to get multi-region iterator "
+                            "{%s, %s}\n", __func__, fn[i], fn_bed);
                  } else {
-                    fprintf(samtools_stderr,
-                            "[%s] failed to get iterator over "
-                            "{%s, HTS_IDX_NONE, 0, 0}\n",
-                            __func__, fn[i]);
+                    if (mapped_tid != INT32_MIN) {
+                        fprintf(samtools_stderr,
+                                "[%s] failed to get iterator over "
+                                "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n",
+                                __func__, fn[i], mapped_tid, beg, end);
+                    } else {
+                        fprintf(samtools_stderr,
+                                "[%s] failed to get iterator over "
+                                "{%s, HTS_IDX_NONE, 0, 0}\n",
+                                __func__, fn[i]);
+                    }
                  }
+                hts_idx_destroy(reg_idx);
+                free(rtrans);
+                rtrans = NULL;
                  goto fail;
              }
+
+            hts_idx_destroy(reg_idx);
          }
+
          free(rtrans);
          rtrans = NULL;
-    } else {
-        for (i = 0; i < n; ++i) {
-            if (hdr[i] == NULL) {
-                iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
-                if (iter[i] == NULL) {
-                    fprintf(samtools_stderr, "[%s] failed to get iterator\n", __func__);
-                    goto fail;
-                }
-            }
-            else iter[i] = NULL;
-        }
      }
  
      // Load the first read from each file into the heap
@@ -1281,6 +1346,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
      sam_hdr_destroy(hin);
      sam_hdr_destroy(hout);
      free_merged_header(merged_hdr);
+    hts_reglist_free(lreg, nreg);
+    bed_destroy(hreg);
      free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
      if (sam_close(fpout) < 0) {
          print_error(cmd, "error closing output file");
@@ -1309,6 +1376,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
      free(RG);
      free(translation_tbl);
      free(hdr);
+    hts_reglist_free(lreg, nreg);
+    bed_destroy(hreg);
      free(iter);
      free(heap);
      free(fp);
@@ -1324,13 +1393,14 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch
      strcpy(mode, "wb");
      if (flag & MERGE_UNCOMP) strcat(mode, "0");
      else if (flag & MERGE_LEVEL1) strcat(mode, "1");
-    return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1);
+    return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1);
  }
  
  static void merge_usage(FILE *to)
  {
      fprintf(to,
-"Usage: samtools merge [-nurlf] [-h inh.sam] [-b <bamlist.fofn>] <out.bam> <in1.bam> [<in2.bam> ... <inN.bam>]\n"
+"Usage: samtools merge [options] -o <out.bam> [options] <in1.bam> ... <inN.bam>\n"
+"   or: samtools merge [options] <out.bam> <in1.bam> ... <inN.bam>\n"
  "\n"
  "Options:\n"
  "  -n         Input files are sorted by read name\n"
@@ -1338,6 +1408,7 @@ static void merge_usage(FILE *to)
  "  -r         Attach RG tag (inferred from file names)\n"
  "  -u         Uncompressed BAM output\n"
  "  -f         Overwrite the output BAM if exist\n"
+"  -o FILE    Specify output file via option instead of <out.bam> argument\n"
  "  -1         Compress level 1\n"
  "  -l INT     Compression level, from 0 to 9 [-1]\n"
  "  -R STR     Merge file in the specified region STR [all]\n"
@@ -1347,6 +1418,7 @@ static void merge_usage(FILE *to)
  "  -s VALUE   Override random seed\n"
  "  -b FILE    List of input BAM filenames, one per line [null]\n"
  "  -X         Use customized index files\n"
+"  -L FILE    Specify a BED file for multiple region filtering [null]\n"
  "  --no-PG    do not add a PG line\n");
      sam_global_opt_help(to, "-.O..@..");
  }
@@ -1355,10 +1427,10 @@ int bam_merge(int argc, char *argv[])
  {
      int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0;
      char *fn_headers = NULL, *reg = NULL, mode[12];
-    char *sort_tag = NULL, *arg_list = NULL;
+    char *sort_tag = NULL, *fnout = NULL, *arg_list = NULL;
      long random_seed = (long)time(NULL);
      char** fn = NULL;
-    char** fn_idx = NULL;
+    char** fn_idx = NULL, *fn_bed = NULL;
      int fn_size = 0, no_pg = 0;
  
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
@@ -1374,12 +1446,13 @@ int bam_merge(int argc, char *argv[])
          return 0;
      }
  
-    while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:X", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "h:nru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) {
          switch (c) {
          case 'r': flag |= MERGE_RG; break;
          case 'f': flag |= MERGE_FORCE; break;
          case 'h': fn_headers = optarg; break;
          case 'n': is_by_qname = 1; break;
+        case 'o': fnout = optarg; break;
          case 't': sort_tag = optarg; break;
          case '1': flag |= MERGE_LEVEL1; level = 1; break;
          case 'u': flag |= MERGE_UNCOMP; level = 0; break;
@@ -1389,6 +1462,7 @@ int bam_merge(int argc, char *argv[])
          case 'p': flag |= MERGE_COMBINE_PG; break;
          case 's': random_seed = atol(optarg); break;
          case 'X': has_index_file = 1; break; // -X flag for index filename
+        case 'L': fn_bed = optarg; break;
          case 'b': {
              // load the list of files to read
              if (has_index_file) {
@@ -1417,7 +1491,12 @@ int bam_merge(int argc, char *argv[])
          case '?': merge_usage(samtools_stderr); return 1;
          }
      }
-    if ( argc - optind < 1 ) {
+
+    if (fnout == NULL && argc - optind >= 1) {
+        fnout = argv[optind];
+        optind++;
+    }
+    if (fnout == NULL) {
          print_error("merge", "You must at least specify the output file");
          merge_usage(samtools_stderr);
          return 1;
@@ -1428,50 +1507,57 @@ int bam_merge(int argc, char *argv[])
          return 1;
      }
  
-    srand48(random_seed);
-    if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) {
-        FILE *fp = fopen(argv[optind], "rb");
-        if (fp != NULL) {
-            fclose(fp);
-            fprintf(samtools_stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]);
-            return 1;
+    hts_srand48(random_seed);
+    if (!(flag & MERGE_FORCE) && strcmp(fnout, "-") != 0) {
+        struct stat sbuf;
+        if (stat(fnout, &sbuf) == 0 && S_ISREG(sbuf.st_mode)) {
+            fprintf(samtools_stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, fnout);
+            ret = 1;
+            goto end;
          }
      }
  
      int nargcfiles = 0;
      if (has_index_file) { // Calculate # of input BAM files
-        if ((argc - optind - 1) % 2 != 0) {
+        if ((argc - optind) % 2 != 0) {
              fprintf(samtools_stderr, "Odd number of filenames detected! Each BAM file should have an index file\n");
-            return 1;
+            ret = 1;
+            goto end;
          }
-        nargcfiles = (argc - optind - 1) / 2;
+        nargcfiles = (argc - optind) / 2;
      } else {
-        nargcfiles = argc - optind - 1;
+        nargcfiles = argc - optind;
      }
  
      if (nargcfiles > 0) {
          // Add argc files to end of array
          fn = realloc(fn, (fn_size+nargcfiles) * sizeof(char*));
          if (fn == NULL) { ret = 1; goto end; }
-        memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*));
+        memcpy(fn+fn_size, argv + optind, nargcfiles * sizeof(char*));
  
          if(has_index_file) {
              fn_idx = realloc(fn_idx, nargcfiles * sizeof(char*));
              if (fn_idx == NULL) { ret = 1; goto end; }
-            memcpy(fn_idx+fn_size, argv + nargcfiles + (optind+1), nargcfiles * sizeof(char*));
+            memcpy(fn_idx+fn_size, argv + nargcfiles + optind, nargcfiles * sizeof(char*));
          }
      }
      if (fn_size+nargcfiles < 1) {
          print_error("merge", "You must specify at least one (and usually two or more) input files");
          merge_usage(samtools_stderr);
-        free(fn_idx);
-        return 1;
+        ret = 1;
+        goto end;
+    }
+
+    if (reg && fn_bed) {
+        print_error("merge", "You must specify either a BED file or a region");
+        ret = 1;
+        goto end;
      }
      strcpy(mode, "wb");
-    sam_open_mode(mode+1, argv[optind], NULL);
+    sam_open_mode(mode+1, fnout, NULL);
      if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
-    if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers,
-                        fn_size+nargcfiles, fn, fn_idx, flag, reg, ga.nthreads,
+    if (bam_merge_core2(is_by_qname, sort_tag, fnout, mode, fn_headers,
+                        fn_size+nargcfiles, fn, fn_idx, fn_bed, flag, reg, ga.nthreads,
                          "merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0)
          ret = 1;
  
@@ -1633,6 +1719,12 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out,
      ks_heapmake(heap, heap_size, heap);
      while (heap->pos != HEAP_EMPTY) {
          bam1_t *b = heap->entry.bam_record;
+        if (g_is_by_minhash && b->core.tid == -1) {
+            // Remove the cached minhash value
+            b->core.pos = -1;
+            b->core.mpos = -1;
+            b->core.isize = 0;
+        }
          if (sam_write1(fpout, hout, b) < 0) {
              print_error_errno(cmd, "failed writing to \"%s\"", out);
              goto fail;
@@ -1791,12 +1883,45 @@ static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b)
      }
  }
  
+// Sort by minimiser (stored in bam1_tag.u.pos).
+// If equal, sort by position.
+//
+// The 64-bit sort key is split over the bam pos and isize fields.
+// This permits it to survive writing to temporary file and coming back.
+static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b)
+{
+    const bam1_t *A = a.bam_record;
+    const bam1_t *B = b.bam_record;
+
+    if (!A) return 1;
+    if (!B) return 0;
+
+    if (A->core.tid != -1 || B->core.tid != -1)
+        return bam1_cmp_core(a,b);
+
+    const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos;
+    const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos;
+
+    if (m_a < m_b) // by hash
+        return -1;
+    else if (m_a > m_b)
+        return 1;
+    else if (A->core.isize < B->core.isize) // by hash location in seq
+        return -1;
+    else if (A->core.isize > B->core.isize)
+        return 1;
+    else
+        return bam1_cmp_core(a,b);
+}
+
  // Function to compare reads and determine which one is < the other
  // Handle sort-by-pos, sort-by-name, or sort-by-tag
  static inline int bam1_lt(const bam1_tag a, const bam1_tag b)
  {
      if (g_is_by_tag) {
          return bam1_cmp_by_tag(a, b) < 0;
+    } else if (g_is_by_minhash) {
+        return bam1_cmp_by_minhash(a, b) < 0;
      } else {
          return bam1_cmp_core(a,b) < 0;
      }
@@ -1820,7 +1945,7 @@ typedef struct {
  //        -1 for failure
  static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf,
                          const sam_hdr_t *h, int n_threads, const htsFormat *fmt,
-                        char *arg_list, int no_pg, int write_index)
+                        int clear_minhash, char *arg_list, int no_pg, int write_index)
  {
      size_t i;
      samFile* fp;
@@ -1828,22 +1953,27 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *bu
  
      fp = sam_open_format(fn, mode, fmt);
      if (fp == NULL) return -1;
-    if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools",
-                                 "VN", samtools_version(),
+    if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", "VN", samtools_version(),
                                   arg_list ? "CL": NULL,
                                   arg_list ? arg_list : NULL,
                                   NULL)) {
          goto fail;
      }
-    if (sam_hdr_write(fp, (sam_hdr_t *)h) != 0) goto fail;
+    if (sam_hdr_write(fp, h) != 0) goto fail;
  
-    if (write_index) {
+    if (write_index)
          if (!(out_idx_fn = auto_index(fp, fn, (sam_hdr_t *)h))) goto fail;
-    }
  
      if (n_threads > 1) hts_set_threads(fp, n_threads);
      for (i = 0; i < l; ++i) {
-        if (sam_write1(fp, (sam_hdr_t *)h, buf[i].bam_record) < 0) goto fail;
+        bam1_t *b = buf[i].bam_record;
+        if (clear_minhash && b->core.tid == -1) {
+            // Remove the cached minhash value
+            b->core.pos = -1;
+            b->core.mpos = -1;
+            b->core.isize = 0;
+        }
+        if (sam_write1(fp, h, b) < 0) goto fail;
      }
  
      if (write_index) {
@@ -1946,18 +2076,206 @@ err:
      return ret;
  }
  
+/*
+ * Computes the minhash of a sequence using both forward and reverse strands.
+ *
+ * This is used as a sort key for unmapped data, to collate like sequences
+ * together and to improve compression ratio.
+ *
+ * The minhash is returned and *pos filled out with location of this hash
+ * key in the sequence if pos != NULL.
+ */
+static uint64_t minhash(bam1_t *b, int kmer, int *pos, int *rev) {
+    uint64_t hashf = 0, minhashf = UINT64_MAX;
+    uint64_t hashr = 0, minhashr = UINT64_MAX;
+    int minhashpf = 0, minhashpr = 0, i;
+    uint64_t mask = (1L<<(2*kmer))-1;
+    unsigned char *seq = bam_get_seq(b);
+    int len = b->core.l_qseq;
+
+    // Lookup tables for bam_seqi to 0123 fwd/rev hashes
+    // =ACM GRSV TWYH KDBN
+#define X 0
+    unsigned char L[16] = {
+        X,0,1,X,  2,X,X,X,  3,X,X,X,  X,X,X,X,
+    };
+    uint64_t R[16] = {
+        X,3,2,X,  1,X,X,X,  0,X,X,X,  X,X,X,X,
+    };
+    for (i = 0; i < 16; i++)
+        R[i] <<= 2*(kmer-1);
+
+    // Punt homopolymers somewhere central in the hash space
+#define XOR (0xdead7878beef7878 & mask)
+
+    // Initialise hash keys
+    for (i = 0; i < kmer-1 && i < len; i++) {
+        int base = bam_seqi(seq, i);
+        hashf = (hashf<<2) | L[base];
+        hashr = (hashr>>2) | R[base];
+    }
+
+    // Loop to find minimum
+    for (; i < len; i++) {
+        int base = bam_seqi(seq, i);
+
+        hashf = ((hashf<<2) | L[base]) & mask;
+        hashr =  (hashr>>2) | R[base];
+
+        if (minhashf > (hashf^XOR))
+            minhashf = (hashf^XOR), minhashpf = i;
+        if (minhashr > (hashr^XOR))
+            minhashr = (hashr^XOR), minhashpr = len-i+kmer-2;
+
+    }
+
+    if (minhashf <= minhashr) {
+        if (rev) *rev = 0;
+        if (pos) *pos = minhashpf;
+        return minhashf;
+    } else {
+        if (rev) *rev = 1;
+        if (pos) *pos = minhashpr;
+        return minhashr;
+    }
+}
+
+//--- Start of candidates to punt to htslib
+/*!
+ * @abstract
+ * Extracts the sequence (in current alignment orientation) from
+ * a bam record and places it in buf, which is nul terminated.
+ *
+ * @param b     The bam structure
+ * @param buf   A buffer at least b->core.l_qseq+1 bytes long
+ */
+static void bam_to_seq(bam1_t *b, char *buf) {
+    int i;
+    uint8_t *seq = bam_get_seq(b);
+    for (i = 0; i < b->core.l_qseq; i++)
+        buf[i] = seq_nt16_str[bam_seqi(seq, i)];
+    buf[i] = 0;
+}
+
+/*!
+ * @abstract
+ * Writes a new sequence, of length b->core.l_qseq, to a BAM record.
+ *
+ * If a sequence of a new length is required the caller must first make
+ * room for it by updating the bam1_t struct.
+ *
+ * @param b     The bam structure
+ * @param buf   A buffer at least b->core.l_qseq bytes long
+ */
+static void seq_to_bam(bam1_t *b, char *buf) {
+    int i;
+    uint8_t *seq = bam_get_seq(b);
+    for (i = 0; i < b->core.l_qseq; i++)
+        bam_set_seqi(seq, i, seq_nt16_table[(unsigned char)buf[i]]);
+}
+
+/*!
+ * @abstract Reverse complements a BAM record.
+ *
+ * It's possible to do this inline, but complex due to the 4-bit sequence
+ * encoding.  For now I take the dumb approach.
+ *
+ * @param b  Pointer to a BAM alignment
+ *
+ * @return   0 on success, -1 on failure (ENOMEM)
+ */
+static int reverse_complement(bam1_t *b) {
+    static char comp[256] = {
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//00
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//10
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//20
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//30
+
+       //    *   *   *    *   E   F   *    *   I   J   *    L   *   *   O
+        '@','T','V','G', 'H','E','F','C', 'D','I','H','M', 'L','K','N','O',//40
+       //P   Q   *   *    *   *   *   *    X   Y   Z   [    \   ]   ^   _
+        'P','Q','Y','S', 'A','A','B','W', 'X','Y','Z','[','\\','[','^','_',//50
+       //`   *   *   *    *   E   F   *    *   I   J   *    L   *   *   O
+        '`','t','v','g', 'h','e','f','c', 'd','i','j','m', 'l','k','n','o',//60
+       //P   Q   *   *    *   *   *   *    X   Y   Z   {    |   }   ~   DEL
+        'p','q','y','s', 'a','a','b','w', 'x','y','z','{', '|','}','~',127,//70
+
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//80
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//90
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//A0
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//B0
+
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//C0
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//D0
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//E0
+        'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',//F0
+    };
+    char seq_[10000], *seq = seq_;
+    uint8_t *qual = bam_get_qual(b);
+    int i, j;
+
+    if (b->core.l_qseq >= 10000)
+        if (!(seq = malloc(b->core.l_qseq+1)))
+            return -1;
+
+    bam_to_seq(b, seq);
+
+    for (i = 0, j = b->core.l_qseq-1; i < j; i++, j--) {
+        unsigned char tmp = seq[i];
+        seq[i] = comp[(unsigned char)seq[j]];
+        seq[j] = comp[tmp];
+        tmp = qual[i];
+        qual[i] = qual[j];
+        qual[j] = tmp;
+    }
+    if (i ==j)
+        seq[i] = comp[(unsigned char)seq[i]];
+
+    seq_to_bam(b, seq);
+
+    if (seq != seq_)
+        free(seq);
+
+    b->core.flag ^= 0x10;
+
+    return 0;
+}
+//--- End of candidates to punt to htslib
+
  static void *worker(void *data)
  {
      worker_t *w = (worker_t*)data;
      char *name;
      w->error = 0;
  
-    if (!g_is_by_qname && !g_is_by_tag) {
+    if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) {
          if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) {
              w->error = errno;
              return NULL;
          }
      } else {
+        if (g_is_by_minhash) {
+            int i;
+            for (i = 0; i < w->buf_len; i++) {
+                bam1_t *b = w->buf[i].bam_record;
+                if (b->core.tid != -1)
+                    continue;
+
+                int pos = 0, rev = 0;
+                uint64_t mh = minhash(b, g_is_by_minhash, &pos, &rev);
+                if (rev)
+                    reverse_complement(b);
+
+                // Store 64-bit hash in unmapped pos and mpos fields.
+                // The position of hash is in isize, which we use for
+                // resolving ties when sorting by hash key.
+                // These are unused for completely unmapped data and
+                // will be reset during final output.
+                b->core.pos = mh>>31;
+                b->core.mpos = mh&0x7fffffff;
+                b->core.isize = 65535-pos >=0 ? 65535-pos : 0;
+            }
+        }
          ks_mergesort(sort, w->buf_len, w->buf, 0);
      }
  
@@ -1985,10 +2303,10 @@ static void *worker(void *data)
              return 0;
          }
  
-        if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, NULL, 1, 0) < 0)
+        if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, 0, NULL, 1, 0) < 0)
              w->error = errno;
      } else {
-        if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, NULL, 1, 0) < 0)
+        if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) < 0)
              w->error = errno;
      }
  
@@ -2045,6 +2363,7 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix,
      return n_files + n_threads;
  }
  
+
  /*!
    @abstract Sort an unsorted BAM file based on the chromosome order
    and the leftmost position of an alignment
@@ -2069,7 +2388,7 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix,
   */
  int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix,
                        const char *fnout, const char *modeout,
-                      size_t _max_mem, int n_threads,
+                      size_t _max_mem, int by_minimiser, int n_threads,
                        const htsFormat *in_fmt, const htsFormat *out_fmt,
                        char *arg_list, int no_pg, int write_index)
  {
@@ -2092,6 +2411,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
  
      if (n_threads < 2) n_threads = 1;
      g_is_by_qname = is_by_qname;
+    g_is_by_minhash = by_minimiser;
      if (sort_by_tag) {
          g_is_by_tag = 1;
          g_sort_tag[0] = sort_by_tag[0];
@@ -2118,11 +2438,23 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
      else
          new_so = "coordinate";
  
-    if ((-1 == sam_hdr_update_hd(header, "SO", new_so))
-     && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL))
-     ) {
-        print_error("sort", "failed to change sort order header to '%s'\n", new_so);
-        goto err;
+    if (by_minimiser) {
+        const char *new_ss = "coordinate:minhash";
+        if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "SS", new_ss))
+            && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION,
+                                       "SO", new_so, "SS", new_ss, NULL))
+            ) {
+            print_error("sort", "failed to change sort order header to 'SO:%s SS:%s'\n",
+                        new_so, new_ss);
+            goto err;
+        }
+    } else {
+        if ((-1 == sam_hdr_update_hd(header, "SO", new_so))
+            && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL))
+            ) {
+            print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so);
+            goto err;
+        }
      }
  
      if (-1 == sam_hdr_remove_tag_hd(header, "GO")) {
@@ -2209,7 +2541,8 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
  
      // write the final output
      if (n_files == 0 && num_in_mem < 2) { // a single block
-        if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, arg_list, no_pg, write_index) != 0) {
+        if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt,
+                         g_is_by_minhash, arg_list, no_pg, write_index) != 0) {
              print_error_errno("sort", "failed to create \"%s\"", fnout);
              goto err;
          }
@@ -2263,7 +2596,7 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma
      char *fnout = calloc(strlen(prefix) + 4 + 1, 1);
      if (!fnout) return -1;
      sprintf(fnout, "%s.bam", prefix);
-    ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
+    ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, 0, NULL, NULL, NULL, 1, 0);
      free(fnout);
      return ret;
  }
@@ -2274,13 +2607,16 @@ static void sort_usage(FILE *fp)
  "Usage: samtools sort [options...] [in.bam]\n"
  "Options:\n"
  "  -l INT     Set compression level, from 0 (uncompressed) to 9 (best)\n"
+"  -u         Output uncompressed data (equivalent to -l 0)\n"
  "  -m INT     Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
-"  -n         Sort by read name\n"
+"  -M         Use minimiser for clustering unaligned/unplaced reads\n"
+"  -K INT     Kmer size to use for minimiser [20]\n"
+"  -n         Sort by read name (not compatible with samtools index command)\n"
  "  -t TAG     Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
  "  -o FILE    Write final output to FILE rather than standard output\n"
  "  -T PREFIX  Write temporary files to PREFIX.nnnn.bam\n"
  "  --no-PG    do not add a PG line\n");
-    sam_global_opt_help(fp, "-.O..@-.");
+    sam_global_opt_help(fp, "-.O..@..");
  }
  
  static void complain_about_memory_setting(size_t max_mem) {
@@ -2304,6 +2640,7 @@ int bam_sort(int argc, char *argv[])
  {
      size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20;
      int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0;
+    int by_minimiser = 0, minimiser_kmer = 20;
      char* sort_tag = NULL, *arg_list = NULL;
      char *fnout = "-", modeout[12];
      kstring_t tmpprefix = { 0, 0, NULL };
@@ -2317,7 +2654,7 @@ int bam_sort(int argc, char *argv[])
          { NULL, 0, NULL, 0 }
      };
  
-    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) {
          switch (c) {
          case 'o': fnout = optarg; o_seen = 1; break;
          case 'n': is_by_qname = 1; break;
@@ -2332,7 +2669,16 @@ int bam_sort(int argc, char *argv[])
              }
          case 'T': kputs(optarg, &tmpprefix); break;
          case 'l': level = atoi(optarg); break;
-        case 1: no_pg = 1; break;
+        case 'u': level = 0; break;
+        case   1: no_pg = 1; break;
+        case 'M': by_minimiser = 1; break;
+        case 'K':
+            minimiser_kmer = atoi(optarg);
+            if (minimiser_kmer < 1)
+                minimiser_kmer = 1;
+            else if (minimiser_kmer > 31)
+                minimiser_kmer = 31;
+            break;
  
          default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                    /* else fall-through */
@@ -2387,7 +2733,8 @@ int bam_sort(int argc, char *argv[])
      }
  
      ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-",
-                            tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
+                            tmpprefix.s, fnout, modeout, max_mem,
+                            by_minimiser * minimiser_kmer, ga.nthreads,
                              &ga.in, &ga.out, arg_list, no_pg, ga.write_index);
      if (ret >= 0)
          ret = EXIT_SUCCESS;
diff --git a/samtools/bam_stat.c b/samtools/bam_stat.c

index 5fb9ba022a498f0842bca8bf51d8a14c2ac201c7..31dc8fe4e453b49b11acaee942bffdcb6dca80f4 100644 (file)
--- a/samtools/bam_stat.c
+++ b/samtools/bam_stat.c
@@ -1,6 +1,6 @@
  /*  bam_stat.c -- flagstat subcommand.
  
-    Copyright (C) 2009, 2011, 2013-2015, 2019 Genome Research Ltd.
+    Copyright (C) 2009, 2011, 2013-2015, 2019, 2021 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
@@ -42,32 +42,41 @@ typedef struct {
      long long n_dup[2];
      long long n_diffchr[2], n_diffhigh[2];
      long long n_secondary[2], n_supp[2];
+    long long n_primary[2], n_pmapped[2], n_pdup[2];
  } bam_flagstat_t;
  
-#define flagstat_loop(s, c) do {                                        \
-        int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0;                       \
-        ++(s)->n_reads[w];                                              \
-        if ((c)->flag & BAM_FSECONDARY ) {                              \
-            ++(s)->n_secondary[w];                                      \
-        } else if ((c)->flag & BAM_FSUPPLEMENTARY ) {                   \
-            ++(s)->n_supp[w];                                           \
-        } else if ((c)->flag & BAM_FPAIRED) {                           \
-            ++(s)->n_pair_all[w];                                       \
-            if (((c)->flag & BAM_FPROPER_PAIR) && !((c)->flag & BAM_FUNMAP) ) ++(s)->n_pair_good[w];    \
-            if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w];              \
-            if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w];              \
-            if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w];  \
-            if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \
-                ++(s)->n_pair_map[w];                                   \
-                if ((c)->mtid != (c)->tid) {                            \
-                    ++(s)->n_diffchr[w];                                \
-                    if ((c)->qual >= 5) ++(s)->n_diffhigh[w];           \
-                }                                                       \
-            }                                                           \
-        }                                                               \
-        if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w];              \
-        if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w];                      \
-    } while (0)
+inline static void flagstat_loop(bam_flagstat_t *s, bam1_core_t *c)
+{
+    int w = (c->flag & BAM_FQCFAIL)? 1 : 0;
+    ++s->n_reads[w];
+    if (c->flag & BAM_FSECONDARY ) {
+        ++s->n_secondary[w];
+    } else if (c->flag & BAM_FSUPPLEMENTARY ) {
+        ++s->n_supp[w];
+    } else {
+        ++s->n_primary[w];
+
+        if (c->flag & BAM_FPAIRED) {
+            ++s->n_pair_all[w];
+            if ((c->flag & BAM_FPROPER_PAIR) && !(c->flag & BAM_FUNMAP) ) ++s->n_pair_good[w];
+            if (c->flag & BAM_FREAD1) ++s->n_read1[w];
+            if (c->flag & BAM_FREAD2) ++s->n_read2[w];
+            if ((c->flag & BAM_FMUNMAP) && !(c->flag & BAM_FUNMAP)) ++s->n_sgltn[w];
+            if (!(c->flag & BAM_FUNMAP) && !(c->flag & BAM_FMUNMAP)) {
+                ++s->n_pair_map[w];
+                if (c->mtid != c->tid) {
+                    ++s->n_diffchr[w];
+                    if (c->qual >= 5) ++s->n_diffhigh[w];
+                }
+            }
+        }
+
+        if (!(c->flag & BAM_FUNMAP)) ++s->n_pmapped[w];
+        if (c->flag & BAM_FDUP) ++s->n_pdup[w];
+    }
+    if (!(c->flag & BAM_FUNMAP)) ++s->n_mapped[w];
+    if (c->flag & BAM_FDUP) ++s->n_dup[w];
+}
  
  bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h)
  {
@@ -81,8 +90,10 @@ bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h)
      while ((ret = sam_read1(fp, h, b)) >= 0)
          flagstat_loop(s, c);
      bam_destroy1(b);
-    if (ret != -1)
-        fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
+    if (ret != -1) {
+        free(s);
+        return NULL;
+    }
      return s;
  }
  
@@ -114,10 +125,13 @@ static void out_fmt_default(bam_flagstat_t *s)
  {
      char b0[16], b1[16];
      printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
+    printf("%lld + %lld primary\n", s->n_primary[0], s->n_primary[1]);
      printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]);
      printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]);
      printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
+    printf("%lld + %lld primary duplicates\n", s->n_pdup[0], s->n_pdup[1]);
      printf("%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1]));
+    printf("%lld + %lld primary mapped (%s : %s)\n", s->n_pmapped[0], s->n_pmapped[1], percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1]));
      printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
      printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]);
      printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]);
@@ -129,14 +143,18 @@ static void out_fmt_default(bam_flagstat_t *s)
  }
  
  static void out_fmt_json(bam_flagstat_t *s) {
-    char b0[16], b1[16];
+    char b0[16], b1[16], p0[16], p1[16], pp0[16], pp1[16], s0[16], s1[16];
      printf("{\n \"QC-passed reads\": { \n"
                   "  \"total\": %lld, \n"
+                 "  \"primary\": %lld, \n"
                   "  \"secondary\": %lld, \n"
                   "  \"supplementary\": %lld, \n"
                   "  \"duplicates\": %lld, \n"
+                 "  \"primary duplicates\": %lld, \n"
                   "  \"mapped\": %lld, \n"
                   "  \"mapped %%\": %s, \n"
+                 "  \"primary mapped\": %lld, \n"
+                 "  \"primary mapped %%\": %s, \n"
                   "  \"paired in sequencing\": %lld, \n"
                   "  \"read1\": %lld, \n"
                   "  \"read2\": %lld, \n"
@@ -150,11 +168,15 @@ static void out_fmt_json(bam_flagstat_t *s) {
                   " },"
              "\n \"QC-failed reads\": { \n"
                   "  \"total\": %lld, \n"
+                 "  \"primary\": %lld, \n"
                   "  \"secondary\": %lld, \n"
                   "  \"supplementary\": %lld, \n"
                   "  \"duplicates\": %lld, \n"
+                 "  \"primary duplicates\": %lld, \n"
                   "  \"mapped\": %lld, \n"
                   "  \"mapped %%\": %s, \n"
+                 "  \"primary mapped\": %lld, \n"
+                 "  \"primary mapped %%\": %s, \n"
                   "  \"paired in sequencing\": %lld, \n"
                   "  \"read1\": %lld, \n"
                   "  \"read2\": %lld, \n"
@@ -168,35 +190,43 @@ static void out_fmt_json(bam_flagstat_t *s) {
                   " }\n"
              "}\n",
          s->n_reads[0],
+        s->n_primary[0],
          s->n_secondary[0],
          s->n_supp[0],
          s->n_dup[0],
+        s->n_pdup[0],
          s->n_mapped[0],
          percent_json(b0, s->n_mapped[0], s->n_reads[0]),
+        s->n_pmapped[0],
+        percent_json(p0, s->n_pmapped[0], s->n_primary[0]),
          s->n_pair_all[0],
          s->n_read1[0],
          s->n_read2[0],
          s->n_pair_good[0],
-        percent_json(b0, s->n_pair_good[0], s->n_pair_all[0]),
+        percent_json(pp0, s->n_pair_good[0], s->n_pair_all[0]),
          s->n_pair_map[0],
          s->n_sgltn[0],
-        percent_json(b0, s->n_sgltn[0], s->n_pair_all[0]),
+        percent_json(s0, s->n_sgltn[0], s->n_pair_all[0]),
          s->n_diffchr[0],
          s->n_diffhigh[0],
          s->n_reads[1],
+        s->n_primary[1],
          s->n_secondary[1],
          s->n_supp[1],
          s->n_dup[1],
+        s->n_pdup[1],
          s->n_mapped[1],
          percent_json(b1, s->n_mapped[1], s->n_reads[1]),
+        s->n_pmapped[1],
+        percent_json(p1, s->n_pmapped[1], s->n_primary[1]),
          s->n_pair_all[1],
          s->n_read1[1],
          s->n_read2[1],
          s->n_pair_good[1],
-        percent_json(b1, s->n_pair_good[1], s->n_pair_all[1]),
+        percent_json(pp1, s->n_pair_good[1], s->n_pair_all[1]),
          s->n_pair_map[1],
          s->n_sgltn[1],
-        percent_json(b1, s->n_sgltn[1], s->n_pair_all[1]),
+        percent_json(s1, s->n_sgltn[1], s->n_pair_all[1]),
          s->n_diffchr[1],
          s->n_diffhigh[1]
      );
@@ -205,11 +235,15 @@ static void out_fmt_json(bam_flagstat_t *s) {
  static void out_fmt_tsv(bam_flagstat_t *s) {
      char b0[16], b1[16];
      printf("%lld\t%lld\ttotal (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
+    printf("%lld\t%lld\tprimary\n", s->n_primary[0], s->n_primary[1]);
      printf("%lld\t%lld\tsecondary\n", s->n_secondary[0], s->n_secondary[1]);
      printf("%lld\t%lld\tsupplementary\n", s->n_supp[0], s->n_supp[1]);
      printf("%lld\t%lld\tduplicates\n", s->n_dup[0], s->n_dup[1]);
+    printf("%lld\t%lld\tprimary duplicates\n", s->n_pdup[0], s->n_pdup[1]);
      printf("%lld\t%lld\tmapped\n", s->n_mapped[0], s->n_mapped[1]);
      printf("%s\t%s\tmapped %%\n", percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1]));
+    printf("%lld\t%lld\tprimary mapped\n", s->n_pmapped[0], s->n_pmapped[1]);
+    printf("%s\t%s\tprimary mapped %%\n", percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1]));
      printf("%lld\t%lld\tpaired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
      printf("%lld\t%lld\tread1\n", s->n_read1[0], s->n_read1[1]);
      printf("%lld\t%lld\tread2\n", s->n_read2[0], s->n_read2[1]);
@@ -242,7 +276,7 @@ int bam_flagstat(int argc, char *argv[])
      sam_hdr_t *header;
      bam_flagstat_t *s;
      const char *out_fmt = "default";
-    int c;
+    int c, status = EXIT_SUCCESS;
  
      enum {
          INPUT_FMT_OPTION = CHAR_MAX+1,
@@ -296,10 +330,17 @@ int bam_flagstat(int argc, char *argv[])
      }
  
      s = bam_flagstat_core(fp, header);
-    output_fmt(s, out_fmt);
-    free(s);
+    if (s) {
+        output_fmt(s, out_fmt);
+        free(s);
+    }
+    else {
+        print_error("flagstat", "error reading from \"%s\"", argv[optind]);
+        status = EXIT_FAILURE;
+    }
+
      sam_hdr_destroy(header);
      sam_close(fp);
      sam_global_args_free(&ga);
-    return 0;
+    return status;
  }
diff --git a/samtools/bam_stat.c.pysam.c b/samtools/bam_stat.c.pysam.c

index 84a9ea43acaaf21c566abbddeb4945a412b460e7..bd6f4ca640f1fecd9f01450c67f1901fc890f11a 100644 (file)
--- a/samtools/bam_stat.c.pysam.c
+++ b/samtools/bam_stat.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bam_stat.c -- flagstat subcommand.
  
-    Copyright (C) 2009, 2011, 2013-2015, 2019 Genome Research Ltd.
+    Copyright (C) 2009, 2011, 2013-2015, 2019, 2021 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
@@ -44,32 +44,41 @@ typedef struct {
      long long n_dup[2];
      long long n_diffchr[2], n_diffhigh[2];
      long long n_secondary[2], n_supp[2];
+    long long n_primary[2], n_pmapped[2], n_pdup[2];
  } bam_flagstat_t;
  
-#define flagstat_loop(s, c) do {                                        \
-        int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0;                       \
-        ++(s)->n_reads[w];                                              \
-        if ((c)->flag & BAM_FSECONDARY ) {                              \
-            ++(s)->n_secondary[w];                                      \
-        } else if ((c)->flag & BAM_FSUPPLEMENTARY ) {                   \
-            ++(s)->n_supp[w];                                           \
-        } else if ((c)->flag & BAM_FPAIRED) {                           \
-            ++(s)->n_pair_all[w];                                       \
-            if (((c)->flag & BAM_FPROPER_PAIR) && !((c)->flag & BAM_FUNMAP) ) ++(s)->n_pair_good[w];    \
-            if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w];              \
-            if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w];              \
-            if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w];  \
-            if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \
-                ++(s)->n_pair_map[w];                                   \
-                if ((c)->mtid != (c)->tid) {                            \
-                    ++(s)->n_diffchr[w];                                \
-                    if ((c)->qual >= 5) ++(s)->n_diffhigh[w];           \
-                }                                                       \
-            }                                                           \
-        }                                                               \
-        if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w];              \
-        if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w];                      \
-    } while (0)
+inline static void flagstat_loop(bam_flagstat_t *s, bam1_core_t *c)
+{
+    int w = (c->flag & BAM_FQCFAIL)? 1 : 0;
+    ++s->n_reads[w];
+    if (c->flag & BAM_FSECONDARY ) {
+        ++s->n_secondary[w];
+    } else if (c->flag & BAM_FSUPPLEMENTARY ) {
+        ++s->n_supp[w];
+    } else {
+        ++s->n_primary[w];
+
+        if (c->flag & BAM_FPAIRED) {
+            ++s->n_pair_all[w];
+            if ((c->flag & BAM_FPROPER_PAIR) && !(c->flag & BAM_FUNMAP) ) ++s->n_pair_good[w];
+            if (c->flag & BAM_FREAD1) ++s->n_read1[w];
+            if (c->flag & BAM_FREAD2) ++s->n_read2[w];
+            if ((c->flag & BAM_FMUNMAP) && !(c->flag & BAM_FUNMAP)) ++s->n_sgltn[w];
+            if (!(c->flag & BAM_FUNMAP) && !(c->flag & BAM_FMUNMAP)) {
+                ++s->n_pair_map[w];
+                if (c->mtid != c->tid) {
+                    ++s->n_diffchr[w];
+                    if (c->qual >= 5) ++s->n_diffhigh[w];
+                }
+            }
+        }
+
+        if (!(c->flag & BAM_FUNMAP)) ++s->n_pmapped[w];
+        if (c->flag & BAM_FDUP) ++s->n_pdup[w];
+    }
+    if (!(c->flag & BAM_FUNMAP)) ++s->n_mapped[w];
+    if (c->flag & BAM_FDUP) ++s->n_dup[w];
+}
  
  bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h)
  {
@@ -83,8 +92,10 @@ bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h)
      while ((ret = sam_read1(fp, h, b)) >= 0)
          flagstat_loop(s, c);
      bam_destroy1(b);
-    if (ret != -1)
-        fprintf(samtools_stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n");
+    if (ret != -1) {
+        free(s);
+        return NULL;
+    }
      return s;
  }
  
@@ -109,17 +120,20 @@ static void usage_exit(FILE *fp, int exit_status)
      fprintf(fp, "  -O, --");
      fprintf(fp, "output-fmt FORMAT[,OPT[=VAL]]...\n"
              "               Specify output format (json, tsv)\n");
-    exit(exit_status);
+    samtools_exit(exit_status);
  }
  
  static void out_fmt_default(bam_flagstat_t *s)
  {
      char b0[16], b1[16];
      fprintf(samtools_stdout, "%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
+    fprintf(samtools_stdout, "%lld + %lld primary\n", s->n_primary[0], s->n_primary[1]);
      fprintf(samtools_stdout, "%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]);
      fprintf(samtools_stdout, "%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]);
      fprintf(samtools_stdout, "%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
+    fprintf(samtools_stdout, "%lld + %lld primary duplicates\n", s->n_pdup[0], s->n_pdup[1]);
      fprintf(samtools_stdout, "%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1]));
+    fprintf(samtools_stdout, "%lld + %lld primary mapped (%s : %s)\n", s->n_pmapped[0], s->n_pmapped[1], percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1]));
      fprintf(samtools_stdout, "%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
      fprintf(samtools_stdout, "%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]);
      fprintf(samtools_stdout, "%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]);
@@ -131,14 +145,18 @@ static void out_fmt_default(bam_flagstat_t *s)
  }
  
  static void out_fmt_json(bam_flagstat_t *s) {
-    char b0[16], b1[16];
+    char b0[16], b1[16], p0[16], p1[16], pp0[16], pp1[16], s0[16], s1[16];
      fprintf(samtools_stdout, "{\n \"QC-passed reads\": { \n"
                   "  \"total\": %lld, \n"
+                 "  \"primary\": %lld, \n"
                   "  \"secondary\": %lld, \n"
                   "  \"supplementary\": %lld, \n"
                   "  \"duplicates\": %lld, \n"
+                 "  \"primary duplicates\": %lld, \n"
                   "  \"mapped\": %lld, \n"
                   "  \"mapped %%\": %s, \n"
+                 "  \"primary mapped\": %lld, \n"
+                 "  \"primary mapped %%\": %s, \n"
                   "  \"paired in sequencing\": %lld, \n"
                   "  \"read1\": %lld, \n"
                   "  \"read2\": %lld, \n"
@@ -152,11 +170,15 @@ static void out_fmt_json(bam_flagstat_t *s) {
                   " },"
              "\n \"QC-failed reads\": { \n"
                   "  \"total\": %lld, \n"
+                 "  \"primary\": %lld, \n"
                   "  \"secondary\": %lld, \n"
                   "  \"supplementary\": %lld, \n"
                   "  \"duplicates\": %lld, \n"
+                 "  \"primary duplicates\": %lld, \n"
                   "  \"mapped\": %lld, \n"
                   "  \"mapped %%\": %s, \n"
+                 "  \"primary mapped\": %lld, \n"
+                 "  \"primary mapped %%\": %s, \n"
                   "  \"paired in sequencing\": %lld, \n"
                   "  \"read1\": %lld, \n"
                   "  \"read2\": %lld, \n"
@@ -170,35 +192,43 @@ static void out_fmt_json(bam_flagstat_t *s) {
                   " }\n"
              "}\n",
          s->n_reads[0],
+        s->n_primary[0],
          s->n_secondary[0],
          s->n_supp[0],
          s->n_dup[0],
+        s->n_pdup[0],
          s->n_mapped[0],
          percent_json(b0, s->n_mapped[0], s->n_reads[0]),
+        s->n_pmapped[0],
+        percent_json(p0, s->n_pmapped[0], s->n_primary[0]),
          s->n_pair_all[0],
          s->n_read1[0],
          s->n_read2[0],
          s->n_pair_good[0],
-        percent_json(b0, s->n_pair_good[0], s->n_pair_all[0]),
+        percent_json(pp0, s->n_pair_good[0], s->n_pair_all[0]),
          s->n_pair_map[0],
          s->n_sgltn[0],
-        percent_json(b0, s->n_sgltn[0], s->n_pair_all[0]),
+        percent_json(s0, s->n_sgltn[0], s->n_pair_all[0]),
          s->n_diffchr[0],
          s->n_diffhigh[0],
          s->n_reads[1],
+        s->n_primary[1],
          s->n_secondary[1],
          s->n_supp[1],
          s->n_dup[1],
+        s->n_pdup[1],
          s->n_mapped[1],
          percent_json(b1, s->n_mapped[1], s->n_reads[1]),
+        s->n_pmapped[1],
+        percent_json(p1, s->n_pmapped[1], s->n_primary[1]),
          s->n_pair_all[1],
          s->n_read1[1],
          s->n_read2[1],
          s->n_pair_good[1],
-        percent_json(b1, s->n_pair_good[1], s->n_pair_all[1]),
+        percent_json(pp1, s->n_pair_good[1], s->n_pair_all[1]),
          s->n_pair_map[1],
          s->n_sgltn[1],
-        percent_json(b1, s->n_sgltn[1], s->n_pair_all[1]),
+        percent_json(s1, s->n_sgltn[1], s->n_pair_all[1]),
          s->n_diffchr[1],
          s->n_diffhigh[1]
      );
@@ -207,11 +237,15 @@ static void out_fmt_json(bam_flagstat_t *s) {
  static void out_fmt_tsv(bam_flagstat_t *s) {
      char b0[16], b1[16];
      fprintf(samtools_stdout, "%lld\t%lld\ttotal (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
+    fprintf(samtools_stdout, "%lld\t%lld\tprimary\n", s->n_primary[0], s->n_primary[1]);
      fprintf(samtools_stdout, "%lld\t%lld\tsecondary\n", s->n_secondary[0], s->n_secondary[1]);
      fprintf(samtools_stdout, "%lld\t%lld\tsupplementary\n", s->n_supp[0], s->n_supp[1]);
      fprintf(samtools_stdout, "%lld\t%lld\tduplicates\n", s->n_dup[0], s->n_dup[1]);
+    fprintf(samtools_stdout, "%lld\t%lld\tprimary duplicates\n", s->n_pdup[0], s->n_pdup[1]);
      fprintf(samtools_stdout, "%lld\t%lld\tmapped\n", s->n_mapped[0], s->n_mapped[1]);
      fprintf(samtools_stdout, "%s\t%s\tmapped %%\n", percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1]));
+    fprintf(samtools_stdout, "%lld\t%lld\tprimary mapped\n", s->n_pmapped[0], s->n_pmapped[1]);
+    fprintf(samtools_stdout, "%s\t%s\tprimary mapped %%\n", percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1]));
      fprintf(samtools_stdout, "%lld\t%lld\tpaired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
      fprintf(samtools_stdout, "%lld\t%lld\tread1\n", s->n_read1[0], s->n_read1[1]);
      fprintf(samtools_stdout, "%lld\t%lld\tread2\n", s->n_read2[0], s->n_read2[1]);
@@ -244,7 +278,7 @@ int bam_flagstat(int argc, char *argv[])
      sam_hdr_t *header;
      bam_flagstat_t *s;
      const char *out_fmt = "default";
-    int c;
+    int c, status = EXIT_SUCCESS;
  
      enum {
          INPUT_FMT_OPTION = CHAR_MAX+1,
@@ -298,10 +332,17 @@ int bam_flagstat(int argc, char *argv[])
      }
  
      s = bam_flagstat_core(fp, header);
-    output_fmt(s, out_fmt);
-    free(s);
+    if (s) {
+        output_fmt(s, out_fmt);
+        free(s);
+    }
+    else {
+        print_error("flagstat", "error reading from \"%s\"", argv[optind]);
+        status = EXIT_FAILURE;
+    }
+
      sam_hdr_destroy(header);
      sam_close(fp);
      sam_global_args_free(&ga);
-    return 0;
+    return status;
  }
diff --git a/samtools/bamtk.c b/samtools/bamtk.c

index a6959f914571112c61d23fd632405cc7d8d4d9b4..93e646899a939649e7c0f85c841bd9b24bedb729 100644 (file)
--- a/samtools/bamtk.c
+++ b/samtools/bamtk.c
@@ -1,6 +1,6 @@
  /*  bamtk.c -- main samtools command front-end.
  
-    Copyright (C) 2008-2019 Genome Research Ltd.
+    Copyright (C) 2008-2021 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
@@ -30,6 +30,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include <string.h>
  
  #include "htslib/hts.h"
+#include "htslib/hfile.h"
  #include "samtools.h"
  #include "version.h"
  
@@ -46,7 +47,6 @@ int bam_fillmd(int argc, char *argv[]);
  int bam_idxstats(int argc, char *argv[]);
  int bam_markdup(int argc, char *argv[]);
  int main_samview(int argc, char *argv[]);
-int main_import(int argc, char *argv[]);
  int main_reheader(int argc, char *argv[]);
  int main_cut_target(int argc, char *argv[]);
  int main_phase(int argc, char *argv[]);
@@ -65,12 +65,78 @@ int main_addreplacerg(int argc, char *argv[]);
  int faidx_main(int argc, char *argv[]);
  int dict_main(int argc, char *argv[]);
  int fqidx_main(int argc, char *argv[]);
+int amplicon_clip_main(int argc, char *argv[]);
+int main_ampliconstats(int argc, char *argv[]);
+int main_import(int argc, char *argv[]);
  
  const char *samtools_version()
  {
      return SAMTOOLS_VERSION;
  }
  
+// These come out of the config.h file built by autoconf or Makefile
+const char *samtools_feature_string(void) {
+    const char *fmt =
+
+#ifdef PACKAGE_URL
+    "build=configure "
+#else
+    "build=Makefile "
+#endif
+
+#ifdef HAVE_CURSES
+    "curses=yes "
+#else
+    "curses=no "
+#endif
+    ;
+
+    return fmt;
+}
+
+static void long_version(void) {
+    printf("samtools %s\n"
+           "Using htslib %s\n"
+           "Copyright (C) 2021 Genome Research Ltd.\n",
+           samtools_version(), hts_version());
+
+    printf("\nSamtools compilation details:\n");
+    printf("    Features:       %s\n", samtools_feature_string());
+    printf("    CC:             %s\n", SAMTOOLS_CC);
+    printf("    CPPFLAGS:       %s\n", SAMTOOLS_CPPFLAGS);
+    printf("    CFLAGS:         %s\n", SAMTOOLS_CFLAGS);
+    printf("    LDFLAGS:        %s\n", SAMTOOLS_LDFLAGS);
+    printf("    HTSDIR:         %s\n", SAMTOOLS_HTSDIR);
+    printf("    LIBS:           %s\n", SAMTOOLS_LIBS);
+    printf("    CURSES_LIB:     %s\n", SAMTOOLS_CURSES_LIB);
+
+    printf("\nHTSlib compilation details:\n");
+    printf("    Features:       %s\n", hts_feature_string());
+    printf("    CC:             %s\n", hts_test_feature(HTS_FEATURE_CC));
+    printf("    CPPFLAGS:       %s\n", hts_test_feature(HTS_FEATURE_CPPFLAGS));
+    printf("    CFLAGS:         %s\n", hts_test_feature(HTS_FEATURE_CFLAGS));
+    printf("    LDFLAGS:        %s\n", hts_test_feature(HTS_FEATURE_LDFLAGS));
+
+    // Plugins and schemes
+    printf("\nHTSlib URL scheme handlers present:\n");
+    const char *plugins[100];
+    int np = 100, i, j;
+
+    if (hfile_list_plugins(plugins, &np) < 0)
+        return;
+
+    for (i = 0; i < np; i++) {
+        const char *sc_list[100];
+        int nschemes = 100;
+        if (hfile_list_schemes(plugins[i], sc_list, &nschemes) < 0)
+            return;
+
+        printf("    %s:\t", plugins[i]);
+        for (j = 0; j < nschemes; j++)
+            printf(" %s%c", sc_list[j], ",\n"[j+1==nschemes]);
+    }
+}
+
  static void usage(FILE *fp)
  {
      /* Please improve the grouping */
@@ -96,6 +162,7 @@ static void usage(FILE *fp)
  "     targetcut      cut fosmid regions (for fosmid pool only)\n"
  "     addreplacerg   adds or replaces RG tags\n"
  "     markdup        mark duplicates\n"
+"     ampliconclip   clip oligos from the end of reads\n"
  "\n"
  "  -- File operations\n"
  "     collate        shuffle and group alignments by name\n"
@@ -107,6 +174,7 @@ static void usage(FILE *fp)
  "     quickcheck     quickly check if SAM/BAM/CRAM file appears intact\n"
  "     fastq          converts a BAM to a FASTQ\n"
  "     fasta          converts a BAM to a FASTA\n"
+"     import         Converts FASTA or FASTQ files to SAM/BAM/CRAM\n"
  "\n"
  "  -- Statistics\n"
  "     bedcov         read depth per BED region\n"
@@ -116,19 +184,18 @@ static void usage(FILE *fp)
  "     idxstats       BAM index stats\n"
  "     phase          phase heterozygotes\n"
  "     stats          generate stats (former bamcheck)\n"
+"     ampliconstats  generate amplicon specific stats\n"
  "\n"
  "  -- Viewing\n"
  "     flags          explain BAM flags\n"
  "     tview          text alignment viewer\n"
  "     view           SAM<->BAM<->CRAM conversion\n"
  "     depad          convert padded BAM to unpadded BAM\n"
+"\n"
+"  -- Misc\n"
+"     help [cmd]     display this help message or help for [cmd]\n"
+"     version        detailed version information\n"
  "\n");
-#ifdef _WIN32
-    fprintf(fp,
-"Note: The Windows version of SAMtools is mainly designed for read-only\n"
-"      operations, such as viewing the alignments and generating the pileup.\n"
-"      Binary files generated by the Windows version may be buggy.\n\n");
-#endif
  }
  
  // This is a tricky one, but on Windows the filename wildcard expansion is done by
@@ -176,6 +243,7 @@ int main(int argc, char *argv[])
      else if (strcmp(argv[1], "fixmate") == 0)   ret = bam_mating(argc-1, argv+1);
      else if (strcmp(argv[1], "rmdup") == 0)     ret = bam_rmdup(argc-1, argv+1);
      else if (strcmp(argv[1], "markdup") == 0)   ret = bam_markdup(argc-1, argv+1);
+    else if (strcmp(argv[1], "ampliconclip") == 0) ret = amplicon_clip_main(argc-1, argv+1);
      else if (strcmp(argv[1], "flagstat") == 0 ||
               strcmp(argv[1], "flagstats") == 0) ret = bam_flagstat(argc-1, argv+1);
      else if (strcmp(argv[1], "calmd") == 0)     ret = bam_fillmd(argc-1, argv+1);
@@ -206,12 +274,10 @@ int main(int argc, char *argv[])
          return 1;
      }
      else if (strcmp(argv[1], "tview") == 0)   ret = bam_tview_main(argc-1, argv+1);
-    else if (strcmp(argv[1], "--version") == 0) {
-        printf(
-"samtools %s\n"
-"Using htslib %s\n"
-"Copyright (C) 2019 Genome Research Ltd.\n",
-               samtools_version(), hts_version());
+    else if (strcmp(argv[1], "ampliconstats") == 0)     ret = main_ampliconstats(argc-1, argv+1);
+    else if (strcmp(argv[1], "version") == 0 || \
+             strcmp(argv[1], "--version") == 0) {
+        long_version();
      }
      else if (strcmp(argv[1], "--version-only") == 0) {
          printf("%s+htslib-%s\n", samtools_version(), hts_version());
diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c

index 91c29b87d4110e667b1a23c3c7798caf1740c4ce..dfb2cdd0925a682a227344f4486ab41bd8ca7cbd 100644 (file)
--- a/samtools/bamtk.c.pysam.c
+++ b/samtools/bamtk.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  bamtk.c -- main samtools command front-end.
  
-    Copyright (C) 2008-2019 Genome Research Ltd.
+    Copyright (C) 2008-2021 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
@@ -32,8 +32,10 @@ DEALINGS IN THE SOFTWARE.  */
  #include <string.h>
  
  #include "htslib/hts.h"
+#include "htslib/hfile.h"
  #include "samtools.h"
  #include "version.h"
+#include "samtools_config_vars.h"
  
  int bam_taf2baf(int argc, char *argv[]);
  int bam_mpileup(int argc, char *argv[]);
@@ -48,8 +50,7 @@ int bam_fillmd(int argc, char *argv[]);
  int bam_idxstats(int argc, char *argv[]);
  int bam_markdup(int argc, char *argv[]);
  int main_samview(int argc, char *argv[]);
-int main_import(int argc, char *argv[]);
-int main_reheader(int argc, char *argv[]);
+int samtools_main_reheader(int argc, char *argv[]);
  int main_cut_target(int argc, char *argv[]);
  int main_phase(int argc, char *argv[]);
  int main_cat(int argc, char *argv[]);
@@ -67,12 +68,78 @@ int main_addreplacerg(int argc, char *argv[]);
  int faidx_main(int argc, char *argv[]);
  int dict_main(int argc, char *argv[]);
  int fqidx_main(int argc, char *argv[]);
+int amplicon_clip_main(int argc, char *argv[]);
+int main_ampliconstats(int argc, char *argv[]);
+int main_import(int argc, char *argv[]);
  
  const char *samtools_version()
  {
      return SAMTOOLS_VERSION;
  }
  
+// These come out of the config.h file built by autoconf or Makefile
+const char *samtools_feature_string(void) {
+    const char *fmt =
+
+#ifdef PACKAGE_URL
+    "build=configure "
+#else
+    "build=Makefile "
+#endif
+
+#ifdef HAVE_CURSES
+    "curses=yes "
+#else
+    "curses=no "
+#endif
+    ;
+
+    return fmt;
+}
+
+static void long_version(void) {
+    fprintf(samtools_stdout, "samtools %s\n"
+           "Using htslib %s\n"
+           "Copyright (C) 2021 Genome Research Ltd.\n",
+           samtools_version(), hts_version());
+
+    fprintf(samtools_stdout, "\nSamtools compilation details:\n");
+    fprintf(samtools_stdout, "    Features:       %s\n", samtools_feature_string());
+    fprintf(samtools_stdout, "    CC:             %s\n", SAMTOOLS_CC);
+    fprintf(samtools_stdout, "    CPPFLAGS:       %s\n", SAMTOOLS_CPPFLAGS);
+    fprintf(samtools_stdout, "    CFLAGS:         %s\n", SAMTOOLS_CFLAGS);
+    fprintf(samtools_stdout, "    LDFLAGS:        %s\n", SAMTOOLS_LDFLAGS);
+    fprintf(samtools_stdout, "    HTSDIR:         %s\n", SAMTOOLS_HTSDIR);
+    fprintf(samtools_stdout, "    LIBS:           %s\n", SAMTOOLS_LIBS);
+    fprintf(samtools_stdout, "    CURSES_LIB:     %s\n", SAMTOOLS_CURSES_LIB);
+
+    fprintf(samtools_stdout, "\nHTSlib compilation details:\n");
+    fprintf(samtools_stdout, "    Features:       %s\n", hts_feature_string());
+    fprintf(samtools_stdout, "    CC:             %s\n", hts_test_feature(HTS_FEATURE_CC));
+    fprintf(samtools_stdout, "    CPPFLAGS:       %s\n", hts_test_feature(HTS_FEATURE_CPPFLAGS));
+    fprintf(samtools_stdout, "    CFLAGS:         %s\n", hts_test_feature(HTS_FEATURE_CFLAGS));
+    fprintf(samtools_stdout, "    LDFLAGS:        %s\n", hts_test_feature(HTS_FEATURE_LDFLAGS));
+
+    // Plugins and schemes
+    fprintf(samtools_stdout, "\nHTSlib URL scheme handlers present:\n");
+    const char *plugins[100];
+    int np = 100, i, j;
+
+    if (hfile_list_plugins(plugins, &np) < 0)
+        return;
+
+    for (i = 0; i < np; i++) {
+        const char *sc_list[100];
+        int nschemes = 100;
+        if (hfile_list_schemes(plugins[i], sc_list, &nschemes) < 0)
+            return;
+
+        fprintf(samtools_stdout, "    %s:\t", plugins[i]);
+        for (j = 0; j < nschemes; j++)
+            fprintf(samtools_stdout, " %s%c", sc_list[j], ",\n"[j+1==nschemes]);
+    }
+}
+
  static void usage(FILE *fp)
  {
      /* Please improve the grouping */
@@ -98,6 +165,7 @@ static void usage(FILE *fp)
  "     targetcut      cut fosmid regions (for fosmid pool only)\n"
  "     addreplacerg   adds or replaces RG tags\n"
  "     markdup        mark duplicates\n"
+"     ampliconclip   clip oligos from the end of reads\n"
  "\n"
  "  -- File operations\n"
  "     collate        shuffle and group alignments by name\n"
@@ -109,6 +177,7 @@ static void usage(FILE *fp)
  "     quickcheck     quickly check if SAM/BAM/CRAM file appears intact\n"
  "     fastq          converts a BAM to a FASTQ\n"
  "     fasta          converts a BAM to a FASTA\n"
+"     import         Converts FASTA or FASTQ files to SAM/BAM/CRAM\n"
  "\n"
  "  -- Statistics\n"
  "     bedcov         read depth per BED region\n"
@@ -118,19 +187,18 @@ static void usage(FILE *fp)
  "     idxstats       BAM index stats\n"
  "     phase          phase heterozygotes\n"
  "     stats          generate stats (former bamcheck)\n"
+"     ampliconstats  generate amplicon specific stats\n"
  "\n"
  "  -- Viewing\n"
  "     flags          explain BAM flags\n"
  "     tview          text alignment viewer\n"
  "     view           SAM<->BAM<->CRAM conversion\n"
  "     depad          convert padded BAM to unpadded BAM\n"
+"\n"
+"  -- Misc\n"
+"     help [cmd]     display this help message or help for [cmd]\n"
+"     version        detailed version information\n"
  "\n");
-#ifdef _WIN32
-    fprintf(fp,
-"Note: The Windows version of SAMtools is mainly designed for read-only\n"
-"      operations, such as viewing the alignments and generating the pileup.\n"
-"      Binary files generated by the Windows version may be buggy.\n\n");
-#endif
  }
  
  // This is a tricky one, but on Windows the filename wildcard expansion is done by
@@ -178,11 +246,12 @@ int samtools_main(int argc, char *argv[])
      else if (strcmp(argv[1], "fixmate") == 0)   ret = bam_mating(argc-1, argv+1);
      else if (strcmp(argv[1], "rmdup") == 0)     ret = bam_rmdup(argc-1, argv+1);
      else if (strcmp(argv[1], "markdup") == 0)   ret = bam_markdup(argc-1, argv+1);
+    else if (strcmp(argv[1], "ampliconclip") == 0) ret = amplicon_clip_main(argc-1, argv+1);
      else if (strcmp(argv[1], "flagstat") == 0 ||
               strcmp(argv[1], "flagstats") == 0) ret = bam_flagstat(argc-1, argv+1);
      else if (strcmp(argv[1], "calmd") == 0)     ret = bam_fillmd(argc-1, argv+1);
      else if (strcmp(argv[1], "fillmd") == 0)    ret = bam_fillmd(argc-1, argv+1);
-    else if (strcmp(argv[1], "reheader") == 0)  ret = main_reheader(argc-1, argv+1);
+    else if (strcmp(argv[1], "reheader") == 0)  ret = samtools_main_reheader(argc-1, argv+1);
      else if (strcmp(argv[1], "cat") == 0)       ret = main_cat(argc-1, argv+1);
      else if (strcmp(argv[1], "targetcut") == 0) ret = main_cut_target(argc-1, argv+1);
      else if (strcmp(argv[1], "phase") == 0)     ret = main_phase(argc-1, argv+1);
@@ -208,12 +277,10 @@ int samtools_main(int argc, char *argv[])
          return 1;
      }
      //else if (strcmp(argv[1], "tview") == 0)   ret = bam_tview_main(argc-1, argv+1);
-    else if (strcmp(argv[1], "--version") == 0) {
-        fprintf(samtools_stdout, 
-"samtools %s\n"
-"Using htslib %s\n"
-"Copyright (C) 2019 Genome Research Ltd.\n",
-               samtools_version(), hts_version());
+    else if (strcmp(argv[1], "ampliconstats") == 0)     ret = main_ampliconstats(argc-1, argv+1);
+    else if (strcmp(argv[1], "version") == 0 || \
+             strcmp(argv[1], "--version") == 0) {
+        long_version();
      }
      else if (strcmp(argv[1], "--version-only") == 0) {
          fprintf(samtools_stdout, "%s+htslib-%s\n", samtools_version(), hts_version());
diff --git a/samtools/bedcov.c b/samtools/bedcov.c

index a36d6725990deb031d7137af65ecd9c26d07864b..bccc09ba12fa7f312ce4b5c8aee818649c0dbadf 100644 (file)
--- a/samtools/bedcov.c
+++ b/samtools/bedcov.c
@@ -1,7 +1,7 @@
  /*  bedcov.c -- bedcov subcommand.
  
      Copyright (C) 2012 Broad Institute.
-    Copyright (C) 2013-2014, 2018, 2019 Genome Research Ltd.
+    Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
@@ -40,11 +40,14 @@ DEALINGS IN THE SOFTWARE.  */
  #include "htslib/kseq.h"
  KSTREAM_INIT(gzFile, gzread, 16384)
  
+#define DEFAULT_DEPTH 64000
+
  typedef struct {
      htsFile *fp;
      sam_hdr_t *header;
      hts_itr_t *iter;
      int min_mapQ;
+    uint32_t flags;  // read filtering flags
  } aux_t;
  
  static int read_bam(void *data, bam1_t *b)
@@ -55,7 +58,7 @@ static int read_bam(void *data, bam1_t *b)
      {
          ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b);
          if ( ret<0 ) break;
-        if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
+        if ( b->core.flag & aux->flags ) continue;
          if ( (int)b->core.qual < aux->min_mapQ ) continue;
          break;
      }
@@ -69,10 +72,12 @@ int main_bedcov(int argc, char *argv[])
      kstream_t *ks;
      hts_idx_t **idx;
      aux_t **aux;
-    int *n_plp, dret, i, j, m, n, c, min_mapQ = 0, skip_DN = 0;
-    int64_t *cnt;
+    int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0;
+    int64_t *cnt, *pcov = NULL;;
      const bam_pileup1_t **plp;
      int usage = 0, has_index_file = 0;
+    uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
+    int tflags = 0, min_depth = -1;
  
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
      static const struct option lopts[] = {
@@ -80,11 +85,28 @@ int main_bedcov(int argc, char *argv[])
          { NULL, 0, NULL, 0 }
      };
  
-    while ((c = getopt_long(argc, argv, "Q:Xj", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) {
          switch (c) {
          case 'Q': min_mapQ = atoi(optarg); break;
          case 'X': has_index_file = 1; break;
+        case 'g':
+            tflags = bam_str2flag(optarg);
+            if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
+                print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
+                return 1;
+            }
+            flags &= ~tflags;
+            break;
+        case 'G':
+            tflags = bam_str2flag(optarg);
+            if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
+                print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
+                return 1;
+            }
+            flags |= tflags;
+            break;
          case 'j': skip_DN = 1; break;
+        case 'd': min_depth = atoi(optarg); break;
          default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                    /* else fall-through */
          case '?': usage = 1; break;
@@ -96,7 +118,12 @@ int main_bedcov(int argc, char *argv[])
          fprintf(stderr, "Options:\n");
          fprintf(stderr, "      -Q <int>            mapping quality threshold [0]\n");
          fprintf(stderr, "      -X                  use customized index files\n");
+        fprintf(stderr, "      -g <flags>          remove the specified flags from the set used to filter out reads\n");
+        fprintf(stderr, "      -G <flags>          add the specified flags to the set used to filter out reads\n"
+                        "                          The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704");
          fprintf(stderr, "      -j                  do not include deletions (D) and ref skips (N) in bedcov computation\n");
+        fprintf(stderr, "      -d <int>            depth threshold. Number of reference bases with coverage above and"
+                        "                          including this value will be displayed in a separate column\n");
          sam_global_opt_help(stderr, "-.--.--.");
          return 1;
      }
@@ -136,8 +163,11 @@ int main_bedcov(int argc, char *argv[])
                      argv[i+optind+1]);
              return 2;
          }
+        aux[i]->flags = flags;
      }
-    cnt = calloc(n, 8);
+    cnt = calloc(n, sizeof(*cnt));
+    if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov));
+    if (!cnt || (min_depth >= 0 && !pcov)) return 2;
  
      fp = gzopen(argv[optind], "rb");
      if (fp == NULL) {
@@ -149,7 +179,8 @@ int main_bedcov(int argc, char *argv[])
      plp = calloc(n, sizeof(bam_pileup1_t*));
      while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
          char *p, *q;
-        int tid, beg, end, pos;
+        int tid, pos, num = 0;
+        int64_t beg = 0, end = 0;
          bam_mplp_t mplp;
  
          if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */
@@ -158,53 +189,75 @@ int main_bedcov(int argc, char *argv[])
             be followed by a tab in that case). */
          if (strncmp(str.s, "track ", 6) == 0) continue;
          if (strncmp(str.s, "browser ", 8) == 0) continue;
-        for (p = q = str.s; *p && *p != '\t'; ++p);
-        if (*p != '\t') goto bed_error;
-        *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t';
+        for (p = q = str.s; *p && !isspace(*p); ++p);
+        if (*p == 0) goto bed_error;
+        char c = *p;
+        *p = 0; tid = bam_name2id(aux[0]->header, q); *p = c;
          if (tid < 0) goto bed_error;
-        for (q = p = p + 1; isdigit(*p); ++p);
-        if (*p != '\t') goto bed_error;
-        *p = 0; beg = atoi(q); *p = '\t';
-        for (q = p = p + 1; isdigit(*p); ++p);
-        if (*p == '\t' || *p == 0) {
-            int c = *p;
-            *p = 0; end = atoi(q); *p = c;
-        } else goto bed_error;
+        num = sscanf(p + 1, "%"SCNd64" %"SCNd64, &beg, &end);
+        if (num < 2 || end < beg) goto bed_error;
  
          for (i = 0; i < n; ++i) {
              if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
              aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
          }
+
          mplp = bam_mplp_init(n, read_bam, (void**)aux);
-        bam_mplp_set_maxcnt(mplp, 64000);
-        memset(cnt, 0, 8 * n);
-        while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
+        if (min_depth > DEFAULT_DEPTH)
+            bam_mplp_set_maxcnt(mplp, min_depth);
+        else
+            bam_mplp_set_maxcnt(mplp, DEFAULT_DEPTH);
+
+        memset(cnt, 0, sizeof(*cnt) * n);
+        if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n);
+
+        while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0)
              if (pos >= beg && pos < end) {
-                for (i = 0, m = 0; i < n; ++i) {
-                    if (skip_DN)
+                for (i = 0; i < n; ++i) {
+                    m = 0;
+                    if (skip_DN || min_depth >= 0) {
                          for (j = 0; j < n_plp[i]; ++j) {
                              const bam_pileup1_t *pi = plp[i] + j;
                              if (pi->is_del || pi->is_refskip) ++m;
                          }
-                    cnt[i] += n_plp[i] - m;
+                    }
+                    int pd = n_plp[i] - m;
+                    cnt[i] += pd;
+                    if (min_depth >= 0 && pd >= min_depth) pcov[i]++;
                  }
              }
+
+        if (ret < 0) {
+            print_error("bedcov", "error reading from input file");
+            status = 2;
+            bam_mplp_destroy(mplp);
+            break;
+        }
+
          for (i = 0; i < n; ++i) {
              kputc('\t', &str);
              kputl(cnt[i], &str);
          }
+        if (min_depth >= 0) {
+            for (i = 0; i < n; ++i) {
+                kputc('\t', &str);
+                kputl(pcov[i], &str);
+            }
+        }
          puts(str.s);
          bam_mplp_destroy(mplp);
          continue;
  
  bed_error:
          fprintf(stderr, "Errors in BED line '%s'\n", str.s);
+        status = 2;
      }
      free(n_plp); free(plp);
      ks_destroy(ks);
      gzclose(fp);
  
      free(cnt);
+    free(pcov);
      for (i = 0; i < n; ++i) {
          if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
          hts_idx_destroy(idx[i]);
@@ -215,5 +268,5 @@ bed_error:
      free(aux); free(idx);
      free(str.s);
      sam_global_args_free(&ga);
-    return 0;
+    return status;
  }
diff --git a/samtools/bedcov.c.pysam.c b/samtools/bedcov.c.pysam.c

index 82b63aaf408f8e50661e5bb7e969ac4e052c48b6..b72cbf1506277ab9fcb450d38479df393a8a68b8 100644 (file)
--- a/samtools/bedcov.c.pysam.c
+++ b/samtools/bedcov.c.pysam.c
@@ -3,7 +3,7 @@
  /*  bedcov.c -- bedcov subcommand.
  
      Copyright (C) 2012 Broad Institute.
-    Copyright (C) 2013-2014, 2018, 2019 Genome Research Ltd.
+    Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd.
  
      Author: Heng Li <lh3@sanger.ac.uk>
  
@@ -42,11 +42,14 @@ DEALINGS IN THE SOFTWARE.  */
  #include "htslib/kseq.h"
  KSTREAM_INIT(gzFile, gzread, 16384)
  
+#define DEFAULT_DEPTH 64000
+
  typedef struct {
      htsFile *fp;
      sam_hdr_t *header;
      hts_itr_t *iter;
      int min_mapQ;
+    uint32_t flags;  // read filtering flags
  } aux_t;
  
  static int read_bam(void *data, bam1_t *b)
@@ -57,7 +60,7 @@ static int read_bam(void *data, bam1_t *b)
      {
          ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b);
          if ( ret<0 ) break;
-        if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
+        if ( b->core.flag & aux->flags ) continue;
          if ( (int)b->core.qual < aux->min_mapQ ) continue;
          break;
      }
@@ -71,10 +74,12 @@ int main_bedcov(int argc, char *argv[])
      kstream_t *ks;
      hts_idx_t **idx;
      aux_t **aux;
-    int *n_plp, dret, i, j, m, n, c, min_mapQ = 0, skip_DN = 0;
-    int64_t *cnt;
+    int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0;
+    int64_t *cnt, *pcov = NULL;;
      const bam_pileup1_t **plp;
      int usage = 0, has_index_file = 0;
+    uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
+    int tflags = 0, min_depth = -1;
  
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
      static const struct option lopts[] = {
@@ -82,11 +87,28 @@ int main_bedcov(int argc, char *argv[])
          { NULL, 0, NULL, 0 }
      };
  
-    while ((c = getopt_long(argc, argv, "Q:Xj", lopts, NULL)) >= 0) {
+    while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) {
          switch (c) {
          case 'Q': min_mapQ = atoi(optarg); break;
          case 'X': has_index_file = 1; break;
+        case 'g':
+            tflags = bam_str2flag(optarg);
+            if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
+                print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
+                return 1;
+            }
+            flags &= ~tflags;
+            break;
+        case 'G':
+            tflags = bam_str2flag(optarg);
+            if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
+                print_error("bedcov", "Flag value \"%s\" is not supported", optarg);
+                return 1;
+            }
+            flags |= tflags;
+            break;
          case 'j': skip_DN = 1; break;
+        case 'd': min_depth = atoi(optarg); break;
          default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                    /* else fall-through */
          case '?': usage = 1; break;
@@ -98,7 +120,12 @@ int main_bedcov(int argc, char *argv[])
          fprintf(samtools_stderr, "Options:\n");
          fprintf(samtools_stderr, "      -Q <int>            mapping quality threshold [0]\n");
          fprintf(samtools_stderr, "      -X                  use customized index files\n");
+        fprintf(samtools_stderr, "      -g <flags>          remove the specified flags from the set used to filter out reads\n");
+        fprintf(samtools_stderr, "      -G <flags>          add the specified flags to the set used to filter out reads\n"
+                        "                          The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704");
          fprintf(samtools_stderr, "      -j                  do not include deletions (D) and ref skips (N) in bedcov computation\n");
+        fprintf(samtools_stderr, "      -d <int>            depth threshold. Number of reference bases with coverage above and"
+                        "                          including this value will be displayed in a separate column\n");
          sam_global_opt_help(samtools_stderr, "-.--.--.");
          return 1;
      }
@@ -138,8 +165,11 @@ int main_bedcov(int argc, char *argv[])
                      argv[i+optind+1]);
              return 2;
          }
+        aux[i]->flags = flags;
      }
-    cnt = calloc(n, 8);
+    cnt = calloc(n, sizeof(*cnt));
+    if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov));
+    if (!cnt || (min_depth >= 0 && !pcov)) return 2;
  
      fp = gzopen(argv[optind], "rb");
      if (fp == NULL) {
@@ -151,7 +181,8 @@ int main_bedcov(int argc, char *argv[])
      plp = calloc(n, sizeof(bam_pileup1_t*));
      while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
          char *p, *q;
-        int tid, beg, end, pos;
+        int tid, pos, num = 0;
+        int64_t beg = 0, end = 0;
          bam_mplp_t mplp;
  
          if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */
@@ -160,53 +191,75 @@ int main_bedcov(int argc, char *argv[])
             be followed by a tab in that case). */
          if (strncmp(str.s, "track ", 6) == 0) continue;
          if (strncmp(str.s, "browser ", 8) == 0) continue;
-        for (p = q = str.s; *p && *p != '\t'; ++p);
-        if (*p != '\t') goto bed_error;
-        *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t';
+        for (p = q = str.s; *p && !isspace(*p); ++p);
+        if (*p == 0) goto bed_error;
+        char c = *p;
+        *p = 0; tid = bam_name2id(aux[0]->header, q); *p = c;
          if (tid < 0) goto bed_error;
-        for (q = p = p + 1; isdigit(*p); ++p);
-        if (*p != '\t') goto bed_error;
-        *p = 0; beg = atoi(q); *p = '\t';
-        for (q = p = p + 1; isdigit(*p); ++p);
-        if (*p == '\t' || *p == 0) {
-            int c = *p;
-            *p = 0; end = atoi(q); *p = c;
-        } else goto bed_error;
+        num = sscanf(p + 1, "%"SCNd64" %"SCNd64, &beg, &end);
+        if (num < 2 || end < beg) goto bed_error;
  
          for (i = 0; i < n; ++i) {
              if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
              aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
          }
+
          mplp = bam_mplp_init(n, read_bam, (void**)aux);
-        bam_mplp_set_maxcnt(mplp, 64000);
-        memset(cnt, 0, 8 * n);
-        while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
+        if (min_depth > DEFAULT_DEPTH)
+            bam_mplp_set_maxcnt(mplp, min_depth);
+        else
+            bam_mplp_set_maxcnt(mplp, DEFAULT_DEPTH);
+
+        memset(cnt, 0, sizeof(*cnt) * n);
+        if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n);
+
+        while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0)
              if (pos >= beg && pos < end) {
-                for (i = 0, m = 0; i < n; ++i) {
-                    if (skip_DN)
+                for (i = 0; i < n; ++i) {
+                    m = 0;
+                    if (skip_DN || min_depth >= 0) {
                          for (j = 0; j < n_plp[i]; ++j) {
                              const bam_pileup1_t *pi = plp[i] + j;
                              if (pi->is_del || pi->is_refskip) ++m;
                          }
-                    cnt[i] += n_plp[i] - m;
+                    }
+                    int pd = n_plp[i] - m;
+                    cnt[i] += pd;
+                    if (min_depth >= 0 && pd >= min_depth) pcov[i]++;
                  }
              }
+
+        if (ret < 0) {
+            print_error("bedcov", "error reading from input file");
+            status = 2;
+            bam_mplp_destroy(mplp);
+            break;
+        }
+
          for (i = 0; i < n; ++i) {
              kputc('\t', &str);
              kputl(cnt[i], &str);
          }
+        if (min_depth >= 0) {
+            for (i = 0; i < n; ++i) {
+                kputc('\t', &str);
+                kputl(pcov[i], &str);
+            }
+        }
          samtools_puts(str.s);
          bam_mplp_destroy(mplp);
          continue;
  
  bed_error:
          fprintf(samtools_stderr, "Errors in BED line '%s'\n", str.s);
+        status = 2;
      }
      free(n_plp); free(plp);
      ks_destroy(ks);
      gzclose(fp);
  
      free(cnt);
+    free(pcov);
      for (i = 0; i < n; ++i) {
          if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
          hts_idx_destroy(idx[i]);
@@ -217,5 +270,5 @@ bed_error:
      free(aux); free(idx);
      free(str.s);
      sam_global_args_free(&ga);
-    return 0;
+    return status;
  }
diff --git a/samtools/bedidx.c b/samtools/bedidx.c

index ded2314782a9357cad1241f61dad48472e76a0d3..6b22d4efce1f9a8aa5c499a8fc94a136aff1a4f6 100644 (file)
--- a/samtools/bedidx.c
+++ b/samtools/bedidx.c
@@ -573,6 +573,14 @@ const char* bed_get(void *reg_hash, int i, int filter) {
      return kh_key(h, i);
  }
  
+/**
+ * Create a region list from a the region hash table
+ * @param  reg_hash  The region hash table
+ * @param  filter    0 - allow all regions, 1 - allow only selected regions
+ * @param  n_reg     Pointer to the returned region number
+ * @return           The regions list as a hts_reglist_t
+ */
+
  hts_reglist_t *bed_reglist(void *reg_hash, int filter, int *n_reg) {
  
      reghash_t *h;
diff --git a/samtools/bedidx.c.pysam.c b/samtools/bedidx.c.pysam.c

index 027e08e032ab900ceef5075bc5182bdb943c6f10..533b42a92c202b0d367a169cbeff9c8f6734fcf3 100644 (file)
--- a/samtools/bedidx.c.pysam.c
+++ b/samtools/bedidx.c.pysam.c
@@ -575,6 +575,14 @@ const char* bed_get(void *reg_hash, int i, int filter) {
      return kh_key(h, i);
  }
  
+/**
+ * Create a region list from a the region hash table
+ * @param  reg_hash  The region hash table
+ * @param  filter    0 - allow all regions, 1 - allow only selected regions
+ * @param  n_reg     Pointer to the returned region number
+ * @return           The regions list as a hts_reglist_t
+ */
+
  hts_reglist_t *bed_reglist(void *reg_hash, int filter, int *n_reg) {
  
      reghash_t *h;
diff --git a/samtools/coverage.c b/samtools/coverage.c

index c4f38de703e3a3bc4577b600d58cc86264179d29..cab1f8b2e2fd62cf62ec4298141b2b0d3e7b2627 100644 (file)
--- a/samtools/coverage.c
+++ b/samtools/coverage.c
@@ -1,7 +1,7 @@
  /* coverage.c -- samtools coverage subcommand
  
      Copyright (C) 2018,2019 Florian Breitwieser
-    Portions copyright (C) 2019 Genome Research Ltd.
+    Portions copyright (C) 2019-2021 Genome Research Ltd.
  
      Author: Florian P Breitwieser <florian.bw@gmail.com>
  
@@ -24,7 +24,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  DEALINGS IN THE SOFTWARE.  */
  
  /* This program calculates coverage from multiple BAMs
- * simutaneously, to achieve random access and to use the BED interface.
+ * simultaneously, to achieve random access and to use the BED interface.
   * To compile this program separately, you may:
   *
   *   gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz
@@ -57,19 +57,6 @@ DEALINGS IN THE SOFTWARE.  */
  
  const char *VERSION = "0.1";
  
-typedef struct {  // auxiliary data structure to hold a BAM file
-    samFile *fp;     // file handle
-    sam_hdr_t *hdr;  // file header
-    hts_itr_t *iter; // iterator to a region - NULL for us by default
-    int min_mapQ;    // mapQ filter
-    int min_len;     // length filter
-    unsigned int n_reads;  // records the number of reads seen in file
-    unsigned int n_selected_reads; // records the number of reads passing filter
-    unsigned long summed_mapQ; // summed mapQ of all reads passing filter
-    int fail_flags;
-    int required_flags;
-} bam_aux_t;
-
  typedef struct {  // auxiliary data structure to hold stats on coverage
      unsigned long long n_covered_bases;
      unsigned long long summed_coverage;
@@ -77,12 +64,23 @@ typedef struct {  // auxiliary data structure to hold stats on coverage
      unsigned long long summed_mapQ;
      unsigned int n_reads;
      unsigned int n_selected_reads;
-    int32_t tid;    // chromosome ID, defined by header
+    bool covered;
      hts_pos_t beg;
      hts_pos_t end;
      int64_t bin_width;
  } stats_aux_t;
  
+typedef struct {  // auxiliary data structure to hold a BAM file
+    samFile *fp;     // file handle
+    sam_hdr_t *hdr;  // file header
+    hts_itr_t *iter; // iterator to a region - NULL for us by default
+    int min_mapQ;    // mapQ filter
+    int min_len;     // length filter
+    int fail_flags;
+    int required_flags;
+    stats_aux_t *stats;
+} bam_aux_t;
+
  #if __STDC_VERSION__ >= 199901L
  #define VERTICAL_LINE "\u2502" // BOX DRAWINGS LIGHT VERTICAL
  
@@ -91,7 +89,7 @@ typedef struct {  // auxiliary data structure to hold stats on coverage
  // LOWER ONE EIGHTH BLOCK … FULL BLOCK
  static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"};
  // In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those
-static const char *const BLOCK_CHARS2[2] = {"\u2584", "\u2588"};
+static const char *const BLOCK_CHARS2[2] = {".", ":"};
  
  #else
  
@@ -102,7 +100,7 @@ static const char *const BLOCK_CHARS8[8] = {
      "\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84",
      "\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" };
  
-static const char *const BLOCK_CHARS2[2] = {"\xE2\x96\x84", "\xE2\x96\x88"};
+static const char *const BLOCK_CHARS2[2] = {".", ":"};
  
  #endif
  
@@ -114,11 +112,14 @@ static int usage() {
              "Input options:\n"
              "  -b, --bam-list FILE     list of input BAM filenames, one per line\n"
              "  -l, --min-read-len INT  ignore reads shorter than INT bp [0]\n"
-            "  -q, --min-MQ INT        base quality threshold [0]\n"
-            "  -Q, --min-BQ INT        mapping quality threshold [0]\n"
+            "  -q, --min-MQ INT        mapping quality threshold [0]\n"
+            "  -Q, --min-BQ INT        base quality threshold [0]\n"
              "  --rf <int|str>          required flags: skip reads with mask bits unset []\n"
              "  --ff <int|str>          filter flags: skip reads with mask bits set \n"
              "                                      [UNMAP,SECONDARY,QCFAIL,DUP]\n"
+            "  -d, --depth INT         maximum allowed coverage depth [1000000].\n"
+            "                          If 0, depth is set to the maximum integer value,\n"
+            "                          effectively removing any depth limit.\n"
              "Output options:\n"
              "  -m, --histogram         show histogram instead of tabular output\n"
              "  -A, --ascii             show only ASCII characters in histogram\n"
@@ -171,79 +172,63 @@ static char* readable_bps(double base_pairs, char *buf) {
      return buf;
  }
  
-static void set_read_counts(bam_aux_t **data, stats_aux_t *stats, int n_bam_files) {
-    int i;
-    stats->n_reads = 0;
-    stats->n_selected_reads = 0;
-    stats->summed_mapQ = 0;
-    for (i = 0; i < n_bam_files && data[i]; ++i) {
-        stats->n_reads += data[i]->n_reads;
-        stats->n_selected_reads += data[i]->n_selected_reads;
-        stats->summed_mapQ += data[i]->summed_mapQ;
-        data[i]->n_reads = 0;
-        data[i]->n_selected_reads = 0;
-        data[i]->summed_mapQ = 0;
-    }
-}
-
  // read one alignment from one BAM file
  static int read_bam(void *data, bam1_t *b) {
      bam_aux_t *aux = (bam_aux_t*)data; // data in fact is a pointer to an auxiliary structure
+    int nref = sam_hdr_nref(aux->hdr);
      int ret;
      while (1) {
          if((ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b)) < 0) break;
-        ++aux->n_reads;
+        if (b->core.tid >= 0 && b->core.tid < nref)
+            aux->stats[b->core.tid].n_reads++;
  
          if ( aux->fail_flags && (b->core.flag & aux->fail_flags) ) continue;
          if ( aux->required_flags && !(b->core.flag & aux->required_flags) ) continue;
          if ( b->core.qual < aux->min_mapQ ) continue;
          if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue;
-        ++aux->n_selected_reads;
-        aux->summed_mapQ += b->core.qual;
+        if (b->core.tid >= 0 && b->core.tid < nref) {
+            aux->stats[b->core.tid].n_selected_reads++;
+            aux->stats[b->core.tid].summed_mapQ += b->core.qual;
+        }
          break;
      }
      return ret;
  }
  
-void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats) {
-    fputs(sam_hdr_tid2name(h, stats->tid), file_out);
-    double region_len = (double) stats->end - stats->beg;
+void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid) {
+    fputs(sam_hdr_tid2name(h, tid), file_out);
+    double region_len = (double) stats[tid].end - stats[tid].beg;
      fprintf(file_out, "\t%"PRId64"\t%"PRId64"\t%u\t%llu\t%g\t%g\t%.3g\t%.3g\n",
-            stats->beg+1,
-            stats->end,
-            stats->n_selected_reads,
-            stats->n_covered_bases,
-            100.0 * stats->n_covered_bases / region_len,
-            stats->summed_coverage / region_len,
-            stats->summed_coverage > 0? stats->summed_baseQ/(double) stats->summed_coverage : 0,
-            stats->n_selected_reads > 0? stats->summed_mapQ/(double) stats->n_selected_reads : 0
+            stats[tid].beg+1,
+            stats[tid].end,
+            stats[tid].n_selected_reads,
+            stats[tid].n_covered_bases,
+            100.0 * stats[tid].n_covered_bases / region_len,
+            stats[tid].summed_coverage / region_len,
+            stats[tid].summed_coverage > 0? stats[tid].summed_baseQ/(double) stats[tid].summed_coverage : 0,
+            stats[tid].n_selected_reads > 0? stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads : 0
             );
  }
  
-void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, const uint32_t *hist,
+void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist,
          const int hist_size, const bool full_utf) {
      int i, col;
      bool show_percentiles = false;
      const int n_rows = 10;
      const char * const * BLOCK_CHARS = full_utf? BLOCK_CHARS8 : BLOCK_CHARS2;
      const int blockchar_len = full_utf? 8 : 2;
-    /*
-       if (stats->beg == 0) {
-       stats->end = h->target_len[stats->tid];
-       }
-       */
-    double region_len = stats->end - stats->beg;
+    double region_len = stats[tid].end - stats[tid].beg;
  
      // Calculate histogram that contains percent covered
      double hist_data[hist_size];
      double max_val = 0.0;
      for (i = 0; i < hist_size; ++i) {
-        hist_data[i] = 100 * hist[i] / (double) stats->bin_width;
+        hist_data[i] = 100 * hist[i] / (double) stats[tid].bin_width;
          if (hist_data[i] > max_val) max_val = hist_data[i];
      }
  
      char buf[30];
-    fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, stats->tid), readable_bps(sam_hdr_tid2len(h, stats->tid), buf));
+    fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, tid), readable_bps(sam_hdr_tid2len(h, tid), buf));
  
      double row_bin_size = max_val / (double) n_rows;
      for (i = n_rows-1; i >= 0; --i) {
@@ -253,7 +238,7 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, co
          } else {
              fprintf(file_out, ">%7.2f%% ", current_bin);
          }
-        fprintf(file_out, VERTICAL_LINE);
+        fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
          for (col = 0; col < hist_size; ++col) {
              // get the difference in eights, or halfs when full UTF8 is not supported
              int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1;
@@ -266,22 +251,22 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, co
                  fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]);
              }
          }
-        fprintf(file_out, VERTICAL_LINE);
+        fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
          fputc(' ', file_out);
          switch (i) {
-            case 9: fprintf(file_out, "Number of reads: %i", stats->n_selected_reads); break;
-            case 8: if (stats->n_reads - stats->n_selected_reads > 0) fprintf(file_out, "    (%i filtered)", stats->n_reads - stats->n_selected_reads); break;
-            case 7: fprintf(file_out, "Covered bases:   %sbp", readable_bps(stats->n_covered_bases, buf)); break;
+            case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break;
+            case 8: if (stats[tid].n_reads - stats[tid].n_selected_reads > 0) fprintf(file_out, "    (%i filtered)", stats[tid].n_reads - stats[tid].n_selected_reads); break;
+            case 7: fprintf(file_out, "Covered bases:   %sbp", readable_bps(stats[tid].n_covered_bases, buf)); break;
              case 6: fprintf(file_out, "Percent covered: %.4g%%",
-                            100.0 * stats->n_covered_bases / region_len); break;
+                            100.0 * stats[tid].n_covered_bases / region_len); break;
              case 5: fprintf(file_out, "Mean coverage:   %.3gx",
-                            stats->summed_coverage / region_len); break;
+                            stats[tid].summed_coverage / region_len); break;
              case 4: fprintf(file_out, "Mean baseQ:      %.3g",
-                            stats->summed_baseQ/(double) stats->summed_coverage); break;
+                            stats[tid].summed_baseQ/(double) stats[tid].summed_coverage); break;
              case 3: fprintf(file_out, "Mean mapQ:       %.3g",
-                            stats->summed_mapQ/(double) stats->n_selected_reads); break;
+                            stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads); break;
              case 1: fprintf(file_out, "Histo bin width: %sbp",
-                            readable_bps(stats->bin_width, buf)); break;
+                            readable_bps(stats[tid].bin_width, buf)); break;
              case 0: fprintf(file_out, "Histo max bin:   %.5g%%", max_val); break;
          };
          fputc('\n', file_out);
@@ -290,22 +275,22 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, co
      // print x axis. Could be made pretty for widths that are not divisible
      // by 10 by variable spacing of the labels, instead of placing a label every 10 characters
      char buf2[50];
-    fprintf(file_out, "     %s", center_text(readable_bps(stats->beg + 1, buf), buf2, 10));
+    fprintf(file_out, "     %s", center_text(readable_bps(stats[tid].beg + 1, buf), buf2, 10));
      int rest;
      for (rest = 10; rest < 10*(hist_size/10); rest += 10) {
-        fprintf(file_out, "%s", center_text(readable_bps(stats->beg + stats->bin_width*rest, buf), buf2, 10));
+        fprintf(file_out, "%s", center_text(readable_bps(stats[tid].beg + stats[tid].bin_width*rest, buf), buf2, 10));
      }
      int last_padding = hist_size%10;
-    fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats->end, buf), buf2, 10));
+    fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats[tid].end, buf), buf2, 10));
      fprintf(file_out, "\n");
  }
  
  int main_coverage(int argc, char *argv[]) {
      int status = EXIT_SUCCESS;
  
-    int ret, tid, pos, i, j;
+    int ret, tid = -1, old_tid = -1, pos, i, j;
  
-    int max_depth = 0;
+    int max_depth = 1000000;
      int opt_min_baseQ = 0;
      int opt_min_mapQ = 0;
      int opt_min_len = 0;
@@ -330,7 +315,6 @@ int main_coverage(int argc, char *argv[]) {
      bool opt_print_header = true;
      bool opt_print_tabular = true;
      bool opt_print_histogram = false;
-    bool *covered_tids = NULL;
      bool opt_full_utf = true;
  
      FILE *file_out = stdout;
@@ -343,7 +327,7 @@ int main_coverage(int argc, char *argv[]) {
          {"incl-flags", required_argument, NULL, 1}, // require flag
          {"excl-flags", required_argument, NULL, 2}, // filter flag
          {"bam-list", required_argument, NULL, 'b'},
-        {"min-read-len", required_argument, NULL, 'L'},
+        {"min-read-len", required_argument, NULL, 'l'},
          {"min-MQ", required_argument, NULL, 'q'},
          {"min-mq", required_argument, NULL, 'q'},
          {"min-BQ", required_argument, NULL, 'Q'},
@@ -355,13 +339,14 @@ int main_coverage(int argc, char *argv[]) {
          {"n-bins", required_argument, NULL, 'w'},
          {"region", required_argument, NULL, 'r'},
          {"help", no_argument, NULL, 'h'},
+        {"depth", required_argument, NULL, 'd'},
          { NULL, 0, NULL, 0 }
      };
  
      // parse the command line
      int c;
      opterr = 0;
-    while ((c = getopt_long(argc, argv, "Ao:L:q:Q:hHw:r:b:m", lopts, NULL)) != -1) {
+    while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) {
          switch (c) {
              case 1:
                  if ((required_flags = bam_str2flag(optarg)) < 0) {
@@ -372,9 +357,10 @@ int main_coverage(int argc, char *argv[]) {
                      fprintf(stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE;
                  }; break;
              case 'o': opt_output_file = optarg; opt_full_width = false; break;
-            case 'L': opt_min_len = atoi(optarg); break;
-            case 'q': opt_min_baseQ = atoi(optarg); break;
-            case 'Q': opt_min_mapQ = atoi(optarg); break;
+            case 'l': opt_min_len = atoi(optarg); break;
+            case 'q': opt_min_mapQ = atoi(optarg); break;
+            case 'Q': opt_min_baseQ = atoi(optarg); break;
+            case 'd': max_depth = atoi(optarg); break; // maximum coverage depth
              case 'w': opt_n_bins = atoi(optarg); opt_full_width = false;
                        opt_print_histogram = true; opt_print_tabular = false;
                        break;
@@ -427,7 +413,7 @@ int main_coverage(int argc, char *argv[]) {
              if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi)) {
                  columns = csbi.srWindow.Right - csbi.srWindow.Left + 1;
              }
-#else
+#elif defined TIOCGWINSZ
              struct winsize w;
              if (ioctl(2, TIOCGWINSZ, &w) == 0)
                  columns = w.ws_col;
@@ -460,7 +446,7 @@ int main_coverage(int argc, char *argv[]) {
  
      data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file
      if (!data) {
-        print_error("coverage", "Failed to allocate memory");
+        print_error_errno("coverage", "Failed to allocate memory");
          status = EXIT_FAILURE;
          goto coverage_end;
      }
@@ -469,7 +455,7 @@ int main_coverage(int argc, char *argv[]) {
          int rf;
          data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t));
          if (!data[i]) {
-            print_error("coverage", "Failed to allocate memory");
+            print_error_errno("coverage", "Failed to allocate memory");
              status = EXIT_FAILURE;
              goto coverage_end;
          }
@@ -485,12 +471,12 @@ int main_coverage(int argc, char *argv[]) {
  
          // Set CRAM options on file handle - returns 0 on success
          if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
-            print_error_errno("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
+            print_error("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
              status = EXIT_FAILURE;
              goto coverage_end;
          }
          if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
-            print_error_errno("coverage", "Failed to set CRAM_OPT_DECODE_MD value");
+            print_error("coverage", "Failed to set CRAM_OPT_DECODE_MD value");
              status = EXIT_FAILURE;
              goto coverage_end;
          }
@@ -516,7 +502,7 @@ int main_coverage(int argc, char *argv[]) {
              data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator
              hts_idx_destroy(idx); // the index is not needed any more; free the memory
              if (data[i]->iter == NULL) {
-                print_error_errno("coverage", "Failed to parse region \"%s\"", opt_reg);
+                print_error("coverage", "Failed to parse region \"%s\". Check the region format or region name presence in the file \"%s\"", opt_reg, argv[optind+i]);
                  status = EXIT_FAILURE;
                  goto coverage_end;
              }
@@ -528,30 +514,30 @@ int main_coverage(int argc, char *argv[]) {
  
      h = data[0]->hdr; // easy access to the header of the 1st BAM
      int n_targets = sam_hdr_nref(h);
-    covered_tids = calloc(n_targets, sizeof(bool));
-    stats = calloc(1, sizeof(stats_aux_t));
-    if (!covered_tids || !stats) {
-        print_error("coverage", "Failed to allocate memory");
+    stats = calloc(n_targets, sizeof(stats_aux_t));
+    if (!stats) {
+        print_error_errno("coverage", "Failed to allocate memory");
          status = EXIT_FAILURE;
          goto coverage_end;
      }
  
      int64_t n_bins = opt_n_bins;
      if (opt_reg) {
-        stats->tid = data[0]->iter->tid;
-        stats->beg = data[0]->iter->beg; // and to the parsed region coordinates
-        stats->end = data[0]->iter->end;
-        if (stats->end == HTS_POS_MAX) {
-            stats->end = sam_hdr_tid2len(h, stats->tid);
+        stats_aux_t *s = stats + data[0]->iter->tid;
+        s->beg = data[0]->iter->beg; // and to the parsed region coordinates
+        s->end = data[0]->iter->end;
+        if (s->end == HTS_POS_MAX) {
+            s->end = sam_hdr_tid2len(h, data[0]->iter->tid);
          }
-        if (opt_n_bins > stats->end - stats->beg) {
-            n_bins = stats->end - stats->beg;
+        if (opt_n_bins > s->end - s->beg) {
+            n_bins = s->end - s->beg;
          }
-        stats->bin_width = (stats->end-stats->beg) / n_bins;
-    } else {
-        stats->tid = -1;
+        s->bin_width = (s->end-s->beg) / (n_bins > 0 ? n_bins : 1);
      }
  
+    for (i=0; i<n_bam_files; i++)
+        data[i]->stats = stats;
+
      int64_t current_bin = 0;
  
      // the core multi-pileup loop
@@ -567,43 +553,41 @@ int main_coverage(int argc, char *argv[]) {
      n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM
      plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
      if (!hist || !n_plp || !plp) {
-        print_error("coverage", "Failed to allocate memory");
+        print_error_errno("coverage", "Failed to allocate memory");
          status = EXIT_FAILURE;
          goto coverage_end;
      }
      while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
  
-        if (tid != stats->tid) { // Next target sequence
-            if (stats->tid >= 0) { // It's not the first sequence, print results
-                set_read_counts(data, stats, n_bam_files);
+        if (tid != old_tid) { // Next target sequence
+            if (old_tid >= 0) {
                  if (opt_print_histogram) {
-                    print_hist(file_out, h, stats, hist, n_bins, opt_full_utf);
+                    print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf);
                      fputc('\n', file_out);
                  } else if (opt_print_tabular) {
-                    print_tabular_line(file_out, h, stats);
+                    print_tabular_line(file_out, h, stats, old_tid);
                  }
  
-                // reset data
-                memset(stats, 0, sizeof(stats_aux_t));
                  if (opt_print_histogram)
                      memset(hist, 0, n_bins*sizeof(uint32_t));
              }
  
-            stats->tid = tid;
-            covered_tids[tid] = true;
+            stats[tid].covered = true;
              if (!opt_reg)
-                stats->end = sam_hdr_tid2len(h, tid);
+                stats[tid].end = sam_hdr_tid2len(h, tid);
  
              if (opt_print_histogram) {
-                n_bins = opt_n_bins > stats->end-stats->beg? stats->end-stats->beg : opt_n_bins;
-                stats->bin_width = (stats->end-stats->beg) / n_bins;
+                n_bins = opt_n_bins > stats[tid].end-stats[tid].beg? stats[tid].end-stats[tid].beg : opt_n_bins;
+                stats[tid].bin_width = (stats[tid].end-stats[tid].beg) / n_bins;
              }
+
+            old_tid = tid;
          }
-        if (pos < stats->beg || pos >= stats->end) continue; // out of range; skip
+        if (pos < stats[tid].beg || pos >= stats[tid].end) continue; // out of range; skip
          if (tid >= n_targets) continue;     // diff number of @SQ lines per file?
  
          if (opt_print_histogram) {
-            current_bin = (pos - stats->beg) / stats->bin_width;
+            current_bin = (pos - stats[tid].beg) / stats[tid].bin_width;
          }
  
          bool count_base = false;
@@ -616,39 +600,40 @@ int main_coverage(int argc, char *argv[]) {
                  else if (p->qpos < p->b->core.l_qseq &&
                          bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality
                  else
-                    stats->summed_baseQ += bam_get_qual(p->b)[p->qpos];
+                    stats[tid].summed_baseQ += bam_get_qual(p->b)[p->qpos];
              }
              if (depth_at_pos > 0) {
                  count_base = true;
-                stats->summed_coverage += depth_at_pos;
+                stats[tid].summed_coverage += depth_at_pos;
              }
              // hist[current_bin] += depth_at_pos;  // Add counts to the histogram here to have one based on coverage
              //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
          }
          if (count_base) {
-            ++(stats->n_covered_bases);
+            stats[tid].n_covered_bases++;
              if (opt_print_histogram && current_bin < n_bins)
                  ++(hist[current_bin]); // Histogram based on breadth of coverage
          }
      }
  
-    if (stats->tid != -1) {
-        set_read_counts(data, stats, n_bam_files);
+    if (tid == -1 && opt_reg && *opt_reg != '*')
+        // Region specified but no data covering it.
+        tid = data[0]->iter->tid;
+
+    if (tid < n_targets && tid >=0) {
          if (opt_print_histogram) {
-            print_hist(file_out, h, stats, hist, n_bins, opt_full_utf);
+            print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf);
          } else if (opt_print_tabular) {
-            print_tabular_line(file_out, h, stats);
+            print_tabular_line(file_out, h, stats, tid);
          }
      }
  
  
      if (!opt_reg && opt_print_tabular) {
-        memset(stats, 0, sizeof(stats_aux_t));
          for (i = 0; i < n_targets; ++i) {
-            if (!covered_tids[i]) {
-                stats->tid = i;
-                stats->end = sam_hdr_tid2len(h, i);
-                print_tabular_line(file_out, h, stats);
+            if (!stats[i].covered) {
+                stats[i].end = sam_hdr_tid2len(h, i);
+                print_tabular_line(file_out, h, stats, i);
              }
          }
      }
@@ -658,13 +643,11 @@ int main_coverage(int argc, char *argv[]) {
  coverage_end:
      if (n_plp) free(n_plp);
      if (plp) free(plp);
-    bam_mplp_destroy(mplp);
+    if (mplp) bam_mplp_destroy(mplp);
  
-    if (covered_tids) free(covered_tids);
      if (hist) free(hist);
      if (stats) free(stats);
  
-
      // Close files and free data structures
      if (!(file_out == stdout || fclose(file_out) == 0)) {
          if (status == EXIT_SUCCESS) {
diff --git a/samtools/coverage.c.pysam.c b/samtools/coverage.c.pysam.c

index 127a52896d02b6bfa5dda4258aee5aec1ce3ca1a..662deb55ddcd0ffb29480da537061ee9b90b9c22 100644 (file)
--- a/samtools/coverage.c.pysam.c
+++ b/samtools/coverage.c.pysam.c
@@ -3,7 +3,7 @@
  /* coverage.c -- samtools coverage subcommand
  
      Copyright (C) 2018,2019 Florian Breitwieser
-    Portions copyright (C) 2019 Genome Research Ltd.
+    Portions copyright (C) 2019-2021 Genome Research Ltd.
  
      Author: Florian P Breitwieser <florian.bw@gmail.com>
  
@@ -26,7 +26,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  DEALINGS IN THE SOFTWARE.  */
  
  /* This program calculates coverage from multiple BAMs
- * simutaneously, to achieve random access and to use the BED interface.
+ * simultaneously, to achieve random access and to use the BED interface.
   * To compile this program separately, you may:
   *
   *   gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz
@@ -59,19 +59,6 @@ DEALINGS IN THE SOFTWARE.  */
  
  const char *VERSION = "0.1";
  
-typedef struct {  // auxiliary data structure to hold a BAM file
-    samFile *fp;     // file handle
-    sam_hdr_t *hdr;  // file header
-    hts_itr_t *iter; // iterator to a region - NULL for us by default
-    int min_mapQ;    // mapQ filter
-    int min_len;     // length filter
-    unsigned int n_reads;  // records the number of reads seen in file
-    unsigned int n_selected_reads; // records the number of reads passing filter
-    unsigned long summed_mapQ; // summed mapQ of all reads passing filter
-    int fail_flags;
-    int required_flags;
-} bam_aux_t;
-
  typedef struct {  // auxiliary data structure to hold stats on coverage
      unsigned long long n_covered_bases;
      unsigned long long summed_coverage;
@@ -79,12 +66,23 @@ typedef struct {  // auxiliary data structure to hold stats on coverage
      unsigned long long summed_mapQ;
      unsigned int n_reads;
      unsigned int n_selected_reads;
-    int32_t tid;    // chromosome ID, defined by header
+    bool covered;
      hts_pos_t beg;
      hts_pos_t end;
      int64_t bin_width;
  } stats_aux_t;
  
+typedef struct {  // auxiliary data structure to hold a BAM file
+    samFile *fp;     // file handle
+    sam_hdr_t *hdr;  // file header
+    hts_itr_t *iter; // iterator to a region - NULL for us by default
+    int min_mapQ;    // mapQ filter
+    int min_len;     // length filter
+    int fail_flags;
+    int required_flags;
+    stats_aux_t *stats;
+} bam_aux_t;
+
  #if __STDC_VERSION__ >= 199901L
  #define VERTICAL_LINE "\u2502" // BOX DRAWINGS LIGHT VERTICAL
  
@@ -93,7 +91,7 @@ typedef struct {  // auxiliary data structure to hold stats on coverage
  // LOWER ONE EIGHTH BLOCK … FULL BLOCK
  static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"};
  // In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those
-static const char *const BLOCK_CHARS2[2] = {"\u2584", "\u2588"};
+static const char *const BLOCK_CHARS2[2] = {".", ":"};
  
  #else
  
@@ -104,7 +102,7 @@ static const char *const BLOCK_CHARS8[8] = {
      "\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84",
      "\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" };
  
-static const char *const BLOCK_CHARS2[2] = {"\xE2\x96\x84", "\xE2\x96\x88"};
+static const char *const BLOCK_CHARS2[2] = {".", ":"};
  
  #endif
  
@@ -116,11 +114,14 @@ static int usage() {
              "Input options:\n"
              "  -b, --bam-list FILE     list of input BAM filenames, one per line\n"
              "  -l, --min-read-len INT  ignore reads shorter than INT bp [0]\n"
-            "  -q, --min-MQ INT        base quality threshold [0]\n"
-            "  -Q, --min-BQ INT        mapping quality threshold [0]\n"
+            "  -q, --min-MQ INT        mapping quality threshold [0]\n"
+            "  -Q, --min-BQ INT        base quality threshold [0]\n"
              "  --rf <int|str>          required flags: skip reads with mask bits unset []\n"
              "  --ff <int|str>          filter flags: skip reads with mask bits set \n"
              "                                      [UNMAP,SECONDARY,QCFAIL,DUP]\n"
+            "  -d, --depth INT         maximum allowed coverage depth [1000000].\n"
+            "                          If 0, depth is set to the maximum integer value,\n"
+            "                          effectively removing any depth limit.\n"
              "Output options:\n"
              "  -m, --histogram         show histogram instead of tabular output\n"
              "  -A, --ascii             show only ASCII characters in histogram\n"
@@ -173,79 +174,63 @@ static char* readable_bps(double base_pairs, char *buf) {
      return buf;
  }
  
-static void set_read_counts(bam_aux_t **data, stats_aux_t *stats, int n_bam_files) {
-    int i;
-    stats->n_reads = 0;
-    stats->n_selected_reads = 0;
-    stats->summed_mapQ = 0;
-    for (i = 0; i < n_bam_files && data[i]; ++i) {
-        stats->n_reads += data[i]->n_reads;
-        stats->n_selected_reads += data[i]->n_selected_reads;
-        stats->summed_mapQ += data[i]->summed_mapQ;
-        data[i]->n_reads = 0;
-        data[i]->n_selected_reads = 0;
-        data[i]->summed_mapQ = 0;
-    }
-}
-
  // read one alignment from one BAM file
  static int read_bam(void *data, bam1_t *b) {
      bam_aux_t *aux = (bam_aux_t*)data; // data in fact is a pointer to an auxiliary structure
+    int nref = sam_hdr_nref(aux->hdr);
      int ret;
      while (1) {
          if((ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b)) < 0) break;
-        ++aux->n_reads;
+        if (b->core.tid >= 0 && b->core.tid < nref)
+            aux->stats[b->core.tid].n_reads++;
  
          if ( aux->fail_flags && (b->core.flag & aux->fail_flags) ) continue;
          if ( aux->required_flags && !(b->core.flag & aux->required_flags) ) continue;
          if ( b->core.qual < aux->min_mapQ ) continue;
          if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue;
-        ++aux->n_selected_reads;
-        aux->summed_mapQ += b->core.qual;
+        if (b->core.tid >= 0 && b->core.tid < nref) {
+            aux->stats[b->core.tid].n_selected_reads++;
+            aux->stats[b->core.tid].summed_mapQ += b->core.qual;
+        }
          break;
      }
      return ret;
  }
  
-void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats) {
-    fputs(sam_hdr_tid2name(h, stats->tid), file_out);
-    double region_len = (double) stats->end - stats->beg;
+void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid) {
+    fputs(sam_hdr_tid2name(h, tid), file_out);
+    double region_len = (double) stats[tid].end - stats[tid].beg;
      fprintf(file_out, "\t%"PRId64"\t%"PRId64"\t%u\t%llu\t%g\t%g\t%.3g\t%.3g\n",
-            stats->beg+1,
-            stats->end,
-            stats->n_selected_reads,
-            stats->n_covered_bases,
-            100.0 * stats->n_covered_bases / region_len,
-            stats->summed_coverage / region_len,
-            stats->summed_coverage > 0? stats->summed_baseQ/(double) stats->summed_coverage : 0,
-            stats->n_selected_reads > 0? stats->summed_mapQ/(double) stats->n_selected_reads : 0
+            stats[tid].beg+1,
+            stats[tid].end,
+            stats[tid].n_selected_reads,
+            stats[tid].n_covered_bases,
+            100.0 * stats[tid].n_covered_bases / region_len,
+            stats[tid].summed_coverage / region_len,
+            stats[tid].summed_coverage > 0? stats[tid].summed_baseQ/(double) stats[tid].summed_coverage : 0,
+            stats[tid].n_selected_reads > 0? stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads : 0
             );
  }
  
-void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, const uint32_t *hist,
+void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, int tid, const uint32_t *hist,
          const int hist_size, const bool full_utf) {
      int i, col;
      bool show_percentiles = false;
      const int n_rows = 10;
      const char * const * BLOCK_CHARS = full_utf? BLOCK_CHARS8 : BLOCK_CHARS2;
      const int blockchar_len = full_utf? 8 : 2;
-    /*
-       if (stats->beg == 0) {
-       stats->end = h->target_len[stats->tid];
-       }
-       */
-    double region_len = stats->end - stats->beg;
+    double region_len = stats[tid].end - stats[tid].beg;
  
      // Calculate histogram that contains percent covered
      double hist_data[hist_size];
      double max_val = 0.0;
      for (i = 0; i < hist_size; ++i) {
-        hist_data[i] = 100 * hist[i] / (double) stats->bin_width;
+        hist_data[i] = 100 * hist[i] / (double) stats[tid].bin_width;
          if (hist_data[i] > max_val) max_val = hist_data[i];
      }
  
      char buf[30];
-    fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, stats->tid), readable_bps(sam_hdr_tid2len(h, stats->tid), buf));
+    fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, tid), readable_bps(sam_hdr_tid2len(h, tid), buf));
  
      double row_bin_size = max_val / (double) n_rows;
      for (i = n_rows-1; i >= 0; --i) {
@@ -255,7 +240,7 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, co
          } else {
              fprintf(file_out, ">%7.2f%% ", current_bin);
          }
-        fprintf(file_out, VERTICAL_LINE);
+        fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
          for (col = 0; col < hist_size; ++col) {
              // get the difference in eights, or halfs when full UTF8 is not supported
              int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1;
@@ -268,22 +253,22 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, co
                  fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]);
              }
          }
-        fprintf(file_out, VERTICAL_LINE);
+        fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
          fputc(' ', file_out);
          switch (i) {
-            case 9: fprintf(file_out, "Number of reads: %i", stats->n_selected_reads); break;
-            case 8: if (stats->n_reads - stats->n_selected_reads > 0) fprintf(file_out, "    (%i filtered)", stats->n_reads - stats->n_selected_reads); break;
-            case 7: fprintf(file_out, "Covered bases:   %sbp", readable_bps(stats->n_covered_bases, buf)); break;
+            case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break;
+            case 8: if (stats[tid].n_reads - stats[tid].n_selected_reads > 0) fprintf(file_out, "    (%i filtered)", stats[tid].n_reads - stats[tid].n_selected_reads); break;
+            case 7: fprintf(file_out, "Covered bases:   %sbp", readable_bps(stats[tid].n_covered_bases, buf)); break;
              case 6: fprintf(file_out, "Percent covered: %.4g%%",
-                            100.0 * stats->n_covered_bases / region_len); break;
+                            100.0 * stats[tid].n_covered_bases / region_len); break;
              case 5: fprintf(file_out, "Mean coverage:   %.3gx",
-                            stats->summed_coverage / region_len); break;
+                            stats[tid].summed_coverage / region_len); break;
              case 4: fprintf(file_out, "Mean baseQ:      %.3g",
-                            stats->summed_baseQ/(double) stats->summed_coverage); break;
+                            stats[tid].summed_baseQ/(double) stats[tid].summed_coverage); break;
              case 3: fprintf(file_out, "Mean mapQ:       %.3g",
-                            stats->summed_mapQ/(double) stats->n_selected_reads); break;
+                            stats[tid].summed_mapQ/(double) stats[tid].n_selected_reads); break;
              case 1: fprintf(file_out, "Histo bin width: %sbp",
-                            readable_bps(stats->bin_width, buf)); break;
+                            readable_bps(stats[tid].bin_width, buf)); break;
              case 0: fprintf(file_out, "Histo max bin:   %.5g%%", max_val); break;
          };
          fputc('\n', file_out);
@@ -292,22 +277,22 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, co
      // print x axis. Could be made pretty for widths that are not divisible
      // by 10 by variable spacing of the labels, instead of placing a label every 10 characters
      char buf2[50];
-    fprintf(file_out, "     %s", center_text(readable_bps(stats->beg + 1, buf), buf2, 10));
+    fprintf(file_out, "     %s", center_text(readable_bps(stats[tid].beg + 1, buf), buf2, 10));
      int rest;
      for (rest = 10; rest < 10*(hist_size/10); rest += 10) {
-        fprintf(file_out, "%s", center_text(readable_bps(stats->beg + stats->bin_width*rest, buf), buf2, 10));
+        fprintf(file_out, "%s", center_text(readable_bps(stats[tid].beg + stats[tid].bin_width*rest, buf), buf2, 10));
      }
      int last_padding = hist_size%10;
-    fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats->end, buf), buf2, 10));
+    fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats[tid].end, buf), buf2, 10));
      fprintf(file_out, "\n");
  }
  
  int main_coverage(int argc, char *argv[]) {
      int status = EXIT_SUCCESS;
  
-    int ret, tid, pos, i, j;
+    int ret, tid = -1, old_tid = -1, pos, i, j;
  
-    int max_depth = 0;
+    int max_depth = 1000000;
      int opt_min_baseQ = 0;
      int opt_min_mapQ = 0;
      int opt_min_len = 0;
@@ -332,7 +317,6 @@ int main_coverage(int argc, char *argv[]) {
      bool opt_print_header = true;
      bool opt_print_tabular = true;
      bool opt_print_histogram = false;
-    bool *covered_tids = NULL;
      bool opt_full_utf = true;
  
      FILE *file_out = samtools_stdout;
@@ -345,7 +329,7 @@ int main_coverage(int argc, char *argv[]) {
          {"incl-flags", required_argument, NULL, 1}, // require flag
          {"excl-flags", required_argument, NULL, 2}, // filter flag
          {"bam-list", required_argument, NULL, 'b'},
-        {"min-read-len", required_argument, NULL, 'L'},
+        {"min-read-len", required_argument, NULL, 'l'},
          {"min-MQ", required_argument, NULL, 'q'},
          {"min-mq", required_argument, NULL, 'q'},
          {"min-BQ", required_argument, NULL, 'Q'},
@@ -357,13 +341,14 @@ int main_coverage(int argc, char *argv[]) {
          {"n-bins", required_argument, NULL, 'w'},
          {"region", required_argument, NULL, 'r'},
          {"help", no_argument, NULL, 'h'},
+        {"depth", required_argument, NULL, 'd'},
          { NULL, 0, NULL, 0 }
      };
  
      // parse the command line
      int c;
      opterr = 0;
-    while ((c = getopt_long(argc, argv, "Ao:L:q:Q:hHw:r:b:m", lopts, NULL)) != -1) {
+    while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) {
          switch (c) {
              case 1:
                  if ((required_flags = bam_str2flag(optarg)) < 0) {
@@ -374,9 +359,10 @@ int main_coverage(int argc, char *argv[]) {
                      fprintf(samtools_stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE;
                  }; break;
              case 'o': opt_output_file = optarg; opt_full_width = false; break;
-            case 'L': opt_min_len = atoi(optarg); break;
-            case 'q': opt_min_baseQ = atoi(optarg); break;
-            case 'Q': opt_min_mapQ = atoi(optarg); break;
+            case 'l': opt_min_len = atoi(optarg); break;
+            case 'q': opt_min_mapQ = atoi(optarg); break;
+            case 'Q': opt_min_baseQ = atoi(optarg); break;
+            case 'd': max_depth = atoi(optarg); break; // maximum coverage depth
              case 'w': opt_n_bins = atoi(optarg); opt_full_width = false;
                        opt_print_histogram = true; opt_print_tabular = false;
                        break;
@@ -429,7 +415,7 @@ int main_coverage(int argc, char *argv[]) {
              if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi)) {
                  columns = csbi.srWindow.Right - csbi.srWindow.Left + 1;
              }
-#else
+#elif defined TIOCGWINSZ
              struct winsize w;
              if (ioctl(2, TIOCGWINSZ, &w) == 0)
                  columns = w.ws_col;
@@ -462,7 +448,7 @@ int main_coverage(int argc, char *argv[]) {
  
      data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file
      if (!data) {
-        print_error("coverage", "Failed to allocate memory");
+        print_error_errno("coverage", "Failed to allocate memory");
          status = EXIT_FAILURE;
          goto coverage_end;
      }
@@ -471,7 +457,7 @@ int main_coverage(int argc, char *argv[]) {
          int rf;
          data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t));
          if (!data[i]) {
-            print_error("coverage", "Failed to allocate memory");
+            print_error_errno("coverage", "Failed to allocate memory");
              status = EXIT_FAILURE;
              goto coverage_end;
          }
@@ -487,12 +473,12 @@ int main_coverage(int argc, char *argv[]) {
  
          // Set CRAM options on file handle - returns 0 on success
          if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
-            print_error_errno("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
+            print_error("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value");
              status = EXIT_FAILURE;
              goto coverage_end;
          }
          if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
-            print_error_errno("coverage", "Failed to set CRAM_OPT_DECODE_MD value");
+            print_error("coverage", "Failed to set CRAM_OPT_DECODE_MD value");
              status = EXIT_FAILURE;
              goto coverage_end;
          }
@@ -518,7 +504,7 @@ int main_coverage(int argc, char *argv[]) {
              data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator
              hts_idx_destroy(idx); // the index is not needed any more; free the memory
              if (data[i]->iter == NULL) {
-                print_error_errno("coverage", "Failed to parse region \"%s\"", opt_reg);
+                print_error("coverage", "Failed to parse region \"%s\". Check the region format or region name presence in the file \"%s\"", opt_reg, argv[optind+i]);
                  status = EXIT_FAILURE;
                  goto coverage_end;
              }
@@ -530,30 +516,30 @@ int main_coverage(int argc, char *argv[]) {
  
      h = data[0]->hdr; // easy access to the header of the 1st BAM
      int n_targets = sam_hdr_nref(h);
-    covered_tids = calloc(n_targets, sizeof(bool));
-    stats = calloc(1, sizeof(stats_aux_t));
-    if (!covered_tids || !stats) {
-        print_error("coverage", "Failed to allocate memory");
+    stats = calloc(n_targets, sizeof(stats_aux_t));
+    if (!stats) {
+        print_error_errno("coverage", "Failed to allocate memory");
          status = EXIT_FAILURE;
          goto coverage_end;
      }
  
      int64_t n_bins = opt_n_bins;
      if (opt_reg) {
-        stats->tid = data[0]->iter->tid;
-        stats->beg = data[0]->iter->beg; // and to the parsed region coordinates
-        stats->end = data[0]->iter->end;
-        if (stats->end == HTS_POS_MAX) {
-            stats->end = sam_hdr_tid2len(h, stats->tid);
+        stats_aux_t *s = stats + data[0]->iter->tid;
+        s->beg = data[0]->iter->beg; // and to the parsed region coordinates
+        s->end = data[0]->iter->end;
+        if (s->end == HTS_POS_MAX) {
+            s->end = sam_hdr_tid2len(h, data[0]->iter->tid);
          }
-        if (opt_n_bins > stats->end - stats->beg) {
-            n_bins = stats->end - stats->beg;
+        if (opt_n_bins > s->end - s->beg) {
+            n_bins = s->end - s->beg;
          }
-        stats->bin_width = (stats->end-stats->beg) / n_bins;
-    } else {
-        stats->tid = -1;
+        s->bin_width = (s->end-s->beg) / (n_bins > 0 ? n_bins : 1);
      }
  
+    for (i=0; i<n_bam_files; i++)
+        data[i]->stats = stats;
+
      int64_t current_bin = 0;
  
      // the core multi-pileup loop
@@ -569,43 +555,41 @@ int main_coverage(int argc, char *argv[]) {
      n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM
      plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
      if (!hist || !n_plp || !plp) {
-        print_error("coverage", "Failed to allocate memory");
+        print_error_errno("coverage", "Failed to allocate memory");
          status = EXIT_FAILURE;
          goto coverage_end;
      }
      while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
  
-        if (tid != stats->tid) { // Next target sequence
-            if (stats->tid >= 0) { // It's not the first sequence, print results
-                set_read_counts(data, stats, n_bam_files);
+        if (tid != old_tid) { // Next target sequence
+            if (old_tid >= 0) {
                  if (opt_print_histogram) {
-                    print_hist(file_out, h, stats, hist, n_bins, opt_full_utf);
+                    print_hist(file_out, h, stats, old_tid, hist, n_bins, opt_full_utf);
                      fputc('\n', file_out);
                  } else if (opt_print_tabular) {
-                    print_tabular_line(file_out, h, stats);
+                    print_tabular_line(file_out, h, stats, old_tid);
                  }
  
-                // reset data
-                memset(stats, 0, sizeof(stats_aux_t));
                  if (opt_print_histogram)
                      memset(hist, 0, n_bins*sizeof(uint32_t));
              }
  
-            stats->tid = tid;
-            covered_tids[tid] = true;
+            stats[tid].covered = true;
              if (!opt_reg)
-                stats->end = sam_hdr_tid2len(h, tid);
+                stats[tid].end = sam_hdr_tid2len(h, tid);
  
              if (opt_print_histogram) {
-                n_bins = opt_n_bins > stats->end-stats->beg? stats->end-stats->beg : opt_n_bins;
-                stats->bin_width = (stats->end-stats->beg) / n_bins;
+                n_bins = opt_n_bins > stats[tid].end-stats[tid].beg? stats[tid].end-stats[tid].beg : opt_n_bins;
+                stats[tid].bin_width = (stats[tid].end-stats[tid].beg) / n_bins;
              }
+
+            old_tid = tid;
          }
-        if (pos < stats->beg || pos >= stats->end) continue; // out of range; skip
+        if (pos < stats[tid].beg || pos >= stats[tid].end) continue; // out of range; skip
          if (tid >= n_targets) continue;     // diff number of @SQ lines per file?
  
          if (opt_print_histogram) {
-            current_bin = (pos - stats->beg) / stats->bin_width;
+            current_bin = (pos - stats[tid].beg) / stats[tid].bin_width;
          }
  
          bool count_base = false;
@@ -618,39 +602,40 @@ int main_coverage(int argc, char *argv[]) {
                  else if (p->qpos < p->b->core.l_qseq &&
                          bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality
                  else
-                    stats->summed_baseQ += bam_get_qual(p->b)[p->qpos];
+                    stats[tid].summed_baseQ += bam_get_qual(p->b)[p->qpos];
              }
              if (depth_at_pos > 0) {
                  count_base = true;
-                stats->summed_coverage += depth_at_pos;
+                stats[tid].summed_coverage += depth_at_pos;
              }
              // hist[current_bin] += depth_at_pos;  // Add counts to the histogram here to have one based on coverage
              //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output
          }
          if (count_base) {
-            ++(stats->n_covered_bases);
+            stats[tid].n_covered_bases++;
              if (opt_print_histogram && current_bin < n_bins)
                  ++(hist[current_bin]); // Histogram based on breadth of coverage
          }
      }
  
-    if (stats->tid != -1) {
-        set_read_counts(data, stats, n_bam_files);
+    if (tid == -1 && opt_reg && *opt_reg != '*')
+        // Region specified but no data covering it.
+        tid = data[0]->iter->tid;
+
+    if (tid < n_targets && tid >=0) {
          if (opt_print_histogram) {
-            print_hist(file_out, h, stats, hist, n_bins, opt_full_utf);
+            print_hist(file_out, h, stats, tid, hist, n_bins, opt_full_utf);
          } else if (opt_print_tabular) {
-            print_tabular_line(file_out, h, stats);
+            print_tabular_line(file_out, h, stats, tid);
          }
      }
  
  
      if (!opt_reg && opt_print_tabular) {
-        memset(stats, 0, sizeof(stats_aux_t));
          for (i = 0; i < n_targets; ++i) {
-            if (!covered_tids[i]) {
-                stats->tid = i;
-                stats->end = sam_hdr_tid2len(h, i);
-                print_tabular_line(file_out, h, stats);
+            if (!stats[i].covered) {
+                stats[i].end = sam_hdr_tid2len(h, i);
+                print_tabular_line(file_out, h, stats, i);
              }
          }
      }
@@ -660,13 +645,11 @@ int main_coverage(int argc, char *argv[]) {
  coverage_end:
      if (n_plp) free(n_plp);
      if (plp) free(plp);
-    bam_mplp_destroy(mplp);
+    if (mplp) bam_mplp_destroy(mplp);
  
-    if (covered_tids) free(covered_tids);
      if (hist) free(hist);
      if (stats) free(stats);
  
-
      // Close files and free data structures
      if (!(file_out == samtools_stdout || fclose(file_out) == 0)) {
          if (status == EXIT_SUCCESS) {
diff --git a/samtools/cut_target.c b/samtools/cut_target.c

index e59f51b866d14dede7f65e61f1e8df9ae9875133..7c8387ca6848f6be8adf8ae6a794f0b3b6073d38 100644 (file)
--- a/samtools/cut_target.c
+++ b/samtools/cut_target.c
@@ -63,7 +63,7 @@ static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
      if (n > g->max_bases) { // enlarge g->bases
          g->max_bases = n;
          kroundup32(g->max_bases);
-        g->bases = realloc(g->bases, g->max_bases * 2);
+        g->bases = realloc(g->bases, (size_t) g->max_bases * 2);
      }
      for (i = k = 0; i < n; ++i) {
          const bam_pileup1_t *p = plp + i;
@@ -170,7 +170,7 @@ static int read_aln(void *data, bam1_t *b)
  
  int main_cut_target(int argc, char *argv[])
  {
-    int c, tid, pos, n, lasttid = -1, usage = 0;
+    int c, tid, pos, n, lasttid = -1, usage = 0, status = EXIT_SUCCESS;
      hts_pos_t l, max_l;
      const bam_pileup1_t *p;
      bam_plp_t plp;
@@ -237,6 +237,12 @@ int main_cut_target(int argc, char *argv[])
          cns[pos] = gencns(&g, n, p);
      }
      process_cns(g.h, lasttid, l, cns);
+
+    if (n < 0) {
+        print_error("targetcut", "error reading from \"%s\"", argv[optind]);
+        status = EXIT_FAILURE;
+    }
+
      free(cns);
      sam_hdr_destroy(g.h);
      bam_plp_destroy(plp);
@@ -247,5 +253,5 @@ int main_cut_target(int argc, char *argv[])
      errmod_destroy(g.em);
      free(g.bases);
      sam_global_args_free(&ga);
-    return 0;
+    return status;
  }
diff --git a/samtools/cut_target.c.pysam.c b/samtools/cut_target.c.pysam.c

index bbc2d29b082f7a16d637c1619a5a6139031212ae..babe42b751e46887fa31729ee892d34e3cc2fccf 100644 (file)
--- a/samtools/cut_target.c.pysam.c
+++ b/samtools/cut_target.c.pysam.c
@@ -65,7 +65,7 @@ static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
      if (n > g->max_bases) { // enlarge g->bases
          g->max_bases = n;
          kroundup32(g->max_bases);
-        g->bases = realloc(g->bases, g->max_bases * 2);
+        g->bases = realloc(g->bases, (size_t) g->max_bases * 2);
      }
      for (i = k = 0; i < n; ++i) {
          const bam_pileup1_t *p = plp + i;
@@ -172,7 +172,7 @@ static int read_aln(void *data, bam1_t *b)
  
  int main_cut_target(int argc, char *argv[])
  {
-    int c, tid, pos, n, lasttid = -1, usage = 0;
+    int c, tid, pos, n, lasttid = -1, usage = 0, status = EXIT_SUCCESS;
      hts_pos_t l, max_l;
      const bam_pileup1_t *p;
      bam_plp_t plp;
@@ -239,6 +239,12 @@ int main_cut_target(int argc, char *argv[])
          cns[pos] = gencns(&g, n, p);
      }
      process_cns(g.h, lasttid, l, cns);
+
+    if (n < 0) {
+        print_error("targetcut", "error reading from \"%s\"", argv[optind]);
+        status = EXIT_FAILURE;
+    }
+
      free(cns);
      sam_hdr_destroy(g.h);
      bam_plp_destroy(plp);
@@ -249,5 +255,5 @@ int main_cut_target(int argc, char *argv[])
      errmod_destroy(g.em);
      free(g.bases);
      sam_global_args_free(&ga);
-    return 0;
+    return status;
  }
diff --git a/samtools/dict.c b/samtools/dict.c

index c159c24e7502e384f382c9c28cd3d53b2ddb4f45..029d54876ceec27bb74911257acfa2467281da50 100644 (file)
--- a/samtools/dict.c
+++ b/samtools/dict.c
@@ -1,6 +1,6 @@
  /*  dict.c -- create a sequence dictionary file.
  
-    Copyright (C) 2015 Genome Research Ltd.
+    Copyright (C) 2015, 2020 Genome Research Ltd.
  
      Author: Shane McCarthy <sm15@sanger.ac.uk>
  
@@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include <config.h>
  
  #include <stdio.h>
+#include <string.h>
  #include <unistd.h>
  #include <zlib.h>
  #include <getopt.h>
@@ -37,7 +38,7 @@ typedef struct _args_t
  {
      char *output_fname, *fname;
      char *assembly, *species, *uri;
-    int  header;
+    int  alias, header;
  }
  args_t;
  
@@ -79,6 +80,20 @@ static void write_dict(const char *fn, args_t *args)
          hts_md5_final(digest, md5);
          hts_md5_hex(hex, digest);
          fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex);
+        if (args->alias) {
+            const char *name = seq->name.s;
+            if (strncmp(name, "chr", 3) == 0) {
+                name += 3;
+                fprintf(out, "\tAN:%s", name);
+            }
+            else
+                fprintf(out, "\tAN:chr%s", name);
+
+            if (strcmp(name, "M") == 0)
+                fprintf(out, ",chrMT,MT");
+            else if (strcmp(name, "MT") == 0)
+                fprintf(out, ",chrM,M");
+        }
          if (args->uri)
              fprintf(out, "\tUR:%s", args->uri);
          else if (strcmp(fn, "-") != 0) {
@@ -107,8 +122,10 @@ static int dict_usage(void)
      fprintf(stderr, "About:   Create a sequence dictionary file from a fasta file\n");
      fprintf(stderr, "Usage:   samtools dict [options] <file.fa|file.fa.gz>\n\n");
      fprintf(stderr, "Options: -a, --assembly STR    assembly\n");
+    fprintf(stderr, "         -A, --alias, --alternative-name\n");
+    fprintf(stderr, "                               add AN tag by adding/removing 'chr'\n");
      fprintf(stderr, "         -H, --no-header       do not print @HD line\n");
-    fprintf(stderr, "         -o, --output STR      file to write out dict file [stdout]\n");
+    fprintf(stderr, "         -o, --output FILE     file to write out dict file [stdout]\n");
      fprintf(stderr, "         -s, --species STR     species\n");
      fprintf(stderr, "         -u, --uri STR         URI [file:///abs/path/to/file.fa]\n");
      fprintf(stderr, "\n");
@@ -124,6 +141,8 @@ int dict_main(int argc, char *argv[])
      {
          {"help", no_argument, NULL, 'h'},
          {"no-header", no_argument, NULL, 'H'},
+        {"alias", no_argument, NULL, 'A'},
+        {"alternative-name", no_argument, NULL, 'A'},
          {"assembly", required_argument, NULL, 'a'},
          {"species", required_argument, NULL, 's'},
          {"uri", required_argument, NULL, 'u'},
@@ -131,10 +150,11 @@ int dict_main(int argc, char *argv[])
          {NULL, 0, NULL, 0}
      };
      int c;
-    while ( (c=getopt_long(argc,argv,"?hHa:s:u:o:",loptions,NULL))>0 )
+    while ( (c=getopt_long(argc,argv,"?AhHa:s:u:o:",loptions,NULL))>0 )
      {
          switch (c)
          {
+            case 'A': args->alias = 1; break;
              case 'a': args->assembly = optarg; break;
              case 's': args->species = optarg; break;
              case 'u': args->uri = optarg; break;
diff --git a/samtools/dict.c.pysam.c b/samtools/dict.c.pysam.c

index 87ec1ac142f57813c239ff94769db907d9506057..ca54c48cc6b6f0843893f5376e88c115528f2a97 100644 (file)
--- a/samtools/dict.c.pysam.c
+++ b/samtools/dict.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  dict.c -- create a sequence dictionary file.
  
-    Copyright (C) 2015 Genome Research Ltd.
+    Copyright (C) 2015, 2020 Genome Research Ltd.
  
      Author: Shane McCarthy <sm15@sanger.ac.uk>
  
@@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE.  */
  #include <config.h>
  
  #include <stdio.h>
+#include <string.h>
  #include <unistd.h>
  #include <zlib.h>
  #include <getopt.h>
@@ -39,7 +40,7 @@ typedef struct _args_t
  {
      char *output_fname, *fname;
      char *assembly, *species, *uri;
-    int  header;
+    int  alias, header;
  }
  args_t;
  
@@ -55,19 +56,19 @@ static void write_dict(const char *fn, args_t *args)
      fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
      if (fp == 0) {
          fprintf(samtools_stderr, "dict: %s: No such file or directory\n", fn);
-        exit(1);
+        samtools_exit(1);
      }
      FILE *out = samtools_stdout;
      if (args->output_fname) {
          out = fopen(args->output_fname, "w");
          if (out == NULL) {
            fprintf(samtools_stderr, "dict: %s: Cannot open file for writing\n", args->output_fname);
-          exit(1);
+          samtools_exit(1);
          }
      }
  
      if (!(md5 = hts_md5_init()))
-        exit(1);
+        samtools_exit(1);
  
      seq = kseq_init(fp);
      if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n");
@@ -81,6 +82,20 @@ static void write_dict(const char *fn, args_t *args)
          hts_md5_final(digest, md5);
          hts_md5_hex(hex, digest);
          fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex);
+        if (args->alias) {
+            const char *name = seq->name.s;
+            if (strncmp(name, "chr", 3) == 0) {
+                name += 3;
+                fprintf(out, "\tAN:%s", name);
+            }
+            else
+                fprintf(out, "\tAN:chr%s", name);
+
+            if (strcmp(name, "M") == 0)
+                fprintf(out, ",chrMT,MT");
+            else if (strcmp(name, "MT") == 0)
+                fprintf(out, ",chrM,M");
+        }
          if (args->uri)
              fprintf(out, "\tUR:%s", args->uri);
          else if (strcmp(fn, "-") != 0) {
@@ -109,8 +124,10 @@ static int dict_usage(void)
      fprintf(samtools_stderr, "About:   Create a sequence dictionary file from a fasta file\n");
      fprintf(samtools_stderr, "Usage:   samtools dict [options] <file.fa|file.fa.gz>\n\n");
      fprintf(samtools_stderr, "Options: -a, --assembly STR    assembly\n");
+    fprintf(samtools_stderr, "         -A, --alias, --alternative-name\n");
+    fprintf(samtools_stderr, "                               add AN tag by adding/removing 'chr'\n");
      fprintf(samtools_stderr, "         -H, --no-header       do not print @HD line\n");
-    fprintf(samtools_stderr, "         -o, --output STR      file to write out dict file [samtools_stdout]\n");
+    fprintf(samtools_stderr, "         -o, --output FILE     file to write out dict file [samtools_stdout]\n");
      fprintf(samtools_stderr, "         -s, --species STR     species\n");
      fprintf(samtools_stderr, "         -u, --uri STR         URI [file:///abs/path/to/file.fa]\n");
      fprintf(samtools_stderr, "\n");
@@ -126,6 +143,8 @@ int dict_main(int argc, char *argv[])
      {
          {"help", no_argument, NULL, 'h'},
          {"no-header", no_argument, NULL, 'H'},
+        {"alias", no_argument, NULL, 'A'},
+        {"alternative-name", no_argument, NULL, 'A'},
          {"assembly", required_argument, NULL, 'a'},
          {"species", required_argument, NULL, 's'},
          {"uri", required_argument, NULL, 'u'},
@@ -133,10 +152,11 @@ int dict_main(int argc, char *argv[])
          {NULL, 0, NULL, 0}
      };
      int c;
-    while ( (c=getopt_long(argc,argv,"?hHa:s:u:o:",loptions,NULL))>0 )
+    while ( (c=getopt_long(argc,argv,"?AhHa:s:u:o:",loptions,NULL))>0 )
      {
          switch (c)
          {
+            case 'A': args->alias = 1; break;
              case 'a': args->assembly = optarg; break;
              case 's': args->species = optarg; break;
              case 'u': args->uri = optarg; break;
diff --git a/samtools/faidx.c b/samtools/faidx.c

index 162233fdb7a3bd4c1026205334aa708ac6492fa1..03b5d65213978f12f99d723e3dfdd60c8a98b396 100644 (file)
--- a/samtools/faidx.c
+++ b/samtools/faidx.c
@@ -1,6 +1,6 @@
  /*  faidx.c -- faidx subcommand.
  
-    Copyright (C) 2008, 2009, 2013, 2016, 2018-2019 Genome Research Ltd.
+    Copyright (C) 2008, 2009, 2013, 2016, 2018-2020 Genome Research Ltd.
      Portions copyright (C) 2011 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -198,14 +198,16 @@ static int read_regions_from_file(faidx_t *faid, hFILE *in_file, FILE *file, con
  
  static int usage(FILE *fp, enum fai_format_options format, int exit_status)
  {
-    char *tool, *file_type;
+    char *tool, *file_type, *index_name;
  
      if (format == FAI_FASTA) {
          tool = "faidx <file.fa|file.fa.gz>";
          file_type = "FASTA";
+        index_name = "file.fa";
      } else {
          tool = "fqidx <file.fq|file.fq.gz>";
          file_type = "FASTQ";
+        index_name = "file.fq";
      }
  
      fprintf(fp, "Usage: samtools %s [<reg> [...]]\n", tool);
@@ -219,8 +221,10 @@ static int usage(FILE *fp, enum fai_format_options format, int exit_status)
                  "                          TYPE = rc   for /rc on negative strand (default)\n"
                  "                                 no   for no strand indicator\n"
                  "                                 sign for (+) / (-)\n"
-                "                                 custom,<pos>,<neg> for custom indicator\n",
-                file_type, file_type);
+                "                                 custom,<pos>,<neg> for custom indicator\n"
+                "     --fai-idx      FILE  name of the index file (default %s.fai).\n"
+                "     --gzi-idx      FILE  name of compressed file index (default %s.gz.gzi).\n",
+                file_type, file_type, index_name, index_name);
  
  
      if (format == FAI_FASTA) {
@@ -241,6 +245,8 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
      char *pos_strand_name = ""; // Extension to add to name for +ve strand
      char *neg_strand_name = "/rc"; // Extension to add to name for -ve strand
      char *strand_names = NULL; // Used for custom strand annotation
+    char *fai_name = NULL; // specified index name
+    char *gzi_name = NULL; // specified compressed index name
      FILE* file_out = stdout;/* output stream */
  
      static const struct option lopts[] = {
@@ -252,6 +258,8 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
          { "fastq", no_argument,              NULL, 'f' },
          { "reverse-complement", no_argument, NULL, 'i' },
          { "mark-strand", required_argument, NULL, 1000 },
+        { "fai-idx", required_argument,     NULL, 1001 },
+        { "gzi-idx", required_argument,     NULL, 1002 },
          { NULL, 0, NULL, 0 }
      };
  
@@ -300,6 +308,8 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
                      return usage(stderr, format, EXIT_FAILURE);
                  }
                  break;
+            case 1001: fai_name = optarg; break;
+            case 1002: gzi_name = optarg; break;
              default:  break;
          }
      }
@@ -307,19 +317,40 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
      if ( argc==optind )
          return usage(stdout, format, EXIT_SUCCESS);
  
-    if ( optind+1 == argc && !region_file)
-    {
-        if (fai_build(argv[optind]) != 0) {
-            fprintf(stderr, "[faidx] Could not build fai index %s.fai\n", argv[optind]);
+    if (optind+1 == argc && !region_file) {
+        if (output_file && !fai_name)
+            fai_name = output_file;
+
+        if (fai_build3(argv[optind], fai_name, gzi_name) != 0) {
+            if (fai_name)
+                fprintf(stderr, "[faidx] Could not build fai index %s", fai_name);
+            else
+                fprintf(stderr, "[faidx] Could not build fai index %s.fai", argv[optind]);
+
+            if (gzi_name)
+                fprintf(stderr, " or compressed index %s\n", gzi_name);
+            else
+                fprintf(stderr, "\n");
+
              return EXIT_FAILURE;
          }
+
          return 0;
      }
  
-    faidx_t *fai = fai_load_format(argv[optind], format);
+    faidx_t *fai = fai_load3_format(argv[optind], fai_name, gzi_name, FAI_CREATE, format);
+
+    if (!fai) {
+        if (fai_name)
+            fprintf(stderr, "[faidx] Could not load fai index %s", fai_name);
+        else
+            fprintf(stderr, "[faidx] Could not build fai index %s.fai", argv[optind]);
+
+        if (gzi_name)
+            fprintf(stderr, " or compressed index %s\n", gzi_name);
+        else
+            fprintf(stderr, "\n");
  
-    if ( !fai ) {
-        fprintf(stderr, "[faidx] Could not load fai index of %s\n", argv[optind]);
          return EXIT_FAILURE;
      }
  
diff --git a/samtools/faidx.c.pysam.c b/samtools/faidx.c.pysam.c

index e73e63b8b89c07ac31364ba02720580bf33871ed..0bc515bd97cbc423c5cfa26b9135534cd2a665a9 100644 (file)
--- a/samtools/faidx.c.pysam.c
+++ b/samtools/faidx.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  faidx.c -- faidx subcommand.
  
-    Copyright (C) 2008, 2009, 2013, 2016, 2018-2019 Genome Research Ltd.
+    Copyright (C) 2008, 2009, 2013, 2016, 2018-2020 Genome Research Ltd.
      Portions copyright (C) 2011 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -200,14 +200,16 @@ static int read_regions_from_file(faidx_t *faid, hFILE *in_file, FILE *file, con
  
  static int usage(FILE *fp, enum fai_format_options format, int exit_status)
  {
-    char *tool, *file_type;
+    char *tool, *file_type, *index_name;
  
      if (format == FAI_FASTA) {
          tool = "faidx <file.fa|file.fa.gz>";
          file_type = "FASTA";
+        index_name = "file.fa";
      } else {
          tool = "fqidx <file.fq|file.fq.gz>";
          file_type = "FASTQ";
+        index_name = "file.fq";
      }
  
      fprintf(fp, "Usage: samtools %s [<reg> [...]]\n", tool);
@@ -221,8 +223,10 @@ static int usage(FILE *fp, enum fai_format_options format, int exit_status)
                  "                          TYPE = rc   for /rc on negative strand (default)\n"
                  "                                 no   for no strand indicator\n"
                  "                                 sign for (+) / (-)\n"
-                "                                 custom,<pos>,<neg> for custom indicator\n",
-                file_type, file_type);
+                "                                 custom,<pos>,<neg> for custom indicator\n"
+                "     --fai-idx      FILE  name of the index file (default %s.fai).\n"
+                "     --gzi-idx      FILE  name of compressed file index (default %s.gz.gzi).\n",
+                file_type, file_type, index_name, index_name);
  
  
      if (format == FAI_FASTA) {
@@ -243,6 +247,8 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
      char *pos_strand_name = ""; // Extension to add to name for +ve strand
      char *neg_strand_name = "/rc"; // Extension to add to name for -ve strand
      char *strand_names = NULL; // Used for custom strand annotation
+    char *fai_name = NULL; // specified index name
+    char *gzi_name = NULL; // specified compressed index name
      FILE* file_out = samtools_stdout;/* output stream */
  
      static const struct option lopts[] = {
@@ -254,6 +260,8 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
          { "fastq", no_argument,              NULL, 'f' },
          { "reverse-complement", no_argument, NULL, 'i' },
          { "mark-strand", required_argument, NULL, 1000 },
+        { "fai-idx", required_argument,     NULL, 1001 },
+        { "gzi-idx", required_argument,     NULL, 1002 },
          { NULL, 0, NULL, 0 }
      };
  
@@ -302,6 +310,8 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
                      return usage(samtools_stderr, format, EXIT_FAILURE);
                  }
                  break;
+            case 1001: fai_name = optarg; break;
+            case 1002: gzi_name = optarg; break;
              default:  break;
          }
      }
@@ -309,19 +319,40 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format)
      if ( argc==optind )
          return usage(samtools_stdout, format, EXIT_SUCCESS);
  
-    if ( optind+1 == argc && !region_file)
-    {
-        if (fai_build(argv[optind]) != 0) {
-            fprintf(samtools_stderr, "[faidx] Could not build fai index %s.fai\n", argv[optind]);
+    if (optind+1 == argc && !region_file) {
+        if (output_file && !fai_name)
+            fai_name = output_file;
+
+        if (fai_build3(argv[optind], fai_name, gzi_name) != 0) {
+            if (fai_name)
+                fprintf(samtools_stderr, "[faidx] Could not build fai index %s", fai_name);
+            else
+                fprintf(samtools_stderr, "[faidx] Could not build fai index %s.fai", argv[optind]);
+
+            if (gzi_name)
+                fprintf(samtools_stderr, " or compressed index %s\n", gzi_name);
+            else
+                fprintf(samtools_stderr, "\n");
+
              return EXIT_FAILURE;
          }
+
          return 0;
      }
  
-    faidx_t *fai = fai_load_format(argv[optind], format);
+    faidx_t *fai = fai_load3_format(argv[optind], fai_name, gzi_name, FAI_CREATE, format);
+
+    if (!fai) {
+        if (fai_name)
+            fprintf(samtools_stderr, "[faidx] Could not load fai index %s", fai_name);
+        else
+            fprintf(samtools_stderr, "[faidx] Could not build fai index %s.fai", argv[optind]);
+
+        if (gzi_name)
+            fprintf(samtools_stderr, " or compressed index %s\n", gzi_name);
+        else
+            fprintf(samtools_stderr, "\n");
  
-    if ( !fai ) {
-        fprintf(samtools_stderr, "[faidx] Could not load fai index of %s\n", argv[optind]);
          return EXIT_FAILURE;
      }
  
diff --git a/samtools/htslib-1.10/LICENSE b/samtools/htslib-1.10/LICENSE

deleted file mode 100644 (file)

index f70e757..0000000
--- a/samtools/htslib-1.10/LICENSE
+++ /dev/null
@@ -1,69 +0,0 @@
-[Files in this distribution outwith the cram/ subdirectory are distributed
-according to the terms of the following MIT/Expat license.]
-
-The MIT/Expat License
-
-Copyright (C) 2012-2019 Genome Research Ltd.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
-
-[Files within the cram/ subdirectory in this distribution are distributed
-according to the terms of the following Modified 3-Clause BSD license.]
-
-The Modified-BSD License
-
-Copyright (C) 2012-2019 Genome Research Ltd.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice,
-   this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute
-   nor the names of its contributors may be used to endorse or promote products
-   derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR ITS CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-[The use of a range of years within a copyright notice in this distribution
-should be interpreted as being equivalent to a list of years including the
-first and last year specified and all consecutive years between them.
-
-For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009,
-2011-2012" should be interpreted as being identical to a notice that reads
-"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice
-that reads "Copyright (C) 2005-2012" should be interpreted as being identical
-to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010,
-2011, 2012".]
diff --git a/samtools/htslib-1.10/README b/samtools/htslib-1.10/README

deleted file mode 100644 (file)

index 4225bec..0000000
--- a/samtools/htslib-1.10/README
+++ /dev/null
@@ -1,5 +0,0 @@
-HTSlib is an implementation of a unified C library for accessing common file
-formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing
-data.  It is the core library used by samtools and bcftools.
-
-See INSTALL for building and installation instructions.
diff --git a/samtools/padding.c b/samtools/padding.c

index a769efec1d68fecc3c71164841fc6bf1d7965cb2..11b098ec1f10f7ee15373f1a41e8adca6b57ee20 100644 (file)
--- a/samtools/padding.c
+++ b/samtools/padding.c
@@ -1,7 +1,7 @@
  /*  padding.c -- depad subcommand.
  
      Copyright (C) 2011, 2012 Broad Institute.
-    Copyright (C) 2014-2016, 2019 Genome Research Ltd.
+    Copyright (C) 2014-2016, 2019-2020 Genome Research Ltd.
      Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -38,24 +38,38 @@ DEALINGS IN THE SOFTWARE.  */
  
  #define bam_reg2bin(b,e) hts_reg2bin((b),(e), 14, 5)
  
-// The one and only function needed from sam.c.
-// Explicitly here to avoid including bam.h translation layer.
-extern char *samfaipath(const char *fn_ref);
-
-static void replace_cigar(bam1_t *b, int n, uint32_t *cigar)
+static int replace_cigar(bam1_t *b, uint32_t n, uint32_t *cigar)
  {
+    int diff = 0;
      if (n != b->core.n_cigar) {
          int o = b->core.l_qname + b->core.n_cigar * 4;
-        if (b->l_data + (n - b->core.n_cigar) * 4 > b->m_data) {
-            b->m_data = b->l_data + (n - b->core.n_cigar) * 4;
-            kroundup32(b->m_data);
-            b->data = (uint8_t*)realloc(b->data, b->m_data);
+        if (n > b->core.n_cigar) {
+            diff = (n - b->core.n_cigar) * 4;
+            if ((INT_MAX - b->l_data)/4 < (n - b->core.n_cigar)) {
+                fprintf(stderr, "[depad] ERROR: BAM record too big\n");
+                return -1;
+            }
+            if (b->l_data + diff > b->m_data) {
+                b->m_data = b->l_data + diff;
+                kroundup32(b->m_data);
+                uint8_t *tmp = (uint8_t*)realloc(b->data, b->m_data);
+                if (!tmp) {
+                    fprintf(stderr, "[depad] ERROR: Memory allocation failure.\n");
+                    return -1;
+                }
+                b->data = tmp;
+            }
+        } else {
+            diff = -(int)((b->core.n_cigar - n) * 4);
          }
          memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->l_data - o);
-        memcpy(b->data + b->core.l_qname, cigar, n * 4);
-        b->l_data += (n - b->core.n_cigar) * 4;
          b->core.n_cigar = n;
-    } else memcpy(b->data + b->core.l_qname, cigar, n * 4);
+    }
+
+    memcpy(b->data + b->core.l_qname, cigar, n * 4);
+    b->l_data += diff;
+
+    return 0;
  }
  
  #define write_cigar(_c, _n, _m, _v) do { \
@@ -195,7 +209,8 @@ int bam_pad2unpad(samFile *in, samFile *out,  sam_hdr_t *h, faidx_t *fai)
      kstring_t r, q;
      int r_tid = -1;
      uint32_t *cigar2 = 0;
-    int ret = 0, n2 = 0, m2 = 0, *posmap = 0;
+    int ret = 0, *posmap = 0;
+    uint32_t n2 = 0, m2 = 0;
  
      b = bam_init1();
      if (!b) {
@@ -242,7 +257,8 @@ int bam_pad2unpad(samFile *in, samFile *out,  sam_hdr_t *h, faidx_t *fai)
                  }
              }
              write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH));
-            replace_cigar(b, n2, cigar2);
+            if (replace_cigar(b, n2, cigar2) < 0)
+                return -1;
              posmap = update_posmap(posmap, r);
          } else if (b->core.n_cigar > 0) {
              int i, k, op;
@@ -328,7 +344,8 @@ int bam_pad2unpad(samFile *in, samFile *out,  sam_hdr_t *h, faidx_t *fai)
              for (i = k = 0; i < n2; ++i)
                  if (cigar2[i]) cigar2[k++] = cigar2[i];
              n2 = k;
-            replace_cigar(b, n2, cigar2);
+            if (replace_cigar(b, n2, cigar2) < 0)
+                return -1;
          }
          /* Even unmapped reads can have a POS value, e.g. if their mate was mapped */
          if (b->core.pos != -1) b->core.pos = posmap[b->core.pos];
@@ -430,7 +447,7 @@ int main_pad2unpad(int argc, char *argv[])
      sam_hdr_t *h = 0, *h_fix = 0;
      faidx_t *fai = 0;
      int c, compress_level = -1, is_long_help = 0, no_pg = 0;
-    char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0, *fn_out_idx = NULL;
+    char in_mode[5], out_mode[6], *fn_out = 0, *fn_fai = 0, *fn_out_idx = NULL;
      int ret=0;
      char *arg_list = NULL;
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
@@ -477,8 +494,8 @@ int main_pad2unpad(int argc, char *argv[])
  
      // Load FASTA reference (also needed for SAM -> BAM if missing header)
      if (ga.reference) {
-        fn_list = samfaipath(ga.reference);
-        fai = fai_load(ga.reference);
+        fn_fai = fai_path(ga.reference);
+        fai = fai_load3(ga.reference, fn_fai, NULL, FAI_CREATE);
      }
      // open file handlers
      if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) {
@@ -486,8 +503,8 @@ int main_pad2unpad(int argc, char *argv[])
          ret = 1;
          goto depad_end;
      }
-    if (fn_list && hts_set_fai_filename(in, fn_list) != 0) {
-        fprintf(stderr, "[depad] failed to load reference file \"%s\".\n", fn_list);
+    if (fn_fai && hts_set_fai_filename(in, fn_fai) != 0) {
+        fprintf(stderr, "[depad] failed to load reference file \"%s\".\n", fn_fai);
          ret = 1;
          goto depad_end;
      }
@@ -570,7 +587,7 @@ depad_end:
          fprintf(stderr, "[depad] error on closing output file.\n");
          ret = 1;
      }
-    free(fn_list); free(fn_out);
+    free(fn_fai); free(fn_out);
      if (fn_out_idx)
          free(fn_out_idx);
      sam_global_args_free(&ga);
diff --git a/samtools/padding.c.pysam.c b/samtools/padding.c.pysam.c

index ecc3691d5543a82e39431b823ad231b5283dce7e..e90255fb38fd7c92af4f7192da9023ee198ddc34 100644 (file)
--- a/samtools/padding.c.pysam.c
+++ b/samtools/padding.c.pysam.c
@@ -3,7 +3,7 @@
  /*  padding.c -- depad subcommand.
  
      Copyright (C) 2011, 2012 Broad Institute.
-    Copyright (C) 2014-2016, 2019 Genome Research Ltd.
+    Copyright (C) 2014-2016, 2019-2020 Genome Research Ltd.
      Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -40,24 +40,38 @@ DEALINGS IN THE SOFTWARE.  */
  
  #define bam_reg2bin(b,e) hts_reg2bin((b),(e), 14, 5)
  
-// The one and only function needed from sam.c.
-// Explicitly here to avoid including bam.h translation layer.
-extern char *samfaipath(const char *fn_ref);
-
-static void replace_cigar(bam1_t *b, int n, uint32_t *cigar)
+static int replace_cigar(bam1_t *b, uint32_t n, uint32_t *cigar)
  {
+    int diff = 0;
      if (n != b->core.n_cigar) {
          int o = b->core.l_qname + b->core.n_cigar * 4;
-        if (b->l_data + (n - b->core.n_cigar) * 4 > b->m_data) {
-            b->m_data = b->l_data + (n - b->core.n_cigar) * 4;
-            kroundup32(b->m_data);
-            b->data = (uint8_t*)realloc(b->data, b->m_data);
+        if (n > b->core.n_cigar) {
+            diff = (n - b->core.n_cigar) * 4;
+            if ((INT_MAX - b->l_data)/4 < (n - b->core.n_cigar)) {
+                fprintf(samtools_stderr, "[depad] ERROR: BAM record too big\n");
+                return -1;
+            }
+            if (b->l_data + diff > b->m_data) {
+                b->m_data = b->l_data + diff;
+                kroundup32(b->m_data);
+                uint8_t *tmp = (uint8_t*)realloc(b->data, b->m_data);
+                if (!tmp) {
+                    fprintf(samtools_stderr, "[depad] ERROR: Memory allocation failure.\n");
+                    return -1;
+                }
+                b->data = tmp;
+            }
+        } else {
+            diff = -(int)((b->core.n_cigar - n) * 4);
          }
          memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->l_data - o);
-        memcpy(b->data + b->core.l_qname, cigar, n * 4);
-        b->l_data += (n - b->core.n_cigar) * 4;
          b->core.n_cigar = n;
-    } else memcpy(b->data + b->core.l_qname, cigar, n * 4);
+    }
+
+    memcpy(b->data + b->core.l_qname, cigar, n * 4);
+    b->l_data += diff;
+
+    return 0;
  }
  
  #define write_cigar(_c, _n, _m, _v) do { \
@@ -197,7 +211,8 @@ int bam_pad2unpad(samFile *in, samFile *out,  sam_hdr_t *h, faidx_t *fai)
      kstring_t r, q;
      int r_tid = -1;
      uint32_t *cigar2 = 0;
-    int ret = 0, n2 = 0, m2 = 0, *posmap = 0;
+    int ret = 0, *posmap = 0;
+    uint32_t n2 = 0, m2 = 0;
  
      b = bam_init1();
      if (!b) {
@@ -244,7 +259,8 @@ int bam_pad2unpad(samFile *in, samFile *out,  sam_hdr_t *h, faidx_t *fai)
                  }
              }
              write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH));
-            replace_cigar(b, n2, cigar2);
+            if (replace_cigar(b, n2, cigar2) < 0)
+                return -1;
              posmap = update_posmap(posmap, r);
          } else if (b->core.n_cigar > 0) {
              int i, k, op;
@@ -330,7 +346,8 @@ int bam_pad2unpad(samFile *in, samFile *out,  sam_hdr_t *h, faidx_t *fai)
              for (i = k = 0; i < n2; ++i)
                  if (cigar2[i]) cigar2[k++] = cigar2[i];
              n2 = k;
-            replace_cigar(b, n2, cigar2);
+            if (replace_cigar(b, n2, cigar2) < 0)
+                return -1;
          }
          /* Even unmapped reads can have a POS value, e.g. if their mate was mapped */
          if (b->core.pos != -1) b->core.pos = posmap[b->core.pos];
@@ -432,7 +449,7 @@ int main_pad2unpad(int argc, char *argv[])
      sam_hdr_t *h = 0, *h_fix = 0;
      faidx_t *fai = 0;
      int c, compress_level = -1, is_long_help = 0, no_pg = 0;
-    char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0, *fn_out_idx = NULL;
+    char in_mode[5], out_mode[6], *fn_out = 0, *fn_fai = 0, *fn_out_idx = NULL;
      int ret=0;
      char *arg_list = NULL;
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
@@ -479,8 +496,8 @@ int main_pad2unpad(int argc, char *argv[])
  
      // Load FASTA reference (also needed for SAM -> BAM if missing header)
      if (ga.reference) {
-        fn_list = samfaipath(ga.reference);
-        fai = fai_load(ga.reference);
+        fn_fai = fai_path(ga.reference);
+        fai = fai_load3(ga.reference, fn_fai, NULL, FAI_CREATE);
      }
      // open file handlers
      if ((in = sam_open_format(argv[optind], in_mode, &ga.in)) == 0) {
@@ -488,8 +505,8 @@ int main_pad2unpad(int argc, char *argv[])
          ret = 1;
          goto depad_end;
      }
-    if (fn_list && hts_set_fai_filename(in, fn_list) != 0) {
-        fprintf(samtools_stderr, "[depad] failed to load reference file \"%s\".\n", fn_list);
+    if (fn_fai && hts_set_fai_filename(in, fn_fai) != 0) {
+        fprintf(samtools_stderr, "[depad] failed to load reference file \"%s\".\n", fn_fai);
          ret = 1;
          goto depad_end;
      }
@@ -572,7 +589,7 @@ depad_end:
          fprintf(samtools_stderr, "[depad] error on closing output file.\n");
          ret = 1;
      }
-    free(fn_list); free(fn_out);
+    free(fn_fai); free(fn_out);
      if (fn_out_idx)
          free(fn_out_idx);
      sam_global_args_free(&ga);
diff --git a/samtools/phase.c b/samtools/phase.c

index 871e7c30d4a74ed29f69eaf2f62ccb4b0be72c4e..50f7a8f0843952ef6d3193dfb52a48022f34e031 100644 (file)
--- a/samtools/phase.c
+++ b/samtools/phase.c
@@ -583,6 +583,7 @@ static int start_output(phaseg_t *g, int c, const char *middle, const htsFormat
  int main_phase(int argc, char *argv[])
  {
      int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0;
+    int status = EXIT_SUCCESS;
      const bam_pileup1_t *plp;
      bam_plp_t iter;
      nseq_t *seqs;
@@ -785,6 +786,12 @@ int main_phase(int argc, char *argv[])
              return 1;
          }
      }
+
+    if (n < 0) {
+        print_error("phase", "error reading from '%s'", argv[optind]);
+        status = EXIT_FAILURE;
+    }
+
      sam_hdr_destroy(g.fp_hdr);
      bam_plp_destroy(iter);
      sam_close(g.fp);
@@ -809,5 +816,5 @@ int main_phase(int argc, char *argv[])
      }
      free(g.arg_list);
      sam_global_args_free(&ga);
-    return 0;
+    return status;
  }
diff --git a/samtools/phase.c.pysam.c b/samtools/phase.c.pysam.c

index 6357eab5a3b5f6fc9d08cdbd2bbfe80a7ae06d4d..13ab556592dc4f8aedc54f5d9dd7dc8e9cfd7e96 100644 (file)
--- a/samtools/phase.c.pysam.c
+++ b/samtools/phase.c.pysam.c
@@ -585,6 +585,7 @@ static int start_output(phaseg_t *g, int c, const char *middle, const htsFormat
  int main_phase(int argc, char *argv[])
  {
      int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0;
+    int status = EXIT_SUCCESS;
      const bam_pileup1_t *plp;
      bam_plp_t iter;
      nseq_t *seqs;
@@ -787,6 +788,12 @@ int main_phase(int argc, char *argv[])
              return 1;
          }
      }
+
+    if (n < 0) {
+        print_error("phase", "error reading from '%s'", argv[optind]);
+        status = EXIT_FAILURE;
+    }
+
      sam_hdr_destroy(g.fp_hdr);
      bam_plp_destroy(iter);
      sam_close(g.fp);
@@ -811,5 +818,5 @@ int main_phase(int argc, char *argv[])
      }
      free(g.arg_list);
      sam_global_args_free(&ga);
-    return 0;
+    return status;
  }
diff --git a/samtools/sam_view.c b/samtools/sam_view.c

index c13aea85edf1f062d2a7f88a38bf1f1fbbf3a416..515eaa510849a1b7794be63e86f752a54fc35f36 100644 (file)
--- a/samtools/sam_view.c
+++ b/samtools/sam_view.c
@@ -1,6 +1,6 @@
  /*  sam_view.c -- SAM<->BAM<->CRAM conversion.
  
-    Copyright (C) 2009-2019 Genome Research Ltd.
+    Copyright (C) 2009-2021 Genome Research Ltd.
      Portions copyright (C) 2009, 2011, 2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -37,20 +37,20 @@ DEALINGS IN THE SOFTWARE.  */
  #include "htslib/faidx.h"
  #include "htslib/khash.h"
  #include "htslib/thread_pool.h"
+#include "htslib/hts_expr.h"
  #include "samtools.h"
  #include "sam_opts.h"
  #include "bedidx.h"
  
-KHASH_SET_INIT_STR(rg)
-KHASH_SET_INIT_STR(tv)
+KHASH_SET_INIT_STR(str)
  
-typedef khash_t(rg) *rghash_t;
-typedef khash_t(tv) *tvhash_t;
+typedef khash_t(str) *strhash_t;
  
  // This structure contains the settings for a samview run
  typedef struct samview_settings {
-    rghash_t rghash;
-    tvhash_t tvhash;
+    strhash_t rghash;
+    strhash_t rnhash;
+    strhash_t tvhash;
      int min_mapQ;
      int flag_on;
      int flag_off;
@@ -65,13 +65,15 @@ typedef struct samview_settings {
      char** remove_aux;
      int multi_region;
      char* tag;
+    hts_filter_t *filter;
+    int remove_flag;
+    int add_flag;
  } samview_settings_t;
  
  
  // TODO Add declarations of these to a viable htslib or samtools header
  extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b);
  extern int bam_remove_B(bam1_t *b);
-extern char *samfaipath(const char *fn_ref);
  
  // Returns 0 to indicate read should be output 1 otherwise
  static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings)
@@ -98,19 +100,39 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin
      if (settings->rghash) {
          uint8_t *s = bam_aux_get(b, "RG");
          if (s) {
-            khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1));
+            khint_t k = kh_get(str, settings->rghash, (char*)(s + 1));
              if (k == kh_end(settings->rghash)) return 1;
          }
      }
-    if (settings->tvhash && settings->tag) {
+    if (settings->tag) {
          uint8_t *s = bam_aux_get(b, settings->tag);
          if (s) {
-            khint_t k = kh_get(tv, settings->tvhash, (char*)(s + 1));
-            if (k == kh_end(settings->tvhash)) return 1;
+            if (settings->tvhash) {
+                char t[32], *val;
+                if (*s == 'i' || *s == 'I' || *s == 's' || *s == 'S' || *s == 'c' || *s == 'C') {
+                    int ret = snprintf(t, 32, "%"PRId64, bam_aux2i(s));
+                    if (ret > 0) val = t;
+                    else return 1;
+                } else if (*s == 'A') {
+                    t[0] = *(s+1);
+                    t[1] = 0;
+                    val = t;
+                } else {
+                    val = (char *)(s+1);
+                }
+                khint_t k = kh_get(str, settings->tvhash, val);
+                if (k == kh_end(settings->tvhash)) return 1;
+            }
          } else {
              return 1;
          }
      }
+    if (settings->rnhash) {
+        const char* rn = bam_get_qname(b);
+        if (!rn || kh_get(str, settings->rnhash, rn) == kh_end(settings->rnhash)) {
+            return 1;
+        }
+    }
      if (settings->library) {
          const char *p = bam_get_library((sam_hdr_t*)h, b);
          if (!p || strcmp(p, settings->library) != 0) return 1;
@@ -124,11 +146,43 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin
              }
          }
      }
+
+    if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1)
+        return 1;
+
      return 0;
  }
  
  static int usage(FILE *fp, int exit_status, int is_long_help);
  
+static int populate_lookup_from_file(const char *subcmd, strhash_t lookup, char *fn)
+{
+    FILE *fp;
+    char buf[1024];
+    int ret = 0;
+    fp = fopen(fn, "r");
+    if (fp == NULL) {
+        print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
+        return -1;
+    }
+
+    while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) {
+        char *d = strdup(buf);
+        if (d != NULL) {
+            kh_put(str, lookup, d, &ret);
+            if (ret == 0) free(d); /* Duplicate */
+        } else {
+            ret = -1;
+        }
+    }
+    if (ferror(fp)) ret = -1;
+    if (ret == -1) {
+        print_error_errno(subcmd, "failed to read \"%s\"", fn);
+    }
+    fclose(fp);
+    return (ret != -1) ? 0 : -1;
+}
+
  static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name)
  {
      char *d = strdup(name);
@@ -137,11 +191,11 @@ static int add_read_group_single(const char *subcmd, samview_settings_t *setting
      if (d == NULL) goto err;
  
      if (settings->rghash == NULL) {
-        settings->rghash = kh_init(rg);
+        settings->rghash = kh_init(str);
          if (settings->rghash == NULL) goto err;
      }
  
-    kh_put(rg, settings->rghash, d, &ret);
+    kh_put(str, settings->rghash, d, &ret);
      if (ret == -1) goto err;
      if (ret ==  0) free(d); /* Duplicate */
      return 0;
@@ -152,40 +206,28 @@ static int add_read_group_single(const char *subcmd, samview_settings_t *setting
      return -1;
  }
  
-static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
+static int add_read_names_file(const char *subcmd, samview_settings_t *settings, char *fn)
  {
-    FILE *fp;
-    char buf[1024];
-    int ret = 0;
-    if (settings->rghash == NULL) {
-        settings->rghash = kh_init(rg);
-        if (settings->rghash == NULL) {
+    if (settings->rnhash == NULL) {
+        settings->rnhash = kh_init(str);
+        if (settings->rnhash == NULL) {
              perror(NULL);
              return -1;
          }
      }
+    return populate_lookup_from_file(subcmd, settings->rnhash, fn);
+}
  
-    fp = fopen(fn, "r");
-    if (fp == NULL) {
-        print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
-        return -1;
-    }
-
-    while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) {
-        char *d = strdup(buf);
-        if (d != NULL) {
-            kh_put(rg, settings->rghash, d, &ret);
-            if (ret == 0) free(d); /* Duplicate */
-        } else {
-            ret = -1;
+static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
+{
+    if (settings->rghash == NULL) {
+        settings->rghash = kh_init(str);
+        if (settings->rghash == NULL) {
+            perror(NULL);
+            return -1;
          }
      }
-    if (ferror(fp)) ret = -1;
-    if (ret == -1) {
-        print_error_errno(subcmd, "failed to read \"%s\"", fn);
-    }
-    fclose(fp);
-    return (ret != -1) ? 0 : -1;
+    return populate_lookup_from_file(subcmd, settings->rghash, fn);
  }
  
  static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name)
@@ -196,11 +238,11 @@ static int add_tag_value_single(const char *subcmd, samview_settings_t *settings
      if (d == NULL) goto err;
  
      if (settings->tvhash == NULL) {
-        settings->tvhash = kh_init(tv);
+        settings->tvhash = kh_init(str);
          if (settings->tvhash == NULL) goto err;
      }
  
-    kh_put(tv, settings->tvhash, d, &ret);
+    kh_put(str, settings->tvhash, d, &ret);
      if (ret == -1) goto err;
      if (ret ==  0) free(d); /* Duplicate */
      return 0;
@@ -213,38 +255,14 @@ static int add_tag_value_single(const char *subcmd, samview_settings_t *settings
  
  static int add_tag_values_file(const char *subcmd, samview_settings_t *settings, char *fn)
  {
-    FILE *fp;
-    char buf[1024];
-    int ret = 0;
      if (settings->tvhash == NULL) {
-        settings->tvhash = kh_init(tv);
+        settings->tvhash = kh_init(str);
          if (settings->tvhash == NULL) {
              perror(NULL);
              return -1;
          }
      }
-
-    fp = fopen(fn, "r");
-    if (fp == NULL) {
-        print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
-        return -1;
-    }
-
-    while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) {
-        char *d = strdup(buf);
-        if (d != NULL) {
-            kh_put(tv, settings->tvhash, d, &ret);
-            if (ret == 0) free(d); /* Duplicate */
-        } else {
-            ret = -1;
-        }
-    }
-    if (ferror(fp)) ret = -1;
-    if (ret == -1) {
-        print_error_errno(subcmd, "failed to read \"%s\"", fn);
-    }
-    fclose(fp);
-    return (ret != -1) ? 0 : -1;
+    return populate_lookup_from_file(subcmd, settings->tvhash, fn);
  }
  
  static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b, const char *fname, int *retp)
@@ -259,6 +277,18 @@ static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t
      return r;
  }
  
+static inline void change_flag(bam1_t *b, samview_settings_t *settings)
+{
+    if (settings->add_flag)
+        b->core.flag |= settings->add_flag;
+
+    if (settings->remove_flag)
+        b->core.flag &= ~settings->remove_flag;
+}
+
+// Make mnemonic distinct values for longoption-only options
+#define LONGOPT(c)  ((c) + 128)
+
  int main_samview(int argc, char *argv[])
  {
      int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0;
@@ -266,8 +296,8 @@ int main_samview(int argc, char *argv[])
      samFile *in = 0, *out = 0, *un_out=0;
      FILE *fp_out = NULL;
      sam_hdr_t *header = NULL;
-    char out_mode[5], out_un_mode[5], *out_format = "";
-    char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0;
+    char out_mode[6] = {0}, out_un_mode[6] = {0}, *out_format = "";
+    char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_fai = 0, *q, *fn_un_out = 0;
      char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL;
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
      htsThreadPool p = {NULL, 0};
@@ -288,12 +318,59 @@ int main_samview(int argc, char *argv[])
          .library = NULL,
          .bed = NULL,
          .multi_region = 0,
-        .tag = NULL
+        .tag = NULL,
+        .filter = NULL,
+        .remove_flag = 0,
+        .add_flag = 0
      };
  
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'),
-        {"no-PG", no_argument, NULL, 1},
+        {"add-flags", required_argument, NULL, LONGOPT('a')},
+        {"bam", no_argument, NULL, 'b'},
+        {"count", no_argument, NULL, 'c'},
+        {"cram", no_argument, NULL, 'C'},
+        {"customised-index", no_argument, NULL, 'X'},
+        {"customized-index", no_argument, NULL, 'X'},
+        {"excl-flags", required_argument, NULL, 'F'},
+        {"exclude-flags", required_argument, NULL, 'F'},
+        {"expr", required_argument, NULL, 'e'},
+        {"expression", required_argument, NULL, 'e'},
+        {"fai-reference", required_argument, NULL, 't'},
+        {"fast", no_argument, NULL, '1'},
+        {"header-only", no_argument, NULL, 'H'},
+        {"help", no_argument, NULL, LONGOPT('?')},
+        {"library", required_argument, NULL, 'l'},
+        {"min-mapq", required_argument, NULL, 'q'},
+        {"min-MQ", required_argument, NULL, 'q'},
+        {"min-mq", required_argument, NULL, 'q'},
+        {"min-qlen", required_argument, NULL, 'm'},
+        {"no-header", no_argument, NULL, LONGOPT('H')},
+        {"no-PG", no_argument, NULL, LONGOPT('P')},
+        {"output", required_argument, NULL, 'o'},
+        {"output-unselected", required_argument, NULL, 'U'},
+        {"QNAME-file", required_argument, NULL, 'N'},
+        {"qname-file", required_argument, NULL, 'N'},
+        {"read-group", required_argument, NULL, 'r'},
+        {"read-group-file", required_argument, NULL, 'R'},
+        {"readgroup", required_argument, NULL, 'r'},
+        {"readgroup-file", required_argument, NULL, 'R'},
+        {"region-file", required_argument, NULL, LONGOPT('L')},
+        {"regions-file", required_argument, NULL, LONGOPT('L')},
+        {"remove-B", no_argument, NULL, 'B'},
+        {"remove-flags", required_argument, NULL, LONGOPT('r')},
+        {"remove-tag", required_argument, NULL, 'x'},
+        {"require-flags", required_argument, NULL, 'f'},
+        {"subsample", required_argument, NULL, LONGOPT('s')},
+        {"subsample-seed", required_argument, NULL, LONGOPT('S')},
+        {"tag", required_argument, NULL, 'd'},
+        {"tag-file", required_argument, NULL, 'D'},
+        {"target-file", required_argument, NULL, 'L'},
+        {"targets-file", required_argument, NULL, 'L'},
+        {"uncompressed", no_argument, NULL, 'u'},
+        {"unoutput", required_argument, NULL, 'U'},
+        {"use-index", no_argument, NULL, 'M'},
+        {"with-header", no_argument, NULL, 'h'},
          { NULL, 0, NULL, 0 }
      };
  
@@ -310,16 +387,11 @@ int main_samview(int argc, char *argv[])
      opterr = 0;
  
      while ((c = getopt_long(argc, argv,
-                            "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:d:D:L:s:@:m:x:U:MX",
+                            "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:",
                              lopts, NULL)) >= 0) {
          switch (c) {
          case 's':
-            if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) {
-                // Convert likely user input 0,1,2,... to pseudo-random
-                // values with more entropy and more bits set
-                srand(settings.subsam_seed);
-                settings.subsam_seed = rand();
-            }
+            settings.subsam_seed = strtol(optarg, &q, 10);
              if (q && *q == '.') {
                  settings.subsam_frac = strtod(q, &q);
                  if (*q) ret = 1;
@@ -332,24 +404,36 @@ int main_samview(int argc, char *argv[])
                  goto view_end;
              }
              break;
+        case LONGOPT('s'):
+            settings.subsam_frac = strtod(optarg, &q);
+            if (*q || settings.subsam_frac < 0.0 || settings.subsam_frac > 1.0) {
+                print_error("view", "Incorrect sampling argument \"%s\"", optarg);
+                goto view_end;
+            }
+            break;
+        case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break;
          case 'm': settings.min_qlen = atoi(optarg); break;
          case 'c': is_count = 1; break;
          case 'S': break;
          case 'b': out_format = "b"; break;
          case 'C': out_format = "c"; break;
-        case 't': fn_list = strdup(optarg); break;
+        case 't': fn_fai = strdup(optarg); break;
          case 'h': is_header = 1; break;
          case 'H': is_header_only = 1; break;
+        case LONGOPT('H'): is_header = is_header_only = 0; break;
          case 'o': fn_out = strdup(optarg); break;
          case 'U': fn_un_out = strdup(optarg); break;
          case 'X': has_index_file = 1; break;
-        case 'f': settings.flag_on |= strtol(optarg, 0, 0); break;
-        case 'F': settings.flag_off |= strtol(optarg, 0, 0); break;
-        case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break;
+        case 'f': settings.flag_on |= bam_str2flag(optarg); break;
+        case 'F': settings.flag_off |= bam_str2flag(optarg); break;
+        case 'G': settings.flag_alloff |= bam_str2flag(optarg); break;
          case 'q': settings.min_mapQ = atoi(optarg); break;
          case 'u': compress_level = 0; break;
          case '1': compress_level = 1; break;
          case 'l': settings.library = strdup(optarg); break;
+        case LONGOPT('L'):
+            settings.multi_region = 1;
+            // fall through
          case 'L':
              if ((settings.bed = bed_read(optarg)) == NULL) {
                  print_error_errno("view", "Could not read file \"%s\"", optarg);
@@ -369,8 +453,14 @@ int main_samview(int argc, char *argv[])
                  goto view_end;
              }
              break;
+        case 'N':
+            if (add_read_names_file("view", &settings, optarg) != 0) {
+                ret = 1;
+                goto view_end;
+            }
+            break;
          case 'd':
-            if (strlen(optarg) < 4 || optarg[2] != ':') {
+            if (strlen(optarg) < 2 || (strlen(optarg) > 2 && optarg[2] != ':')) {
                  print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg);
                  ret = 1;
                  goto view_end;
@@ -391,7 +481,8 @@ int main_samview(int argc, char *argv[])
                  memcpy(settings.tag, optarg, 2);
              }
  
-            if (add_tag_value_single("view", &settings, optarg+3) != 0) {
+            if (strlen(optarg) > 3 && add_tag_value_single("view", &settings, optarg+3) != 0) {
+                print_error("view", "Could not add tag:value \"%s\"", optarg);
                  ret = 1;
                  goto view_end;
              }
@@ -399,7 +490,7 @@ int main_samview(int argc, char *argv[])
          case 'D':
              // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX
              // path translation as described at:
-            //   http://www.mingw.org/wiki/Posix_path_conversion
+            // http://www.mingw.org/wiki/Posix_path_conversion
              if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) {
                  print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg);
                  ret = 1;
@@ -430,6 +521,8 @@ int main_samview(int argc, char *argv[])
          //case 'x': out_format = "x"; break;
          //case 'X': out_format = "X"; break;
                   */
+        case LONGOPT('?'):
+            return usage(stdout, EXIT_SUCCESS, 1);
          case '?':
              if (optopt == '?') {  // '-?' appeared on command line
                  return usage(stdout, EXIT_SUCCESS, 1);
@@ -451,7 +544,7 @@ int main_samview(int argc, char *argv[])
          case 'x':
              {
                  if (strlen(optarg) != 2) {
-                    fprintf(stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n");
+                    print_error("main_samview", "Error parsing -x auxiliary tags should be exactly two characters long.");
                      return usage(stderr, EXIT_FAILURE, 0);
                  }
                  settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len));
@@ -459,13 +552,22 @@ int main_samview(int argc, char *argv[])
              }
              break;
          case 'M': settings.multi_region = 1; break;
-        case 1: no_pg = 1; break;
+        case LONGOPT('P'): no_pg = 1; break;
+        case 'e':
+            if (!(settings.filter = hts_filter_init(optarg))) {
+                print_error("main_samview", "Couldn't initialise filter");
+                return 1;
+            }
+            break;
+        case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break;
+        case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break;
          default:
              if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0)
                  return usage(stderr, EXIT_FAILURE, 0);
              break;
          }
      }
+    if (fn_fai == 0 && ga.reference) fn_fai = fai_path(ga.reference);
      if (compress_level >= 0 && !*out_format) out_format = "b";
      if (is_header_only) is_header = 1;
      // File format auto-detection first
@@ -474,8 +576,7 @@ int main_samview(int argc, char *argv[])
      // Overridden by manual -b, -C
      if (*out_format)
          out_mode[1] = out_un_mode[1] = *out_format;
-    out_mode[2] = out_un_mode[2] = '\0';
-    // out_(un_)mode now 1 or 2 bytes long, followed by nul.
+    // out_(un_)mode now 1, 2 or 3 bytes long, followed by nul.
      if (compress_level >= 0) {
          char tmp[2];
          tmp[0] = compress_level + '0'; tmp[1] = '\0';
@@ -486,20 +587,23 @@ int main_samview(int argc, char *argv[])
          print_error("view", "No input provided or missing option argument.");
          return usage(stderr, EXIT_FAILURE, 0); // potential memory leak...
      }
+    if (settings.subsam_seed != 0) {
+        // Convert likely user input 1,2,... to pseudo-random
+        // values with more entropy and more bits set
+        srand(settings.subsam_seed);
+        settings.subsam_seed = rand();
+    }
  
      fn_in = (optind < argc)? argv[optind] : "-";
-    // generate the fn_list if necessary
-    if (fn_list == 0 && ga.reference) fn_list = samfaipath(ga.reference);
-    // open file handlers
      if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) {
          print_error_errno("view", "failed to open \"%s\" for reading", fn_in);
          ret = 1;
          goto view_end;
      }
  
-    if (fn_list) {
-        if (hts_set_fai_filename(in, fn_list) != 0) {
-            fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+    if (fn_fai) {
+        if (hts_set_fai_filename(in, fn_fai) != 0) {
+            fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai);
              ret = 1;
              goto view_end;
          }
@@ -518,9 +622,9 @@ int main_samview(int argc, char *argv[])
              ret = 1;
              goto view_end;
          }
-        if (fn_list) {
-            if (hts_set_fai_filename(out, fn_list) != 0) {
-                fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+        if (fn_fai) {
+            if (hts_set_fai_filename(out, fn_fai) != 0) {
+                fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai);
                  ret = 1;
                  goto view_end;
              }
@@ -565,9 +669,9 @@ int main_samview(int argc, char *argv[])
                  ret = 1;
                  goto view_end;
              }
-            if (fn_list) {
-                if (hts_set_fai_filename(un_out, fn_list) != 0) {
-                    fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+            if (fn_fai) {
+                if (hts_set_fai_filename(un_out, fn_fai) != 0) {
+                    fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai);
                      ret = 1;
                      goto view_end;
                  }
@@ -654,7 +758,10 @@ int main_samview(int argc, char *argv[])
                          // fetch alignments
                          while ((result = sam_itr_multi_next(in, iter, b)) >= 0) {
                              if (!process_aln(header, b, &settings)) {
-                                if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; }
+                                if (!is_count) {
+                                    change_flag(b, &settings);
+                                    if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
+                                }
                                  count++;
                              } else {
                                  if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
@@ -682,16 +789,20 @@ int main_samview(int argc, char *argv[])
          if ((has_index_file && optind >= argc - 2) || (!has_index_file && optind >= argc - 1)) { // convert/print the entire file
              bam1_t *b = bam_init1();
              int r;
+            errno = 0;
              while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in'
                  if (!process_aln(header, b, &settings)) {
-                    if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; }
+                    if (!is_count) {
+                        change_flag(b, &settings);
+                        if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
+                    }
                      count++;
                  } else {
                      if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
                  }
              }
              if (r < -1) {
-                fprintf(stderr, "[main_samview] truncated file.\n");
+                print_error_errno("view", "error reading file \"%s\"", fn_in);
                  ret = 1;
              }
              bam_destroy1(b);
@@ -722,7 +833,10 @@ int main_samview(int argc, char *argv[])
                  // fetch alignments
                  while ((result = sam_itr_next(in, iter, b)) >= 0) {
                      if (!process_aln(header, b, &settings)) {
-                        if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; }
+                        if (!is_count) {
+                            change_flag(b, &settings);
+                            if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
+                        }
                          count++;
                      } else {
                          if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
@@ -766,7 +880,7 @@ view_end:
      if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret);
      if (fp_out) fclose(fp_out);
  
-    free(fn_list); free(fn_out); free(settings.library);  free(fn_un_out);
+    free(fn_fai); free(fn_out); free(settings.library);  free(fn_un_out);
      sam_global_args_free(&ga);
      if ( header ) sam_hdr_destroy(header);
      if (settings.bed) bed_destroy(settings.bed);
@@ -774,13 +888,19 @@ view_end:
          khint_t k;
          for (k = 0; k < kh_end(settings.rghash); ++k)
              if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k));
-        kh_destroy(rg, settings.rghash);
+        kh_destroy(str, settings.rghash);
+    }
+    if (settings.rnhash) {
+        khint_t k;
+        for (k = 0; k < kh_end(settings.rnhash); ++k)
+            if (kh_exist(settings.rnhash, k)) free((char*)kh_key(settings.rnhash, k));
+        kh_destroy(str, settings.rnhash);
      }
      if (settings.tvhash) {
          khint_t k;
          for (k = 0; k < kh_end(settings.tvhash); ++k)
              if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k));
-        kh_destroy(tv, settings.tvhash);
+        kh_destroy(str, settings.tvhash);
      }
      if (settings.remove_aux_len) {
          free(settings.remove_aux);
@@ -788,6 +908,8 @@ view_end:
      if (settings.tag) {
          free(settings.tag);
      }
+    if (settings.filter)
+        hts_filter_free(settings.filter);
  
      if (p.pool)
          hts_tpool_destroy(p.pool);
@@ -807,47 +929,52 @@ static int usage(FILE *fp, int exit_status, int is_long_help)
  "\n"
  "Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n"
  "\n"
-"Options:\n"
-// output options
-"  -b       output BAM\n"
-"  -C       output CRAM (requires -T)\n"
-"  -1       use fast BAM compression (implies -b)\n"
-"  -u       uncompressed BAM output (implies -b)\n"
-"  -h       include header in SAM output\n"
-"  -H       print SAM header only (no alignments)\n"
-"  -c       print only the count of matching records\n"
-"  -o FILE  output file name [stdout]\n"
-"  -U FILE  output reads not selected by filters to FILE [null]\n"
-// extra input
-"  -t FILE  FILE listing reference names and lengths (see long help) [null]\n"
-"  -X       include customized index file\n"
-// read filters
-"  -L FILE  only include reads overlapping this BED FILE [null]\n"
-"  -r STR   only include reads in read group STR [null]\n"
-"  -R FILE  only include reads with read group listed in FILE [null]\n"
-"  -d STR:STR\n"
-"           only include reads with tag STR and associated value STR [null]\n"
-"  -D STR:FILE\n"
-"           only include reads with tag STR and associated values listed in\n"
-"           FILE [null]\n"
-"  -q INT   only include reads with mapping quality >= INT [0]\n"
-"  -l STR   only include reads in library STR [null]\n"
-"  -m INT   only include reads with number of CIGAR operations consuming\n"
-"           query sequence >= INT [0]\n"
-"  -f INT   only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
-"  -F INT   only include reads with none of the FLAGS in INT present [0]\n"       //   F&x == 0
-"  -G INT   only EXCLUDE reads with all  of the FLAGs in INT present [0]\n"       // !(F&x == x)
-"  -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n"
-"           fraction of templates/read pairs to keep; INT part sets seed)\n"
-"  -M       use the multi-region iterator (increases the speed, removes\n"
-"           duplicates and outputs the reads as they are ordered in the file)\n"
-// read processing
-"  -x STR   read tag to strip (repeatable) [null]\n"
-"  -B       collapse the backward CIGAR operation\n"
-// general options
-"  -?       print long help, including note about region specification\n"
-"  -S       ignored (input format is auto-detected)\n"
-"  --no-PG  do not add a PG line\n");
+"Output options:\n"
+"  -b, --bam                  Output BAM\n"
+"  -C, --cram                 Output CRAM (requires -T)\n"
+"  -1, --fast                 Use fast BAM compression (implies --bam)\n"
+"  -u, --uncompressed         Uncompressed BAM output (implies --bam)\n"
+"  -h, --with-header          Include header in SAM output\n"
+"  -H, --header-only          Print SAM header only (no alignments)\n"
+"      --no-header            Print SAM alignment records only [default]\n"
+"  -c, --count                Print only the count of matching records\n"
+"  -o, --output FILE          Write output to FILE [standard output]\n"
+"  -U, --unoutput FILE, --output-unselected FILE\n"
+"                             Output reads not selected by filters to FILE\n"
+"Input options:\n"
+"  -t, --fai-reference FILE   FILE listing reference names and lengths\n"
+"  -M, --use-index            Use index and multi-region iterator for regions\n"
+"      --region[s]-file FILE  Use index to include only reads overlapping FILE\n"
+"  -X, --customized-index     Expect extra index file argument after <in.bam>\n"
+"\n"
+"Filtering options (Only include in output reads that...):\n"
+"  -L, --target[s]-file FILE  ...overlap (BED) regions in FILE\n"
+"  -r, --read-group STR       ...are in read group STR\n"
+"  -R, --read-group-file FILE ...are in a read group listed in FILE\n"
+"  -N, --qname-file FILE      ...whose read name is listed in FILE\n"
+"  -d, --tag STR1[:STR2]      ...have a tag STR1 (with associated value STR2)\n"
+"  -D, --tag-file STR:FILE    ...have a tag STR whose value is listed in FILE\n"
+"  -q, --min-MQ INT           ...have mapping quality >= INT\n"
+"  -l, --library STR          ...are in library STR\n"
+"  -m, --min-qlen INT         ...cover >= INT query bases (as measured via CIGAR)\n"
+"  -e, --expr STR             ...match the filter expression STR\n"
+"  -f, --require-flags FLAG   ...have all of the FLAGs present\n"             //   F&x == x
+"  -F, --excl[ude]-flags FLAG ...have none of the FLAGs present\n"            //   F&x == 0
+"  -G FLAG                    EXCLUDE reads with all of the FLAGs present\n"  // !(F&x == x)  TODO long option
+"      --subsample FLOAT      Keep only FLOAT fraction of templates/read pairs\n"
+"      --subsample-seed INT   Influence WHICH reads are kept in subsampling [0]\n"
+"  -s INT.FRAC                Same as --subsample 0.FRAC --subsample-seed INT\n"
+"\n"
+"Processing options:\n"
+"      --add-flags FLAG       Add FLAGs to reads\n"
+"      --remove-flags FLAG    Remove FLAGs from reads\n"
+"  -x, --remove-tag STR       Strip tag STR from reads (option may be repeated)\n"
+"  -B, --remove-B             Collapse the backward CIGAR operation\n"
+"\n"
+"General options:\n"
+"  -?, --help   Print long help, including note about region specification\n"
+"  -S           Ignored (input format is auto-detected)\n"
+"      --no-PG  Do not add a PG line\n");
  
      sam_global_opt_help(fp, "-.O.T@..");
      fprintf(fp, "\n");
@@ -887,23 +1014,16 @@ static int usage(FILE *fp, int exit_status, int is_long_help)
  "\n"
  "6. Option `-u' is preferred over `-b' when the output is piped to\n"
  "   another samtools command.\n"
+"\n"
+"7. Option `-M`/`--use-index` causes overlaps with `-L` BED file regions and\n"
+"   command-line region arguments to be computed using the multi-region iterator\n"
+"   and an index. This increases speed, omits duplicates, and outputs the reads\n"
+"   as they are ordered in the input SAM/BAM/CRAM file.\n"
+"\n"
+"8. Options `-L`/`--target[s]-file` and `--region[s]-file` may not be used\n"
+"   together. `--region[s]-file FILE` is simply equivalent to `-M -L FILE`,\n"
+"   so using both causes one of the specified BED files to be ignored.\n"
  "\n");
  
      return exit_status;
  }
-
-int main_import(int argc, char *argv[])
-{
-    int argc2, ret;
-    char **argv2;
-    if (argc != 4) {
-        fprintf(stderr, "Usage: samtools import <in.ref_list> <in.sam> <out.bam>\n");
-        return 1;
-    }
-    argc2 = 6;
-    argv2 = calloc(6, sizeof(char*));
-    argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2];
-    ret = main_samview(argc2, argv2);
-    free(argv2);
-    return ret;
-}
diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c

index 6153ee833ea79df621aa5fb4efb16808dec9f0e7..42c42e462e07ce0b4948ac3fb25744d7b5a6811a 100644 (file)
--- a/samtools/sam_view.c.pysam.c
+++ b/samtools/sam_view.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  sam_view.c -- SAM<->BAM<->CRAM conversion.
  
-    Copyright (C) 2009-2019 Genome Research Ltd.
+    Copyright (C) 2009-2021 Genome Research Ltd.
      Portions copyright (C) 2009, 2011, 2012 Broad Institute.
  
      Author: Heng Li <lh3@sanger.ac.uk>
@@ -39,20 +39,20 @@ DEALINGS IN THE SOFTWARE.  */
  #include "htslib/faidx.h"
  #include "htslib/khash.h"
  #include "htslib/thread_pool.h"
+#include "htslib/hts_expr.h"
  #include "samtools.h"
  #include "sam_opts.h"
  #include "bedidx.h"
  
-KHASH_SET_INIT_STR(rg)
-KHASH_SET_INIT_STR(tv)
+KHASH_SET_INIT_STR(str)
  
-typedef khash_t(rg) *rghash_t;
-typedef khash_t(tv) *tvhash_t;
+typedef khash_t(str) *strhash_t;
  
  // This structure contains the settings for a samview run
  typedef struct samview_settings {
-    rghash_t rghash;
-    tvhash_t tvhash;
+    strhash_t rghash;
+    strhash_t rnhash;
+    strhash_t tvhash;
      int min_mapQ;
      int flag_on;
      int flag_off;
@@ -67,13 +67,15 @@ typedef struct samview_settings {
      char** remove_aux;
      int multi_region;
      char* tag;
+    hts_filter_t *filter;
+    int remove_flag;
+    int add_flag;
  } samview_settings_t;
  
  
  // TODO Add declarations of these to a viable htslib or samtools header
  extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b);
  extern int bam_remove_B(bam1_t *b);
-extern char *samfaipath(const char *fn_ref);
  
  // Returns 0 to indicate read should be output 1 otherwise
  static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings)
@@ -100,19 +102,39 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin
      if (settings->rghash) {
          uint8_t *s = bam_aux_get(b, "RG");
          if (s) {
-            khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1));
+            khint_t k = kh_get(str, settings->rghash, (char*)(s + 1));
              if (k == kh_end(settings->rghash)) return 1;
          }
      }
-    if (settings->tvhash && settings->tag) {
+    if (settings->tag) {
          uint8_t *s = bam_aux_get(b, settings->tag);
          if (s) {
-            khint_t k = kh_get(tv, settings->tvhash, (char*)(s + 1));
-            if (k == kh_end(settings->tvhash)) return 1;
+            if (settings->tvhash) {
+                char t[32], *val;
+                if (*s == 'i' || *s == 'I' || *s == 's' || *s == 'S' || *s == 'c' || *s == 'C') {
+                    int ret = snprintf(t, 32, "%"PRId64, bam_aux2i(s));
+                    if (ret > 0) val = t;
+                    else return 1;
+                } else if (*s == 'A') {
+                    t[0] = *(s+1);
+                    t[1] = 0;
+                    val = t;
+                } else {
+                    val = (char *)(s+1);
+                }
+                khint_t k = kh_get(str, settings->tvhash, val);
+                if (k == kh_end(settings->tvhash)) return 1;
+            }
          } else {
              return 1;
          }
      }
+    if (settings->rnhash) {
+        const char* rn = bam_get_qname(b);
+        if (!rn || kh_get(str, settings->rnhash, rn) == kh_end(settings->rnhash)) {
+            return 1;
+        }
+    }
      if (settings->library) {
          const char *p = bam_get_library((sam_hdr_t*)h, b);
          if (!p || strcmp(p, settings->library) != 0) return 1;
@@ -126,11 +148,43 @@ static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settin
              }
          }
      }
+
+    if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1)
+        return 1;
+
      return 0;
  }
  
  static int usage(FILE *fp, int exit_status, int is_long_help);
  
+static int populate_lookup_from_file(const char *subcmd, strhash_t lookup, char *fn)
+{
+    FILE *fp;
+    char buf[1024];
+    int ret = 0;
+    fp = fopen(fn, "r");
+    if (fp == NULL) {
+        print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
+        return -1;
+    }
+
+    while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) {
+        char *d = strdup(buf);
+        if (d != NULL) {
+            kh_put(str, lookup, d, &ret);
+            if (ret == 0) free(d); /* Duplicate */
+        } else {
+            ret = -1;
+        }
+    }
+    if (ferror(fp)) ret = -1;
+    if (ret == -1) {
+        print_error_errno(subcmd, "failed to read \"%s\"", fn);
+    }
+    fclose(fp);
+    return (ret != -1) ? 0 : -1;
+}
+
  static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name)
  {
      char *d = strdup(name);
@@ -139,11 +193,11 @@ static int add_read_group_single(const char *subcmd, samview_settings_t *setting
      if (d == NULL) goto err;
  
      if (settings->rghash == NULL) {
-        settings->rghash = kh_init(rg);
+        settings->rghash = kh_init(str);
          if (settings->rghash == NULL) goto err;
      }
  
-    kh_put(rg, settings->rghash, d, &ret);
+    kh_put(str, settings->rghash, d, &ret);
      if (ret == -1) goto err;
      if (ret ==  0) free(d); /* Duplicate */
      return 0;
@@ -154,40 +208,28 @@ static int add_read_group_single(const char *subcmd, samview_settings_t *setting
      return -1;
  }
  
-static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
+static int add_read_names_file(const char *subcmd, samview_settings_t *settings, char *fn)
  {
-    FILE *fp;
-    char buf[1024];
-    int ret = 0;
-    if (settings->rghash == NULL) {
-        settings->rghash = kh_init(rg);
-        if (settings->rghash == NULL) {
+    if (settings->rnhash == NULL) {
+        settings->rnhash = kh_init(str);
+        if (settings->rnhash == NULL) {
              perror(NULL);
              return -1;
          }
      }
+    return populate_lookup_from_file(subcmd, settings->rnhash, fn);
+}
  
-    fp = fopen(fn, "r");
-    if (fp == NULL) {
-        print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
-        return -1;
-    }
-
-    while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) {
-        char *d = strdup(buf);
-        if (d != NULL) {
-            kh_put(rg, settings->rghash, d, &ret);
-            if (ret == 0) free(d); /* Duplicate */
-        } else {
-            ret = -1;
+static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn)
+{
+    if (settings->rghash == NULL) {
+        settings->rghash = kh_init(str);
+        if (settings->rghash == NULL) {
+            perror(NULL);
+            return -1;
          }
      }
-    if (ferror(fp)) ret = -1;
-    if (ret == -1) {
-        print_error_errno(subcmd, "failed to read \"%s\"", fn);
-    }
-    fclose(fp);
-    return (ret != -1) ? 0 : -1;
+    return populate_lookup_from_file(subcmd, settings->rghash, fn);
  }
  
  static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name)
@@ -198,11 +240,11 @@ static int add_tag_value_single(const char *subcmd, samview_settings_t *settings
      if (d == NULL) goto err;
  
      if (settings->tvhash == NULL) {
-        settings->tvhash = kh_init(tv);
+        settings->tvhash = kh_init(str);
          if (settings->tvhash == NULL) goto err;
      }
  
-    kh_put(tv, settings->tvhash, d, &ret);
+    kh_put(str, settings->tvhash, d, &ret);
      if (ret == -1) goto err;
      if (ret ==  0) free(d); /* Duplicate */
      return 0;
@@ -215,38 +257,14 @@ static int add_tag_value_single(const char *subcmd, samview_settings_t *settings
  
  static int add_tag_values_file(const char *subcmd, samview_settings_t *settings, char *fn)
  {
-    FILE *fp;
-    char buf[1024];
-    int ret = 0;
      if (settings->tvhash == NULL) {
-        settings->tvhash = kh_init(tv);
+        settings->tvhash = kh_init(str);
          if (settings->tvhash == NULL) {
              perror(NULL);
              return -1;
          }
      }
-
-    fp = fopen(fn, "r");
-    if (fp == NULL) {
-        print_error_errno(subcmd, "failed to open \"%s\" for reading", fn);
-        return -1;
-    }
-
-    while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) {
-        char *d = strdup(buf);
-        if (d != NULL) {
-            kh_put(tv, settings->tvhash, d, &ret);
-            if (ret == 0) free(d); /* Duplicate */
-        } else {
-            ret = -1;
-        }
-    }
-    if (ferror(fp)) ret = -1;
-    if (ret == -1) {
-        print_error_errno(subcmd, "failed to read \"%s\"", fn);
-    }
-    fclose(fp);
-    return (ret != -1) ? 0 : -1;
+    return populate_lookup_from_file(subcmd, settings->tvhash, fn);
  }
  
  static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b, const char *fname, int *retp)
@@ -261,6 +279,18 @@ static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t
      return r;
  }
  
+static inline void change_flag(bam1_t *b, samview_settings_t *settings)
+{
+    if (settings->add_flag)
+        b->core.flag |= settings->add_flag;
+
+    if (settings->remove_flag)
+        b->core.flag &= ~settings->remove_flag;
+}
+
+// Make mnemonic distinct values for longoption-only options
+#define LONGOPT(c)  ((c) + 128)
+
  int main_samview(int argc, char *argv[])
  {
      int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0;
@@ -268,8 +298,8 @@ int main_samview(int argc, char *argv[])
      samFile *in = 0, *out = 0, *un_out=0;
      FILE *fp_out = NULL;
      sam_hdr_t *header = NULL;
-    char out_mode[5], out_un_mode[5], *out_format = "";
-    char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0;
+    char out_mode[6] = {0}, out_un_mode[6] = {0}, *out_format = "";
+    char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_fai = 0, *q, *fn_un_out = 0;
      char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL;
      sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
      htsThreadPool p = {NULL, 0};
@@ -290,12 +320,59 @@ int main_samview(int argc, char *argv[])
          .library = NULL,
          .bed = NULL,
          .multi_region = 0,
-        .tag = NULL
+        .tag = NULL,
+        .filter = NULL,
+        .remove_flag = 0,
+        .add_flag = 0
      };
  
      static const struct option lopts[] = {
          SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'),
-        {"no-PG", no_argument, NULL, 1},
+        {"add-flags", required_argument, NULL, LONGOPT('a')},
+        {"bam", no_argument, NULL, 'b'},
+        {"count", no_argument, NULL, 'c'},
+        {"cram", no_argument, NULL, 'C'},
+        {"customised-index", no_argument, NULL, 'X'},
+        {"customized-index", no_argument, NULL, 'X'},
+        {"excl-flags", required_argument, NULL, 'F'},
+        {"exclude-flags", required_argument, NULL, 'F'},
+        {"expr", required_argument, NULL, 'e'},
+        {"expression", required_argument, NULL, 'e'},
+        {"fai-reference", required_argument, NULL, 't'},
+        {"fast", no_argument, NULL, '1'},
+        {"header-only", no_argument, NULL, 'H'},
+        {"help", no_argument, NULL, LONGOPT('?')},
+        {"library", required_argument, NULL, 'l'},
+        {"min-mapq", required_argument, NULL, 'q'},
+        {"min-MQ", required_argument, NULL, 'q'},
+        {"min-mq", required_argument, NULL, 'q'},
+        {"min-qlen", required_argument, NULL, 'm'},
+        {"no-header", no_argument, NULL, LONGOPT('H')},
+        {"no-PG", no_argument, NULL, LONGOPT('P')},
+        {"output", required_argument, NULL, 'o'},
+        {"output-unselected", required_argument, NULL, 'U'},
+        {"QNAME-file", required_argument, NULL, 'N'},
+        {"qname-file", required_argument, NULL, 'N'},
+        {"read-group", required_argument, NULL, 'r'},
+        {"read-group-file", required_argument, NULL, 'R'},
+        {"readgroup", required_argument, NULL, 'r'},
+        {"readgroup-file", required_argument, NULL, 'R'},
+        {"region-file", required_argument, NULL, LONGOPT('L')},
+        {"regions-file", required_argument, NULL, LONGOPT('L')},
+        {"remove-B", no_argument, NULL, 'B'},
+        {"remove-flags", required_argument, NULL, LONGOPT('r')},
+        {"remove-tag", required_argument, NULL, 'x'},
+        {"require-flags", required_argument, NULL, 'f'},
+        {"subsample", required_argument, NULL, LONGOPT('s')},
+        {"subsample-seed", required_argument, NULL, LONGOPT('S')},
+        {"tag", required_argument, NULL, 'd'},
+        {"tag-file", required_argument, NULL, 'D'},
+        {"target-file", required_argument, NULL, 'L'},
+        {"targets-file", required_argument, NULL, 'L'},
+        {"uncompressed", no_argument, NULL, 'u'},
+        {"unoutput", required_argument, NULL, 'U'},
+        {"use-index", no_argument, NULL, 'M'},
+        {"with-header", no_argument, NULL, 'h'},
          { NULL, 0, NULL, 0 }
      };
  
@@ -312,16 +389,11 @@ int main_samview(int argc, char *argv[])
      opterr = 0;
  
      while ((c = getopt_long(argc, argv,
-                            "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:d:D:L:s:@:m:x:U:MX",
+                            "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:",
                              lopts, NULL)) >= 0) {
          switch (c) {
          case 's':
-            if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) {
-                // Convert likely user input 0,1,2,... to pseudo-random
-                // values with more entropy and more bits set
-                srand(settings.subsam_seed);
-                settings.subsam_seed = rand();
-            }
+            settings.subsam_seed = strtol(optarg, &q, 10);
              if (q && *q == '.') {
                  settings.subsam_frac = strtod(q, &q);
                  if (*q) ret = 1;
@@ -334,24 +406,36 @@ int main_samview(int argc, char *argv[])
                  goto view_end;
              }
              break;
+        case LONGOPT('s'):
+            settings.subsam_frac = strtod(optarg, &q);
+            if (*q || settings.subsam_frac < 0.0 || settings.subsam_frac > 1.0) {
+                print_error("view", "Incorrect sampling argument \"%s\"", optarg);
+                goto view_end;
+            }
+            break;
+        case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break;
          case 'm': settings.min_qlen = atoi(optarg); break;
          case 'c': is_count = 1; break;
          case 'S': break;
          case 'b': out_format = "b"; break;
          case 'C': out_format = "c"; break;
-        case 't': fn_list = strdup(optarg); break;
+        case 't': fn_fai = strdup(optarg); break;
          case 'h': is_header = 1; break;
          case 'H': is_header_only = 1; break;
+        case LONGOPT('H'): is_header = is_header_only = 0; break;
          case 'o': fn_out = strdup(optarg); break;
          case 'U': fn_un_out = strdup(optarg); break;
          case 'X': has_index_file = 1; break;
-        case 'f': settings.flag_on |= strtol(optarg, 0, 0); break;
-        case 'F': settings.flag_off |= strtol(optarg, 0, 0); break;
-        case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break;
+        case 'f': settings.flag_on |= bam_str2flag(optarg); break;
+        case 'F': settings.flag_off |= bam_str2flag(optarg); break;
+        case 'G': settings.flag_alloff |= bam_str2flag(optarg); break;
          case 'q': settings.min_mapQ = atoi(optarg); break;
          case 'u': compress_level = 0; break;
          case '1': compress_level = 1; break;
          case 'l': settings.library = strdup(optarg); break;
+        case LONGOPT('L'):
+            settings.multi_region = 1;
+            // fall through
          case 'L':
              if ((settings.bed = bed_read(optarg)) == NULL) {
                  print_error_errno("view", "Could not read file \"%s\"", optarg);
@@ -371,8 +455,14 @@ int main_samview(int argc, char *argv[])
                  goto view_end;
              }
              break;
+        case 'N':
+            if (add_read_names_file("view", &settings, optarg) != 0) {
+                ret = 1;
+                goto view_end;
+            }
+            break;
          case 'd':
-            if (strlen(optarg) < 4 || optarg[2] != ':') {
+            if (strlen(optarg) < 2 || (strlen(optarg) > 2 && optarg[2] != ':')) {
                  print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg);
                  ret = 1;
                  goto view_end;
@@ -393,7 +483,8 @@ int main_samview(int argc, char *argv[])
                  memcpy(settings.tag, optarg, 2);
              }
  
-            if (add_tag_value_single("view", &settings, optarg+3) != 0) {
+            if (strlen(optarg) > 3 && add_tag_value_single("view", &settings, optarg+3) != 0) {
+                print_error("view", "Could not add tag:value \"%s\"", optarg);
                  ret = 1;
                  goto view_end;
              }
@@ -401,7 +492,7 @@ int main_samview(int argc, char *argv[])
          case 'D':
              // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX
              // path translation as described at:
-            //   http://www.mingw.org/wiki/Posix_path_conversion
+            // http://www.mingw.org/wiki/Posix_path_conversion
              if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) {
                  print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg);
                  ret = 1;
@@ -432,6 +523,8 @@ int main_samview(int argc, char *argv[])
          //case 'x': out_format = "x"; break;
          //case 'X': out_format = "X"; break;
                   */
+        case LONGOPT('?'):
+            return usage(samtools_stdout, EXIT_SUCCESS, 1);
          case '?':
              if (optopt == '?') {  // '-?' appeared on command line
                  return usage(samtools_stdout, EXIT_SUCCESS, 1);
@@ -453,7 +546,7 @@ int main_samview(int argc, char *argv[])
          case 'x':
              {
                  if (strlen(optarg) != 2) {
-                    fprintf(samtools_stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n");
+                    print_error("main_samview", "Error parsing -x auxiliary tags should be exactly two characters long.");
                      return usage(samtools_stderr, EXIT_FAILURE, 0);
                  }
                  settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len));
@@ -461,13 +554,22 @@ int main_samview(int argc, char *argv[])
              }
              break;
          case 'M': settings.multi_region = 1; break;
-        case 1: no_pg = 1; break;
+        case LONGOPT('P'): no_pg = 1; break;
+        case 'e':
+            if (!(settings.filter = hts_filter_init(optarg))) {
+                print_error("main_samview", "Couldn't initialise filter");
+                return 1;
+            }
+            break;
+        case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break;
+        case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break;
          default:
              if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0)
                  return usage(samtools_stderr, EXIT_FAILURE, 0);
              break;
          }
      }
+    if (fn_fai == 0 && ga.reference) fn_fai = fai_path(ga.reference);
      if (compress_level >= 0 && !*out_format) out_format = "b";
      if (is_header_only) is_header = 1;
      // File format auto-detection first
@@ -476,8 +578,7 @@ int main_samview(int argc, char *argv[])
      // Overridden by manual -b, -C
      if (*out_format)
          out_mode[1] = out_un_mode[1] = *out_format;
-    out_mode[2] = out_un_mode[2] = '\0';
-    // out_(un_)mode now 1 or 2 bytes long, followed by nul.
+    // out_(un_)mode now 1, 2 or 3 bytes long, followed by nul.
      if (compress_level >= 0) {
          char tmp[2];
          tmp[0] = compress_level + '0'; tmp[1] = '\0';
@@ -488,20 +589,23 @@ int main_samview(int argc, char *argv[])
          print_error("view", "No input provided or missing option argument.");
          return usage(samtools_stderr, EXIT_FAILURE, 0); // potential memory leak...
      }
+    if (settings.subsam_seed != 0) {
+        // Convert likely user input 1,2,... to pseudo-random
+        // values with more entropy and more bits set
+        srand(settings.subsam_seed);
+        settings.subsam_seed = rand();
+    }
  
      fn_in = (optind < argc)? argv[optind] : "-";
-    // generate the fn_list if necessary
-    if (fn_list == 0 && ga.reference) fn_list = samfaipath(ga.reference);
-    // open file handlers
      if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) {
          print_error_errno("view", "failed to open \"%s\" for reading", fn_in);
          ret = 1;
          goto view_end;
      }
  
-    if (fn_list) {
-        if (hts_set_fai_filename(in, fn_list) != 0) {
-            fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+    if (fn_fai) {
+        if (hts_set_fai_filename(in, fn_fai) != 0) {
+            fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai);
              ret = 1;
              goto view_end;
          }
@@ -520,9 +624,9 @@ int main_samview(int argc, char *argv[])
              ret = 1;
              goto view_end;
          }
-        if (fn_list) {
-            if (hts_set_fai_filename(out, fn_list) != 0) {
-                fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+        if (fn_fai) {
+            if (hts_set_fai_filename(out, fn_fai) != 0) {
+                fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai);
                  ret = 1;
                  goto view_end;
              }
@@ -567,9 +671,9 @@ int main_samview(int argc, char *argv[])
                  ret = 1;
                  goto view_end;
              }
-            if (fn_list) {
-                if (hts_set_fai_filename(un_out, fn_list) != 0) {
-                    fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list);
+            if (fn_fai) {
+                if (hts_set_fai_filename(un_out, fn_fai) != 0) {
+                    fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_fai);
                      ret = 1;
                      goto view_end;
                  }
@@ -656,7 +760,10 @@ int main_samview(int argc, char *argv[])
                          // fetch alignments
                          while ((result = sam_itr_multi_next(in, iter, b)) >= 0) {
                              if (!process_aln(header, b, &settings)) {
-                                if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; }
+                                if (!is_count) {
+                                    change_flag(b, &settings);
+                                    if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
+                                }
                                  count++;
                              } else {
                                  if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
@@ -684,16 +791,20 @@ int main_samview(int argc, char *argv[])
          if ((has_index_file && optind >= argc - 2) || (!has_index_file && optind >= argc - 1)) { // convert/print the entire file
              bam1_t *b = bam_init1();
              int r;
+            errno = 0;
              while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in'
                  if (!process_aln(header, b, &settings)) {
-                    if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; }
+                    if (!is_count) {
+                        change_flag(b, &settings);
+                        if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
+                    }
                      count++;
                  } else {
                      if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
                  }
              }
              if (r < -1) {
-                fprintf(samtools_stderr, "[main_samview] truncated file.\n");
+                print_error_errno("view", "error reading file \"%s\"", fn_in);
                  ret = 1;
              }
              bam_destroy1(b);
@@ -724,7 +835,10 @@ int main_samview(int argc, char *argv[])
                  // fetch alignments
                  while ((result = sam_itr_next(in, iter, b)) >= 0) {
                      if (!process_aln(header, b, &settings)) {
-                        if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; }
+                        if (!is_count) {
+                            change_flag(b, &settings);
+                            if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
+                        }
                          count++;
                      } else {
                          if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
@@ -768,7 +882,7 @@ view_end:
      if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret);
      if (fp_out) fclose(fp_out);
  
-    free(fn_list); free(fn_out); free(settings.library);  free(fn_un_out);
+    free(fn_fai); free(fn_out); free(settings.library);  free(fn_un_out);
      sam_global_args_free(&ga);
      if ( header ) sam_hdr_destroy(header);
      if (settings.bed) bed_destroy(settings.bed);
@@ -776,13 +890,19 @@ view_end:
          khint_t k;
          for (k = 0; k < kh_end(settings.rghash); ++k)
              if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k));
-        kh_destroy(rg, settings.rghash);
+        kh_destroy(str, settings.rghash);
+    }
+    if (settings.rnhash) {
+        khint_t k;
+        for (k = 0; k < kh_end(settings.rnhash); ++k)
+            if (kh_exist(settings.rnhash, k)) free((char*)kh_key(settings.rnhash, k));
+        kh_destroy(str, settings.rnhash);
      }
      if (settings.tvhash) {
          khint_t k;
          for (k = 0; k < kh_end(settings.tvhash); ++k)
              if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k));
-        kh_destroy(tv, settings.tvhash);
+        kh_destroy(str, settings.tvhash);
      }
      if (settings.remove_aux_len) {
          free(settings.remove_aux);
@@ -790,6 +910,8 @@ view_end:
      if (settings.tag) {
          free(settings.tag);
      }
+    if (settings.filter)
+        hts_filter_free(settings.filter);
  
      if (p.pool)
          hts_tpool_destroy(p.pool);
@@ -809,47 +931,52 @@ static int usage(FILE *fp, int exit_status, int is_long_help)
  "\n"
  "Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n"
  "\n"
-"Options:\n"
-// output options
-"  -b       output BAM\n"
-"  -C       output CRAM (requires -T)\n"
-"  -1       use fast BAM compression (implies -b)\n"
-"  -u       uncompressed BAM output (implies -b)\n"
-"  -h       include header in SAM output\n"
-"  -H       print SAM header only (no alignments)\n"
-"  -c       print only the count of matching records\n"
-"  -o FILE  output file name [samtools_stdout]\n"
-"  -U FILE  output reads not selected by filters to FILE [null]\n"
-// extra input
-"  -t FILE  FILE listing reference names and lengths (see long help) [null]\n"
-"  -X       include customized index file\n"
-// read filters
-"  -L FILE  only include reads overlapping this BED FILE [null]\n"
-"  -r STR   only include reads in read group STR [null]\n"
-"  -R FILE  only include reads with read group listed in FILE [null]\n"
-"  -d STR:STR\n"
-"           only include reads with tag STR and associated value STR [null]\n"
-"  -D STR:FILE\n"
-"           only include reads with tag STR and associated values listed in\n"
-"           FILE [null]\n"
-"  -q INT   only include reads with mapping quality >= INT [0]\n"
-"  -l STR   only include reads in library STR [null]\n"
-"  -m INT   only include reads with number of CIGAR operations consuming\n"
-"           query sequence >= INT [0]\n"
-"  -f INT   only include reads with all  of the FLAGs in INT present [0]\n"       //   F&x == x
-"  -F INT   only include reads with none of the FLAGS in INT present [0]\n"       //   F&x == 0
-"  -G INT   only EXCLUDE reads with all  of the FLAGs in INT present [0]\n"       // !(F&x == x)
-"  -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n"
-"           fraction of templates/read pairs to keep; INT part sets seed)\n"
-"  -M       use the multi-region iterator (increases the speed, removes\n"
-"           duplicates and outputs the reads as they are ordered in the file)\n"
-// read processing
-"  -x STR   read tag to strip (repeatable) [null]\n"
-"  -B       collapse the backward CIGAR operation\n"
-// general options
-"  -?       print long help, including note about region specification\n"
-"  -S       ignored (input format is auto-detected)\n"
-"  --no-PG  do not add a PG line\n");
+"Output options:\n"
+"  -b, --bam                  Output BAM\n"
+"  -C, --cram                 Output CRAM (requires -T)\n"
+"  -1, --fast                 Use fast BAM compression (implies --bam)\n"
+"  -u, --uncompressed         Uncompressed BAM output (implies --bam)\n"
+"  -h, --with-header          Include header in SAM output\n"
+"  -H, --header-only          Print SAM header only (no alignments)\n"
+"      --no-header            Print SAM alignment records only [default]\n"
+"  -c, --count                Print only the count of matching records\n"
+"  -o, --output FILE          Write output to FILE [standard output]\n"
+"  -U, --unoutput FILE, --output-unselected FILE\n"
+"                             Output reads not selected by filters to FILE\n"
+"Input options:\n"
+"  -t, --fai-reference FILE   FILE listing reference names and lengths\n"
+"  -M, --use-index            Use index and multi-region iterator for regions\n"
+"      --region[s]-file FILE  Use index to include only reads overlapping FILE\n"
+"  -X, --customized-index     Expect extra index file argument after <in.bam>\n"
+"\n"
+"Filtering options (Only include in output reads that...):\n"
+"  -L, --target[s]-file FILE  ...overlap (BED) regions in FILE\n"
+"  -r, --read-group STR       ...are in read group STR\n"
+"  -R, --read-group-file FILE ...are in a read group listed in FILE\n"
+"  -N, --qname-file FILE      ...whose read name is listed in FILE\n"
+"  -d, --tag STR1[:STR2]      ...have a tag STR1 (with associated value STR2)\n"
+"  -D, --tag-file STR:FILE    ...have a tag STR whose value is listed in FILE\n"
+"  -q, --min-MQ INT           ...have mapping quality >= INT\n"
+"  -l, --library STR          ...are in library STR\n"
+"  -m, --min-qlen INT         ...cover >= INT query bases (as measured via CIGAR)\n"
+"  -e, --expr STR             ...match the filter expression STR\n"
+"  -f, --require-flags FLAG   ...have all of the FLAGs present\n"             //   F&x == x
+"  -F, --excl[ude]-flags FLAG ...have none of the FLAGs present\n"            //   F&x == 0
+"  -G FLAG                    EXCLUDE reads with all of the FLAGs present\n"  // !(F&x == x)  TODO long option
+"      --subsample FLOAT      Keep only FLOAT fraction of templates/read pairs\n"
+"      --subsample-seed INT   Influence WHICH reads are kept in subsampling [0]\n"
+"  -s INT.FRAC                Same as --subsample 0.FRAC --subsample-seed INT\n"
+"\n"
+"Processing options:\n"
+"      --add-flags FLAG       Add FLAGs to reads\n"
+"      --remove-flags FLAG    Remove FLAGs from reads\n"
+"  -x, --remove-tag STR       Strip tag STR from reads (option may be repeated)\n"
+"  -B, --remove-B             Collapse the backward CIGAR operation\n"
+"\n"
+"General options:\n"
+"  -?, --help   Print long help, including note about region specification\n"
+"  -S           Ignored (input format is auto-detected)\n"
+"      --no-PG  Do not add a PG line\n");
  
      sam_global_opt_help(fp, "-.O.T@..");
      fprintf(fp, "\n");
@@ -889,23 +1016,16 @@ static int usage(FILE *fp, int exit_status, int is_long_help)
  "\n"
  "6. Option `-u' is preferred over `-b' when the output is piped to\n"
  "   another samtools command.\n"
+"\n"
+"7. Option `-M`/`--use-index` causes overlaps with `-L` BED file regions and\n"
+"   command-line region arguments to be computed using the multi-region iterator\n"
+"   and an index. This increases speed, omits duplicates, and outputs the reads\n"
+"   as they are ordered in the input SAM/BAM/CRAM file.\n"
+"\n"
+"8. Options `-L`/`--target[s]-file` and `--region[s]-file` may not be used\n"
+"   together. `--region[s]-file FILE` is simply equivalent to `-M -L FILE`,\n"
+"   so using both causes one of the specified BED files to be ignored.\n"
  "\n");
  
      return exit_status;
  }
-
-int main_import(int argc, char *argv[])
-{
-    int argc2, ret;
-    char **argv2;
-    if (argc != 4) {
-        fprintf(samtools_stderr, "Usage: samtools import <in.ref_list> <in.sam> <out.bam>\n");
-        return 1;
-    }
-    argc2 = 6;
-    argv2 = calloc(6, sizeof(char*));
-    argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2];
-    ret = main_samview(argc2, argv2);
-    free(argv2);
-    return ret;
-}
diff --git a/samtools/samtools.pysam.c b/samtools/samtools.pysam.c

index b26f892a93c77f51244b8e26481c99df5c0df90c..70446032bae1801c106b4a2df0ad4d850eaf53e7 100644 (file)
--- a/samtools/samtools.pysam.c
+++ b/samtools/samtools.pysam.c
@@ -1,6 +1,7 @@
  #include <ctype.h>
  #include <assert.h>
  #include <unistd.h>
+#include <setjmp.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
@@ -55,6 +56,25 @@ int samtools_puts(const char *s)
    return putc('\n', samtools_stdout);
  }
  
+
+static jmp_buf samtools_jmpbuf;
+static int samtools_status = 0;
+
+int samtools_dispatch(int argc, char *argv[])
+{
+  if (setjmp(samtools_jmpbuf) == 0)
+    return samtools_main(argc, argv);
+  else
+    return samtools_status;
+}
+
+void samtools_exit(int status)
+{
+  samtools_status = status;
+  longjmp(samtools_jmpbuf, 1);
+}
+
+
  void samtools_set_optind(int val)
  {
    // setting this in cython via 
diff --git a/samtools/samtools.pysam.h b/samtools/samtools.pysam.h

index df8fd0179234306c3d83ccafbd0596b9d2886f0c..9d20ecb1cc7d02d410064e75282967f08f0bda18 100644 (file)
--- a/samtools/samtools.pysam.h
+++ b/samtools/samtools.pysam.h
@@ -3,6 +3,17 @@
  
  #include <stdio.h>
  
+#ifndef __has_attribute
+#define __has_attribute(attribute) 0
+#endif
+#ifndef PYSAM_NORETURN
+#if __has_attribute(__noreturn__) || __GNUC__ >= 3
+#define PYSAM_NORETURN __attribute__((__noreturn__))
+#else
+#define PYSAM_NORETURN
+#endif
+#endif
+
  extern FILE * samtools_stderr;
  
  extern FILE * samtools_stdout;
@@ -40,6 +51,8 @@ int samtools_puts(const char *s);
  
  int samtools_dispatch(int argc, char *argv[]);
  
+void PYSAM_NORETURN samtools_exit(int status);
+
  void samtools_set_optind(int);
  
  extern int samtools_main(int argc, char *argv[]);
diff --git a/samtools/stats.c b/samtools/stats.c

index 55ede4c2834d7e10f726b85d90a3221fb6e75854..f030cf57de2cfc98d03a3bd400c48d1a3c5b251a 100644 (file)
--- a/samtools/stats.c
+++ b/samtools/stats.c
@@ -1,6 +1,6 @@
  /*  stats.c -- This is the former bamcheck integrated into samtools/htslib.
  
-    Copyright (C) 2012-2019 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
      Author: Sam Nicholls <sam@samnicholls.net>
@@ -175,8 +175,8 @@ typedef struct
      // Arrays for the histogram data
      uint64_t *quals_1st, *quals_2nd;
      uint64_t *gc_1st, *gc_2nd;
-    acgtno_count_t *acgtno_cycles_1st;
-    acgtno_count_t *acgtno_cycles_2nd;
+    acgtno_count_t *acgtno_cycles_1st, *acgtno_cycles_2nd;
+    acgtno_count_t *acgtno_revcomp;
      uint64_t *read_lengths, *read_lengths_1st, *read_lengths_2nd;
      uint64_t *insertions, *deletions;
      uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
@@ -210,7 +210,7 @@ typedef struct
      uint64_t nbases_mapped_cigar;
      uint64_t nbases_trimmed;  // bwa trimmed bases
      uint64_t nmismatches;
-    uint64_t nreads_QCfailed, nreads_secondary;
+    uint64_t nreads_QCfailed, nreads_secondary, nreads_supplementary;
      struct {
          uint32_t names, reads, quals;
      } checksum;
@@ -250,7 +250,7 @@ typedef struct
      uint32_t nchunks;
  
      uint32_t pair_count;          // Number of active pairs in the pairing hash table
-    uint32_t target_count;        // Number of bases covered by the target file
+    uint64_t target_count;        // Number of bases covered by the target file
      uint32_t last_pair_tid;
      uint32_t last_read_flush;
  
@@ -647,6 +647,11 @@ void realloc_buffers(stats_t *stats, int seq_len)
          error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t));
      memset(stats->acgtno_cycles_2nd + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t));
  
+    stats->acgtno_revcomp = realloc(stats->acgtno_revcomp, n*sizeof(acgtno_count_t));
+    if ( !stats->acgtno_revcomp )
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t));
+    memset(stats->acgtno_revcomp + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t));
+
      stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
      if ( !stats->read_lengths )
          error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
@@ -870,16 +875,20 @@ void collect_orig_read_stats(bam1_t *bam_line, stats_t *stats, int* gc_count_out
              switch (bam_seqi(seq, i)) {
              case 1:
                  acgtno_cycles[ read_cycle ].a++;
+                reverse ? stats->acgtno_revcomp[ read_cycle ].t++ : stats->acgtno_revcomp[ read_cycle ].a++;
                  break;
              case 2:
                  acgtno_cycles[ read_cycle ].c++;
+                reverse ? stats->acgtno_revcomp[ read_cycle ].g++ : stats->acgtno_revcomp[ read_cycle ].c++;
                  gc_count++;
                  break;
              case 4:
                  acgtno_cycles[ read_cycle ].g++;
+                reverse ? stats->acgtno_revcomp[ read_cycle ].c++ : stats->acgtno_revcomp[ read_cycle ].g++;
                  gc_count++;
                  break;
              case 8:
+                reverse ? stats->acgtno_revcomp[ read_cycle ].a++ : stats->acgtno_revcomp[ read_cycle ].t++;
                  acgtno_cycles[ read_cycle ].t++;
                  break;
              case 15:
@@ -1129,6 +1138,8 @@ static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stat
  
  void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pairs)
  {
+    if ( !is_in_regions(bam_line,stats) )
+        return;
      if ( stats->rg_hash )
      {
          const uint8_t *rg = bam_aux_get(bam_line, "RG");
@@ -1145,8 +1156,6 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair
          stats->nreads_filtered++;
          return;
      }
-    if ( !is_in_regions(bam_line,stats) )
-        return;
      if ( stats->info->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->info->filter_readlen )
          return;
  
@@ -1159,6 +1168,11 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair
          return;
      }
  
+    if ( bam_line->core.flag & BAM_FSUPPLEMENTARY )
+    {
+        stats->nreads_supplementary++;
+    }
+
      // If line has no sequence cannot continue
      int seq_len = bam_line->core.l_qseq;
      if ( !seq_len ) return;
@@ -1187,8 +1201,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair
  
      // These stats should only be calculated for the original reads ignoring supplementary artificial reads
      // otherwise we'll accidentally double count
-    if ( IS_ORIGINAL(bam_line) )
-    {
+    if ( IS_ORIGINAL(bam_line) ) {
          stats->read_lengths[read_len]++;
          if ( order == READ_ORDER_FIRST ) stats->read_lengths_1st[read_len]++;
          if ( order == READ_ORDER_LAST ) stats->read_lengths_2nd[read_len]++;
@@ -1200,7 +1213,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair
  
      count_indels(stats, bam_line);
  
-    if ( IS_PAIRED_AND_MAPPED(bam_line) )
+    if ( IS_PAIRED_AND_MAPPED(bam_line) && IS_ORIGINAL(bam_line) )
      {
          // The insert size is tricky, because for long inserts the libraries are
          // prepared differently and the pairs point in other direction. BWA does
@@ -1495,7 +1508,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse)
      fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n");
      fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals);
      fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
-    fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other));  // not counting excluded seqs (and none of the below)
+    fprintf(to, "SN\traw total sequences:\t%ld\t# excluding supplementary and secondary reads\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other));  // not counting excluded seqs (and none of the below)
      fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
      fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other));
      fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
@@ -1510,6 +1523,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse)
      fprintf(to, "SN\treads MQ0:\t%ld\t# mapped and MQ=0\n", (long)stats->nreads_mq0);
      fprintf(to, "SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
      fprintf(to, "SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
+    fprintf(to, "SN\tsupplementary alignments:\t%ld\n", (long)stats->nreads_supplementary);
      fprintf(to, "SN\ttotal length:\t%ld\t# ignores clipping\n", (long)stats->total_len);
      fprintf(to, "SN\ttotal first fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_1st);
      fprintf(to, "SN\ttotal last fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_2nd);
@@ -1535,7 +1549,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse)
      fprintf(to, "SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
      fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0);
      if ( stats->target_count ) {
-        fprintf(to, "SN\tbases inside the target:\t%u\n", stats->target_count);
+        fprintf(to, "SN\tbases inside the target:\t%" PRIu64 "\n", stats->target_count);
          for (icov=stats->info->cov_threshold+1; icov<stats->ncov; icov++)
              cov_sum += stats->cov[icov];
          fprintf(to, "SN\tpercentage of target genome with coverage > %d (%%):\t%.2f\n", stats->info->cov_threshold, (float)(100*cov_sum)/stats->target_count);
@@ -1612,7 +1626,18 @@ void output_stats(FILE *to, stats_t *stats, int sparse)
                  100.*(acgtno_count_1st->t + acgtno_count_2nd->t)/acgt_sum,
                  100.*(acgtno_count_1st->n + acgtno_count_2nd->n)/acgt_sum,
                  100.*(acgtno_count_1st->other + acgtno_count_2nd->other)/acgt_sum);
-
+    }
+    fprintf(to, "# ACGT content per cycle, read oriented. Use `grep ^GCT | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]\n");
+    for (ibase=0; ibase<stats->max_len; ibase++)
+    {
+        acgtno_count_t *acgtno_count = &(stats->acgtno_revcomp[ibase]);
+        uint64_t acgt_sum = acgtno_count->a + acgtno_count->c + acgtno_count->g + acgtno_count->t;
+        if ( ! acgt_sum ) continue;
+        fprintf(to, "GCT\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1,
+                100.*(acgtno_count->a)/acgt_sum,
+                100.*(acgtno_count->c)/acgt_sum,
+                100.*(acgtno_count->g)/acgt_sum,
+                100.*(acgtno_count->t)/acgt_sum);
      }
  
      uint64_t tA=0, tC=0, tG=0, tT=0, tN=0;
@@ -1800,7 +1825,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse)
      }
  }
  
-static void init_regions(stats_t *stats, const char *file)
+static void init_regions(stats_t *stats, const char *file, stats_info_t* info)
  {
      FILE *fp = fopen(file,"r");
      if ( !fp ) error("%s: %s\n",file,strerror(errno));
@@ -1877,8 +1902,15 @@ static void init_regions(stats_t *stats, const char *file)
              }
              reg->npos = ++new_p;
          }
-        for (p = 0; p < reg->npos; p++)
-            stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1);
+        for (p = 0; p < reg->npos; p++) {
+            if (reg->pos[p].end < HTS_POS_MAX) {
+                stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1);
+            } else {
+                uint64_t hdr_end = sam_hdr_tid2len(info->sam_header, r);
+                if (hdr_end)
+                    stats->target_count += (hdr_end - reg->pos[p].beg + 1);
+            }
+        }
      }
  
      if (!(stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t))))
@@ -1941,7 +1973,7 @@ int is_in_regions(bam1_t *bam_line, stats_t *stats)
      return 1;
  }
  
-int replicate_regions(stats_t *stats, hts_itr_multi_t *iter) {
+int replicate_regions(stats_t *stats, hts_itr_multi_t *iter, stats_info_t *info) {
      if ( !stats || !iter)
          return 1;
  
@@ -1975,8 +2007,13 @@ int replicate_regions(stats_t *stats, hts_itr_multi_t *iter) {
          for (j = 0; j < stats->regions[tid].npos; j++) {
              stats->regions[tid].pos[j].beg = iter->reg_list[i].intervals[j].beg+1;
              stats->regions[tid].pos[j].end = iter->reg_list[i].intervals[j].end;
-
-            stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1);
+            if (stats->regions[tid].pos[j].end < HTS_POS_MAX) {
+                stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1);
+            } else {
+                uint64_t hdr_end = sam_hdr_tid2len(info->sam_header, tid);
+                if (hdr_end)
+                    stats->target_count += (hdr_end - stats->regions[tid].pos[j].beg + 1);
+            }
          }
      }
  
@@ -2073,6 +2110,7 @@ void cleanup_stats(stats_t* stats)
      free(stats->mpc_buf);
      free(stats->acgtno_cycles_1st);
      free(stats->acgtno_cycles_2nd);
+    free(stats->acgtno_revcomp);
      free(stats->read_lengths);
      free(stats->read_lengths_1st);
      free(stats->read_lengths_2nd);
@@ -2257,6 +2295,8 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr
      if (!stats->acgtno_cycles_1st) goto nomem;
      stats->acgtno_cycles_2nd  = calloc(stats->nbases,sizeof(acgtno_count_t));
      if (!stats->acgtno_cycles_2nd) goto nomem;
+    stats->acgtno_revcomp  = calloc(stats->nbases,sizeof(acgtno_count_t));
+    if (!stats->acgtno_revcomp) goto nomem;
      stats->read_lengths   = calloc(stats->nbases,sizeof(uint64_t));
      if (!stats->read_lengths)     goto nomem;
      stats->read_lengths_1st   = calloc(stats->nbases,sizeof(uint64_t));
@@ -2279,7 +2319,7 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr
          goto nomem;
      realloc_rseq_buffer(stats);
      if ( targets )
-        init_regions(stats, targets);
+        init_regions(stats, targets, info);
      return;
   nomem:
      error("Out of memory");
@@ -2459,7 +2499,7 @@ int main_stats(int argc, char *argv[])
              if (iter) {
                  if (!targets) {
                      all_stats->nchunks = argc-optind;
-                    if (replicate_regions(all_stats, iter))
+                    if (replicate_regions(all_stats, iter, info))
                          fprintf(stderr, "Replications of the regions failed\n");
                  }
  
diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c

index 3d126a70142ee2b325124350a11177adf388b7e9..9e8165d171120558a2d1c3b7471b4b1d888b9c53 100644 (file)
--- a/samtools/stats.c.pysam.c
+++ b/samtools/stats.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  stats.c -- This is the former bamcheck integrated into samtools/htslib.
  
-    Copyright (C) 2012-2019 Genome Research Ltd.
+    Copyright (C) 2012-2021 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
      Author: Sam Nicholls <sam@samnicholls.net>
@@ -177,8 +177,8 @@ typedef struct
      // Arrays for the histogram data
      uint64_t *quals_1st, *quals_2nd;
      uint64_t *gc_1st, *gc_2nd;
-    acgtno_count_t *acgtno_cycles_1st;
-    acgtno_count_t *acgtno_cycles_2nd;
+    acgtno_count_t *acgtno_cycles_1st, *acgtno_cycles_2nd;
+    acgtno_count_t *acgtno_revcomp;
      uint64_t *read_lengths, *read_lengths_1st, *read_lengths_2nd;
      uint64_t *insertions, *deletions;
      uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
@@ -212,7 +212,7 @@ typedef struct
      uint64_t nbases_mapped_cigar;
      uint64_t nbases_trimmed;  // bwa trimmed bases
      uint64_t nmismatches;
-    uint64_t nreads_QCfailed, nreads_secondary;
+    uint64_t nreads_QCfailed, nreads_secondary, nreads_supplementary;
      struct {
          uint32_t names, reads, quals;
      } checksum;
@@ -252,7 +252,7 @@ typedef struct
      uint32_t nchunks;
  
      uint32_t pair_count;          // Number of active pairs in the pairing hash table
-    uint32_t target_count;        // Number of bases covered by the target file
+    uint64_t target_count;        // Number of bases covered by the target file
      uint32_t last_pair_tid;
      uint32_t last_read_flush;
  
@@ -649,6 +649,11 @@ void realloc_buffers(stats_t *stats, int seq_len)
          error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t));
      memset(stats->acgtno_cycles_2nd + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t));
  
+    stats->acgtno_revcomp = realloc(stats->acgtno_revcomp, n*sizeof(acgtno_count_t));
+    if ( !stats->acgtno_revcomp )
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len, n*sizeof(acgtno_count_t));
+    memset(stats->acgtno_revcomp + stats->nbases, 0, (n-stats->nbases)*sizeof(acgtno_count_t));
+
      stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
      if ( !stats->read_lengths )
          error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
@@ -872,16 +877,20 @@ void collect_orig_read_stats(bam1_t *bam_line, stats_t *stats, int* gc_count_out
              switch (bam_seqi(seq, i)) {
              case 1:
                  acgtno_cycles[ read_cycle ].a++;
+                reverse ? stats->acgtno_revcomp[ read_cycle ].t++ : stats->acgtno_revcomp[ read_cycle ].a++;
                  break;
              case 2:
                  acgtno_cycles[ read_cycle ].c++;
+                reverse ? stats->acgtno_revcomp[ read_cycle ].g++ : stats->acgtno_revcomp[ read_cycle ].c++;
                  gc_count++;
                  break;
              case 4:
                  acgtno_cycles[ read_cycle ].g++;
+                reverse ? stats->acgtno_revcomp[ read_cycle ].c++ : stats->acgtno_revcomp[ read_cycle ].g++;
                  gc_count++;
                  break;
              case 8:
+                reverse ? stats->acgtno_revcomp[ read_cycle ].a++ : stats->acgtno_revcomp[ read_cycle ].t++;
                  acgtno_cycles[ read_cycle ].t++;
                  break;
              case 15:
@@ -1131,6 +1140,8 @@ static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stat
  
  void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pairs)
  {
+    if ( !is_in_regions(bam_line,stats) )
+        return;
      if ( stats->rg_hash )
      {
          const uint8_t *rg = bam_aux_get(bam_line, "RG");
@@ -1147,8 +1158,6 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair
          stats->nreads_filtered++;
          return;
      }
-    if ( !is_in_regions(bam_line,stats) )
-        return;
      if ( stats->info->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->info->filter_readlen )
          return;
  
@@ -1161,6 +1170,11 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair
          return;
      }
  
+    if ( bam_line->core.flag & BAM_FSUPPLEMENTARY )
+    {
+        stats->nreads_supplementary++;
+    }
+
      // If line has no sequence cannot continue
      int seq_len = bam_line->core.l_qseq;
      if ( !seq_len ) return;
@@ -1189,8 +1203,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair
  
      // These stats should only be calculated for the original reads ignoring supplementary artificial reads
      // otherwise we'll accidentally double count
-    if ( IS_ORIGINAL(bam_line) )
-    {
+    if ( IS_ORIGINAL(bam_line) ) {
          stats->read_lengths[read_len]++;
          if ( order == READ_ORDER_FIRST ) stats->read_lengths_1st[read_len]++;
          if ( order == READ_ORDER_LAST ) stats->read_lengths_2nd[read_len]++;
@@ -1202,7 +1215,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair
  
      count_indels(stats, bam_line);
  
-    if ( IS_PAIRED_AND_MAPPED(bam_line) )
+    if ( IS_PAIRED_AND_MAPPED(bam_line) && IS_ORIGINAL(bam_line) )
      {
          // The insert size is tricky, because for long inserts the libraries are
          // prepared differently and the pairs point in other direction. BWA does
@@ -1497,7 +1510,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse)
      fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n");
      fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals);
      fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
-    fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other));  // not counting excluded seqs (and none of the below)
+    fprintf(to, "SN\traw total sequences:\t%ld\t# excluding supplementary and secondary reads\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other));  // not counting excluded seqs (and none of the below)
      fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
      fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other));
      fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
@@ -1512,6 +1525,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse)
      fprintf(to, "SN\treads MQ0:\t%ld\t# mapped and MQ=0\n", (long)stats->nreads_mq0);
      fprintf(to, "SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
      fprintf(to, "SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
+    fprintf(to, "SN\tsupplementary alignments:\t%ld\n", (long)stats->nreads_supplementary);
      fprintf(to, "SN\ttotal length:\t%ld\t# ignores clipping\n", (long)stats->total_len);
      fprintf(to, "SN\ttotal first fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_1st);
      fprintf(to, "SN\ttotal last fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_2nd);
@@ -1537,7 +1551,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse)
      fprintf(to, "SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
      fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0);
      if ( stats->target_count ) {
-        fprintf(to, "SN\tbases inside the target:\t%u\n", stats->target_count);
+        fprintf(to, "SN\tbases inside the target:\t%" PRIu64 "\n", stats->target_count);
          for (icov=stats->info->cov_threshold+1; icov<stats->ncov; icov++)
              cov_sum += stats->cov[icov];
          fprintf(to, "SN\tpercentage of target genome with coverage > %d (%%):\t%.2f\n", stats->info->cov_threshold, (float)(100*cov_sum)/stats->target_count);
@@ -1614,7 +1628,18 @@ void output_stats(FILE *to, stats_t *stats, int sparse)
                  100.*(acgtno_count_1st->t + acgtno_count_2nd->t)/acgt_sum,
                  100.*(acgtno_count_1st->n + acgtno_count_2nd->n)/acgt_sum,
                  100.*(acgtno_count_1st->other + acgtno_count_2nd->other)/acgt_sum);
-
+    }
+    fprintf(to, "# ACGT content per cycle, read oriented. Use `grep ^GCT | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]\n");
+    for (ibase=0; ibase<stats->max_len; ibase++)
+    {
+        acgtno_count_t *acgtno_count = &(stats->acgtno_revcomp[ibase]);
+        uint64_t acgt_sum = acgtno_count->a + acgtno_count->c + acgtno_count->g + acgtno_count->t;
+        if ( ! acgt_sum ) continue;
+        fprintf(to, "GCT\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1,
+                100.*(acgtno_count->a)/acgt_sum,
+                100.*(acgtno_count->c)/acgt_sum,
+                100.*(acgtno_count->g)/acgt_sum,
+                100.*(acgtno_count->t)/acgt_sum);
      }
  
      uint64_t tA=0, tC=0, tG=0, tT=0, tN=0;
@@ -1802,7 +1827,7 @@ void output_stats(FILE *to, stats_t *stats, int sparse)
      }
  }
  
-static void init_regions(stats_t *stats, const char *file)
+static void init_regions(stats_t *stats, const char *file, stats_info_t* info)
  {
      FILE *fp = fopen(file,"r");
      if ( !fp ) error("%s: %s\n",file,strerror(errno));
@@ -1879,8 +1904,15 @@ static void init_regions(stats_t *stats, const char *file)
              }
              reg->npos = ++new_p;
          }
-        for (p = 0; p < reg->npos; p++)
-            stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1);
+        for (p = 0; p < reg->npos; p++) {
+            if (reg->pos[p].end < HTS_POS_MAX) {
+                stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1);
+            } else {
+                uint64_t hdr_end = sam_hdr_tid2len(info->sam_header, r);
+                if (hdr_end)
+                    stats->target_count += (hdr_end - reg->pos[p].beg + 1);
+            }
+        }
      }
  
      if (!(stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t))))
@@ -1943,7 +1975,7 @@ int is_in_regions(bam1_t *bam_line, stats_t *stats)
      return 1;
  }
  
-int replicate_regions(stats_t *stats, hts_itr_multi_t *iter) {
+int replicate_regions(stats_t *stats, hts_itr_multi_t *iter, stats_info_t *info) {
      if ( !stats || !iter)
          return 1;
  
@@ -1977,8 +2009,13 @@ int replicate_regions(stats_t *stats, hts_itr_multi_t *iter) {
          for (j = 0; j < stats->regions[tid].npos; j++) {
              stats->regions[tid].pos[j].beg = iter->reg_list[i].intervals[j].beg+1;
              stats->regions[tid].pos[j].end = iter->reg_list[i].intervals[j].end;
-
-            stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1);
+            if (stats->regions[tid].pos[j].end < HTS_POS_MAX) {
+                stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1);
+            } else {
+                uint64_t hdr_end = sam_hdr_tid2len(info->sam_header, tid);
+                if (hdr_end)
+                    stats->target_count += (hdr_end - stats->regions[tid].pos[j].beg + 1);
+            }
          }
      }
  
@@ -2054,7 +2091,7 @@ static void HTS_NORETURN error(const char *format, ...)
          vfprintf(samtools_stderr, format, ap);
          va_end(ap);
      }
-    exit(1);
+    samtools_exit(1);
  }
  
  void cleanup_stats_info(stats_info_t* info){
@@ -2075,6 +2112,7 @@ void cleanup_stats(stats_t* stats)
      free(stats->mpc_buf);
      free(stats->acgtno_cycles_1st);
      free(stats->acgtno_cycles_2nd);
+    free(stats->acgtno_revcomp);
      free(stats->read_lengths);
      free(stats->read_lengths_1st);
      free(stats->read_lengths_2nd);
@@ -2259,6 +2297,8 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr
      if (!stats->acgtno_cycles_1st) goto nomem;
      stats->acgtno_cycles_2nd  = calloc(stats->nbases,sizeof(acgtno_count_t));
      if (!stats->acgtno_cycles_2nd) goto nomem;
+    stats->acgtno_revcomp  = calloc(stats->nbases,sizeof(acgtno_count_t));
+    if (!stats->acgtno_revcomp) goto nomem;
      stats->read_lengths   = calloc(stats->nbases,sizeof(uint64_t));
      if (!stats->read_lengths)     goto nomem;
      stats->read_lengths_1st   = calloc(stats->nbases,sizeof(uint64_t));
@@ -2281,7 +2321,7 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr
          goto nomem;
      realloc_rseq_buffer(stats);
      if ( targets )
-        init_regions(stats, targets);
+        init_regions(stats, targets, info);
      return;
   nomem:
      error("Out of memory");
@@ -2461,7 +2501,7 @@ int main_stats(int argc, char *argv[])
              if (iter) {
                  if (!targets) {
                      all_stats->nchunks = argc-optind;
-                    if (replicate_regions(all_stats, iter))
+                    if (replicate_regions(all_stats, iter, info))
                          fprintf(samtools_stderr, "Replications of the regions failed\n");
                  }
  
diff --git a/samtools/stats_isize.c.pysam.c b/samtools/stats_isize.c.pysam.c

index 96feb90675272ce1fb28401a2783007bd329f68e..1bb2bd4630cca4525f5a89ece5b67481079050ad 100644 (file)
--- a/samtools/stats_isize.c.pysam.c
+++ b/samtools/stats_isize.c.pysam.c
@@ -97,7 +97,7 @@ static void sparse_set_f(isize_data_t data, int at, isize_insert_t field, uint64
              a->max = max(at, a->max);
          } else {
              fprintf(samtools_stderr, "%s\n", "Failed to allocate memory for isize_sparse_record_t");
-            exit(11);
+            samtools_exit(11);
          }
      } else {
          return;
diff --git a/samtools/tmp_file.h b/samtools/tmp_file.h

index 15d088e77319450ac388282c78cf8c06ae8b8765..4f2647cb7e43fe26fd89935729d5941c6cdca982 100644 (file)
--- a/samtools/tmp_file.h
+++ b/samtools/tmp_file.h
@@ -31,7 +31,7 @@ DEALINGS IN THE SOFTWARE
  #include <lz4.h>
  #include "htslib/sam.h"
  
-#ifdef _cplusplus
+#ifdef __cplusplus
  extern "C" {
  #endif
  
diff --git a/samtools/version.sh b/samtools/version.sh

index 5ccd9bb96d5ca633369b0b179da60365b0edd300..9d28100df452e2646d873042094166f5c60356f4 100755 (executable)
--- a/samtools/version.sh
+++ b/samtools/version.sh
@@ -24,7 +24,7 @@
  # DEALINGS IN THE SOFTWARE.
  
  # Master version, for use in tarballs or non-git source copies
-VERSION=1.10
+VERSION=1.13
  
  # If we have a git clone, then check against the current tag
  if [ -e .git ]
diff --git a/setup.py b/setup.py

index 072ed8ad75c6c6fca2eb7f9a5c8e4b66d1fce0e7..5f2bb00d78369ca6ff5af5ede1fe68e9b5030c52 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,10 @@ import subprocess
  import sys
  import sysconfig
  from contextlib import contextmanager
-from setuptools import setup
+from distutils import log
+from setuptools import setup, Command
+from setuptools.command.sdist import sdist
+
  from cy_build import CyExtension as Extension, cy_build_ext as build_ext
  try:
      import cython
@@ -79,6 +82,61 @@ def run_make_print_config():
      return make_print_config
  
  
+# This function emulates the way distutils combines settings from sysconfig,
+# environment variables, and the extension being built. It returns a dictionary
+# representing the usual set of variables, suitable for writing to a generated
+# file or for running configure (provided the returned LIBS is ignored).
+def build_config_dict(ext):
+    def env(var):
+        return [os.environ[var]] if var in os.environ else []
+
+    def sc(var):
+        value = sysconfig.get_config_var(var)
+        return [value] if value is not None else []
+
+    def optionise(option, valuelist):
+        def quote(s): return "'"+s+"'" if " " in s else s
+        return list(quote(option+v) for v in valuelist)
+
+    def kvtuples(pairlist):
+        def appendoptvalue(t): return t[0] if t[1] is None else t[0]+"="+t[1]
+        return map(appendoptvalue, pairlist)
+
+    # For CC, select the first of these that is set
+    cc = (env('CC') + sc('CC') + ['gcc'])[0]
+
+    # distutils ignores sysconfig for CPPFLAGS
+    cppflags = " ".join(env('CPPFLAGS') + optionise('-I', ext.include_dirs) +
+                        optionise('-D', kvtuples(ext.define_macros)) +
+                        optionise('-U', ext.undef_macros))
+
+    cflags = " ".join(sc('CFLAGS') + env('CFLAGS') + ext.extra_compile_args)
+
+    # distutils actually includes $CPPFLAGS here too, but that's weird and
+    # unnecessary for us as we know the output LDFLAGS will be used correctly
+    ldflags = " ".join(sc('LDFLAGS') + env('LDFLAGS') + env('CFLAGS') +
+                       optionise('-L', ext.library_dirs) +
+                       ext.extra_link_args)
+
+    # ext.libraries is computed (incorporating $LIBS etc) during configure
+    libs = " ".join(optionise('-l', ext.libraries))
+
+    return { 'CC': cc, 'CPPFLAGS': cppflags, 'CFLAGS': cflags,
+             'LDFLAGS': ldflags, 'LIBS': libs }
+
+
+def write_configvars_header(filename, ext, prefix):
+    config = build_config_dict(ext)
+    if prefix != 'HTS':
+        config['HTSDIR'] = '(unused)'
+        config['CURSES_LIB'] = '(unused)'
+
+    log.info("creating %s for '%s' extension", filename, ext.name)
+    with open(filename, "w") as outf:
+        for var, value in config.items():
+            outf.write('#define {}_{} "{}"\n'.format(prefix, var, value))
+
+
  @contextmanager
  def set_compiler_envvars():
      tmp_vars = []
@@ -140,6 +198,46 @@ def get_pysam_version():
      return version.__version__
  
  
+# Override sdist command to ensure Cythonized *.c files are included.
+class cythonize_sdist(sdist):
+    # Remove when setuptools (as installed on GH runners) has these options
+    if not any(opt[0] == 'owner=' for opt in sdist.user_options):
+        sdist.user_options.append(('owner=', 'u', 'Specify owner inside tar'))
+    if not any(opt[0] == 'group=' for opt in sdist.user_options):
+        sdist.user_options.append(('group=', 'g', 'Specify group inside tar'))
+
+    def run(self):
+        from Cython.Build import cythonize
+        cythonize(self.distribution.ext_modules)
+        super().run()
+
+
+class clean_ext(Command):
+    description = "clean up Cython temporary files"
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        objs = glob.glob(os.path.join("pysam", "libc*.c"))
+        if objs:
+            log.info("removing 'pysam/libc*.c' (%s Cython objects)", len(objs))
+        for obj in objs:
+            os.remove(obj)
+
+        headers = (glob.glob(os.path.join("htslib",   "*config*.h")) +
+                   glob.glob(os.path.join("samtools", "*config*.h")) +
+                   glob.glob(os.path.join("bcftools", "*config*.h")))
+        if headers:
+            log.info("removing '*/*config*.h' (%s generated headers)", len(headers))
+        for header in headers:
+            os.remove(header)
+
+
  # How to link against HTSLIB
  # shared:   build shared chtslib from builtin htslib code.
  # external: use shared libhts.so compiled outside of
@@ -170,8 +268,6 @@ package_dirs = {'pysam': 'pysam',
  config_headers = ["samtools/config.h",
                    "bcftools/config.h"]
  
-cmdclass = {'build_ext': build_ext}
-
  # If cython is available, the pysam will be built using cython from
  # the .pyx files. If no cython is available, the C-files included in the
  # distribution will be used.
@@ -191,22 +287,6 @@ if not os.path.exists(fn):
          "from the repository"
          .format(fn))
  
-# exclude sources that contain a main function
-EXCLUDE = {
-    "samtools": (
-    ),
-    "bcftools": (
-        "test", "plugins", "peakfit.c",
-        "peakfit.h",
-        # needs to renamed, name conflict with samtools reheader
-        "reheader.c",
-        "polysomy.c"),
-    "htslib": (
-        'htslib/tabix.c',
-        'htslib/bgzip.c',
-        'htslib/htsfile.c'),
-}
-
  print ("# pysam: htslib mode is {}".format(HTSLIB_MODE))
  print ("# pysam: HTSLIB_CONFIGURE_OPTIONS={}".format(
      HTSLIB_CONFIGURE_OPTIONS))
@@ -364,11 +444,20 @@ libraries_for_pysam_module = external_htslib_libraries + internal_htslib_librari
  # The list below uses the union of include_dirs and library_dirs for
  # reasons of simplicity.
  
+def prebuild_libchtslib(ext, force):
+    if HTSLIB_MODE not in ['shared', 'separate']: return
+    write_configvars_header("htslib/config_vars.h", ext, "HTS")
+
+def prebuild_libcsamtools(ext, force):
+    write_configvars_header("samtools/samtools_config_vars.h", ext, "SAMTOOLS")
+
  modules = [
      dict(name="pysam.libchtslib",
+         prebuild_func=prebuild_libchtslib,
           sources=[source_pattern % "htslib", "pysam/htslib_util.c"] + shared_htslib_sources + os_c_files,
           libraries=external_htslib_libraries),
      dict(name="pysam.libcsamtools",
+         prebuild_func=prebuild_libcsamtools,
           sources=[source_pattern % "samtools"] + glob.glob(os.path.join("samtools", "*.pysam.c")) +
           [os.path.join("samtools", "lz4", "lz4.c")] + htslib_sources + os_c_files,
           libraries=external_htslib_libraries + internal_htslib_libraries),
@@ -447,12 +536,11 @@ metadata = {
      'packages': package_list,
      'requires': ['cython (>=0.29.12)'],
      'ext_modules': [Extension(**opts) for opts in modules],
-    'cmdclass': cmdclass,
+    'cmdclass': {'build_ext': build_ext, 'clean_ext': clean_ext, 'sdist': cythonize_sdist},
      'package_dir': package_dirs,
      'package_data': {'': ['*.pxd', '*.h'], },
      # do not pack in order to permit linking to csamtools.so
      'zip_safe': False,
-    'use_2to3': True,
  }
  
  if __name__ == '__main__':
diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py

index 3c5dda59567559d0ed7fa34a8733980dd806b2b8..8fb1971e306b0789249684ee22bc7dcfc13a0130 100644 (file)
--- a/tests/AlignedSegment_test.py
+++ b/tests/AlignedSegment_test.py
@@ -7,7 +7,7 @@ import string
  import copy
  import array
  
-from TestUtils import checkFieldEqual, BAM_DATADIR, get_temp_filename, get_temp_context, IS_PYTHON3
+from TestUtils import checkFieldEqual, make_data_files, BAM_DATADIR, get_temp_filename, get_temp_context, IS_PYTHON3
  
  
  if IS_PYTHON3:
@@ -15,6 +15,11 @@ if IS_PYTHON3:
  else:
      maketrans = string.maketrans
  
+
+def setUpModule():
+    make_data_files(BAM_DATADIR)
+
+
  class ReadTest(unittest.TestCase):
  
      def build_read(self):
@@ -65,7 +70,7 @@ class TestAlignedSegment(ReadTest):
          a = pysam.AlignedSegment()
          s = str(a)
          self.assertEqual(
-            "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]",
+            "None\t0\t*\t0\t0\tNone\t*\t0\t0\tNone\tNone\t[]",
              s)
  
      def testSettingTagInEmptyRead(self):
@@ -525,13 +530,13 @@ class TestAlignedSegment(ReadTest):
      def test_query_length_is_limited(self):
          a = self.build_read()
          a.query_name = "A" * 1
-        a.query_name = "A" * 251
+        a.query_name = "A" * 254
          self.assertRaises(
              ValueError,
              setattr,
              a,
              "query_name",
-            "A" * 252)
+            "A" * 255)
  
      def test_header_accessible(self):
          a = self.build_read()
diff --git a/tests/AlignmentFileHeader_test.py b/tests/AlignmentFileHeader_test.py

index e6c428754d770a134820f91190a85e6f3e2b9798..a665f430f290d77cb855dd28361075115a21772b 100644 (file)
--- a/tests/AlignmentFileHeader_test.py
+++ b/tests/AlignmentFileHeader_test.py
@@ -13,7 +13,7 @@ import copy
  from collections import OrderedDict as odict
  import pysam
  import pysam.samtools
-from TestUtils import get_temp_filename, BAM_DATADIR
+from TestUtils import get_temp_filename, make_data_files, BAM_DATADIR
  
  if sys.version_info.major >= 3:
      from io import StringIO
@@ -21,6 +21,10 @@ else:
      from StringIO import StringIO
  
  
+def setUpModule():
+    make_data_files(BAM_DATADIR)
+
+
  class TestHeaderConstruction(unittest.TestCase):
      """testing header construction."""
  
diff --git a/tests/AlignmentFilePileup_test.py b/tests/AlignmentFilePileup_test.py

index 43072fa761826f2cd24de05f5c6c449a6631bb00..8e75a52dd4ca6c9e389af10ee921eb3ddd419166 100644 (file)
--- a/tests/AlignmentFilePileup_test.py
+++ b/tests/AlignmentFilePileup_test.py
@@ -2,10 +2,14 @@
  import os
  import pysam
  import unittest
-from TestUtils import BAM_DATADIR, IS_PYTHON3, force_str, flatten_nested_list
+from TestUtils import make_data_files, BAM_DATADIR, IS_PYTHON3, force_str, flatten_nested_list
  import PileupTestUtils
  
  
+def setUpModule():
+    make_data_files(BAM_DATADIR)
+
+
  class TestPileupReadSelection(unittest.TestCase):
      '''test pileup functionality.'''
  
diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py

index 28de4205959e2fb3b7d49b03e135e2ed335836b7..3a6cafc88dd73cec6b8a5b448b7a2c3bb35992a6 100644 (file)
--- a/tests/AlignmentFile_test.py
+++ b/tests/AlignmentFile_test.py
@@ -24,7 +24,11 @@ import pysam
  import pysam.samtools
  from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \
      check_samtools_view_equal, checkFieldEqual, force_str, \
-    get_temp_filename, BAM_DATADIR
+    get_temp_filename, make_data_files, BAM_DATADIR
+
+
+def setUpModule():
+    make_data_files(BAM_DATADIR)
  
  
  ##################################################
@@ -723,7 +727,7 @@ class TestIO(unittest.TestCase):
          read = load_bam()
          self.assertEqual(read.reference_name, "chr1")
          
-    # TOOD
+    # TODO
      # def testReadingFromSamFileWithoutHeader(self):
      #     '''read from samfile without header.
      #     '''
@@ -1391,12 +1395,12 @@ class TestEmptyHeader(unittest.TestCase):
          self.assertEqual(s.header.to_dict(), {'SQ': [{'LN': 1000, 'SN': 'chr1'}]})
  
      def test_bam_without_seq_in_header(self):
-        s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "example_no_seq_in_header.bam"))
+        s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "0example_no_seq_in_header.bam"))
          self.assertTrue("SQ" in s.header.to_dict())
          self.assertTrue("@SQ" in str(s.header))
  
      def test_bam_without_seq_with_null_bytes_in_header(self):
-        s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "example_no_seq_in_header_null_bytes.bam"))
+        s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "0example_no_seq_in_header_null_bytes.bam"))
          self.assertTrue("SQ" in s.header.to_dict())
          self.assertTrue("@SQ" in str(s.header))
  
@@ -1460,6 +1464,24 @@ class TestTruncatedBAM(unittest.TestCase):
              return len([a for a in x])
          self.assertRaises(IOError, iterall, s)
  
+        # Ignore closing errors, as s is now in an error state
+        try:
+            s.close()
+        except IOError:
+            pass
+
+
+class TestCorruptBAM(unittest.TestCase):
+    """See pull request 1035."""
+
+    def testCorruptBamIterator(self):
+        s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex2_corrupt.bam"))
+
+        def iterall(x):
+            return len([a for a in x])
+
+        self.assertRaises(IOError, iterall, s)
+
  
  COMPARE_BTAG = [100, 1, 91, 0, 7, 101, 0, 201, 96, 204,
                  0, 0, 87, 109, 0, 7, 97, 112, 1, 12, 78,
@@ -2316,26 +2338,6 @@ class TestSanityCheckingBAM(unittest.TestCase):
          self.check_write(read)
  
  
-class TestHeader1000Genomes(unittest.TestCase):
-
-    '''see issue 110'''
-    bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase3_EX_or_LC_only_alignment/data/HG00104/alignment/HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam"  # noqa
-    bambase = "HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam"  # noqa
-
-    def testRead(self):
-
-        if not check_url(self.bamfile):
-            return
-
-        f = pysam.AlignmentFile(self.bamfile, "rb")
-        data = f.header.copy()
-        self.assertTrue(data)
-
-    def tearDown(self):
-        if os.path.exists(self.bambase + ".bai"):
-            os.unlink(self.bambase + ".bai")
-
-
  class TestLargeCigar(unittest.TestCase):
  
      def setUp(self):
@@ -2422,9 +2424,6 @@ class TestLargeCigar(unittest.TestCase):
  #     mode = "w"
  
  if __name__ == "__main__":
-    # build data files
-    print("building data files")
-    subprocess.call("make -C %s" % BAM_DATADIR, shell=True)
      print("starting tests")
      unittest.main()
      print("completed tests")
diff --git a/tests/StreamFiledescriptors_test.py b/tests/StreamFiledescriptors_test.py

index f09ef372a628b7910b5734869a32b28672754b28..07adea81d53e0c452cceb899136f6623c33af92f 100644 (file)
--- a/tests/StreamFiledescriptors_test.py
+++ b/tests/StreamFiledescriptors_test.py
@@ -5,11 +5,15 @@ import threading
  import errno
  import unittest
  from pysam import AlignmentFile
-from TestUtils import BAM_DATADIR
+from TestUtils import make_data_files, BAM_DATADIR
  
  IS_PYTHON2 = sys.version_info[0] == 2
  
  
+def setUpModule():
+    make_data_files(BAM_DATADIR)
+
+
  def alignmentfile_writer_thread(infile, outfile):
      def _writer_thread(infile, outfile):
          """read from infile and write to outfile"""
diff --git a/tests/TestUtils.py b/tests/TestUtils.py

index f33761e5836887e400710bfa3fc36c6fcd141244..97bd2edbfde42aa17e2e02fd4ab24109b244585f 100644 (file)
--- a/tests/TestUtils.py
+++ b/tests/TestUtils.py
@@ -5,6 +5,7 @@ import difflib
  import gzip
  import contextlib
  import inspect
+import subprocess
  import tempfile
  import pysam
  
@@ -251,6 +252,18 @@ def get_temp_context(suffix="", keep=False):
              os.unlink(f)
  
  
+def make_data_files(directory):
+    what = None
+    try:
+        if not os.path.exists(os.path.join(directory, "all.stamp")):
+            subprocess.check_output(["make", "-C", directory], stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+        what = "Making test data in '%s' failed:\n%s" % (directory, force_str(e.output))
+
+    if what is not None:
+        raise RuntimeError(what)
+
+
  def load_and_convert(filename, encode=True):
      '''load data from filename and convert all fields to string.
  
diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py

index 4458d1f163569e186b023bf5cedc161a6ce228fb..fcc39a63eb5d50b8fbd81b6c0f0699c492d6e45c 100644 (file)
--- a/tests/VariantFile_test.py
+++ b/tests/VariantFile_test.py
@@ -7,14 +7,17 @@ import unittest
  import pysam
  import shutil
  import gzip
-import subprocess
  
  try:
      from pathlib import Path
  except ImportError:
      Path = None
  
-from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR, get_temp_context
+from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, make_data_files, CBCF_DATADIR, get_temp_context
+
+
+def setUpModule():
+    make_data_files(CBCF_DATADIR)
  
  
  def read_header(filename):
@@ -33,6 +36,12 @@ def read_header(filename):
      return data
  
  
+def read_index_header(filename):
+    with gzip.open(filename) as infile:
+        magic = infile.read(4)
+    return magic
+
+
  class TestMissingGenotypes(unittest.TestCase):
  
      filename = "missing_genotypes.vcf"
@@ -199,6 +208,7 @@ class TestIndexFormatsVCF(unittest.TestCase):
              shutil.copyfile(self.vcf_filename, fn)
              pysam.tabix_index(fn, preset="vcf", force=True)
              self.assertTrue(os.path.exists(fn + ".gz" + ".tbi"))
+            self.assertEqual(read_index_header(fn + ".gz.tbi"), b"TBI\1")
              self.assertFalse(os.path.exists(fn + ".gz" + ".csi"))
              
              with pysam.VariantFile(fn + ".gz") as inf:
@@ -210,6 +220,7 @@ class TestIndexFormatsVCF(unittest.TestCase):
  
              pysam.tabix_index(fn, preset="vcf", force=True, csi=True)
              self.assertTrue(os.path.exists(fn + ".gz" + ".csi"))
+            self.assertEqual(read_index_header(fn + ".gz.csi"), b"CSI\1")
              self.assertFalse(os.path.exists(fn + ".gz" + ".tbi"))
              
              with pysam.VariantFile(fn + ".gz") as inf:
@@ -221,6 +232,7 @@ class TestIndexFormatsVCF(unittest.TestCase):
              shutil.copyfile(self.bcf_filename + ".csi", fn + ".csi")
  
              self.assertTrue(os.path.exists(fn + ".csi"))
+            self.assertEqual(read_index_header(fn + ".csi"), b"CSI\1")
              self.assertFalse(os.path.exists(fn + ".tbi"))
              
              with pysam.VariantFile(fn) as inf:
@@ -232,6 +244,7 @@ class TestIndexFormatsVCF(unittest.TestCase):
  
              pysam.tabix_index(fn, preset="bcf", force=True, csi=False)
              self.assertTrue(os.path.exists(fn + ".csi"))
+            self.assertEqual(read_index_header(fn + ".csi"), b"CSI\1")
              self.assertFalse(os.path.exists(fn + ".tbi"))
              
              with pysam.VariantFile(fn) as inf:
@@ -244,6 +257,7 @@ class TestIndexFormatsVCF(unittest.TestCase):
              pysam.tabix_index(fn, preset="vcf", force=True, csi=True)
              
              self.assertTrue(os.path.exists(fn + ".csi"))
+            self.assertEqual(read_index_header(fn + ".csi"), b"CSI\1")
              self.assertFalse(os.path.exists(fn + ".tbi"))
              
              with pysam.VariantFile(fn) as inf:
@@ -668,9 +682,6 @@ class TestUnicode(unittest.TestCase):
                  
  
  if __name__ == "__main__":
-    # build data files
-    print("building data files")
-    subprocess.call("make -C %s" % CBCF_DATADIR, shell=True)
      print("starting tests")
      unittest.main()
      print("completed tests")
diff --git a/tests/VariantRecord_test.py b/tests/VariantRecord_test.py

index fd80a80dd7e57130e28a485b08e79f8c52b901c8..5043d1f596d1b8ba70781c0a1baf183c0ef8dcaa 100644 (file)
--- a/tests/VariantRecord_test.py
+++ b/tests/VariantRecord_test.py
@@ -13,7 +13,11 @@ try:
  except ImportError:
      Path = None
  
-from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR, get_temp_context
+from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, make_data_files, CBCF_DATADIR, get_temp_context
+
+
+def setUpModule():
+    make_data_files(CBCF_DATADIR)
  
  
  @pytest.fixture
diff --git a/tests/cbcf_data/Makefile b/tests/cbcf_data/Makefile

index 796c3a66220589cd5171dee15ccccec096c79d07..9c3fe757328248b67bb4f1a1988f1b98b5e3abe2 100644 (file)
--- a/tests/cbcf_data/Makefile
+++ b/tests/cbcf_data/Makefile
@@ -4,7 +4,10 @@ VCF=$(filter-out example_empty.vcf,$(ALL_VCF))
  VCFGZ=$(VCF:%.vcf=%.vcf.gz)
  BCF=$(VCF:%.vcf=%.bcf)
  
-all: $(VCFGZ) $(BCF)
+all: all.stamp
+
+all.stamp: $(VCFGZ) $(BCF)
+       touch $@
  
  %.vcf.gz: %.vcf
         bgzip < $< > $@
@@ -19,5 +22,4 @@ example_empty.bcf: example_empty.vcf.gz
         touch $@
  
  clean:
-       rm -f *.gz *.tbi *.csi *.bcf
-
+       -rm -f all.stamp *.gz *.tbi *.csi *.bcf
diff --git a/tests/compile_test.py b/tests/compile_test.py

index f56adb77fba7e1327a495a1cfa5822876c2c817b..300ab92caf89f4705149ca5bd24ca64d7835d503 100644 (file)
--- a/tests/compile_test.py
+++ b/tests/compile_test.py
@@ -10,7 +10,13 @@ pysam and tabix works.
  import os
  import unittest
  import pysam
-from TestUtils import BAM_DATADIR, TABIX_DATADIR
+from TestUtils import make_data_files, BAM_DATADIR, TABIX_DATADIR
+
+
+def setUpModule():
+    make_data_files(BAM_DATADIR)
+    make_data_files(TABIX_DATADIR)
+
  
  try:
      os.unlink('tests/_compile_test.c')
diff --git a/tests/faidx_test.py b/tests/faidx_test.py

index 171fae331576e1aa46c3773b1c117d7bc959c2eb..72520e70f7d469116608d03c043a082ae14dc505 100644 (file)
--- a/tests/faidx_test.py
+++ b/tests/faidx_test.py
@@ -6,7 +6,11 @@ import gzip
  import copy
  import shutil
  
-from TestUtils import check_url, BAM_DATADIR, get_temp_filename
+from TestUtils import check_url, make_data_files, BAM_DATADIR, get_temp_filename
+
+
+def setUpModule():
+    make_data_files(BAM_DATADIR)
  
  
  class TestFastaFile(unittest.TestCase):
diff --git a/tests/pysam_data/0example_no_seq_in_header.bam b/tests/pysam_data/0example_no_seq_in_header.bam

new file mode 100644 (file)

index 0000000..72de636

Binary files /dev/null and b/tests/pysam_data/0example_no_seq_in_header.bam differ
diff --git a/tests/pysam_data/0example_no_seq_in_header_null_bytes.bam b/tests/pysam_data/0example_no_seq_in_header_null_bytes.bam

new file mode 100644 (file)

index 0000000..aaf4b84

Binary files /dev/null and b/tests/pysam_data/0example_no_seq_in_header_null_bytes.bam differ
diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile

index 3921e8a5c7675236e51b14bfd40e343ba6e41fc2..c6ad88418c830bf7a58b8c5f919ee981d8a7ae18 100644 (file)
--- a/tests/pysam_data/Makefile
+++ b/tests/pysam_data/Makefile
@@ -3,11 +3,13 @@ BAM=$(SAM:%.sam=%.bam)
  BAI=$(BAM:%.bam=%.bam.bai)
  CRAM=ex1.cram ex2.cram ex3.cram
  CRAI=$(CRAM:%.cram=%.cram.crai)
-NO_PG:=$(findstring --no-PG,$(shell samtools view))
+NO_PG:=$(findstring --no-PG,$(shell samtools view '-?'))
  
  # ex2.bam - bam file without index
  
-all: ex1.pileup.gz \
+all: all.stamp
+
+all.stamp: ex1.pileup.gz \
         ex1.sam ex1.bam \
         ex2.sam.gz ex2.sam ex2.bam ex2.bam.bai \
         with_md.sam.gz with_md.bam with_md.bam.bai \
@@ -17,13 +19,15 @@ all: ex1.pileup.gz \
         example_bai.bam \
          rg_with_tab.bam \
         ex2_truncated.bam \
+       ex2_corrupt.bam \
         empty.bam empty.bam.bai \
         explicit_index.bam explicit_index.cram \
         faidx_empty_seq.fq.gz \
-       ex1.fa.gz ex1.fa.gz.csi \
+       ex1.fa.gz ex1.fa.gz.fai ex1.fa.gz.gzi \
         ex1_csi.bam \
         example_reverse_complement.bam \
         example_dash_in_chr.bam
+       touch $@
  
  # ex2.sam - as ex1.sam, but with header
  ex2.sam.gz: ex1.bam ex1.bam.bai
@@ -36,13 +40,13 @@ with_md.sam.gz: ex2.bam ex1.fa
  #      samtools view $(NO_PG) -bo $@ -t ex1.fa.fai $<
  
  uncompressed.bam: ex2.sam
-       samtools view $(NO_PG) -buS $< > $@
+       samtools view $(NO_PG) -bu -o $@ $<
  
  %.bam: %.sam
-       samtools view $(NO_PG) -bS $< > $@
+       samtools view $(NO_PG) -bo $@ $<
  
  %.cram: %.sam
-       samtools view $(NO_PG) -bC -T ex1.fa $< > $@
+       samtools view $(NO_PG) -Co $@ -T ex1.fa $<
  
  %.cram.crai: %.cram
         samtools index $<
@@ -50,8 +54,11 @@ uncompressed.bam: ex2.sam
  %.sam: %.sam.gz
         gunzip < $< > $@
  
-ex1.fa.fai:ex1.fa
-               samtools faidx ex1.fa
+%.fa.fai: %.fa
+       samtools faidx $<
+
+%.fa.gz.fai %.fa.gz.gzi: %.fa.gz
+       samtools faidx $<
  
  ex1.bam:ex1.sam.gz ex1.fa.fai
                 samtools view $(NO_PG) -bo ex1.bam -t ex1.fa.fai ex1.sam.gz
@@ -65,12 +72,16 @@ ex1.pileup.gz:ex1.bam ex1.fa
  ex2_truncated.bam: ex2.bam
         head -c 124000 ex2.bam > ex2_truncated.bam
  
+# Append a corrupt read with block_size < sizeof(bam_core_t fields)
+ex2_corrupt.bam: ex2.bam
+       (bgzip -d < $<; printf '\37\0\0\0\1\0\0\0') | bgzip > $@
+
  ex1_csi.bam: ex1.bam
         cp ex1.bam ex1_csi.bam
         samtools index -c ex1_csi.bam
  
  empty.bam: ex2.sam
-       grep "^@" $< | samtools view $(NO_PG) -Sb - > $@
+       grep "^@" $< | samtools view $(NO_PG) -bo $@ -
  
  example_unmapped_reads_no_sq.bam: example_unmapped_reads_no_sq.sam
         touch tmp.list
@@ -89,9 +100,9 @@ explicit_index.cram: ex1.cram
         cp ex1.cram $@
  
  clean:
-       rm -fr *.bam *.bai *.fai *.pileup* *.cram \
-       *~ calDepth *.dSYM pysam_*.sam \
-       ex2.sam ex2.sam.gz ex1.sam \
+       rm -fr [a-z]*.bam *.bai *.csi *.fai *.gzi *.pileup* [a-z]*.cram *.crai \
+       all.stamp *~ calDepth *.dSYM pysam_*.sam \
+       ex2.sam ex2.sam.gz ex1.sam ex1.fa.gz \
         with_md.sam.gz \
         *.fq.gz
  
@@ -100,6 +111,3 @@ clean:
  
  %.fa.gz: %.fa
         bgzip < $< > $@
-
-%.fa.gz.csi: %.fa.gz
-       samtools faidx $<
diff --git a/tests/pysam_data/ex1.sam.gz b/tests/pysam_data/ex1.sam.gz

index 8dd2bc447cb504be23c29aa54d1a7b8ccfb8fa73..16044675f2473b5bccf026d374ac1ee04dfe6b40 100644 (file)

Binary files a/tests/pysam_data/ex1.sam.gz and b/tests/pysam_data/ex1.sam.gz differ
diff --git a/tests/pysam_data/example_no_seq_in_header.bam b/tests/pysam_data/example_no_seq_in_header.bam

deleted file mode 100644 (file)

index 72de636..0000000

Binary files a/tests/pysam_data/example_no_seq_in_header.bam and /dev/null differ
diff --git a/tests/pysam_data/example_no_seq_in_header_null_bytes.bam b/tests/pysam_data/example_no_seq_in_header_null_bytes.bam

deleted file mode 100644 (file)

index aaf4b84..0000000

Binary files a/tests/pysam_data/example_no_seq_in_header_null_bytes.bam and /dev/null differ
diff --git a/tests/refactoring.txt b/tests/refactoring.txt

index 75db3c7cccd6af9c55a3392df779ca7b3e53b190..bc7d0866dc3bf617c7c13b0ae202726d37976464 100644 (file)
--- a/tests/refactoring.txt
+++ b/tests/refactoring.txt
@@ -57,7 +57,7 @@ overlap() -> getOverlap()
  Backwards incompatible changes:
  ================================
  
-1. Empty cigarstring now returns None (intstead of '')
+1. Empty cigarstring now returns None (instead of '')
  
  2. Empty cigar now returns None (instead of [])
  
diff --git a/tests/samtools_test.py b/tests/samtools_test.py

index f0d52c9e287314581d54e0996d002b82e1686b87..7c4023768f3af4361a4215b794e7f3e9c732ccd9 100644 (file)
--- a/tests/samtools_test.py
+++ b/tests/samtools_test.py
@@ -18,12 +18,16 @@ import pysam.samtools
  import pysam.bcftools
  from TestUtils import checkBinaryEqual, check_lines_equal, \
      check_samtools_view_equal, get_temp_filename, force_bytes, WORKDIR, \
-    BAM_DATADIR
+    make_data_files, BAM_DATADIR
  
  
  IS_PYTHON3 = sys.version_info[0] >= 3
  
  
+def setUpModule():
+    make_data_files(BAM_DATADIR)
+
+
  def run_command(cmd):
      '''run a samtools command'''
      try:
@@ -93,7 +97,7 @@ class SamtoolsTest(unittest.TestCase):
          # Samtools-htslib-API: bam_get_library() not yet implemented
          # causes downstream problems
          # TODO: The following cause subsequent commands to fail
-        # unknow option
+        # unknown option
          # "rmdup -s ex1.bam %(out)s_ex1.rmdup.bam",
          # "merge -f %(out)s_ex1.merge.bam ex1.bam ex1.bam",
          "reheader ex2.sam ex1.bam > %(out)s_ex1.reheader.bam",
@@ -242,9 +246,10 @@ class SamtoolsTest(unittest.TestCase):
      def testStatements(self):
          for statement in self.statements:
              command = self.get_command(statement, map_to_internal=False)
-            # bam2fq differs between version 1.5 and 1.6 - reenable if
+            # bam2fq differs between version 1.5 and 1.6 - re-enable if
              # bioconda samtools will be available.
-            if command in ("bedcov", "stats", "dict", "bam2fq"):
+            # flagstat differs between version <=1.12 and >=1.13
+            if command in ("bedcov", "stats", "dict", "bam2fq", "flagstat"):
                  continue
  
              if (command == "calmd" and
@@ -401,7 +406,7 @@ if sys.platform != "darwin":
  #         # "filter -s A ex1.vcf.gz  > %(out)s_ex1.filter",
  #         # exit
  #         # "gtcheck -s A ex1.vcf.gz  > %(out)s_ex1.gtcheck",
-#         # segfauld, used to work wit bcftools 1.3
+#         # segfault, used to work with bcftools 1.3
  #         # "roh -s A ex1.vcf.gz > %(out)s_ex1.roh",
  #         "stats ex1.vcf.gz > %(out)s_ex1.stats",
  #     ]
@@ -415,9 +420,6 @@ if sys.platform != "darwin":
  
  
  if __name__ == "__main__":
-    # build data files
-    print("building data files")
-    subprocess.call("make -C %s" % BAM_DATADIR, shell=True)
      print("starting tests")
      unittest.main()
      print("completed tests")
diff --git a/tests/tabix_data/Makefile b/tests/tabix_data/Makefile

new file mode 100644 (file)

index 0000000..22e5f55
--- /dev/null
+++ b/tests/tabix_data/Makefile
@@ -0,0 +1,7 @@
+all: all.stamp
+
+all.stamp:
+       touch $@
+
+clean:
+       -rm -f all.stamp
diff --git a/tests/tabix_data/example.bed.gz.tbi b/tests/tabix_data/example.bed.gz.tbi

index a529607bb4551fa61b3ed359b8e33c2d9e92bf5d..cf79b95ae232e068968aa9d863a03164350253bd 100644 (file)

Binary files a/tests/tabix_data/example.bed.gz.tbi and b/tests/tabix_data/example.bed.gz.tbi differ
diff --git a/tests/tabix_data/example.gff3.gz.tbi b/tests/tabix_data/example.gff3.gz.tbi

index 855e13926d35975f3b8d38b5cba65228dd4ecd00..d23afbb6fc39c07e57879c620a1b4b113dd33f9c 100644 (file)

Binary files a/tests/tabix_data/example.gff3.gz.tbi and b/tests/tabix_data/example.gff3.gz.tbi differ
diff --git a/tests/tabix_data/example.gtf.gz.tbi b/tests/tabix_data/example.gtf.gz.tbi

index 6e4fb0bfea07421dacec207d3f59c87e0a3a76c4..aa5009d670651dcf2d21e9c62982396099705b52 100644 (file)

Binary files a/tests/tabix_data/example.gtf.gz.tbi and b/tests/tabix_data/example.gtf.gz.tbi differ
diff --git a/tests/tabix_data/example.vcf.gz.tbi b/tests/tabix_data/example.vcf.gz.tbi

index ddb120e44132d17fd5b151c9c6af8351dbd22398..97c80efec315040f37a7d0c230ce868d17739b1b 100644 (file)

Binary files a/tests/tabix_data/example.vcf.gz.tbi and b/tests/tabix_data/example.vcf.gz.tbi differ
diff --git a/tests/tabix_data/example_badcomments.bed.gz.tbi b/tests/tabix_data/example_badcomments.bed.gz.tbi

index 04631805ed33127ce32fccbbfe15d90115e442f4..0ab947f612fe56e7947ef2af0285085eebdba6dc 100644 (file)

Binary files a/tests/tabix_data/example_badcomments.bed.gz.tbi and b/tests/tabix_data/example_badcomments.bed.gz.tbi differ
diff --git a/tests/tabix_data/example_badcomments.gtf.gz.tbi b/tests/tabix_data/example_badcomments.gtf.gz.tbi

index c7731fc26fc6d9d607c81bbae0a274f1e775cc29..16fb1355db063d4ecb95544d85244910b42d1f89 100644 (file)

Binary files a/tests/tabix_data/example_badcomments.gtf.gz.tbi and b/tests/tabix_data/example_badcomments.gtf.gz.tbi differ
diff --git a/tests/tabix_data/example_badcomments.vcf.gz.tbi b/tests/tabix_data/example_badcomments.vcf.gz.tbi

index 366004b49e71da8b701f63febfc6b5e4a5930b21..38f4b591f25cea7ada5d617a324070f784b2cfd2 100644 (file)

Binary files a/tests/tabix_data/example_badcomments.vcf.gz.tbi and b/tests/tabix_data/example_badcomments.vcf.gz.tbi differ
diff --git a/tests/tabix_data/example_comments.bed.gz.tbi b/tests/tabix_data/example_comments.bed.gz.tbi

index 42544b2390d8b8a725b4fd64161c47dff4d6ff36..89b1bb3384da9062c4055b64eb115855b3170981 100644 (file)

Binary files a/tests/tabix_data/example_comments.bed.gz.tbi and b/tests/tabix_data/example_comments.bed.gz.tbi differ
diff --git a/tests/tabix_data/example_comments.gtf.gz.tbi b/tests/tabix_data/example_comments.gtf.gz.tbi

index 2f33d40ff588ca01095f089f0d184bd4aca2cd9d..54f5389f960f8bb24e6375befeb007509d2bc953 100644 (file)

Binary files a/tests/tabix_data/example_comments.gtf.gz.tbi and b/tests/tabix_data/example_comments.gtf.gz.tbi differ
diff --git a/tests/tabix_data/example_comments.vcf.gz.tbi b/tests/tabix_data/example_comments.vcf.gz.tbi

index 366004b49e71da8b701f63febfc6b5e4a5930b21..38f4b591f25cea7ada5d617a324070f784b2cfd2 100644 (file)

Binary files a/tests/tabix_data/example_comments.vcf.gz.tbi and b/tests/tabix_data/example_comments.vcf.gz.tbi differ
diff --git a/tests/tabix_test.py b/tests/tabix_test.py

index c17f7ffe40b7c746144cc1dfe980632af3dc4e8e..754617538e13ee9be41c6576f8c4f461d5bce48b 100644 (file)
--- a/tests/tabix_test.py
+++ b/tests/tabix_test.py
@@ -11,15 +11,18 @@ import shutil
  import gzip
  import pysam
  import unittest
-import subprocess
  import glob
  import re
  from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \
-    load_and_convert, TABIX_DATADIR, get_temp_filename
+    load_and_convert, make_data_files, TABIX_DATADIR, get_temp_filename
  
  IS_PYTHON3 = sys.version_info[0] >= 3
  
  
+def setUpModule():
+    make_data_files(TABIX_DATADIR)
+
+
  def myzip_open(infile, mode="r"):
      '''open compressed file and decode.'''
  
@@ -1239,5 +1242,4 @@ class TestMultithreadTabixFile(unittest.TestCase):
  
  
  if __name__ == "__main__":
-    subprocess.call("make -C %s" % TABIX_DATADIR, shell=True)
      unittest.main()
diff --git a/tests/tabixproxies_test.py b/tests/tabixproxies_test.py

index 7ad7db0d1a218ab49255b309d559f7fa14dbf03c..180690940c9f1829216e24fc1fdf213c02ee70d5 100644 (file)
--- a/tests/tabixproxies_test.py
+++ b/tests/tabixproxies_test.py
@@ -5,7 +5,11 @@ import sys
  import re
  import copy
  import gzip
-from TestUtils import load_and_convert, TABIX_DATADIR
+from TestUtils import load_and_convert, make_data_files, TABIX_DATADIR
+
+
+def setUpModule():
+    make_data_files(TABIX_DATADIR)
  
  
  class TestParser(unittest.TestCase):
diff --git a/tests/test_samtools_python.py b/tests/test_samtools_python.py

index f30ff9c5c2b23f11daebac6384240fb17e672ddf..da4d332623dcac672d1dade228f9ec6c032a96e8 100644 (file)
--- a/tests/test_samtools_python.py
+++ b/tests/test_samtools_python.py
@@ -1,7 +1,11 @@
  import pysam
  import os
  import pytest
-from TestUtils import BAM_DATADIR
+from TestUtils import make_data_files, BAM_DATADIR
+
+
+def setUpModule():
+    make_data_files(BAM_DATADIR)
  
  
  def test_idxstats_parse_split_lines():
author	Nilesh Patra <nilesh@debian.org>
	Thu, 14 Oct 2021 19:28:59 +0000 (00:58 +0530)
committer	Nilesh Patra <nilesh@debian.org>
	Thu, 14 Oct 2021 19:28:59 +0000 (00:58 +0530)
.github/workflows/ci.yaml	[new file with mode: 0644]	patch \| blob
.github/workflows/release.yaml	[new file with mode: 0644]	patch \| blob
.gitignore		patch \| blob \| history
.travis.disabled.yml	[new file with mode: 0644]	patch \| blob
.travis.yml	[deleted file]	patch \| blob \| history
AUTHORS		patch \| blob \| history
INSTALL		patch \| blob \| history
MANIFEST.in		patch \| blob \| history
NEWS		patch \| blob \| history
README.rst		patch \| blob \| history
bcftools/HMM.c		patch \| blob \| history
bcftools/HMM.c.pysam.c		patch \| blob \| history
bcftools/HMM.h		patch \| blob \| history
bcftools/LICENSE		patch \| blob \| history
bcftools/README		patch \| blob \| history
bcftools/abuf.c	[new file with mode: 0644]	patch \| blob
bcftools/abuf.c.pysam.c	[new file with mode: 0644]	patch \| blob
bcftools/abuf.h	[new file with mode: 0644]	patch \| blob
bcftools/bam2bcf.c		patch \| blob \| history
bcftools/bam2bcf.c.pysam.c		patch \| blob \| history
bcftools/bam2bcf.h		patch \| blob \| history
bcftools/bam2bcf_indel.c		patch \| blob \| history
bcftools/bam2bcf_indel.c.pysam.c		patch \| blob \| history
bcftools/bcftools.h		patch \| blob \| history
bcftools/bcftools.pysam.c		patch \| blob \| history
bcftools/bcftools.pysam.h		patch \| blob \| history
bcftools/bin.c		patch \| blob \| history
bcftools/bin.c.pysam.c		patch \| blob \| history
bcftools/call.h		patch \| blob \| history
bcftools/ccall.c		patch \| blob \| history
bcftools/ccall.c.pysam.c		patch \| blob \| history
bcftools/consensus.c		patch \| blob \| history
bcftools/consensus.c.pysam.c		patch \| blob \| history
bcftools/convert.c		patch \| blob \| history
bcftools/convert.c.pysam.c		patch \| blob \| history
bcftools/csq.c		patch \| blob \| history
bcftools/csq.c.pysam.c		patch \| blob \| history
bcftools/dist.c	[new file with mode: 0644]	patch \| blob
bcftools/dist.c.pysam.c	[new file with mode: 0644]	patch \| blob
bcftools/dist.h	[new file with mode: 0644]	patch \| blob
bcftools/em.c		patch \| blob \| history
bcftools/em.c.pysam.c		patch \| blob \| history
bcftools/extsort.c	[new file with mode: 0644]	patch \| blob
bcftools/extsort.c.pysam.c	[new file with mode: 0644]	patch \| blob
bcftools/extsort.h	[new file with mode: 0644]	patch \| blob
bcftools/filter.c		patch \| blob \| history
bcftools/filter.c.pysam.c		patch \| blob \| history
bcftools/filter.h		patch \| blob \| history
bcftools/hclust.c		patch \| blob \| history
bcftools/hclust.c.pysam.c		patch \| blob \| history
bcftools/htslib-1.10.2/LICENSE	[deleted file]	patch \| blob \| history
bcftools/htslib-1.10.2/README	[deleted file]	patch \| blob \| history
bcftools/main.c		patch \| blob \| history
bcftools/main.c.pysam.c		patch \| blob \| history
bcftools/mcall.c		patch \| blob \| history
bcftools/mcall.c.pysam.c		patch \| blob \| history
bcftools/mpileup.c		patch \| blob \| history
bcftools/mpileup.c.pysam.c		patch \| blob \| history
bcftools/ploidy.h		patch \| blob \| history
bcftools/prob1.c		patch \| blob \| history
bcftools/prob1.c.pysam.c		patch \| blob \| history
bcftools/prob1.h		patch \| blob \| history
bcftools/rbuf.h		patch \| blob \| history
bcftools/regidx.c		patch \| blob \| history
bcftools/regidx.c.pysam.c		patch \| blob \| history
bcftools/regidx.h		patch \| blob \| history
bcftools/reheader.c		patch \| blob \| history
bcftools/reheader.c.pysam.c		patch \| blob \| history
bcftools/smpl_ilist.c		patch \| blob \| history
bcftools/smpl_ilist.c.pysam.c		patch \| blob \| history
bcftools/str_finder.c	[new file with mode: 0644]	patch \| blob
bcftools/str_finder.c.pysam.c	[new file with mode: 0644]	patch \| blob
bcftools/str_finder.h	[new file with mode: 0644]	patch \| blob
bcftools/utlist.h	[new file with mode: 0644]	patch \| blob
bcftools/vcfannotate.c		patch \| blob \| history
bcftools/vcfannotate.c.pysam.c		patch \| blob \| history
bcftools/vcfbuf.c		patch \| blob \| history
bcftools/vcfbuf.c.pysam.c		patch \| blob \| history
bcftools/vcfbuf.h		patch \| blob \| history
bcftools/vcfcall.c		patch \| blob \| history
bcftools/vcfcall.c.pysam.c		patch \| blob \| history
bcftools/vcfcnv.c		patch \| blob \| history
bcftools/vcfcnv.c.pysam.c		patch \| blob \| history
bcftools/vcfconcat.c		patch \| blob \| history
bcftools/vcfconcat.c.pysam.c		patch \| blob \| history
bcftools/vcfconvert.c		patch \| blob \| history
bcftools/vcfconvert.c.pysam.c		patch \| blob \| history
bcftools/vcffilter.c		patch \| blob \| history
bcftools/vcffilter.c.pysam.c		patch \| blob \| history
bcftools/vcfgtcheck.c		patch \| blob \| history
bcftools/vcfgtcheck.c.pysam.c		patch \| blob \| history
bcftools/vcfindex.c		patch \| blob \| history
bcftools/vcfindex.c.pysam.c		patch \| blob \| history
bcftools/vcfisec.c		patch \| blob \| history
bcftools/vcfisec.c.pysam.c		patch \| blob \| history
bcftools/vcfmerge.c		patch \| blob \| history
bcftools/vcfmerge.c.pysam.c		patch \| blob \| history
bcftools/vcfnorm.c		patch \| blob \| history
bcftools/vcfnorm.c.pysam.c		patch \| blob \| history
bcftools/vcfplugin.c		patch \| blob \| history
bcftools/vcfplugin.c.pysam.c		patch \| blob \| history
bcftools/vcfquery.c		patch \| blob \| history
bcftools/vcfquery.c.pysam.c		patch \| blob \| history
bcftools/vcfroh.c		patch \| blob \| history
bcftools/vcfroh.c.pysam.c		patch \| blob \| history
bcftools/vcfsom.c		patch \| blob \| history
bcftools/vcfsom.c.pysam.c		patch \| blob \| history
bcftools/vcfsort.c		patch \| blob \| history
bcftools/vcfsort.c.pysam.c		patch \| blob \| history
bcftools/vcfstats.c		patch \| blob \| history
bcftools/vcfstats.c.pysam.c		patch \| blob \| history
bcftools/vcfview.c		patch \| blob \| history
bcftools/vcfview.c.pysam.c		patch \| blob \| history
bcftools/vcmp.c		patch \| blob \| history
bcftools/vcmp.c.pysam.c		patch \| blob \| history
bcftools/vcmp.h		patch \| blob \| history
bcftools/version.c		patch \| blob \| history
bcftools/version.c.pysam.c		patch \| blob \| history
bcftools/version.sh		patch \| blob \| history
cy_build.py		patch \| blob \| history
devtools/import.py		patch \| blob \| history
devtools/install-CGAT-tools.sh		patch \| blob \| history
devtools/run_tests_travis.sh		patch \| blob \| history
doc/api.rst		patch \| blob \| history
doc/benchmarking.rst		patch \| blob \| history
doc/conf.py		patch \| blob \| history
doc/developer.rst		patch \| blob \| history
doc/faq.rst		patch \| blob \| history
doc/glossary.rst		patch \| blob \| history
doc/index.rst		patch \| blob \| history
doc/installation.rst		patch \| blob \| history
doc/release.rst		patch \| blob \| history
doc/usage.rst		patch \| blob \| history
import/pysam.c		patch \| blob \| history
import/pysam.h		patch \| blob \| history
pysam.py	[deleted file]	patch \| blob \| history
pysam/__init__.py		patch \| blob \| history
pysam/libcalignedsegment.pxd		patch \| blob \| history
pysam/libcalignedsegment.pyx		patch \| blob \| history
pysam/libcalignmentfile.pxd		patch \| blob \| history
pysam/libcalignmentfile.pyx		patch \| blob \| history
pysam/libcbcf.pyx		patch \| blob \| history
pysam/libcbcftools.pxd		patch \| blob \| history
pysam/libcfaidx.pyx		patch \| blob \| history
pysam/libchtslib.pxd		patch \| blob \| history
pysam/libchtslib.pyx		patch \| blob \| history
pysam/libcsamtools.pxd		patch \| blob \| history
pysam/libctabix.pyx		patch \| blob \| history
pysam/libcutils.pxd		patch \| blob \| history
pysam/libcutils.pyx		patch \| blob \| history
pysam/samtools.py		patch \| blob \| history
pysam/version.h		patch \| blob \| history
pysam/version.py		patch \| blob \| history
samtools/LICENSE		patch \| blob \| history
samtools/README		patch \| blob \| history
samtools/amplicon_stats.c	[new file with mode: 0644]	patch \| blob
samtools/amplicon_stats.c.pysam.c	[new file with mode: 0644]	patch \| blob
samtools/bam.c		patch \| blob \| history
samtools/bam.c.pysam.c		patch \| blob \| history
samtools/bam.h		patch \| blob \| history
samtools/bam2bcf_indel.c		patch \| blob \| history
samtools/bam2bcf_indel.c.pysam.c		patch \| blob \| history
samtools/bam2depth.c		patch \| blob \| history
samtools/bam2depth.c.pysam.c		patch \| blob \| history
samtools/bam_addrprg.c		patch \| blob \| history
samtools/bam_addrprg.c.pysam.c		patch \| blob \| history
samtools/bam_ampliconclip.c	[new file with mode: 0644]	patch \| blob
samtools/bam_ampliconclip.c.pysam.c	[new file with mode: 0644]	patch \| blob
samtools/bam_ampliconclip.h	[new file with mode: 0644]	patch \| blob
samtools/bam_aux.c		patch \| blob \| history
samtools/bam_aux.c.pysam.c		patch \| blob \| history
samtools/bam_cat.c		patch \| blob \| history
samtools/bam_cat.c.pysam.c		patch \| blob \| history
samtools/bam_color.c		patch \| blob \| history
samtools/bam_color.c.pysam.c		patch \| blob \| history
samtools/bam_fastq.c		patch \| blob \| history
samtools/bam_fastq.c.pysam.c		patch \| blob \| history
samtools/bam_flags.c		patch \| blob \| history
samtools/bam_flags.c.pysam.c		patch \| blob \| history
samtools/bam_import.c	[new file with mode: 0644]	patch \| blob
samtools/bam_import.c.pysam.c	[new file with mode: 0644]	patch \| blob
samtools/bam_index.c.pysam.c		patch \| blob \| history
samtools/bam_markdup.c		patch \| blob \| history
samtools/bam_markdup.c.pysam.c		patch \| blob \| history
samtools/bam_mate.c		patch \| blob \| history
samtools/bam_mate.c.pysam.c		patch \| blob \| history
samtools/bam_md.c		patch \| blob \| history
samtools/bam_md.c.pysam.c		patch \| blob \| history
samtools/bam_plcmd.c		patch \| blob \| history
samtools/bam_plcmd.c.pysam.c		patch \| blob \| history
samtools/bam_reheader.c.pysam.c		patch \| blob \| history
samtools/bam_rmdupse.c.pysam.c		patch \| blob \| history
samtools/bam_sort.c		patch \| blob \| history
samtools/bam_sort.c.pysam.c		patch \| blob \| history
samtools/bam_stat.c		patch \| blob \| history
samtools/bam_stat.c.pysam.c		patch \| blob \| history
samtools/bamtk.c		patch \| blob \| history
samtools/bamtk.c.pysam.c		patch \| blob \| history
samtools/bedcov.c		patch \| blob \| history
samtools/bedcov.c.pysam.c		patch \| blob \| history
samtools/bedidx.c		patch \| blob \| history
samtools/bedidx.c.pysam.c		patch \| blob \| history
samtools/coverage.c		patch \| blob \| history
samtools/coverage.c.pysam.c		patch \| blob \| history
samtools/cut_target.c		patch \| blob \| history
samtools/cut_target.c.pysam.c		patch \| blob \| history
samtools/dict.c		patch \| blob \| history
samtools/dict.c.pysam.c		patch \| blob \| history
samtools/faidx.c		patch \| blob \| history
samtools/faidx.c.pysam.c		patch \| blob \| history
samtools/htslib-1.10/LICENSE	[deleted file]	patch \| blob \| history
samtools/htslib-1.10/README	[deleted file]	patch \| blob \| history
samtools/padding.c		patch \| blob \| history
samtools/padding.c.pysam.c		patch \| blob \| history
samtools/phase.c		patch \| blob \| history
samtools/phase.c.pysam.c		patch \| blob \| history
samtools/sam_view.c		patch \| blob \| history
samtools/sam_view.c.pysam.c		patch \| blob \| history
samtools/samtools.pysam.c		patch \| blob \| history
samtools/samtools.pysam.h		patch \| blob \| history
samtools/stats.c		patch \| blob \| history
samtools/stats.c.pysam.c		patch \| blob \| history
samtools/stats_isize.c.pysam.c		patch \| blob \| history
samtools/tmp_file.h		patch \| blob \| history
samtools/version.sh		patch \| blob \| history
setup.py		patch \| blob \| history
tests/AlignedSegment_test.py		patch \| blob \| history
tests/AlignmentFileHeader_test.py		patch \| blob \| history
tests/AlignmentFilePileup_test.py		patch \| blob \| history
tests/AlignmentFile_test.py		patch \| blob \| history
tests/StreamFiledescriptors_test.py		patch \| blob \| history
tests/TestUtils.py		patch \| blob \| history
tests/VariantFile_test.py		patch \| blob \| history
tests/VariantRecord_test.py		patch \| blob \| history
tests/cbcf_data/Makefile		patch \| blob \| history
tests/compile_test.py		patch \| blob \| history
tests/faidx_test.py		patch \| blob \| history
tests/pysam_data/0example_no_seq_in_header.bam	[new file with mode: 0644]	patch \| blob
tests/pysam_data/0example_no_seq_in_header_null_bytes.bam	[new file with mode: 0644]	patch \| blob
tests/pysam_data/Makefile		patch \| blob \| history
tests/pysam_data/ex1.sam.gz		patch \| blob \| history
tests/pysam_data/example_no_seq_in_header.bam	[deleted file]	patch \| blob \| history
tests/pysam_data/example_no_seq_in_header_null_bytes.bam	[deleted file]	patch \| blob \| history
tests/refactoring.txt		patch \| blob \| history
tests/samtools_test.py		patch \| blob \| history
tests/tabix_data/Makefile	[new file with mode: 0644]	patch \| blob
tests/tabix_data/example.bed.gz.tbi		patch \| blob \| history
tests/tabix_data/example.gff3.gz.tbi		patch \| blob \| history
tests/tabix_data/example.gtf.gz.tbi		patch \| blob \| history
tests/tabix_data/example.vcf.gz.tbi		patch \| blob \| history
tests/tabix_data/example_badcomments.bed.gz.tbi		patch \| blob \| history
tests/tabix_data/example_badcomments.gtf.gz.tbi		patch \| blob \| history
tests/tabix_data/example_badcomments.vcf.gz.tbi		patch \| blob \| history
tests/tabix_data/example_comments.bed.gz.tbi		patch \| blob \| history
tests/tabix_data/example_comments.gtf.gz.tbi		patch \| blob \| history
tests/tabix_data/example_comments.vcf.gz.tbi		patch \| blob \| history
tests/tabix_test.py		patch \| blob \| history
tests/tabixproxies_test.py		patch \| blob \| history
tests/test_samtools_python.py		patch \| blob \| history